1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
from datetime import (
date,
datetime,
time,
)
from selectolax.parser import HTMLParser
from cip_paris_client.schemas import Session
def is_released_next_year(
current_month: int,
target_month: int,
month_next_year_max: int = 9,
month_next_year_min: int = 3,
) -> bool:
return (
current_month >= month_next_year_max
and target_month <= month_next_year_min
)
def parse_date(date_str: str) -> date:
"""Parse date from the CIP webpage format."""
day, month = map(int, date_str.split()[1].split("/"))
year = datetime.now().year
if is_released_next_year(datetime.now().month, month):
year += 1
return date(year=year, month=month, day=day)
def parse_time(time_str: str) -> time:
"""Parse the time from the CIP webpage format."""
hour, minute = map(int, time_str.split(":"))
return time(hour=hour, minute=minute, second=0)
def parse_sessions(
html: bytes,
movie_container_query: str = "div.movie-results-container",
movie_name_query: str = "div.desc h3",
sessions_container_query: str = "div.session-date > div.item",
session_url_selector: str = "a",
session_date_selector: str = "div.sessionDate",
session_time_selector: str = "div p.time",
) -> list[Session]:
"""Parse movie sessions from an html webpage."""
for movie_tree in HTMLParser(html).css(movie_container_query):
movie_name = movie_tree.css_first(movie_name_query).text()
for session_tree in movie_tree.css(sessions_container_query):
# Extract attributes first, conversion is done later.
attributes = session_tree.css_first(
session_url_selector
).attributes
date_str: str = session_tree.css_first(
session_date_selector
).text()
time_str: str = session_tree.css_first(
session_time_selector
).text()
yield Session(
url=attributes.get("href") or "",
cinema=attributes.get("title") or "",
movie=movie_name,
date=parse_date(date_str),
time=parse_time(time_str),
)
|