Syntrex commited on
Commit
38cb69d
·
verified ·
1 Parent(s): cfd6eaf

Update data/schedule.py

Browse files
Files changed (1) hide show
  1. data/schedule.py +227 -15
data/schedule.py CHANGED
@@ -1,22 +1,234 @@
1
  from __future__ import annotations
2
 
 
3
  from datetime import datetime
 
 
4
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
8
- return pd.DataFrame(
9
- [
10
- {
11
- "fetched_at": datetime.utcnow(),
12
- "game_id": "",
13
- "game_date": date_str,
14
- "status": "loaded",
15
- "away_team": "",
16
- "home_team": "",
17
- "away_score": None,
18
- "home_score": None,
19
- "venue": "",
20
- }
21
- ]
22
- )
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
+ import re
4
  from datetime import datetime
5
+ from typing import Any
6
+
7
  import pandas as pd
8
+ import requests
9
+
10
+ SCHEDULE_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/schedule/{date_str}"
11
+
12
+ HEADERS = {
13
+ "User-Agent": "Mozilla/5.0",
14
+ "Accept-Language": "en-US,en;q=0.9",
15
+ }
16
+
17
+ TEAM_MAP = {
18
+ "AUS": "Australia",
19
+ "BRA": "Brazil",
20
+ "CAN": "Canada",
21
+ "CHN": "China",
22
+ "TPE": "Chinese Taipei",
23
+ "COL": "Colombia",
24
+ "CUB": "Cuba",
25
+ "CZE": "Czechia",
26
+ "DOM": "Dominican Republic",
27
+ "GBR": "Great Britain",
28
+ "ISR": "Israel",
29
+ "ITA": "Italy",
30
+ "JPN": "Japan",
31
+ "KOR": "Korea",
32
+ "MEX": "Mexico",
33
+ "NED": "Netherlands",
34
+ "NCA": "Nicaragua",
35
+ "PAN": "Panama",
36
+ "PUR": "Puerto Rico",
37
+ "USA": "United States",
38
+ "VEN": "Venezuela",
39
+ }
40
+
41
+ TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
42
+
43
+ IGNORE_PREFIXES = (
44
+ "Tickets",
45
+ "Schedule",
46
+ "Scores",
47
+ "Stats",
48
+ "Standings",
49
+ "Bracket",
50
+ "Teams",
51
+ "Watch",
52
+ "News",
53
+ "Venues",
54
+ "Experiences",
55
+ "History",
56
+ "Shop",
57
+ "MLB.com",
58
+ "Lang",
59
+ "Official Info",
60
+ "About MLB",
61
+ "Team Information",
62
+ "Official Rules",
63
+ "Replay Review Regulations",
64
+ "Umpires",
65
+ "Advertise with Us",
66
+ "Press Releases",
67
+ "Accessibility Information",
68
+ "Help/Contact Us",
69
+ "MLB App FAQs",
70
+ "MLB.TV Help Center",
71
+ "Shop Help",
72
+ "Careers Home",
73
+ "Terms of Use",
74
+ "Privacy Policy",
75
+ "Legal Notices",
76
+ )
77
+
78
+ ABBR_RE = re.compile(r"^[A-Z]{3}$")
79
+ FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$")
80
+ TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
81
+ DATE_HEADING_RE = re.compile(r"^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$", re.I)
82
+
83
+
84
+ def _strip_html_to_lines(html: str) -> list[str]:
85
+ text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
86
+ text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
87
+ text = re.sub(r"<[^>]+>", "\n", text)
88
+ text = text.replace("&nbsp;", " ")
89
+ text = re.sub(r"\r", "\n", text)
90
+ text = re.sub(r"\n+", "\n", text)
91
+
92
+ raw_lines = [line.strip() for line in text.split("\n")]
93
+ lines: list[str] = []
94
+
95
+ for line in raw_lines:
96
+ if not line:
97
+ continue
98
+ if line.startswith("Image:"):
99
+ continue
100
+ if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"):
101
+ continue
102
+ if any(line.startswith(prefix) for prefix in IGNORE_PREFIXES):
103
+ continue
104
+ lines.append(line)
105
+
106
+ # remove consecutive duplicates
107
+ deduped: list[str] = []
108
+ for line in lines:
109
+ if not deduped or deduped[-1] != line:
110
+ deduped.append(line)
111
+
112
+ return deduped
113
+
114
+
115
+ def _full_team(abbr: str) -> str:
116
+ return TEAM_MAP.get(abbr, abbr)
117
+
118
+
119
+ def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
120
+ rows: list[dict[str, Any]] = []
121
+ i = 0
122
+
123
+ while i < len(lines):
124
+ line = lines[i]
125
+
126
+ if ABBR_RE.match(line):
127
+ away_abbr = line
128
+
129
+ # skip duplicated team abbreviation if present
130
+ j = i + 1
131
+ while j < len(lines) and lines[j] == away_abbr:
132
+ j += 1
133
+
134
+ if j >= len(lines) or lines[j] != "@":
135
+ i += 1
136
+ continue
137
+
138
+ j += 1
139
+ if j >= len(lines) or not ABBR_RE.match(lines[j]):
140
+ i += 1
141
+ continue
142
+
143
+ home_abbr = lines[j]
144
+ j += 1
145
+
146
+ while j < len(lines) and lines[j] == home_abbr:
147
+ j += 1
148
+
149
+ status = ""
150
+ away_score = None
151
+ home_score = None
152
+ game_time = ""
153
+ tv = ""
154
+
155
+ if j < len(lines):
156
+ token = lines[j]
157
+
158
+ final_match = FINAL_RE.match(token)
159
+ if final_match:
160
+ a1, s1, a2, s2 = final_match.groups()
161
+ if {a1, a2} == {away_abbr, home_abbr}:
162
+ status = "Final"
163
+ if a1 == away_abbr:
164
+ away_score = int(s1)
165
+ home_score = int(s2)
166
+ else:
167
+ away_score = int(s2)
168
+ home_score = int(s1)
169
+ j += 1
170
+ elif token == "LIVE":
171
+ status = "Live"
172
+ j += 1
173
+ elif TIME_RE.match(token):
174
+ status = "Scheduled"
175
+ game_time = token
176
+ j += 1
177
+ elif token.startswith("Preview"):
178
+ status = "Preview"
179
+ j += 1
180
+
181
+ if j < len(lines) and lines[j] in TV_MARKERS:
182
+ tv = lines[j]
183
+ j += 1
184
+
185
+ rows.append(
186
+ {
187
+ "fetched_at": datetime.utcnow(),
188
+ "game_id": f"{date_str}:{away_abbr}:{home_abbr}",
189
+ "game_date": date_str,
190
+ "status": status,
191
+ "away_team": _full_team(away_abbr),
192
+ "home_team": _full_team(home_abbr),
193
+ "away_score": away_score,
194
+ "home_score": home_score,
195
+ "venue": "",
196
+ "tv": tv,
197
+ "start_time_et": game_time,
198
+ }
199
+ )
200
+
201
+ i = j
202
+ continue
203
+
204
+ i += 1
205
+
206
+ return pd.DataFrame(rows)
207
 
208
 
209
  def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
210
+ url = SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
211
+ response = requests.get(url, headers=HEADERS, timeout=30)
212
+ response.raise_for_status()
213
+
214
+ lines = _strip_html_to_lines(response.text)
215
+ df = _parse_schedule_lines(lines, date_str=date_str)
216
+
217
+ if df.empty:
218
+ return pd.DataFrame(
219
+ columns=[
220
+ "fetched_at",
221
+ "game_id",
222
+ "game_date",
223
+ "status",
224
+ "away_team",
225
+ "home_team",
226
+ "away_score",
227
+ "home_score",
228
+ "venue",
229
+ "tv",
230
+ "start_time_et",
231
+ ]
232
+ )
233
+
234
+ return df.sort_values(["game_date", "away_team", "home_team"]).reset_index(drop=True)