Syntrex commited on
Commit
e7bfd11
·
verified ·
1 Parent(s): d9ede80

Update data/schedule.py

Browse files
Files changed (1) hide show
  1. data/schedule.py +129 -4
data/schedule.py CHANGED
@@ -7,7 +7,8 @@ from typing import Any
7
  import pandas as pd
8
  import requests
9
 
10
- SCHEDULE_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/schedule/{date_str}"
 
11
 
12
  HEADERS = {
13
  "User-Agent": "Mozilla/5.0",
@@ -38,12 +39,27 @@ TEAM_MAP = {
38
  "VEN": "Venezuela",
39
  }
40
 
 
 
 
 
 
 
 
 
41
  TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
42
  TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
43
  ABBR_RE = re.compile(r"^[A-Z]{3}$")
44
  GAME_PK_RE = re.compile(r"/gameday/(\d+)")
45
 
46
 
 
 
 
 
 
 
 
47
  def _strip_html_to_lines(html: str) -> list[str]:
48
  text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
49
  text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
@@ -83,8 +99,8 @@ def _extract_game_pks(html: str) -> list[str]:
83
  return seen
84
 
85
 
86
- def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
87
- url = SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
88
  response = requests.get(url, headers=HEADERS, timeout=30)
89
  response.raise_for_status()
90
 
@@ -168,6 +184,7 @@ def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
168
  "venue": "",
169
  "tv": tv,
170
  "start_time_et": start_time_et,
 
171
  }
172
  )
173
 
@@ -176,4 +193,112 @@ def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
176
 
177
  i += 1
178
 
179
- return pd.DataFrame(rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import pandas as pd
8
  import requests
9
 
10
+ WBC_SCHEDULE_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/schedule/{date_str}"
11
+ SCHEDULE_API_URL = "https://statsapi.mlb.com/api/v1/schedule"
12
 
13
  HEADERS = {
14
  "User-Agent": "Mozilla/5.0",
 
39
  "VEN": "Venezuela",
40
  }
41
 
42
+ TEAM_NORMALIZATION = {
43
+ "Chinese Taipei": "Chinese Taipei",
44
+ "Czech Republic": "Czechia",
45
+ "South Korea": "Korea",
46
+ "USA": "United States",
47
+ "U.S.A.": "United States",
48
+ }
49
+
50
  TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
51
  TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
52
  ABBR_RE = re.compile(r"^[A-Z]{3}$")
53
  GAME_PK_RE = re.compile(r"/gameday/(\d+)")
54
 
55
 
56
+ def _normalize_team_name(name: Any) -> str:
57
+ text = str(name or "").strip()
58
+ if not text:
59
+ return ""
60
+ return TEAM_NORMALIZATION.get(text, text)
61
+
62
+
63
  def _strip_html_to_lines(html: str) -> list[str]:
64
  text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
65
  text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
 
99
  return seen
100
 
101
 
102
+ def _fetch_wbc_schedule_for_date(date_str: str) -> pd.DataFrame:
103
+ url = WBC_SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
104
  response = requests.get(url, headers=HEADERS, timeout=30)
105
  response.raise_for_status()
106
 
 
184
  "venue": "",
185
  "tv": tv,
186
  "start_time_et": start_time_et,
187
+ "sport_id": 51,
188
  }
189
  )
190
 
 
193
 
194
  i += 1
195
 
196
+ return pd.DataFrame(rows)
197
+
198
+
199
+ def _fetch_mlb_schedule_for_date(date_str: str) -> pd.DataFrame:
200
+ params = {
201
+ "sportId": 1,
202
+ "date": date_str,
203
+ "hydrate": "broadcasts",
204
+ }
205
+
206
+ response = requests.get(SCHEDULE_API_URL, headers=HEADERS, params=params, timeout=30)
207
+ response.raise_for_status()
208
+ payload = response.json()
209
+
210
+ rows: list[dict[str, Any]] = []
211
+
212
+ for date_block in payload.get("dates", []) or []:
213
+ for game in date_block.get("games", []) or []:
214
+ game_pk = game.get("gamePk")
215
+
216
+ teams = game.get("teams", {}) or {}
217
+ away = teams.get("away", {}) or {}
218
+ home = teams.get("home", {}) or {}
219
+
220
+ away_team = _normalize_team_name((away.get("team", {}) or {}).get("name"))
221
+ home_team = _normalize_team_name((home.get("team", {}) or {}).get("name"))
222
+
223
+ status_info = game.get("status", {}) or {}
224
+ detailed_state = str(status_info.get("detailedState", "") or "").strip()
225
+ abstract_state = str(status_info.get("abstractGameState", "") or "").strip().lower()
226
+
227
+ status = ""
228
+ if abstract_state == "live":
229
+ status = "Live"
230
+ elif abstract_state == "final":
231
+ status = "Final"
232
+ elif abstract_state == "preview":
233
+ status = "Scheduled"
234
+ else:
235
+ status = detailed_state
236
+
237
+ game_datetime = game.get("gameDate", "")
238
+ start_time_et = ""
239
+ if game_datetime:
240
+ try:
241
+ ts = pd.to_datetime(game_datetime, utc=True).tz_convert("America/New_York")
242
+ start_time_et = ts.strftime("%-I:%M %p ET")
243
+ except Exception:
244
+ start_time_et = ""
245
+
246
+ broadcasts = game.get("broadcasts", []) or []
247
+ tv = ""
248
+ if broadcasts:
249
+ names = []
250
+ for b in broadcasts:
251
+ name = str((b.get("name") or "")).strip()
252
+ if name and name not in names:
253
+ names.append(name)
254
+ tv = ", ".join(names)
255
+
256
+ if away_team and home_team:
257
+ rows.append(
258
+ {
259
+ "fetched_at": datetime.utcnow(),
260
+ "game_id": f"{date_str}:{away_team}:{home_team}",
261
+ "game_date": date_str,
262
+ "game_pk": str(game_pk) if game_pk is not None else "",
263
+ "status": status,
264
+ "away_team": away_team,
265
+ "home_team": home_team,
266
+ "away_score": None,
267
+ "home_score": None,
268
+ "away_hits": None,
269
+ "home_hits": None,
270
+ "away_errors": None,
271
+ "home_errors": None,
272
+ "venue": "",
273
+ "tv": tv,
274
+ "start_time_et": start_time_et,
275
+ "sport_id": 1,
276
+ }
277
+ )
278
+
279
+ return pd.DataFrame(rows)
280
+
281
+
282
+ def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
283
+ parts: list[pd.DataFrame] = []
284
+
285
+ try:
286
+ wbc_df = _fetch_wbc_schedule_for_date(date_str)
287
+ if wbc_df is not None and not wbc_df.empty:
288
+ parts.append(wbc_df)
289
+ except Exception:
290
+ pass
291
+
292
+ try:
293
+ mlb_df = _fetch_mlb_schedule_for_date(date_str)
294
+ if mlb_df is not None and not mlb_df.empty:
295
+ parts.append(mlb_df)
296
+ except Exception:
297
+ pass
298
+
299
+ if not parts:
300
+ return pd.DataFrame()
301
+
302
+ df = pd.concat(parts, ignore_index=True)
303
+ df = df.drop_duplicates(subset=["game_pk", "away_team", "home_team"], keep="last")
304
+ return df.reset_index(drop=True)