from __future__ import annotations import re from datetime import datetime from typing import Any import pandas as pd import requests from utils.logger import logger WBC_SCHEDULE_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/schedule/{date_str}" SCHEDULE_API_URL = "https://statsapi.mlb.com/api/v1/schedule" HEADERS = { "User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.9", } TEAM_MAP = { "AUS": "Australia", "BRA": "Brazil", "CAN": "Canada", "CHN": "China", "TPE": "Chinese Taipei", "COL": "Colombia", "CUB": "Cuba", "CZE": "Czechia", "DOM": "Dominican Republic", "GBR": "Great Britain", "ISR": "Israel", "ITA": "Italy", "JPN": "Japan", "KOR": "Korea", "MEX": "Mexico", "NED": "Netherlands", "NCA": "Nicaragua", "PAN": "Panama", "PUR": "Puerto Rico", "USA": "United States", "VEN": "Venezuela", } TEAM_NORMALIZATION = { "Chinese Taipei": "Chinese Taipei", "Czech Republic": "Czechia", "South Korea": "Korea", "USA": "United States", "U.S.A.": "United States", } TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"} TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$") ABBR_RE = re.compile(r"^[A-Z]{3}$") GAME_PK_RE = re.compile(r"/gameday/(\d+)") def _normalize_team_name(name: Any) -> str: text = str(name or "").strip() if not text: return "" return TEAM_NORMALIZATION.get(text, text) def _strip_html_to_lines(html: str) -> list[str]: text = re.sub(r"", " ", html, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"", " ", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"<[^>]+>", "\n", text) text = text.replace(" ", " ") text = re.sub(r"\r", "\n", text) text = re.sub(r"\n+", "\n", text) raw_lines = [line.strip() for line in text.split("\n") if line.strip()] cleaned: list[str] = [] for line in raw_lines: if line.startswith("Image:"): continue if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"): continue cleaned.append(line) deduped: list[str] = [] for line in cleaned: if not deduped or deduped[-1] != line: deduped.append(line) return deduped def _full_team(abbr: str) -> str: return TEAM_MAP.get(abbr, abbr) def _extract_game_pks(html: str) -> list[str]: found = GAME_PK_RE.findall(html) seen = [] for pk in found: if pk not in seen: seen.append(pk) return seen def _fetch_wbc_schedule_for_date(date_str: str) -> pd.DataFrame: url = WBC_SCHEDULE_URL_TEMPLATE.format(date_str=date_str) response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() html = response.text lines = _strip_html_to_lines(html) game_pks = _extract_game_pks(html) rows: list[dict[str, Any]] = [] i = 0 game_pk_index = 0 while i < len(lines): line = lines[i] if ABBR_RE.match(line): away_abbr = line j = i + 1 while j < len(lines) and lines[j] == away_abbr: j += 1 if j >= len(lines) or lines[j] != "@": i += 1 continue j += 1 if j >= len(lines) or not ABBR_RE.match(lines[j]): i += 1 continue home_abbr = lines[j] j += 1 while j < len(lines) and lines[j] == home_abbr: j += 1 status = "" start_time_et = "" tv = "" if j < len(lines): token = lines[j] if token == "LIVE": status = "Live" j += 1 elif token.startswith("Final"): status = token j += 1 elif TIME_RE.match(token): status = "Scheduled" start_time_et = token j += 1 elif token.startswith("Preview"): status = "Preview" j += 1 if j < len(lines) and lines[j] in TV_MARKERS: tv = lines[j] j += 1 game_pk = game_pks[game_pk_index] if game_pk_index < len(game_pks) else "" game_pk_index += 1 rows.append( { "fetched_at": datetime.utcnow(), "game_id": f"{date_str}:{away_abbr}:{home_abbr}", "game_date": date_str, "game_pk": game_pk, "status": status, "away_team": _full_team(away_abbr), "home_team": _full_team(home_abbr), "away_score": None, "home_score": None, "away_hits": None, "home_hits": None, "away_errors": None, "home_errors": None, "venue": "", "game_datetime_utc": "", "tv": tv, "start_time_et": start_time_et, "sport_id": 51, } ) i = j continue i += 1 return pd.DataFrame(rows) def _fetch_mlb_schedule_for_date(date_str: str) -> pd.DataFrame: params = { "sportId": 1, "date": date_str, "hydrate": "broadcasts", } response = requests.get(SCHEDULE_API_URL, headers=HEADERS, params=params, timeout=30) response.raise_for_status() payload = response.json() rows: list[dict[str, Any]] = [] for date_block in payload.get("dates", []) or []: for game in date_block.get("games", []) or []: game_pk = game.get("gamePk") teams = game.get("teams", {}) or {} away = teams.get("away", {}) or {} home = teams.get("home", {}) or {} away_team = _normalize_team_name((away.get("team", {}) or {}).get("name")) home_team = _normalize_team_name((home.get("team", {}) or {}).get("name")) status_info = game.get("status", {}) or {} detailed_state = str(status_info.get("detailedState", "") or "").strip() abstract_state = str(status_info.get("abstractGameState", "") or "").strip().lower() status = "" if abstract_state == "live": status = "Live" elif abstract_state == "final": status = "Final" elif abstract_state == "preview": status = "Scheduled" else: status = detailed_state game_datetime = game.get("gameDate", "") start_time_et = "" if game_datetime: try: ts = pd.to_datetime(game_datetime, utc=True).tz_convert("America/New_York") start_time_et = ts.strftime("%-I:%M %p ET") except Exception: start_time_et = "" broadcasts = game.get("broadcasts", []) or [] tv = "" if broadcasts: names = [] for b in broadcasts: name = str((b.get("name") or "")).strip() if name and name not in names: names.append(name) tv = ", ".join(names) if away_team and home_team: rows.append( { "fetched_at": datetime.utcnow(), "game_id": f"{date_str}:{away_team}:{home_team}", "game_date": date_str, "game_pk": str(game_pk) if game_pk is not None else "", "status": status, "away_team": away_team, "home_team": home_team, "away_score": None, "home_score": None, "away_hits": None, "home_hits": None, "away_errors": None, "home_errors": None, "venue": str((game.get("venue", {}) or {}).get("name", "") or "").strip(), "game_datetime_utc": str(game.get("gameDate", "") or "").strip(), "tv": tv, "start_time_et": start_time_et, "sport_id": 1, } ) return pd.DataFrame(rows) def fetch_schedule_for_date(date_str: str) -> pd.DataFrame: try: mlb_df = _fetch_mlb_schedule_for_date(date_str) if mlb_df is not None and not mlb_df.empty: return mlb_df except Exception as e: logger.warning(f"[schedule_fetch] failure: {e}", exc_info=True) return pd.DataFrame()