Spaces:
Running
Running
| from __future__ import annotations | |
| import re | |
| from datetime import datetime | |
| from typing import Any | |
| import pandas as pd | |
| import requests | |
| from utils.logger import logger | |
| WBC_SCHEDULE_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/schedule/{date_str}" | |
| SCHEDULE_API_URL = "https://statsapi.mlb.com/api/v1/schedule" | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| TEAM_MAP = { | |
| "AUS": "Australia", | |
| "BRA": "Brazil", | |
| "CAN": "Canada", | |
| "CHN": "China", | |
| "TPE": "Chinese Taipei", | |
| "COL": "Colombia", | |
| "CUB": "Cuba", | |
| "CZE": "Czechia", | |
| "DOM": "Dominican Republic", | |
| "GBR": "Great Britain", | |
| "ISR": "Israel", | |
| "ITA": "Italy", | |
| "JPN": "Japan", | |
| "KOR": "Korea", | |
| "MEX": "Mexico", | |
| "NED": "Netherlands", | |
| "NCA": "Nicaragua", | |
| "PAN": "Panama", | |
| "PUR": "Puerto Rico", | |
| "USA": "United States", | |
| "VEN": "Venezuela", | |
| } | |
| TEAM_NORMALIZATION = { | |
| "Chinese Taipei": "Chinese Taipei", | |
| "Czech Republic": "Czechia", | |
| "South Korea": "Korea", | |
| "USA": "United States", | |
| "U.S.A.": "United States", | |
| } | |
| TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"} | |
| TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$") | |
| ABBR_RE = re.compile(r"^[A-Z]{3}$") | |
| GAME_PK_RE = re.compile(r"/gameday/(\d+)") | |
| def _normalize_team_name(name: Any) -> str: | |
| text = str(name or "").strip() | |
| if not text: | |
| return "" | |
| return TEAM_NORMALIZATION.get(text, text) | |
| def _strip_html_to_lines(html: str) -> list[str]: | |
| text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE) | |
| text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE) | |
| text = re.sub(r"<[^>]+>", "\n", text) | |
| text = text.replace(" ", " ") | |
| text = re.sub(r"\r", "\n", text) | |
| text = re.sub(r"\n+", "\n", text) | |
| raw_lines = [line.strip() for line in text.split("\n") if line.strip()] | |
| cleaned: list[str] = [] | |
| for line in raw_lines: | |
| if line.startswith("Image:"): | |
| continue | |
| if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"): | |
| continue | |
| cleaned.append(line) | |
| deduped: list[str] = [] | |
| for line in cleaned: | |
| if not deduped or deduped[-1] != line: | |
| deduped.append(line) | |
| return deduped | |
| def _full_team(abbr: str) -> str: | |
| return TEAM_MAP.get(abbr, abbr) | |
| def _extract_game_pks(html: str) -> list[str]: | |
| found = GAME_PK_RE.findall(html) | |
| seen = [] | |
| for pk in found: | |
| if pk not in seen: | |
| seen.append(pk) | |
| return seen | |
| def _fetch_wbc_schedule_for_date(date_str: str) -> pd.DataFrame: | |
| url = WBC_SCHEDULE_URL_TEMPLATE.format(date_str=date_str) | |
| response = requests.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| html = response.text | |
| lines = _strip_html_to_lines(html) | |
| game_pks = _extract_game_pks(html) | |
| rows: list[dict[str, Any]] = [] | |
| i = 0 | |
| game_pk_index = 0 | |
| while i < len(lines): | |
| line = lines[i] | |
| if ABBR_RE.match(line): | |
| away_abbr = line | |
| j = i + 1 | |
| while j < len(lines) and lines[j] == away_abbr: | |
| j += 1 | |
| if j >= len(lines) or lines[j] != "@": | |
| i += 1 | |
| continue | |
| j += 1 | |
| if j >= len(lines) or not ABBR_RE.match(lines[j]): | |
| i += 1 | |
| continue | |
| home_abbr = lines[j] | |
| j += 1 | |
| while j < len(lines) and lines[j] == home_abbr: | |
| j += 1 | |
| status = "" | |
| start_time_et = "" | |
| tv = "" | |
| if j < len(lines): | |
| token = lines[j] | |
| if token == "LIVE": | |
| status = "Live" | |
| j += 1 | |
| elif token.startswith("Final"): | |
| status = token | |
| j += 1 | |
| elif TIME_RE.match(token): | |
| status = "Scheduled" | |
| start_time_et = token | |
| j += 1 | |
| elif token.startswith("Preview"): | |
| status = "Preview" | |
| j += 1 | |
| if j < len(lines) and lines[j] in TV_MARKERS: | |
| tv = lines[j] | |
| j += 1 | |
| game_pk = game_pks[game_pk_index] if game_pk_index < len(game_pks) else "" | |
| game_pk_index += 1 | |
| rows.append( | |
| { | |
| "fetched_at": datetime.utcnow(), | |
| "game_id": f"{date_str}:{away_abbr}:{home_abbr}", | |
| "game_date": date_str, | |
| "game_pk": game_pk, | |
| "status": status, | |
| "away_team": _full_team(away_abbr), | |
| "home_team": _full_team(home_abbr), | |
| "away_score": None, | |
| "home_score": None, | |
| "away_hits": None, | |
| "home_hits": None, | |
| "away_errors": None, | |
| "home_errors": None, | |
| "venue": "", | |
| "game_datetime_utc": "", | |
| "tv": tv, | |
| "start_time_et": start_time_et, | |
| "sport_id": 51, | |
| } | |
| ) | |
| i = j | |
| continue | |
| i += 1 | |
| return pd.DataFrame(rows) | |
| def _fetch_mlb_schedule_for_date(date_str: str) -> pd.DataFrame: | |
| params = { | |
| "sportId": 1, | |
| "date": date_str, | |
| "hydrate": "broadcasts", | |
| } | |
| response = requests.get(SCHEDULE_API_URL, headers=HEADERS, params=params, timeout=30) | |
| response.raise_for_status() | |
| payload = response.json() | |
| rows: list[dict[str, Any]] = [] | |
| for date_block in payload.get("dates", []) or []: | |
| for game in date_block.get("games", []) or []: | |
| game_pk = game.get("gamePk") | |
| teams = game.get("teams", {}) or {} | |
| away = teams.get("away", {}) or {} | |
| home = teams.get("home", {}) or {} | |
| away_team = _normalize_team_name((away.get("team", {}) or {}).get("name")) | |
| home_team = _normalize_team_name((home.get("team", {}) or {}).get("name")) | |
| status_info = game.get("status", {}) or {} | |
| detailed_state = str(status_info.get("detailedState", "") or "").strip() | |
| abstract_state = str(status_info.get("abstractGameState", "") or "").strip().lower() | |
| status = "" | |
| if abstract_state == "live": | |
| status = "Live" | |
| elif abstract_state == "final": | |
| status = "Final" | |
| elif abstract_state == "preview": | |
| status = "Scheduled" | |
| else: | |
| status = detailed_state | |
| game_datetime = game.get("gameDate", "") | |
| start_time_et = "" | |
| if game_datetime: | |
| try: | |
| ts = pd.to_datetime(game_datetime, utc=True).tz_convert("America/New_York") | |
| start_time_et = ts.strftime("%-I:%M %p ET") | |
| except Exception: | |
| start_time_et = "" | |
| broadcasts = game.get("broadcasts", []) or [] | |
| tv = "" | |
| if broadcasts: | |
| names = [] | |
| for b in broadcasts: | |
| name = str((b.get("name") or "")).strip() | |
| if name and name not in names: | |
| names.append(name) | |
| tv = ", ".join(names) | |
| if away_team and home_team: | |
| rows.append( | |
| { | |
| "fetched_at": datetime.utcnow(), | |
| "game_id": f"{date_str}:{away_team}:{home_team}", | |
| "game_date": date_str, | |
| "game_pk": str(game_pk) if game_pk is not None else "", | |
| "status": status, | |
| "away_team": away_team, | |
| "home_team": home_team, | |
| "away_score": None, | |
| "home_score": None, | |
| "away_hits": None, | |
| "home_hits": None, | |
| "away_errors": None, | |
| "home_errors": None, | |
| "venue": str((game.get("venue", {}) or {}).get("name", "") or "").strip(), | |
| "game_datetime_utc": str(game.get("gameDate", "") or "").strip(), | |
| "tv": tv, | |
| "start_time_et": start_time_et, | |
| "sport_id": 1, | |
| } | |
| ) | |
| return pd.DataFrame(rows) | |
| def fetch_schedule_for_date(date_str: str) -> pd.DataFrame: | |
| try: | |
| mlb_df = _fetch_mlb_schedule_for_date(date_str) | |
| if mlb_df is not None and not mlb_df.empty: | |
| return mlb_df | |
| except Exception as e: | |
| logger.warning(f"[schedule_fetch] failure: {e}", exc_info=True) | |
| return pd.DataFrame() |