from __future__ import annotations import re from datetime import datetime from typing import Any import pandas as pd import requests HEADERS = { "User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.9", } SCORES_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/scores/{date_str}" TEAM_MAP = { "AUS": "Australia", "BRA": "Brazil", "CAN": "Canada", "CHN": "China", "TPE": "Chinese Taipei", "COL": "Colombia", "CUB": "Cuba", "CZE": "Czechia", "DOM": "Dominican Republic", "GBR": "Great Britain", "ISR": "Israel", "ITA": "Italy", "JPN": "Japan", "KOR": "Korea", "MEX": "Mexico", "NED": "Netherlands", "NCA": "Nicaragua", "PAN": "Panama", "PUR": "Puerto Rico", "USA": "United States", "VEN": "Venezuela", } FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$") TEAM_AT_RE = re.compile(r"^([A-Z]{3})\s+@\s+([A-Z]{3})$") TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$") TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"} def _full_team(code: str) -> str: return TEAM_MAP.get(code, code) def _strip_html_to_lines(html: str) -> list[str]: text = re.sub(r"", " ", html, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"", " ", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"<[^>]+>", "\n", text) text = text.replace(" ", " ") text = re.sub(r"\r", "\n", text) text = re.sub(r"\n+", "\n", text) lines = [line.strip() for line in text.split("\n") if line.strip()] cleaned: list[str] = [] for line in lines: if line.startswith("Image:"): continue if line.startswith("calendar-"): continue if line in {"Wrap", "Box", "Story", "Preview", "Tickets"}: continue cleaned.append(line) deduped: list[str] = [] for line in cleaned: if not deduped or deduped[-1] != line: deduped.append(line) return deduped def fetch_scores_for_date(date_str: str) -> pd.DataFrame: url = SCORES_URL_TEMPLATE.format(date_str=date_str) response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() lines = _strip_html_to_lines(response.text) rows: list[dict[str, Any]] = [] i = 0 pending_matchup: tuple[str, str] | None = None while i < len(lines): line = lines[i] final_match = FINAL_RE.match(line) if final_match: away_code, away_score, home_code, home_score = final_match.groups() rows.append( { "score_fetch_time": datetime.utcnow(), "game_date": date_str, "away_team": _full_team(away_code), "home_team": _full_team(home_code), "away_score": int(away_score), "home_score": int(home_score), "status": "Final", "start_time_et": "", "tv": "", } ) i += 1 continue matchup_match = TEAM_AT_RE.match(line) if matchup_match: away_code, home_code = matchup_match.groups() pending_matchup = (_full_team(away_code), _full_team(home_code)) i += 1 continue if pending_matchup is not None: away_team, home_team = pending_matchup if line == "LIVE": tv = "" start_time_et = "" if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS: tv = lines[i + 1] i += 1 rows.append( { "score_fetch_time": datetime.utcnow(), "game_date": date_str, "away_team": away_team, "home_team": home_team, "away_score": None, "home_score": None, "status": "Live", "start_time_et": start_time_et, "tv": tv, } ) pending_matchup = None i += 1 continue if TIME_RE.match(line): tv = "" if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS: tv = lines[i + 1] i += 1 rows.append( { "score_fetch_time": datetime.utcnow(), "game_date": date_str, "away_team": away_team, "home_team": home_team, "away_score": None, "home_score": None, "status": "Scheduled", "start_time_et": line, "tv": tv, } ) pending_matchup = None i += 1 continue i += 1 if not rows: return pd.DataFrame( columns=[ "score_fetch_time", "game_date", "away_team", "home_team", "away_score", "home_score", "status", "start_time_et", "tv", ] ) df = pd.DataFrame(rows) df = df.drop_duplicates(subset=["game_date", "away_team", "home_team", "status", "away_score", "home_score"]) return df.reset_index(drop=True)