Spaces:
Running
Running
| from __future__ import annotations | |
| import re | |
| from datetime import datetime | |
| from typing import Any | |
| import pandas as pd | |
| import requests | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| SCORES_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/scores/{date_str}" | |
| TEAM_MAP = { | |
| "AUS": "Australia", | |
| "BRA": "Brazil", | |
| "CAN": "Canada", | |
| "CHN": "China", | |
| "TPE": "Chinese Taipei", | |
| "COL": "Colombia", | |
| "CUB": "Cuba", | |
| "CZE": "Czechia", | |
| "DOM": "Dominican Republic", | |
| "GBR": "Great Britain", | |
| "ISR": "Israel", | |
| "ITA": "Italy", | |
| "JPN": "Japan", | |
| "KOR": "Korea", | |
| "MEX": "Mexico", | |
| "NED": "Netherlands", | |
| "NCA": "Nicaragua", | |
| "PAN": "Panama", | |
| "PUR": "Puerto Rico", | |
| "USA": "United States", | |
| "VEN": "Venezuela", | |
| } | |
| FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$") | |
| TEAM_AT_RE = re.compile(r"^([A-Z]{3})\s+@\s+([A-Z]{3})$") | |
| TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$") | |
| TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"} | |
| def _full_team(code: str) -> str: | |
| return TEAM_MAP.get(code, code) | |
| def _strip_html_to_lines(html: str) -> list[str]: | |
| text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE) | |
| text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE) | |
| text = re.sub(r"<[^>]+>", "\n", text) | |
| text = text.replace(" ", " ") | |
| text = re.sub(r"\r", "\n", text) | |
| text = re.sub(r"\n+", "\n", text) | |
| lines = [line.strip() for line in text.split("\n") if line.strip()] | |
| cleaned: list[str] = [] | |
| for line in lines: | |
| if line.startswith("Image:"): | |
| continue | |
| if line.startswith("calendar-"): | |
| continue | |
| if line in {"Wrap", "Box", "Story", "Preview", "Tickets"}: | |
| continue | |
| cleaned.append(line) | |
| deduped: list[str] = [] | |
| for line in cleaned: | |
| if not deduped or deduped[-1] != line: | |
| deduped.append(line) | |
| return deduped | |
| def fetch_scores_for_date(date_str: str) -> pd.DataFrame: | |
| url = SCORES_URL_TEMPLATE.format(date_str=date_str) | |
| response = requests.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| lines = _strip_html_to_lines(response.text) | |
| rows: list[dict[str, Any]] = [] | |
| i = 0 | |
| pending_matchup: tuple[str, str] | None = None | |
| while i < len(lines): | |
| line = lines[i] | |
| final_match = FINAL_RE.match(line) | |
| if final_match: | |
| away_code, away_score, home_code, home_score = final_match.groups() | |
| rows.append( | |
| { | |
| "score_fetch_time": datetime.utcnow(), | |
| "game_date": date_str, | |
| "away_team": _full_team(away_code), | |
| "home_team": _full_team(home_code), | |
| "away_score": int(away_score), | |
| "home_score": int(home_score), | |
| "status": "Final", | |
| "start_time_et": "", | |
| "tv": "", | |
| } | |
| ) | |
| i += 1 | |
| continue | |
| matchup_match = TEAM_AT_RE.match(line) | |
| if matchup_match: | |
| away_code, home_code = matchup_match.groups() | |
| pending_matchup = (_full_team(away_code), _full_team(home_code)) | |
| i += 1 | |
| continue | |
| if pending_matchup is not None: | |
| away_team, home_team = pending_matchup | |
| if line == "LIVE": | |
| tv = "" | |
| start_time_et = "" | |
| if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS: | |
| tv = lines[i + 1] | |
| i += 1 | |
| rows.append( | |
| { | |
| "score_fetch_time": datetime.utcnow(), | |
| "game_date": date_str, | |
| "away_team": away_team, | |
| "home_team": home_team, | |
| "away_score": None, | |
| "home_score": None, | |
| "status": "Live", | |
| "start_time_et": start_time_et, | |
| "tv": tv, | |
| } | |
| ) | |
| pending_matchup = None | |
| i += 1 | |
| continue | |
| if TIME_RE.match(line): | |
| tv = "" | |
| if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS: | |
| tv = lines[i + 1] | |
| i += 1 | |
| rows.append( | |
| { | |
| "score_fetch_time": datetime.utcnow(), | |
| "game_date": date_str, | |
| "away_team": away_team, | |
| "home_team": home_team, | |
| "away_score": None, | |
| "home_score": None, | |
| "status": "Scheduled", | |
| "start_time_et": line, | |
| "tv": tv, | |
| } | |
| ) | |
| pending_matchup = None | |
| i += 1 | |
| continue | |
| i += 1 | |
| if not rows: | |
| return pd.DataFrame( | |
| columns=[ | |
| "score_fetch_time", | |
| "game_date", | |
| "away_team", | |
| "home_team", | |
| "away_score", | |
| "home_score", | |
| "status", | |
| "start_time_et", | |
| "tv", | |
| ] | |
| ) | |
| df = pd.DataFrame(rows) | |
| df = df.drop_duplicates(subset=["game_date", "away_team", "home_team", "status", "away_score", "home_score"]) | |
| return df.reset_index(drop=True) |