Spaces:

Syntrex
/

2026_MLB_Model

Running

2026_MLB_Model

File size: 5,629 Bytes

e0d0e3c

from __future__ import annotations

import re
from datetime import datetime
from typing import Any

import pandas as pd
import requests

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "en-US,en;q=0.9",
}

SCORES_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/scores/{date_str}"

TEAM_MAP = {
    "AUS": "Australia",
    "BRA": "Brazil",
    "CAN": "Canada",
    "CHN": "China",
    "TPE": "Chinese Taipei",
    "COL": "Colombia",
    "CUB": "Cuba",
    "CZE": "Czechia",
    "DOM": "Dominican Republic",
    "GBR": "Great Britain",
    "ISR": "Israel",
    "ITA": "Italy",
    "JPN": "Japan",
    "KOR": "Korea",
    "MEX": "Mexico",
    "NED": "Netherlands",
    "NCA": "Nicaragua",
    "PAN": "Panama",
    "PUR": "Puerto Rico",
    "USA": "United States",
    "VEN": "Venezuela",
}

FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$")
TEAM_AT_RE = re.compile(r"^([A-Z]{3})\s+@\s+([A-Z]{3})$")
TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}


def _full_team(code: str) -> str:
    return TEAM_MAP.get(code, code)


def _strip_html_to_lines(html: str) -> list[str]:
    text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<[^>]+>", "\n", text)
    text = text.replace("&nbsp;", " ")
    text = re.sub(r"\r", "\n", text)
    text = re.sub(r"\n+", "\n", text)

    lines = [line.strip() for line in text.split("\n") if line.strip()]

    cleaned: list[str] = []
    for line in lines:
        if line.startswith("Image:"):
            continue
        if line.startswith("calendar-"):
            continue
        if line in {"Wrap", "Box", "Story", "Preview", "Tickets"}:
            continue
        cleaned.append(line)

    deduped: list[str] = []
    for line in cleaned:
        if not deduped or deduped[-1] != line:
            deduped.append(line)

    return deduped


def fetch_scores_for_date(date_str: str) -> pd.DataFrame:
    url = SCORES_URL_TEMPLATE.format(date_str=date_str)
    response = requests.get(url, headers=HEADERS, timeout=30)
    response.raise_for_status()

    lines = _strip_html_to_lines(response.text)

    rows: list[dict[str, Any]] = []

    i = 0
    pending_matchup: tuple[str, str] | None = None

    while i < len(lines):
        line = lines[i]

        final_match = FINAL_RE.match(line)
        if final_match:
            away_code, away_score, home_code, home_score = final_match.groups()
            rows.append(
                {
                    "score_fetch_time": datetime.utcnow(),
                    "game_date": date_str,
                    "away_team": _full_team(away_code),
                    "home_team": _full_team(home_code),
                    "away_score": int(away_score),
                    "home_score": int(home_score),
                    "status": "Final",
                    "start_time_et": "",
                    "tv": "",
                }
            )
            i += 1
            continue

        matchup_match = TEAM_AT_RE.match(line)
        if matchup_match:
            away_code, home_code = matchup_match.groups()
            pending_matchup = (_full_team(away_code), _full_team(home_code))
            i += 1
            continue

        if pending_matchup is not None:
            away_team, home_team = pending_matchup

            if line == "LIVE":
                tv = ""
                start_time_et = ""
                if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS:
                    tv = lines[i + 1]
                    i += 1

                rows.append(
                    {
                        "score_fetch_time": datetime.utcnow(),
                        "game_date": date_str,
                        "away_team": away_team,
                        "home_team": home_team,
                        "away_score": None,
                        "home_score": None,
                        "status": "Live",
                        "start_time_et": start_time_et,
                        "tv": tv,
                    }
                )
                pending_matchup = None
                i += 1
                continue

            if TIME_RE.match(line):
                tv = ""
                if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS:
                    tv = lines[i + 1]
                    i += 1

                rows.append(
                    {
                        "score_fetch_time": datetime.utcnow(),
                        "game_date": date_str,
                        "away_team": away_team,
                        "home_team": home_team,
                        "away_score": None,
                        "home_score": None,
                        "status": "Scheduled",
                        "start_time_et": line,
                        "tv": tv,
                    }
                )
                pending_matchup = None
                i += 1
                continue

        i += 1

    if not rows:
        return pd.DataFrame(
            columns=[
                "score_fetch_time",
                "game_date",
                "away_team",
                "home_team",
                "away_score",
                "home_score",
                "status",
                "start_time_et",
                "tv",
            ]
        )

    df = pd.DataFrame(rows)
    df = df.drop_duplicates(subset=["game_date", "away_team", "home_team", "status", "away_score", "home_score"])
    return df.reset_index(drop=True)