Spaces:

Syntrex
/

2026_MLB_Model

Sleeping

App Files Files

Syntrex commited on Mar 7

Commit

38cb69d

verified ·

1 Parent(s): cfd6eaf

Update data/schedule.py

Browse files

Files changed (1) hide show

data/schedule.py +227 -15

data/schedule.py CHANGED Viewed

@@ -1,22 +1,234 @@
 from __future__ import annotations
 from datetime import datetime
 import pandas as pd
 def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
-    return pd.DataFrame(
-        [
-            {
-                "fetched_at": datetime.utcnow(),
-                "game_id": "",
-                "game_date": date_str,
-                "status": "loaded",
-                "away_team": "",
-                "home_team": "",
-                "away_score": None,
-                "home_score": None,
-                "venue": "",
-            }
-        ]
-    )

 from __future__ import annotations
+import re
 from datetime import datetime
+from typing import Any
 import pandas as pd
+import requests
+SCHEDULE_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/schedule/{date_str}"
+HEADERS = {
+    "User-Agent": "Mozilla/5.0",
+    "Accept-Language": "en-US,en;q=0.9",
+}
+TEAM_MAP = {
+    "AUS": "Australia",
+    "BRA": "Brazil",
+    "CAN": "Canada",
+    "CHN": "China",
+    "TPE": "Chinese Taipei",
+    "COL": "Colombia",
+    "CUB": "Cuba",
+    "CZE": "Czechia",
+    "DOM": "Dominican Republic",
+    "GBR": "Great Britain",
+    "ISR": "Israel",
+    "ITA": "Italy",
+    "JPN": "Japan",
+    "KOR": "Korea",
+    "MEX": "Mexico",
+    "NED": "Netherlands",
+    "NCA": "Nicaragua",
+    "PAN": "Panama",
+    "PUR": "Puerto Rico",
+    "USA": "United States",
+    "VEN": "Venezuela",
+}
+TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
+IGNORE_PREFIXES = (
+    "Tickets",
+    "Schedule",
+    "Scores",
+    "Stats",
+    "Standings",
+    "Bracket",
+    "Teams",
+    "Watch",
+    "News",
+    "Venues",
+    "Experiences",
+    "History",
+    "Shop",
+    "MLB.com",
+    "Lang",
+    "Official Info",
+    "About MLB",
+    "Team Information",
+    "Official Rules",
+    "Replay Review Regulations",
+    "Umpires",
+    "Advertise with Us",
+    "Press Releases",
+    "Accessibility Information",
+    "Help/Contact Us",
+    "MLB App FAQs",
+    "MLB.TV Help Center",
+    "Shop Help",
+    "Careers Home",
+    "Terms of Use",
+    "Privacy Policy",
+    "Legal Notices",
+)
+ABBR_RE = re.compile(r"^[A-Z]{3}$")
+FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$")
+TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
+DATE_HEADING_RE = re.compile(r"^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$", re.I)
+def _strip_html_to_lines(html: str) -> list[str]:
+    text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r"<[^>]+>", "\n", text)
+    text = text.replace("&nbsp;", " ")
+    text = re.sub(r"\r", "\n", text)
+    text = re.sub(r"\n+", "\n", text)
+    raw_lines = [line.strip() for line in text.split("\n")]
+    lines: list[str] = []
+    for line in raw_lines:
+        if not line:
+            continue
+        if line.startswith("Image:"):
+            continue
+        if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"):
+            continue
+        if any(line.startswith(prefix) for prefix in IGNORE_PREFIXES):
+            continue
+        lines.append(line)
+    # remove consecutive duplicates
+    deduped: list[str] = []
+    for line in lines:
+        if not deduped or deduped[-1] != line:
+            deduped.append(line)
+    return deduped
+def _full_team(abbr: str) -> str:
+    return TEAM_MAP.get(abbr, abbr)
+def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
+    rows: list[dict[str, Any]] = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if ABBR_RE.match(line):
+            away_abbr = line
+            # skip duplicated team abbreviation if present
+            j = i + 1
+            while j < len(lines) and lines[j] == away_abbr:
+                j += 1
+            if j >= len(lines) or lines[j] != "@":
+                i += 1
+                continue
+            j += 1
+            if j >= len(lines) or not ABBR_RE.match(lines[j]):
+                i += 1
+                continue
+            home_abbr = lines[j]
+            j += 1
+            while j < len(lines) and lines[j] == home_abbr:
+                j += 1
+            status = ""
+            away_score = None
+            home_score = None
+            game_time = ""
+            tv = ""
+            if j < len(lines):
+                token = lines[j]
+                final_match = FINAL_RE.match(token)
+                if final_match:
+                    a1, s1, a2, s2 = final_match.groups()
+                    if {a1, a2} == {away_abbr, home_abbr}:
+                        status = "Final"
+                        if a1 == away_abbr:
+                            away_score = int(s1)
+                            home_score = int(s2)
+                        else:
+                            away_score = int(s2)
+                            home_score = int(s1)
+                        j += 1
+                elif token == "LIVE":
+                    status = "Live"
+                    j += 1
+                elif TIME_RE.match(token):
+                    status = "Scheduled"
+                    game_time = token
+                    j += 1
+                elif token.startswith("Preview"):
+                    status = "Preview"
+                    j += 1
+            if j < len(lines) and lines[j] in TV_MARKERS:
+                tv = lines[j]
+                j += 1
+            rows.append(
+                {
+                    "fetched_at": datetime.utcnow(),
+                    "game_id": f"{date_str}:{away_abbr}:{home_abbr}",
+                    "game_date": date_str,
+                    "status": status,
+                    "away_team": _full_team(away_abbr),
+                    "home_team": _full_team(home_abbr),
+                    "away_score": away_score,
+                    "home_score": home_score,
+                    "venue": "",
+                    "tv": tv,
+                    "start_time_et": game_time,
+                }
+            )
+            i = j
+            continue
+        i += 1
+    return pd.DataFrame(rows)
 def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
+    url = SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
+    response = requests.get(url, headers=HEADERS, timeout=30)
+    response.raise_for_status()
+    lines = _strip_html_to_lines(response.text)
+    df = _parse_schedule_lines(lines, date_str=date_str)
+    if df.empty:
+        return pd.DataFrame(
+            columns=[
+                "fetched_at",
+                "game_id",
+                "game_date",
+                "status",
+                "away_team",
+                "home_team",
+                "away_score",
+                "home_score",
+                "venue",
+                "tv",
+                "start_time_et",
+            ]
+        )
+    return df.sort_values(["game_date", "away_team", "home_team"]).reset_index(drop=True)