Spaces:

Syntrex
/

2026_MLB_Model

Sleeping

App Files Files

Syntrex commited on Mar 7

Commit

c369231

verified ·

1 Parent(s): 46e00bf

Update data/schedule.py

Browse files

Files changed (1) hide show

data/schedule.py +45 -100

data/schedule.py CHANGED Viewed

@@ -39,46 +39,9 @@ TEAM_MAP = {
 }
 TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
-IGNORE_PREFIXES = (
-    "Tickets",
-    "Schedule",
-    "Scores",
-    "Stats",
-    "Standings",
-    "Bracket",
-    "Teams",
-    "Watch",
-    "News",
-    "Venues",
-    "Experiences",
-    "History",
-    "Shop",
-    "MLB.com",
-    "Lang",
-    "Official Info",
-    "About MLB",
-    "Team Information",
-    "Official Rules",
-    "Replay Review Regulations",
-    "Umpires",
-    "Advertise with Us",
-    "Press Releases",
-    "Accessibility Information",
-    "Help/Contact Us",
-    "MLB App FAQs",
-    "MLB.TV Help Center",
-    "Shop Help",
-    "Careers Home",
-    "Terms of Use",
-    "Privacy Policy",
-    "Legal Notices",
-)
-ABBR_RE = re.compile(r"^[A-Z]{3}$")
-FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$")
 TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
-DATE_HEADING_RE = re.compile(r"^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$", re.I)
 def _strip_html_to_lines(html: str) -> list[str]:
@@ -89,23 +52,18 @@ def _strip_html_to_lines(html: str) -> list[str]:
     text = re.sub(r"\r", "\n", text)
     text = re.sub(r"\n+", "\n", text)
-    raw_lines = [line.strip() for line in text.split("\n")]
-    lines: list[str] = []
     for line in raw_lines:
-        if not line:
-            continue
         if line.startswith("Image:"):
             continue
         if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"):
             continue
-        if any(line.startswith(prefix) for prefix in IGNORE_PREFIXES):
-            continue
-        lines.append(line)
-    # remove consecutive duplicates
     deduped: list[str] = []
-    for line in lines:
         if not deduped or deduped[-1] != line:
             deduped.append(line)
@@ -116,18 +74,36 @@ def _full_team(abbr: str) -> str:
     return TEAM_MAP.get(abbr, abbr)
-def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
     rows: list[dict[str, Any]] = []
     i = 0
     while i < len(lines):
         line = lines[i]
         if ABBR_RE.match(line):
             away_abbr = line
-            # skip duplicated team abbreviation if present
             j = i + 1
             while j < len(lines) and lines[j] == away_abbr:
                 j += 1
@@ -147,32 +123,21 @@ def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
                 j += 1
             status = ""
-            away_score = None
-            home_score = None
-            game_time = ""
             tv = ""
             if j < len(lines):
                 token = lines[j]
-                final_match = FINAL_RE.match(token)
-                if final_match:
-                    a1, s1, a2, s2 = final_match.groups()
-                    if {a1, a2} == {away_abbr, home_abbr}:
-                        status = "Final"
-                        if a1 == away_abbr:
-                            away_score = int(s1)
-                            home_score = int(s2)
-                        else:
-                            away_score = int(s2)
-                            home_score = int(s1)
-                        j += 1
-                elif token == "LIVE":
                     status = "Live"
                     j += 1
                 elif TIME_RE.match(token):
                     status = "Scheduled"
-                    game_time = token
                     j += 1
                 elif token.startswith("Preview"):
                     status = "Preview"
@@ -182,19 +147,27 @@ def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
                 tv = lines[j]
                 j += 1
             rows.append(
                 {
                     "fetched_at": datetime.utcnow(),
                     "game_id": f"{date_str}:{away_abbr}:{home_abbr}",
                     "game_date": date_str,
                     "status": status,
                     "away_team": _full_team(away_abbr),
                     "home_team": _full_team(home_abbr),
-                    "away_score": away_score,
-                    "home_score": home_score,
                     "venue": "",
                     "tv": tv,
-                    "start_time_et": game_time,
                 }
             )
@@ -203,32 +176,4 @@ def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
         i += 1
-    return pd.DataFrame(rows)
-def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
-    url = SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
-    response = requests.get(url, headers=HEADERS, timeout=30)
-    response.raise_for_status()
-    lines = _strip_html_to_lines(response.text)
-    df = _parse_schedule_lines(lines, date_str=date_str)
-    if df.empty:
-        return pd.DataFrame(
-            columns=[
-                "fetched_at",
-                "game_id",
-                "game_date",
-                "status",
-                "away_team",
-                "home_team",
-                "away_score",
-                "home_score",
-                "venue",
-                "tv",
-                "start_time_et",
-            ]
-        )
-    return df.sort_values(["game_date", "away_team", "home_team"]).reset_index(drop=True)

 }
 TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
 TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
+ABBR_RE = re.compile(r"^[A-Z]{3}$")
+GAME_PK_RE = re.compile(r"/gameday/(\d+)")
 def _strip_html_to_lines(html: str) -> list[str]:
     text = re.sub(r"\r", "\n", text)
     text = re.sub(r"\n+", "\n", text)
+    raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
+    cleaned: list[str] = []
     for line in raw_lines:
         if line.startswith("Image:"):
             continue
         if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"):
             continue
+        cleaned.append(line)
     deduped: list[str] = []
+    for line in cleaned:
         if not deduped or deduped[-1] != line:
             deduped.append(line)
     return TEAM_MAP.get(abbr, abbr)
+def _extract_game_pks(html: str) -> list[str]:
+    found = GAME_PK_RE.findall(html)
+    seen = []
+    for pk in found:
+        if pk not in seen:
+            seen.append(pk)
+    return seen
+def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
+    url = SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
+    response = requests.get(url, headers=HEADERS, timeout=30)
+    response.raise_for_status()
+    html = response.text
+    lines = _strip_html_to_lines(html)
+    game_pks = _extract_game_pks(html)
     rows: list[dict[str, Any]] = []
     i = 0
+    game_pk_index = 0
     while i < len(lines):
         line = lines[i]
         if ABBR_RE.match(line):
             away_abbr = line
             j = i + 1
             while j < len(lines) and lines[j] == away_abbr:
                 j += 1
                 j += 1
             status = ""
+            start_time_et = ""
             tv = ""
             if j < len(lines):
                 token = lines[j]
+                if token == "LIVE":
                     status = "Live"
                     j += 1
+                elif token.startswith("Final"):
+                    status = token
+                    j += 1
                 elif TIME_RE.match(token):
                     status = "Scheduled"
+                    start_time_et = token
                     j += 1
                 elif token.startswith("Preview"):
                     status = "Preview"
                 tv = lines[j]
                 j += 1
+            game_pk = game_pks[game_pk_index] if game_pk_index < len(game_pks) else ""
+            game_pk_index += 1
             rows.append(
                 {
                     "fetched_at": datetime.utcnow(),
                     "game_id": f"{date_str}:{away_abbr}:{home_abbr}",
                     "game_date": date_str,
+                    "game_pk": game_pk,
                     "status": status,
                     "away_team": _full_team(away_abbr),
                     "home_team": _full_team(home_abbr),
+                    "away_score": None,
+                    "home_score": None,
+                    "away_hits": None,
+                    "home_hits": None,
+                    "away_errors": None,
+                    "home_errors": None,
                     "venue": "",
                     "tv": tv,
+                    "start_time_et": start_time_et,
                 }
             )
         i += 1
+    return pd.DataFrame(rows)