2026_MLB_Model / data /schedule.py
Syntrex's picture
Audit-confirmed fixes: matchup confidence blend + platoon unknown handling
95e27f5
raw
history blame
8.97 kB
from __future__ import annotations
import re
from datetime import datetime
from typing import Any
import pandas as pd
import requests
from utils.logger import logger
WBC_SCHEDULE_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/schedule/{date_str}"
SCHEDULE_API_URL = "https://statsapi.mlb.com/api/v1/schedule"
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Accept-Language": "en-US,en;q=0.9",
}
TEAM_MAP = {
"AUS": "Australia",
"BRA": "Brazil",
"CAN": "Canada",
"CHN": "China",
"TPE": "Chinese Taipei",
"COL": "Colombia",
"CUB": "Cuba",
"CZE": "Czechia",
"DOM": "Dominican Republic",
"GBR": "Great Britain",
"ISR": "Israel",
"ITA": "Italy",
"JPN": "Japan",
"KOR": "Korea",
"MEX": "Mexico",
"NED": "Netherlands",
"NCA": "Nicaragua",
"PAN": "Panama",
"PUR": "Puerto Rico",
"USA": "United States",
"VEN": "Venezuela",
}
TEAM_NORMALIZATION = {
"Chinese Taipei": "Chinese Taipei",
"Czech Republic": "Czechia",
"South Korea": "Korea",
"USA": "United States",
"U.S.A.": "United States",
}
TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
ABBR_RE = re.compile(r"^[A-Z]{3}$")
GAME_PK_RE = re.compile(r"/gameday/(\d+)")
def _normalize_team_name(name: Any) -> str:
text = str(name or "").strip()
if not text:
return ""
return TEAM_NORMALIZATION.get(text, text)
def _strip_html_to_lines(html: str) -> list[str]:
text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<[^>]+>", "\n", text)
text = text.replace("&nbsp;", " ")
text = re.sub(r"\r", "\n", text)
text = re.sub(r"\n+", "\n", text)
raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
cleaned: list[str] = []
for line in raw_lines:
if line.startswith("Image:"):
continue
if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"):
continue
cleaned.append(line)
deduped: list[str] = []
for line in cleaned:
if not deduped or deduped[-1] != line:
deduped.append(line)
return deduped
def _full_team(abbr: str) -> str:
return TEAM_MAP.get(abbr, abbr)
def _extract_game_pks(html: str) -> list[str]:
found = GAME_PK_RE.findall(html)
seen = []
for pk in found:
if pk not in seen:
seen.append(pk)
return seen
def _fetch_wbc_schedule_for_date(date_str: str) -> pd.DataFrame:
url = WBC_SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
html = response.text
lines = _strip_html_to_lines(html)
game_pks = _extract_game_pks(html)
rows: list[dict[str, Any]] = []
i = 0
game_pk_index = 0
while i < len(lines):
line = lines[i]
if ABBR_RE.match(line):
away_abbr = line
j = i + 1
while j < len(lines) and lines[j] == away_abbr:
j += 1
if j >= len(lines) or lines[j] != "@":
i += 1
continue
j += 1
if j >= len(lines) or not ABBR_RE.match(lines[j]):
i += 1
continue
home_abbr = lines[j]
j += 1
while j < len(lines) and lines[j] == home_abbr:
j += 1
status = ""
start_time_et = ""
tv = ""
if j < len(lines):
token = lines[j]
if token == "LIVE":
status = "Live"
j += 1
elif token.startswith("Final"):
status = token
j += 1
elif TIME_RE.match(token):
status = "Scheduled"
start_time_et = token
j += 1
elif token.startswith("Preview"):
status = "Preview"
j += 1
if j < len(lines) and lines[j] in TV_MARKERS:
tv = lines[j]
j += 1
game_pk = game_pks[game_pk_index] if game_pk_index < len(game_pks) else ""
game_pk_index += 1
rows.append(
{
"fetched_at": datetime.utcnow(),
"game_id": f"{date_str}:{away_abbr}:{home_abbr}",
"game_date": date_str,
"game_pk": game_pk,
"status": status,
"away_team": _full_team(away_abbr),
"home_team": _full_team(home_abbr),
"away_score": None,
"home_score": None,
"away_hits": None,
"home_hits": None,
"away_errors": None,
"home_errors": None,
"venue": "",
"game_datetime_utc": "",
"tv": tv,
"start_time_et": start_time_et,
"sport_id": 51,
}
)
i = j
continue
i += 1
return pd.DataFrame(rows)
def _fetch_mlb_schedule_for_date(date_str: str) -> pd.DataFrame:
params = {
"sportId": 1,
"date": date_str,
"hydrate": "broadcasts",
}
response = requests.get(SCHEDULE_API_URL, headers=HEADERS, params=params, timeout=30)
response.raise_for_status()
payload = response.json()
rows: list[dict[str, Any]] = []
for date_block in payload.get("dates", []) or []:
for game in date_block.get("games", []) or []:
game_pk = game.get("gamePk")
teams = game.get("teams", {}) or {}
away = teams.get("away", {}) or {}
home = teams.get("home", {}) or {}
away_team = _normalize_team_name((away.get("team", {}) or {}).get("name"))
home_team = _normalize_team_name((home.get("team", {}) or {}).get("name"))
status_info = game.get("status", {}) or {}
detailed_state = str(status_info.get("detailedState", "") or "").strip()
abstract_state = str(status_info.get("abstractGameState", "") or "").strip().lower()
status = ""
if abstract_state == "live":
status = "Live"
elif abstract_state == "final":
status = "Final"
elif abstract_state == "preview":
status = "Scheduled"
else:
status = detailed_state
game_datetime = game.get("gameDate", "")
start_time_et = ""
if game_datetime:
try:
ts = pd.to_datetime(game_datetime, utc=True).tz_convert("America/New_York")
start_time_et = ts.strftime("%-I:%M %p ET")
except Exception:
start_time_et = ""
broadcasts = game.get("broadcasts", []) or []
tv = ""
if broadcasts:
names = []
for b in broadcasts:
name = str((b.get("name") or "")).strip()
if name and name not in names:
names.append(name)
tv = ", ".join(names)
if away_team and home_team:
rows.append(
{
"fetched_at": datetime.utcnow(),
"game_id": f"{date_str}:{away_team}:{home_team}",
"game_date": date_str,
"game_pk": str(game_pk) if game_pk is not None else "",
"status": status,
"away_team": away_team,
"home_team": home_team,
"away_score": None,
"home_score": None,
"away_hits": None,
"home_hits": None,
"away_errors": None,
"home_errors": None,
"venue": str((game.get("venue", {}) or {}).get("name", "") or "").strip(),
"game_datetime_utc": str(game.get("gameDate", "") or "").strip(),
"tv": tv,
"start_time_et": start_time_et,
"sport_id": 1,
}
)
return pd.DataFrame(rows)
def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
try:
mlb_df = _fetch_mlb_schedule_for_date(date_str)
if mlb_df is not None and not mlb_df.empty:
return mlb_df
except Exception as e:
logger.warning(f"[schedule_fetch] failure: {e}", exc_info=True)
return pd.DataFrame()