2026_MLB_Model / data /scores.py
Syntrex's picture
Audit-confirmed fixes: matchup confidence blend + platoon unknown handling
95e27f5
raw
history blame
6.79 kB
from __future__ import annotations
from datetime import datetime
from typing import Any
import pandas as pd
import requests
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Accept-Language": "en-US,en;q=0.9",
}
SCORES_API_URL = "https://statsapi.mlb.com/api/v1/schedule"
SPORT_IDS = [1] # MLB only
TEAM_NORMALIZATION = {
"Chinese Taipei": "Chinese Taipei",
"Czech Republic": "Czechia",
"South Korea": "Korea",
"USA": "United States",
"U.S.A.": "United States",
}
def _normalize_team_name(name: Any) -> str:
text = str(name or "").strip()
if not text:
return ""
return TEAM_NORMALIZATION.get(text, text)
def _normalize_status(abstract_state: str, detailed_state: str, inning_state: str, current_inning: Any) -> str:
abstract = str(abstract_state or "").strip().lower()
detailed = str(detailed_state or "").strip()
inning = str(inning_state or "").strip()
inning_num = current_inning
if abstract == "final":
return "Final"
if abstract == "live":
if inning and inning_num not in (None, ""):
return f"{inning} {inning_num}"
return detailed or "Live"
if abstract == "preview":
return "Scheduled"
if str(detailed).strip():
return str(detailed).strip()
return ""
def _safe_int(value: Any) -> int | None:
try:
if value is None:
return None
text = str(value).strip().lower()
if text in {"", "nan", "none"}:
return None
return int(float(value))
except Exception:
return None
def _fetch_scores_for_sport_id(date_str: str, sport_id: int) -> pd.DataFrame:
params = {
"sportId": sport_id,
"date": date_str,
"hydrate": "linescore,broadcasts",
}
response = requests.get(SCORES_API_URL, headers=HEADERS, params=params, timeout=30)
response.raise_for_status()
payload = response.json()
dates = payload.get("dates", []) or []
rows: list[dict[str, Any]] = []
for date_block in dates:
games = date_block.get("games", []) or []
for game in games:
game_pk = game.get("gamePk")
teams = game.get("teams", {}) or {}
away = teams.get("away", {}) or {}
home = teams.get("home", {}) or {}
away_team_info = (away.get("team", {}) or {})
home_team_info = (home.get("team", {}) or {})
away_team = _normalize_team_name(away_team_info.get("name"))
home_team = _normalize_team_name(home_team_info.get("name"))
status_info = game.get("status", {}) or {}
abstract_state = status_info.get("abstractGameState", "")
detailed_state = status_info.get("detailedState", "")
linescore = game.get("linescore", {}) or {}
inning_state = linescore.get("inningState", "")
current_inning = linescore.get("currentInning")
status = _normalize_status(
abstract_state=abstract_state,
detailed_state=detailed_state,
inning_state=inning_state,
current_inning=current_inning,
)
away_score = _safe_int(away.get("score"))
home_score = _safe_int(home.get("score"))
linescore_teams = linescore.get("teams", {}) or {}
away_ls = linescore_teams.get("away", {}) or {}
home_ls = linescore_teams.get("home", {}) or {}
if away_score is None:
away_score = _safe_int(away_ls.get("runs"))
if home_score is None:
home_score = _safe_int(home_ls.get("runs"))
away_hits = _safe_int(away_ls.get("hits"))
home_hits = _safe_int(home_ls.get("hits"))
away_errors = _safe_int(away_ls.get("errors"))
home_errors = _safe_int(home_ls.get("errors"))
game_datetime = game.get("gameDate", "")
start_time_et = ""
if game_datetime:
try:
ts = pd.to_datetime(game_datetime, utc=True).tz_convert("America/New_York")
start_time_et = ts.strftime("%-I:%M %p ET")
except Exception:
start_time_et = ""
broadcasts = game.get("broadcasts", []) or []
tv = ""
if broadcasts:
names = []
for b in broadcasts:
name = str((b.get("name") or "")).strip()
if name and name not in names:
names.append(name)
tv = ", ".join(names)
if away_team and home_team:
rows.append(
{
"score_fetch_time": datetime.utcnow(),
"game_date": date_str,
"game_pk": str(game_pk) if game_pk is not None else "",
"away_team": away_team,
"home_team": home_team,
"away_score": away_score,
"home_score": home_score,
"away_hits": away_hits,
"home_hits": home_hits,
"away_errors": away_errors,
"home_errors": home_errors,
"status": status,
"start_time_et": start_time_et,
"tv": tv,
"sport_id": sport_id,
}
)
if not rows:
return pd.DataFrame()
df = pd.DataFrame(rows)
df = df.drop_duplicates(subset=["game_pk", "away_team", "home_team", "status"], keep="last")
return df.reset_index(drop=True)
def fetch_scores_for_date(date_str: str) -> pd.DataFrame:
parts: list[pd.DataFrame] = []
for sport_id in SPORT_IDS:
try:
sport_df = _fetch_scores_for_sport_id(date_str, sport_id)
if sport_df is not None and not sport_df.empty:
parts.append(sport_df)
except Exception:
pass
if not parts:
return pd.DataFrame(
columns=[
"score_fetch_time",
"game_date",
"game_pk",
"away_team",
"home_team",
"away_score",
"home_score",
"away_hits",
"home_hits",
"away_errors",
"home_errors",
"status",
"start_time_et",
"tv",
"sport_id",
]
)
df = pd.concat(parts, ignore_index=True)
df = df.drop_duplicates(subset=["game_pk", "away_team", "home_team"], keep="last")
return df.reset_index(drop=True)