""" data/mlb_starters.py Fetches probable starting pitchers for upcoming MLB games from the public MLB Stats API. Used by the Props page to enrich HR props with matchup context. Returns a dict keyed by (away_team, home_team) canonical names → pitcher names. Both teams in the key are normalized to lowercase stripped strings for fuzzy matching. """ from __future__ import annotations import logging import re import time import unicodedata from datetime import timedelta from typing import Any import pandas as pd import requests from data.odds_name_map import normalize_pitcher_name from database.db import log_pitcher_resolution _log = logging.getLogger(__name__) _SCHEDULE_URL = "https://statsapi.mlb.com/api/v1/schedule" def _normalize_team(name: str) -> str: text = str(name or "").strip().lower() text = unicodedata.normalize("NFKD", text) text = "".join(ch for ch in text if not unicodedata.combining(ch)) text = re.sub(r"[^a-z0-9 ]", "", text) text = re.sub(r"\s+", " ", text).strip() return text # Maps every known team name variant (abbreviation or full name, post-_normalize_team) to a # stable canonical code. Bridges the gap between statcast abbreviations (e.g. "sfg") and # Odds API / MLB Stats API full names (e.g. "san francisco giants"). _MLB_TEAM_CANONICAL: dict[str, str] = { # Arizona Diamondbacks "ari": "ari", "arizona": "ari", "arizona diamondbacks": "ari", "diamondbacks": "ari", "dbacks": "ari", # Atlanta Braves "atl": "atl", "atlanta": "atl", "atlanta braves": "atl", "braves": "atl", # Baltimore Orioles "bal": "bal", "baltimore": "bal", "baltimore orioles": "bal", "orioles": "bal", # Boston Red Sox "bos": "bos", "boston": "bos", "boston red sox": "bos", "red sox": "bos", # Chicago Cubs "chc": "chc", "chicago cubs": "chc", "cubs": "chc", # Chicago White Sox "cws": "cws", "chw": "cws", "chicago white sox": "cws", "white sox": "cws", # Cincinnati Reds "cin": "cin", "cincinnati": "cin", "cincinnati reds": "cin", "reds": "cin", # Cleveland Guardians "cle": "cle", "clg": "cle", "cleveland": "cle", "cleveland guardians": "cle", "guardians": "cle", # Colorado Rockies "col": "col", "colorado": "col", "colorado rockies": "col", "rockies": "col", # Detroit Tigers "det": "det", "detroit": "det", "detroit tigers": "det", "tigers": "det", # Houston Astros "hou": "hou", "houston": "hou", "houston astros": "hou", "astros": "hou", # Kansas City Royals "kc": "kc", "kcr": "kc", "kansas city": "kc", "kansas city royals": "kc", "royals": "kc", # Los Angeles Angels "laa": "laa", "los angeles angels": "laa", "angels": "laa", # Los Angeles Dodgers "lad": "lad", "los angeles dodgers": "lad", "dodgers": "lad", # Miami Marlins "mia": "mia", "miami": "mia", "miami marlins": "mia", "marlins": "mia", # Milwaukee Brewers "mil": "mil", "milwaukee": "mil", "milwaukee brewers": "mil", "brewers": "mil", # Minnesota Twins "min": "min", "minnesota": "min", "minnesota twins": "min", "twins": "min", # New York Mets "nym": "nym", "new york mets": "nym", "mets": "nym", # New York Yankees "nyy": "nyy", "new york yankees": "nyy", "yankees": "nyy", # Oakland / Sacramento Athletics "oak": "oak", "ath": "oak", "as": "oak", "oakland": "oak", "oakland athletics": "oak", "sacramento athletics": "oak", "athletics": "oak", # Philadelphia Phillies "phi": "phi", "philadelphia": "phi", "philadelphia phillies": "phi", "phillies": "phi", # Pittsburgh Pirates "pit": "pit", "pittsburgh": "pit", "pittsburgh pirates": "pit", "pirates": "pit", # San Diego Padres "sd": "sd", "sdp": "sd", "san diego": "sd", "san diego padres": "sd", "padres": "sd", # San Francisco Giants "sf": "sf", "sfg": "sf", "san francisco": "sf", "san francisco giants": "sf", "giants": "sf", # Seattle Mariners "sea": "sea", "seattle": "sea", "seattle mariners": "sea", "mariners": "sea", # St. Louis Cardinals "stl": "stl", "st louis": "stl", "st louis cardinals": "stl", "cardinals": "stl", # Tampa Bay Rays "tb": "tb", "tbr": "tb", "tampa bay": "tb", "tampa bay rays": "tb", "rays": "tb", # Texas Rangers "tex": "tex", "texas": "tex", "texas rangers": "tex", "rangers": "tex", # Toronto Blue Jays "tor": "tor", "toronto": "tor", "toronto blue jays": "tor", "blue jays": "tor", # Washington Nationals "wsh": "wsh", "wsn": "wsh", "washington": "wsh", "washington nationals": "wsh", "nationals": "wsh", "nats": "wsh", } def _canonical_team(name: str) -> str: """Map any team name variant to a stable canonical code for cross-source comparison.""" return _MLB_TEAM_CANONICAL.get(_normalize_team(name), _normalize_team(name)) def _normalize_person(name: str) -> str: text = str(name or "").strip().lower() text = unicodedata.normalize("NFKD", text) text = "".join(ch for ch in text if not unicodedata.combining(ch)) text = re.sub(r"[^a-z0-9 ]", "", text) text = re.sub(r"\s+", " ", text).strip() return text def _names_match(left: str, right: str) -> bool: left_norm = _normalize_person(left) right_norm = _normalize_person(right) return bool(left_norm and right_norm and left_norm == right_norm) def fetch_probable_starters_for_props(conn: Any = None) -> dict[tuple[str, str], dict[str, str | None]]: """ Fetch probable starters for all MLB games in the next 7 days. Returns: { (away_team_normalized, home_team_normalized): { "home_pitcher": "Luis Castillo" | None, "away_pitcher": "Cole Irvin" | None, "away_team_raw": "Seattle Mariners", "home_team_raw": "Oakland Athletics", } } Keys are lowercased/normalized for fuzzy matching against props row team names. """ today = pd.Timestamp.utcnow().date() end_date = today + timedelta(days=7) params: dict[str, Any] = { "sportId": 1, "startDate": today.isoformat(), "endDate": end_date.isoformat(), "hydrate": "probablePitcher", "gameType": "R,F,D,L,W", } try: r = requests.get(_SCHEDULE_URL, params=params, timeout=15) r.raise_for_status() data = r.json() except Exception as exc: _log.warning("[mlb_starters] schedule fetch failed: %s", exc) return {} result: dict[tuple[str, str], dict[str, str | None]] = {} games_total = 0 games_with_starters = 0 for date_entry in data.get("dates", []): for game in date_entry.get("games", []): games_total += 1 teams = game.get("teams", {}) away_raw = str(teams.get("away", {}).get("team", {}).get("name", "") or "") home_raw = str(teams.get("home", {}).get("team", {}).get("name", "") or "") away_pitcher_obj = teams.get("away", {}).get("probablePitcher") or {} home_pitcher_obj = teams.get("home", {}).get("probablePitcher") or {} away_pitcher = str(away_pitcher_obj.get("fullName", "") or "").strip() or None home_pitcher = str(home_pitcher_obj.get("fullName", "") or "").strip() or None if not away_raw or not home_raw: continue game_pk_str = str(game.get("gamePk", "") or "").strip() game_date_str = str(date_entry.get("date", "") or "").strip() key = (_normalize_team(away_raw), _normalize_team(home_raw)) result[key] = { "home_pitcher": home_pitcher, "away_pitcher": away_pitcher, "away_team_raw": away_raw, "home_team_raw": home_raw, "away_pitcher_source": "statsapi_probable_pitcher" if away_pitcher else "unresolved", "home_pitcher_source": "statsapi_probable_pitcher" if home_pitcher else "unresolved", "starter_cache_source": "statsapi_probable_pitcher", "fallback_used": False, } if conn is not None: for raw_name in (away_pitcher, home_pitcher): if not raw_name: continue try: log_pitcher_resolution(conn, { "game_pk": game_pk_str, "game_date": game_date_str, "source": "mlb_starters", "input_name": raw_name, "normalized_name": normalize_pitcher_name(raw_name), "matched_canonical": None, "pitcher_id": None, "match_method": "api_fetch", "sample_size": 0, "p_throws": None, }) except Exception as exc: _log.debug("[mlb_starters] resolution log write failed: %s", exc) if home_pitcher or away_pitcher: games_with_starters += 1 _log.warning( "[mlb_starters] games_total=%d games_with_starters=%d", games_total, games_with_starters, ) return result def _infer_pitcher_team_from_rows( pitcher_name: str, pitcher_statcast_df: pd.DataFrame | None, away_team: str, home_team: str, ) -> str: if ( not pitcher_name or pitcher_statcast_df is None or pitcher_statcast_df.empty or "player_name" not in pitcher_statcast_df.columns ): return "" target = _normalize_person(pitcher_name) rows = pitcher_statcast_df[ pitcher_statcast_df["player_name"].astype(str).map(_normalize_person) == target ].copy() if rows.empty: return "" away_norm = _normalize_team(away_team) home_norm = _normalize_team(home_team) team_candidates: list[str] = [] if {"inning_topbot", "home_team", "away_team"}.issubset(rows.columns): inning_half = rows["inning_topbot"].fillna("").astype(str).str.lower() top_mask = inning_half.str.contains("top") bottom_mask = inning_half.str.contains("bot|bottom") if top_mask.any(): team_candidates.extend( rows.loc[top_mask, "home_team"].dropna().astype(str).tolist() ) if bottom_mask.any(): team_candidates.extend( rows.loc[bottom_mask, "away_team"].dropna().astype(str).tolist() ) for col in ("team", "pitcher_team", "team_name"): if col in rows.columns: team_candidates.extend(rows[col].dropna().astype(str).tolist()) normalized = [_normalize_team(value) for value in team_candidates if str(value).strip()] if not normalized: return "" mode = pd.Series(normalized).mode() inferred = str(mode.iloc[0]).strip() if not mode.empty else "" if _canonical_team(inferred) == _canonical_team(away_norm): return away_team if _canonical_team(inferred) == _canonical_team(home_norm): return home_team return "" def build_oddsapi_starter_fallback_map( props_feed: pd.DataFrame | None, primary_starters: dict[tuple[str, str], dict[str, Any]] | None = None, pitcher_statcast_df: pd.DataFrame | None = None, ) -> dict[tuple[str, str], dict[str, str | None]]: if props_feed is None or props_feed.empty: return {} working = props_feed.copy() market_series = working.get("market_family", working.get("market", pd.Series(dtype="object", index=working.index))) scope_series = working.get("selection_scope", pd.Series(dtype="object", index=working.index)) working = working[ market_series.fillna("").astype(str).str.strip().str.lower().eq("k") & scope_series.fillna("").astype(str).str.strip().str.lower().eq("pitcher") ].copy() if working.empty: return {} results: dict[tuple[str, str], dict[str, str | None]] = {} primary = dict(primary_starters or {}) group_cols = [col for col in ("event_id", "away_team", "home_team") if col in working.columns] if len(group_cols) < 3: return {} for _, event_df in working.groupby(group_cols, dropna=False): away_team = str(event_df["away_team"].iloc[0] or "").strip() home_team = str(event_df["home_team"].iloc[0] or "").strip() away_norm = _normalize_team(away_team) home_norm = _normalize_team(home_team) if not away_norm or not home_norm: continue primary_payload = dict(primary.get((away_norm, home_norm)) or {}) away_pitcher = str(primary_payload.get("away_pitcher") or "").strip() or None home_pitcher = str(primary_payload.get("home_pitcher") or "").strip() or None away_source = str(primary_payload.get("away_pitcher_source") or "").strip() or "unresolved" home_source = str(primary_payload.get("home_pitcher_source") or "").strip() or "unresolved" candidate_names = [ str(name).strip() for name in event_df.get("player_name_raw", pd.Series(dtype="object")).dropna().astype(str).tolist() if str(name).strip() ] unique_candidates: list[str] = [] for candidate in candidate_names: if not any(_names_match(candidate, existing) for existing in unique_candidates): unique_candidates.append(candidate) candidate_team_map: dict[str, str] = {} for candidate in unique_candidates: if away_pitcher and _names_match(candidate, away_pitcher): candidate_team_map[candidate] = away_team continue if home_pitcher and _names_match(candidate, home_pitcher): candidate_team_map[candidate] = home_team continue inferred_team = _infer_pitcher_team_from_rows( pitcher_name=candidate, pitcher_statcast_df=pitcher_statcast_df, away_team=away_team, home_team=home_team, ) if not inferred_team: inferred_team = lookup_batter_current_team(candidate, away_team, home_team) or "" if inferred_team: candidate_team_map[candidate] = inferred_team blank_sides = int(not away_pitcher) + int(not home_pitcher) assigned_from_odds = 0 if not away_pitcher: for candidate, team_name in candidate_team_map.items(): if _normalize_team(team_name) == away_norm: away_pitcher = candidate assigned_from_odds += 1 break if not home_pitcher: for candidate, team_name in candidate_team_map.items(): if _normalize_team(team_name) == home_norm and not _names_match(candidate, away_pitcher or ""): home_pitcher = candidate assigned_from_odds += 1 break unresolved_candidates = [ candidate for candidate in unique_candidates if not _names_match(candidate, away_pitcher or "") and not _names_match(candidate, home_pitcher or "") ] if len(unresolved_candidates) == 1: if not away_pitcher and home_pitcher: away_pitcher = unresolved_candidates[0] assigned_from_odds += 1 elif not home_pitcher and away_pitcher: home_pitcher = unresolved_candidates[0] assigned_from_odds += 1 elif len(unresolved_candidates) == 2 and not away_pitcher and not home_pitcher: # Last resort: 2 candidates, both sides blank, team inference failed for both. # Assign alphabetically — arbitrary but deterministic. sorted_candidates = sorted(unresolved_candidates) away_pitcher = sorted_candidates[0] home_pitcher = sorted_candidates[1] assigned_from_odds += 2 odds_source = "unresolved" if assigned_from_odds >= 2 or (blank_sides >= 2 and away_pitcher and home_pitcher): odds_source = "oddsapi_pitcher_strikeouts_two_candidate_match" elif assigned_from_odds == 1: odds_source = "oddsapi_pitcher_strikeouts_single_candidate_match" elif len(unique_candidates) > 2 or (len(unique_candidates) >= 2 and not away_pitcher and not home_pitcher): odds_source = "oddsapi_pitcher_strikeouts_ambiguous" if away_source == "unresolved" and away_pitcher: away_source = odds_source if odds_source != "unresolved" else "oddsapi_pitcher_strikeouts_single_candidate_match" if home_source == "unresolved" and home_pitcher: home_source = odds_source if odds_source != "unresolved" else "oddsapi_pitcher_strikeouts_single_candidate_match" if away_source.startswith("statsapi_") or home_source.startswith("statsapi_"): starter_cache_source = ( "statsapi_plus_oddsapi_fallback" if (away_source.startswith("oddsapi_") or home_source.startswith("oddsapi_")) else "statsapi_probable_pitcher" ) elif away_source.startswith("oddsapi_") or home_source.startswith("oddsapi_"): starter_cache_source = odds_source if odds_source != "unresolved" else "oddsapi_pitcher_strikeouts_single_candidate_match" else: starter_cache_source = odds_source results[(away_norm, home_norm)] = { "away_team_raw": away_team, "home_team_raw": home_team, "away_pitcher": away_pitcher, "home_pitcher": home_pitcher, "away_pitcher_source": away_source if away_pitcher else "unresolved", "home_pitcher_source": home_source if home_pitcher else "unresolved", "starter_cache_source": starter_cache_source if (away_pitcher or home_pitcher or odds_source != "unresolved") else "unresolved", "fallback_used": bool( str(away_source).startswith("oddsapi_") or str(home_source).startswith("oddsapi_") ), } return results def merge_probable_starters_with_odds_fallback( primary_starters: dict[tuple[str, str], dict[str, Any]] | None, odds_fallback_starters: dict[tuple[str, str], dict[str, Any]] | None, ) -> dict[tuple[str, str], dict[str, str | None]]: primary = dict(primary_starters or {}) fallback = dict(odds_fallback_starters or {}) merged: dict[tuple[str, str], dict[str, str | None]] = {} for key in sorted(set(primary.keys()) | set(fallback.keys())): primary_payload = dict(primary.get(key) or {}) fallback_payload = dict(fallback.get(key) or {}) away_pitcher = str(primary_payload.get("away_pitcher") or "").strip() or str(fallback_payload.get("away_pitcher") or "").strip() or None home_pitcher = str(primary_payload.get("home_pitcher") or "").strip() or str(fallback_payload.get("home_pitcher") or "").strip() or None away_source = ( str(primary_payload.get("away_pitcher_source") or "").strip() or str(fallback_payload.get("away_pitcher_source") or "").strip() or "unresolved" ) home_source = ( str(primary_payload.get("home_pitcher_source") or "").strip() or str(fallback_payload.get("home_pitcher_source") or "").strip() or "unresolved" ) fallback_used = away_source.startswith("oddsapi_") or home_source.startswith("oddsapi_") if away_source.startswith("statsapi_") or home_source.startswith("statsapi_"): starter_cache_source = "statsapi_probable_pitcher" if not fallback_used else "statsapi_plus_oddsapi_fallback" elif fallback_used: starter_cache_source = ( str(fallback_payload.get("starter_cache_source") or "").strip() or "oddsapi_pitcher_strikeouts_single_candidate_match" ) else: starter_cache_source = str(fallback_payload.get("starter_cache_source") or "").strip() or "unresolved" merged[key] = { "away_team_raw": str(primary_payload.get("away_team_raw") or fallback_payload.get("away_team_raw") or "").strip(), "home_team_raw": str(primary_payload.get("home_team_raw") or fallback_payload.get("home_team_raw") or "").strip(), "away_pitcher": away_pitcher, "home_pitcher": home_pitcher, "away_pitcher_source": away_source, "home_pitcher_source": home_source, "starter_cache_source": starter_cache_source, "fallback_used": fallback_used, } return merged def lookup_pitchers_for_game( away_team: str, home_team: str, starters_map: dict[tuple[str, str], dict[str, str | None]], ) -> dict[str, str | None]: """ Look up probable pitchers for a specific game matchup. Returns {"home_pitcher": name_or_None, "away_pitcher": name_or_None}. Uses normalized string matching — tolerates minor differences in team name format. """ away_norm = _normalize_team(away_team) home_norm = _normalize_team(home_team) # Exact normalized match entry = starters_map.get((away_norm, home_norm)) if entry: return entry # Canonical match: bridges abbreviations vs full names (e.g. "sfg" == "san francisco giants") away_canon = _canonical_team(away_norm) home_canon = _canonical_team(home_norm) for (k_away, k_home), v in starters_map.items(): if _canonical_team(k_away) == away_canon and _canonical_team(k_home) == home_canon: return v # Partial substring fallback for (k_away, k_home), v in starters_map.items(): away_match = away_norm in k_away or k_away in away_norm home_match = home_norm in k_home or k_home in home_norm if away_match and home_match: return v return {"home_pitcher": None, "away_pitcher": None} # --------------------------------------------------------------------------- # Current-season roster lookup (batter team resolution fallback) # --------------------------------------------------------------------------- _ROSTER_MAP_CACHE: dict[str, str] | None = None _ROSTER_MAP_EMPTY_UNTIL: float = 0.0 # monotonic timestamp after which retry is allowed def fetch_mlb_current_roster_map(season: int = 2026) -> dict[str, str]: """ Returns {normalized_player_name: canonical_team_code} for all active MLB players. Cached for the process lifetime. Uses MLB Stats API players endpoint. """ global _ROSTER_MAP_CACHE, _ROSTER_MAP_EMPTY_UNTIL if _ROSTER_MAP_CACHE is not None: return _ROSTER_MAP_CACHE # Rate-limit empty/error retries — don't hammer the API on every pitcher call if time.monotonic() < _ROSTER_MAP_EMPTY_UNTIL: return {} url = "https://statsapi.mlb.com/api/v1/sports/1/players" params: dict[str, Any] = {"season": season, "gameType": "R,S"} try: r = requests.get(url, params=params, timeout=15) r.raise_for_status() data = r.json() except Exception as exc: _log.warning("[mlb_roster] fetch failed: %s", exc) _ROSTER_MAP_EMPTY_UNTIL = time.monotonic() + 300 # retry in 5 min return {} people = data.get("people", []) if isinstance(data, dict) else [] roster: dict[str, str] = {} for person in people: full_name = str(person.get("fullName", "") or "") team_name = str((person.get("currentTeam") or {}).get("name", "") or "") if not full_name or not team_name: continue norm_name = _normalize_person(full_name) canon_team = _canonical_team(team_name) if norm_name and canon_team: roster[norm_name] = canon_team _log.warning("[mlb_roster] loaded %d players for season %d", len(roster), season) if not roster: _ROSTER_MAP_EMPTY_UNTIL = time.monotonic() + 300 # retry in 5 min return {} _ROSTER_MAP_CACHE = roster return _ROSTER_MAP_CACHE def lookup_batter_current_team( batter_name: str, away_team: str, home_team: str, season: int = 2026, ) -> str | None: """ Returns the display team name (away_team or home_team) for a batter based on the current MLB roster. Returns None if the player is not found or is not participating in this specific game. """ roster = fetch_mlb_current_roster_map(season) if not roster: return None norm_name = _normalize_person(batter_name) canon_team = roster.get(norm_name) if not canon_team: return None if canon_team == _canonical_team(away_team) and away_team: return away_team if canon_team == _canonical_team(home_team) and home_team: return home_team return None