2026_MLB_Model / data /mlb_starters.py
Syntrex's picture
Merge remote: resolve mlb_starters.py conflict — keep conn param + remote team canonical map
abe5c7e
raw
history blame
24.9 kB
"""
data/mlb_starters.py
Fetches probable starting pitchers for upcoming MLB games from the public
MLB Stats API. Used by the Props page to enrich HR props with matchup context.
Returns a dict keyed by (away_team, home_team) canonical names → pitcher names.
Both teams in the key are normalized to lowercase stripped strings for fuzzy matching.
"""
from __future__ import annotations
import logging
import re
import time
import unicodedata
from datetime import timedelta
from typing import Any
import pandas as pd
import requests
from data.odds_name_map import normalize_pitcher_name
from database.db import log_pitcher_resolution
_log = logging.getLogger(__name__)
_SCHEDULE_URL = "https://statsapi.mlb.com/api/v1/schedule"
def _normalize_team(name: str) -> str:
text = str(name or "").strip().lower()
text = unicodedata.normalize("NFKD", text)
text = "".join(ch for ch in text if not unicodedata.combining(ch))
text = re.sub(r"[^a-z0-9 ]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
# Maps every known team name variant (abbreviation or full name, post-_normalize_team) to a
# stable canonical code. Bridges the gap between statcast abbreviations (e.g. "sfg") and
# Odds API / MLB Stats API full names (e.g. "san francisco giants").
_MLB_TEAM_CANONICAL: dict[str, str] = {
# Arizona Diamondbacks
"ari": "ari", "arizona": "ari", "arizona diamondbacks": "ari", "diamondbacks": "ari", "dbacks": "ari",
# Atlanta Braves
"atl": "atl", "atlanta": "atl", "atlanta braves": "atl", "braves": "atl",
# Baltimore Orioles
"bal": "bal", "baltimore": "bal", "baltimore orioles": "bal", "orioles": "bal",
# Boston Red Sox
"bos": "bos", "boston": "bos", "boston red sox": "bos", "red sox": "bos",
# Chicago Cubs
"chc": "chc", "chicago cubs": "chc", "cubs": "chc",
# Chicago White Sox
"cws": "cws", "chw": "cws", "chicago white sox": "cws", "white sox": "cws",
# Cincinnati Reds
"cin": "cin", "cincinnati": "cin", "cincinnati reds": "cin", "reds": "cin",
# Cleveland Guardians
"cle": "cle", "clg": "cle", "cleveland": "cle", "cleveland guardians": "cle", "guardians": "cle",
# Colorado Rockies
"col": "col", "colorado": "col", "colorado rockies": "col", "rockies": "col",
# Detroit Tigers
"det": "det", "detroit": "det", "detroit tigers": "det", "tigers": "det",
# Houston Astros
"hou": "hou", "houston": "hou", "houston astros": "hou", "astros": "hou",
# Kansas City Royals
"kc": "kc", "kcr": "kc", "kansas city": "kc", "kansas city royals": "kc", "royals": "kc",
# Los Angeles Angels
"laa": "laa", "los angeles angels": "laa", "angels": "laa",
# Los Angeles Dodgers
"lad": "lad", "los angeles dodgers": "lad", "dodgers": "lad",
# Miami Marlins
"mia": "mia", "miami": "mia", "miami marlins": "mia", "marlins": "mia",
# Milwaukee Brewers
"mil": "mil", "milwaukee": "mil", "milwaukee brewers": "mil", "brewers": "mil",
# Minnesota Twins
"min": "min", "minnesota": "min", "minnesota twins": "min", "twins": "min",
# New York Mets
"nym": "nym", "new york mets": "nym", "mets": "nym",
# New York Yankees
"nyy": "nyy", "new york yankees": "nyy", "yankees": "nyy",
# Oakland / Sacramento Athletics
"oak": "oak", "ath": "oak", "as": "oak", "oakland": "oak", "oakland athletics": "oak",
"sacramento athletics": "oak", "athletics": "oak",
# Philadelphia Phillies
"phi": "phi", "philadelphia": "phi", "philadelphia phillies": "phi", "phillies": "phi",
# Pittsburgh Pirates
"pit": "pit", "pittsburgh": "pit", "pittsburgh pirates": "pit", "pirates": "pit",
# San Diego Padres
"sd": "sd", "sdp": "sd", "san diego": "sd", "san diego padres": "sd", "padres": "sd",
# San Francisco Giants
"sf": "sf", "sfg": "sf", "san francisco": "sf", "san francisco giants": "sf", "giants": "sf",
# Seattle Mariners
"sea": "sea", "seattle": "sea", "seattle mariners": "sea", "mariners": "sea",
# St. Louis Cardinals
"stl": "stl", "st louis": "stl", "st louis cardinals": "stl", "cardinals": "stl",
# Tampa Bay Rays
"tb": "tb", "tbr": "tb", "tampa bay": "tb", "tampa bay rays": "tb", "rays": "tb",
# Texas Rangers
"tex": "tex", "texas": "tex", "texas rangers": "tex", "rangers": "tex",
# Toronto Blue Jays
"tor": "tor", "toronto": "tor", "toronto blue jays": "tor", "blue jays": "tor",
# Washington Nationals
"wsh": "wsh", "wsn": "wsh", "washington": "wsh", "washington nationals": "wsh",
"nationals": "wsh", "nats": "wsh",
}
def _canonical_team(name: str) -> str:
"""Map any team name variant to a stable canonical code for cross-source comparison."""
return _MLB_TEAM_CANONICAL.get(_normalize_team(name), _normalize_team(name))
def _normalize_person(name: str) -> str:
text = str(name or "").strip().lower()
text = unicodedata.normalize("NFKD", text)
text = "".join(ch for ch in text if not unicodedata.combining(ch))
text = re.sub(r"[^a-z0-9 ]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def _names_match(left: str, right: str) -> bool:
left_norm = _normalize_person(left)
right_norm = _normalize_person(right)
return bool(left_norm and right_norm and left_norm == right_norm)
def fetch_probable_starters_for_props(conn: Any = None) -> dict[tuple[str, str], dict[str, str | None]]:
"""
Fetch probable starters for all MLB games in the next 7 days.
Returns:
{
(away_team_normalized, home_team_normalized): {
"home_pitcher": "Luis Castillo" | None,
"away_pitcher": "Cole Irvin" | None,
"away_team_raw": "Seattle Mariners",
"home_team_raw": "Oakland Athletics",
}
}
Keys are lowercased/normalized for fuzzy matching against props row team names.
"""
today = pd.Timestamp.utcnow().date()
end_date = today + timedelta(days=7)
params: dict[str, Any] = {
"sportId": 1,
"startDate": today.isoformat(),
"endDate": end_date.isoformat(),
"hydrate": "probablePitcher",
"gameType": "R,F,D,L,W",
}
try:
r = requests.get(_SCHEDULE_URL, params=params, timeout=15)
r.raise_for_status()
data = r.json()
except Exception as exc:
_log.warning("[mlb_starters] schedule fetch failed: %s", exc)
return {}
result: dict[tuple[str, str], dict[str, str | None]] = {}
games_total = 0
games_with_starters = 0
for date_entry in data.get("dates", []):
for game in date_entry.get("games", []):
games_total += 1
teams = game.get("teams", {})
away_raw = str(teams.get("away", {}).get("team", {}).get("name", "") or "")
home_raw = str(teams.get("home", {}).get("team", {}).get("name", "") or "")
away_pitcher_obj = teams.get("away", {}).get("probablePitcher") or {}
home_pitcher_obj = teams.get("home", {}).get("probablePitcher") or {}
away_pitcher = str(away_pitcher_obj.get("fullName", "") or "").strip() or None
home_pitcher = str(home_pitcher_obj.get("fullName", "") or "").strip() or None
if not away_raw or not home_raw:
continue
game_pk_str = str(game.get("gamePk", "") or "").strip()
game_date_str = str(date_entry.get("date", "") or "").strip()
key = (_normalize_team(away_raw), _normalize_team(home_raw))
result[key] = {
"home_pitcher": home_pitcher,
"away_pitcher": away_pitcher,
"away_team_raw": away_raw,
"home_team_raw": home_raw,
"away_pitcher_source": "statsapi_probable_pitcher" if away_pitcher else "unresolved",
"home_pitcher_source": "statsapi_probable_pitcher" if home_pitcher else "unresolved",
"starter_cache_source": "statsapi_probable_pitcher",
"fallback_used": False,
}
if conn is not None:
for raw_name in (away_pitcher, home_pitcher):
if not raw_name:
continue
try:
log_pitcher_resolution(conn, {
"game_pk": game_pk_str,
"game_date": game_date_str,
"source": "mlb_starters",
"input_name": raw_name,
"normalized_name": normalize_pitcher_name(raw_name),
"matched_canonical": None,
"pitcher_id": None,
"match_method": "api_fetch",
"sample_size": 0,
"p_throws": None,
})
except Exception as exc:
_log.debug("[mlb_starters] resolution log write failed: %s", exc)
if home_pitcher or away_pitcher:
games_with_starters += 1
_log.warning(
"[mlb_starters] games_total=%d games_with_starters=%d",
games_total,
games_with_starters,
)
return result
def _infer_pitcher_team_from_rows(
pitcher_name: str,
pitcher_statcast_df: pd.DataFrame | None,
away_team: str,
home_team: str,
) -> str:
if (
not pitcher_name
or pitcher_statcast_df is None
or pitcher_statcast_df.empty
or "player_name" not in pitcher_statcast_df.columns
):
return ""
target = _normalize_person(pitcher_name)
rows = pitcher_statcast_df[
pitcher_statcast_df["player_name"].astype(str).map(_normalize_person) == target
].copy()
if rows.empty:
return ""
away_norm = _normalize_team(away_team)
home_norm = _normalize_team(home_team)
team_candidates: list[str] = []
if {"inning_topbot", "home_team", "away_team"}.issubset(rows.columns):
inning_half = rows["inning_topbot"].fillna("").astype(str).str.lower()
top_mask = inning_half.str.contains("top")
bottom_mask = inning_half.str.contains("bot|bottom")
if top_mask.any():
team_candidates.extend(
rows.loc[top_mask, "home_team"].dropna().astype(str).tolist()
)
if bottom_mask.any():
team_candidates.extend(
rows.loc[bottom_mask, "away_team"].dropna().astype(str).tolist()
)
for col in ("team", "pitcher_team", "team_name"):
if col in rows.columns:
team_candidates.extend(rows[col].dropna().astype(str).tolist())
normalized = [_normalize_team(value) for value in team_candidates if str(value).strip()]
if not normalized:
return ""
mode = pd.Series(normalized).mode()
inferred = str(mode.iloc[0]).strip() if not mode.empty else ""
if _canonical_team(inferred) == _canonical_team(away_norm):
return away_team
if _canonical_team(inferred) == _canonical_team(home_norm):
return home_team
return ""
def build_oddsapi_starter_fallback_map(
props_feed: pd.DataFrame | None,
primary_starters: dict[tuple[str, str], dict[str, Any]] | None = None,
pitcher_statcast_df: pd.DataFrame | None = None,
) -> dict[tuple[str, str], dict[str, str | None]]:
if props_feed is None or props_feed.empty:
return {}
working = props_feed.copy()
market_series = working.get("market_family", working.get("market", pd.Series(dtype="object", index=working.index)))
scope_series = working.get("selection_scope", pd.Series(dtype="object", index=working.index))
working = working[
market_series.fillna("").astype(str).str.strip().str.lower().eq("k")
& scope_series.fillna("").astype(str).str.strip().str.lower().eq("pitcher")
].copy()
if working.empty:
return {}
results: dict[tuple[str, str], dict[str, str | None]] = {}
primary = dict(primary_starters or {})
group_cols = [col for col in ("event_id", "away_team", "home_team") if col in working.columns]
if len(group_cols) < 3:
return {}
for _, event_df in working.groupby(group_cols, dropna=False):
away_team = str(event_df["away_team"].iloc[0] or "").strip()
home_team = str(event_df["home_team"].iloc[0] or "").strip()
away_norm = _normalize_team(away_team)
home_norm = _normalize_team(home_team)
if not away_norm or not home_norm:
continue
primary_payload = dict(primary.get((away_norm, home_norm)) or {})
away_pitcher = str(primary_payload.get("away_pitcher") or "").strip() or None
home_pitcher = str(primary_payload.get("home_pitcher") or "").strip() or None
away_source = str(primary_payload.get("away_pitcher_source") or "").strip() or "unresolved"
home_source = str(primary_payload.get("home_pitcher_source") or "").strip() or "unresolved"
candidate_names = [
str(name).strip()
for name in event_df.get("player_name_raw", pd.Series(dtype="object")).dropna().astype(str).tolist()
if str(name).strip()
]
unique_candidates: list[str] = []
for candidate in candidate_names:
if not any(_names_match(candidate, existing) for existing in unique_candidates):
unique_candidates.append(candidate)
candidate_team_map: dict[str, str] = {}
for candidate in unique_candidates:
if away_pitcher and _names_match(candidate, away_pitcher):
candidate_team_map[candidate] = away_team
continue
if home_pitcher and _names_match(candidate, home_pitcher):
candidate_team_map[candidate] = home_team
continue
inferred_team = _infer_pitcher_team_from_rows(
pitcher_name=candidate,
pitcher_statcast_df=pitcher_statcast_df,
away_team=away_team,
home_team=home_team,
)
if not inferred_team:
inferred_team = lookup_batter_current_team(candidate, away_team, home_team) or ""
if inferred_team:
candidate_team_map[candidate] = inferred_team
blank_sides = int(not away_pitcher) + int(not home_pitcher)
assigned_from_odds = 0
if not away_pitcher:
for candidate, team_name in candidate_team_map.items():
if _normalize_team(team_name) == away_norm:
away_pitcher = candidate
assigned_from_odds += 1
break
if not home_pitcher:
for candidate, team_name in candidate_team_map.items():
if _normalize_team(team_name) == home_norm and not _names_match(candidate, away_pitcher or ""):
home_pitcher = candidate
assigned_from_odds += 1
break
unresolved_candidates = [
candidate
for candidate in unique_candidates
if not _names_match(candidate, away_pitcher or "") and not _names_match(candidate, home_pitcher or "")
]
if len(unresolved_candidates) == 1:
if not away_pitcher and home_pitcher:
away_pitcher = unresolved_candidates[0]
assigned_from_odds += 1
elif not home_pitcher and away_pitcher:
home_pitcher = unresolved_candidates[0]
assigned_from_odds += 1
elif len(unresolved_candidates) == 2 and not away_pitcher and not home_pitcher:
# Last resort: 2 candidates, both sides blank, team inference failed for both.
# Assign alphabetically — arbitrary but deterministic.
sorted_candidates = sorted(unresolved_candidates)
away_pitcher = sorted_candidates[0]
home_pitcher = sorted_candidates[1]
assigned_from_odds += 2
odds_source = "unresolved"
if assigned_from_odds >= 2 or (blank_sides >= 2 and away_pitcher and home_pitcher):
odds_source = "oddsapi_pitcher_strikeouts_two_candidate_match"
elif assigned_from_odds == 1:
odds_source = "oddsapi_pitcher_strikeouts_single_candidate_match"
elif len(unique_candidates) > 2 or (len(unique_candidates) >= 2 and not away_pitcher and not home_pitcher):
odds_source = "oddsapi_pitcher_strikeouts_ambiguous"
if away_source == "unresolved" and away_pitcher:
away_source = odds_source if odds_source != "unresolved" else "oddsapi_pitcher_strikeouts_single_candidate_match"
if home_source == "unresolved" and home_pitcher:
home_source = odds_source if odds_source != "unresolved" else "oddsapi_pitcher_strikeouts_single_candidate_match"
if away_source.startswith("statsapi_") or home_source.startswith("statsapi_"):
starter_cache_source = (
"statsapi_plus_oddsapi_fallback"
if (away_source.startswith("oddsapi_") or home_source.startswith("oddsapi_"))
else "statsapi_probable_pitcher"
)
elif away_source.startswith("oddsapi_") or home_source.startswith("oddsapi_"):
starter_cache_source = odds_source if odds_source != "unresolved" else "oddsapi_pitcher_strikeouts_single_candidate_match"
else:
starter_cache_source = odds_source
results[(away_norm, home_norm)] = {
"away_team_raw": away_team,
"home_team_raw": home_team,
"away_pitcher": away_pitcher,
"home_pitcher": home_pitcher,
"away_pitcher_source": away_source if away_pitcher else "unresolved",
"home_pitcher_source": home_source if home_pitcher else "unresolved",
"starter_cache_source": starter_cache_source if (away_pitcher or home_pitcher or odds_source != "unresolved") else "unresolved",
"fallback_used": bool(
str(away_source).startswith("oddsapi_") or str(home_source).startswith("oddsapi_")
),
}
return results
def merge_probable_starters_with_odds_fallback(
primary_starters: dict[tuple[str, str], dict[str, Any]] | None,
odds_fallback_starters: dict[tuple[str, str], dict[str, Any]] | None,
) -> dict[tuple[str, str], dict[str, str | None]]:
primary = dict(primary_starters or {})
fallback = dict(odds_fallback_starters or {})
merged: dict[tuple[str, str], dict[str, str | None]] = {}
for key in sorted(set(primary.keys()) | set(fallback.keys())):
primary_payload = dict(primary.get(key) or {})
fallback_payload = dict(fallback.get(key) or {})
away_pitcher = str(primary_payload.get("away_pitcher") or "").strip() or str(fallback_payload.get("away_pitcher") or "").strip() or None
home_pitcher = str(primary_payload.get("home_pitcher") or "").strip() or str(fallback_payload.get("home_pitcher") or "").strip() or None
away_source = (
str(primary_payload.get("away_pitcher_source") or "").strip()
or str(fallback_payload.get("away_pitcher_source") or "").strip()
or "unresolved"
)
home_source = (
str(primary_payload.get("home_pitcher_source") or "").strip()
or str(fallback_payload.get("home_pitcher_source") or "").strip()
or "unresolved"
)
fallback_used = away_source.startswith("oddsapi_") or home_source.startswith("oddsapi_")
if away_source.startswith("statsapi_") or home_source.startswith("statsapi_"):
starter_cache_source = "statsapi_probable_pitcher" if not fallback_used else "statsapi_plus_oddsapi_fallback"
elif fallback_used:
starter_cache_source = (
str(fallback_payload.get("starter_cache_source") or "").strip()
or "oddsapi_pitcher_strikeouts_single_candidate_match"
)
else:
starter_cache_source = str(fallback_payload.get("starter_cache_source") or "").strip() or "unresolved"
merged[key] = {
"away_team_raw": str(primary_payload.get("away_team_raw") or fallback_payload.get("away_team_raw") or "").strip(),
"home_team_raw": str(primary_payload.get("home_team_raw") or fallback_payload.get("home_team_raw") or "").strip(),
"away_pitcher": away_pitcher,
"home_pitcher": home_pitcher,
"away_pitcher_source": away_source,
"home_pitcher_source": home_source,
"starter_cache_source": starter_cache_source,
"fallback_used": fallback_used,
}
return merged
def lookup_pitchers_for_game(
away_team: str,
home_team: str,
starters_map: dict[tuple[str, str], dict[str, str | None]],
) -> dict[str, str | None]:
"""
Look up probable pitchers for a specific game matchup.
Returns {"home_pitcher": name_or_None, "away_pitcher": name_or_None}.
Uses normalized string matching — tolerates minor differences in team name format.
"""
away_norm = _normalize_team(away_team)
home_norm = _normalize_team(home_team)
# Exact normalized match
entry = starters_map.get((away_norm, home_norm))
if entry:
return entry
# Canonical match: bridges abbreviations vs full names (e.g. "sfg" == "san francisco giants")
away_canon = _canonical_team(away_norm)
home_canon = _canonical_team(home_norm)
for (k_away, k_home), v in starters_map.items():
if _canonical_team(k_away) == away_canon and _canonical_team(k_home) == home_canon:
return v
# Partial substring fallback
for (k_away, k_home), v in starters_map.items():
away_match = away_norm in k_away or k_away in away_norm
home_match = home_norm in k_home or k_home in home_norm
if away_match and home_match:
return v
return {"home_pitcher": None, "away_pitcher": None}
# ---------------------------------------------------------------------------
# Current-season roster lookup (batter team resolution fallback)
# ---------------------------------------------------------------------------
_ROSTER_MAP_CACHE: dict[str, str] | None = None
_ROSTER_MAP_EMPTY_UNTIL: float = 0.0 # monotonic timestamp after which retry is allowed
def fetch_mlb_current_roster_map(season: int = 2026) -> dict[str, str]:
"""
Returns {normalized_player_name: canonical_team_code} for all active MLB players.
Cached for the process lifetime. Uses MLB Stats API players endpoint.
"""
global _ROSTER_MAP_CACHE, _ROSTER_MAP_EMPTY_UNTIL
if _ROSTER_MAP_CACHE is not None:
return _ROSTER_MAP_CACHE
# Rate-limit empty/error retries — don't hammer the API on every pitcher call
if time.monotonic() < _ROSTER_MAP_EMPTY_UNTIL:
return {}
url = "https://statsapi.mlb.com/api/v1/sports/1/players"
params: dict[str, Any] = {"season": season, "gameType": "R,S"}
try:
r = requests.get(url, params=params, timeout=15)
r.raise_for_status()
data = r.json()
except Exception as exc:
_log.warning("[mlb_roster] fetch failed: %s", exc)
_ROSTER_MAP_EMPTY_UNTIL = time.monotonic() + 300 # retry in 5 min
return {}
people = data.get("people", []) if isinstance(data, dict) else []
roster: dict[str, str] = {}
for person in people:
full_name = str(person.get("fullName", "") or "")
team_name = str((person.get("currentTeam") or {}).get("name", "") or "")
if not full_name or not team_name:
continue
norm_name = _normalize_person(full_name)
canon_team = _canonical_team(team_name)
if norm_name and canon_team:
roster[norm_name] = canon_team
_log.warning("[mlb_roster] loaded %d players for season %d", len(roster), season)
if not roster:
_ROSTER_MAP_EMPTY_UNTIL = time.monotonic() + 300 # retry in 5 min
return {}
_ROSTER_MAP_CACHE = roster
return _ROSTER_MAP_CACHE
def lookup_batter_current_team(
batter_name: str,
away_team: str,
home_team: str,
season: int = 2026,
) -> str | None:
"""
Returns the display team name (away_team or home_team) for a batter based on
the current MLB roster. Returns None if the player is not found or is not
participating in this specific game.
"""
roster = fetch_mlb_current_roster_map(season)
if not roster:
return None
norm_name = _normalize_person(batter_name)
canon_team = roster.get(norm_name)
if not canon_team:
return None
if canon_team == _canonical_team(away_team) and away_team:
return away_team
if canon_team == _canonical_team(home_team) and home_team:
return home_team
return None