Spaces:

Syntrex
/

2026_MLB_Model

Sleeping

Syntrex Claude Sonnet 4.6 commited on Mar 23

Commit

37fe240

1 Parent(s): 0ff601a

Props: multi-book concat-all, full pre-game model stack, pre-season statcast fallback

Issue A: Switch fetch_all_upcoming_hr_props from stop-at-first to concat-all
so Odds API partial data (e.g. Caesars only) no longer blocks the scraper.
Dedup by best odds per (player_name, sportsbook_key, market) after merge.

Issue B: Fall back to load_statcast_previous_season_full() (2025) when
load_statcast_recent() returns empty (pre-season). Model HR% now populates.

Issue C: Full pre-game model stack in props_mapper._get_full_pregame_adjustments():
- Pitcher quality ±0.025 via compute_pitcher_adjustment() [dominant signal]
- Zone matchup ±0.010 via batter_zone_store + pitcher_zone_model
- Arsenal matchup ±0.010 via batter/pitcher arsenal feature rows
- Rolling form ±0.012 via compute_upcoming_rolling_adjustment()
- Park factor ±0.006 via HOME_TEAM_TO_STADIUM + compute_park_adjustment()

New data sources:
- data/mlb_starters.py: probable starters from MLB Stats API, cached 1h
- data/statcast.py: fetch_statcast_range_pitcher() for pitcher-perspective data
- app.py: load_statcast_previous_season_full_pitcher() + load_probable_starters()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (6) hide show

analytics/props_mapper.py +334 -123
app.py +32 -2
data/live_prop_odds.py +29 -2
data/mlb_starters.py +135 -0
data/statcast.py +10 -4
visualization/props_page.py +13 -2

analytics/props_mapper.py CHANGED Viewed

@@ -1,26 +1,28 @@
 """
 analytics/props_mapper.py
-Batch 14: Maps sportsbook HR prop rows to internal model HR probabilities
-and computes edge.
-Model HR probability resolution order (pre-game, no live lineup context):
-  1. internal_model_baseline — compute_batter_baseline() using batter statcast
-     features (EV90, barrel rate, hard hit rate, xwOBA, launch angle).
-     Preferred source when plate_appearances > 0.
-  2. unavailable — insufficient statcast coverage for this player.
-Note: XGBoost HR model (xgb_shadow.py) requires anchor probs from the live
-simulator and cannot be used pre-game. It remains the source for Dashboard
-live-game recommendations only.
-The prob_fn parameter in map_hr_props_to_model() is injectable so the
-probability source can be swapped later without touching odds ingestion or
-the Props page.
 """
 from __future__ import annotations
 from typing import Any, Callable
 import pandas as pd
@@ -28,79 +30,46 @@ import pandas as pd
 from analytics.no_vig_props import american_to_implied_prob, compute_edge
 from data.odds_name_map import map_odds_name_to_model_name
 from models.batter_baseline import build_batter_feature_row, compute_batter_baseline
-from models.pitcher_adjustment import build_pitcher_feature_row
-def _get_pregame_context_adjustments(
-    props_row: Any,
-    statcast_df: pd.DataFrame,
-) -> tuple[float, float, bool, str]:
-    """
-    Derive pitcher quality + park context adjustments for a pre-game props row.
-    Returns (pitcher_adj, park_adj, context_applied, source_detail_str).
-    All adjustments are no-op safe — any missing data yields 0.0.
-    """
-    pitcher_adj = 0.0
-    park_adj = 0.0
-    context_applied = False
-    source_parts: list[str] = ["baseline"]
-    # --- Pitcher context (only when pitcher_name is explicit in props row) ---
-    pitcher_name = None
-    for key in ("pitcher_name", "pitcher", "opposing_pitcher"):
-        val = props_row.get(key) if hasattr(props_row, "get") else None
-        if val and str(val).strip() not in ("", "nan", "None"):
-            pitcher_name = str(val).strip()
-            break
-    if pitcher_name and not statcast_df.empty:
-        try:
-            p_row = build_pitcher_feature_row(statcast_df, pitcher_name)
-            if p_row.get("sample_size", 0) > 0:
-                velo = p_row.get("avg_release_speed")
-                ev = p_row.get("ev_allowed")
-                barrel = p_row.get("barrel_rate_allowed")
-                quality_score = 0.0
-                if velo is not None:
-                    quality_score += (float(velo) - 93.0) * (-0.15)  # higher velo = better pitcher = negative for batter
-                if ev is not None:
-                    quality_score += (float(ev) - 89.0) * 0.08      # higher EV allowed = worse pitcher
-                if barrel is not None:
-                    quality_score += (float(barrel) - 0.07) * 1.0   # higher barrel = worse pitcher
-                pitcher_adj = max(-0.005, min(0.005, quality_score * 0.003))
-                if abs(pitcher_adj) > 0.0001:
-                    context_applied = True
-                    source_parts.append("pitcher_quality")
-        except Exception:
-            pass
-    # --- Park context (if venue available) ---
-    venue = None
-    for key in ("venue", "stadium", "venue_name", "park"):
-        val = props_row.get(key) if hasattr(props_row, "get") else None
-        if val and str(val).strip() not in ("", "nan", "None"):
-            venue = str(val).strip()
-            break
-    if venue:
-        try:
-            from models.environment_model import compute_environment_adjustment
-            env = compute_environment_adjustment(
-                game_row={"venue": venue, "stadium": venue},
-                weather_row=None,
-            )
-            raw_park = float(env.get("park_hr_boost", 0.0) or 0.0)
-            park_adj = max(-0.004, min(0.004, raw_park))
-            if abs(park_adj) > 0.0001:
-                context_applied = True
-                source_parts.append("park")
-        except Exception:
-            pass
-    source_detail = "+".join(source_parts)
-    return pitcher_adj, park_adj, context_applied, source_detail
 def _build_statcast_name_index(statcast_df: pd.DataFrame) -> dict[str, str]:
@@ -118,13 +87,252 @@ def _build_statcast_name_index(statcast_df: pd.DataFrame) -> dict[str, str]:
     return index
 def get_player_hr_prob(
     player_name_normalized: str,
     statcast_df: pd.DataFrame,
     _name_index: dict[str, str] | None = None,
 ) -> tuple[float | None, str]:
     """
-    Returns (prob, source) for a pre-game HR probability.
     source values:
       "internal_model_baseline" — compute_batter_baseline() with statcast features
@@ -153,19 +361,24 @@ def map_hr_props_to_model(
     statcast_df: pd.DataFrame,
     prob_fn: Callable[[str, pd.DataFrame, dict[str, str] | None], tuple[float | None, str]] | None = None,
     pitcher_stats_df: pd.DataFrame | None = None,
 ) -> pd.DataFrame:
     """
     Join HR prop rows to model HR probabilities and compute edge.
     Adds columns:
-      implied_prob         — book implied probability (vig-inclusive)
-      model_hr_prob        — pre-game model HR probability (or None)
-      model_hr_prob_source — source label for model_hr_prob
-      edge                 — model_hr_prob - implied_prob (or None)
     Filters to market == "hr".
     Sorts by edge descending (rows with no edge/model prob sort last).
-    prob_fn is injectable for future source swaps; defaults to get_player_hr_prob.
     """
     if props_df.empty:
         return pd.DataFrame()
@@ -176,52 +389,57 @@ def map_hr_props_to_model(
     if hr_df.empty:
         return pd.DataFrame()
-    # Build name index once for all players
     name_index = _build_statcast_name_index(statcast_df)
-    # Use pitcher_stats_df if provided, else fall back to statcast_df for pitcher lookups
-    _pitcher_df = pitcher_stats_df if pitcher_stats_df is not None else statcast_df
     implied_probs: list[float] = []
     model_probs: list[float | None] = []
     sources: list[str] = []
     edges: list[float | None] = []
-    pitcher_context_adjs: list[float | None] = []
-    park_context_adjs: list[float | None] = []
-    context_applied_flags: list[bool] = []
     source_details: list[str] = []
     for _, row in hr_df.iterrows():
         odds = row.get("odds_american")
         player_name = str(row.get("player_name") or "")
-        # Implied probability from book odds
         try:
             implied = american_to_implied_prob(odds) if odds is not None else None
         except Exception:
             implied = None
-        # Model HR probability (baseline only)
         if player_name:
             model_prob, source = _prob_fn(player_name, statcast_df, name_index)
         else:
             model_prob, source = None, "unavailable"
-        # Pregame context adjustments (pitcher quality + park)
-        try:
-            pitcher_adj, park_adj, ctx_applied, src_detail = _get_pregame_context_adjustments(
-                row, _pitcher_df
-            )
-        except Exception:
-            pitcher_adj, park_adj, ctx_applied, src_detail = 0.0, 0.0, False, "baseline"
-        # Apply context to model prob
-        if model_prob is not None and ctx_applied:
-            model_prob_adj: float | None = max(0.01, min(0.40, model_prob + pitcher_adj + park_adj))
         else:
-            model_prob_adj = model_prob
-        # Edge (uses context-adjusted prob)
         if model_prob_adj is not None and implied is not None:
             edge = compute_edge(model_prob_adj, implied)
         else:
@@ -231,9 +449,6 @@ def map_hr_props_to_model(
         model_probs.append(model_prob_adj)
         sources.append(source)
         edges.append(edge)
-        pitcher_context_adjs.append(pitcher_adj if ctx_applied else None)
-        park_context_adjs.append(park_adj if ctx_applied else None)
-        context_applied_flags.append(ctx_applied)
         source_details.append(src_detail)
     hr_df = hr_df.copy()
@@ -241,12 +456,8 @@ def map_hr_props_to_model(
     hr_df["model_hr_prob"] = model_probs
     hr_df["model_hr_prob_source"] = sources
     hr_df["edge"] = edges
-    hr_df["pregame_pitcher_context_adj"] = pitcher_context_adjs
-    hr_df["pregame_park_context_adj"] = park_context_adjs
-    hr_df["pregame_context_applied"] = context_applied_flags
     hr_df["model_hr_prob_source_detail"] = source_details
-    # Sort: rows with edge first (highest edge first), then no-edge rows
     has_edge = hr_df["edge"].notna()
     with_edge = hr_df[has_edge].sort_values("edge", ascending=False)
     without_edge = hr_df[~has_edge]

 """
 analytics/props_mapper.py
+Maps sportsbook HR prop rows to internal model HR probabilities and computes edge.
+Pre-game model stack (applied additively in weight order):
+  1. Batter baseline       — compute_batter_baseline() (EV90, barrel, hard-hit, xwOBA, LA)
+  2. Pitcher quality       — compute_pitcher_adjustment() ±0.025  [requires probable starter]
+  3. Zone matchup          — compute_zone_matchup_adjustment() ±0.010
+  4. Arsenal matchup       — compute_arsenal_matchup_adjustment() ±0.010
+  5. Rolling form          — compute_upcoming_rolling_adjustment() ±0.012
+  6. Park factor           — compute_park_adjustment() ±0.006
+Pitcher is the dominant adjustment. Park is supporting context only.
+Pitcher data requires:
+  - pitcher_statcast_df (player_type=pitcher — player_name = pitcher name)
+  - probable_starters dict from data.mlb_starters.fetch_probable_starters_for_props()
+Both are optional; any missing data causes a graceful no-op for that component.
 """
 from __future__ import annotations
+from datetime import date
 from typing import Any, Callable
 import pandas as pd
 from analytics.no_vig_props import american_to_implied_prob, compute_edge
 from data.odds_name_map import map_odds_name_to_model_name
 from models.batter_baseline import build_batter_feature_row, compute_batter_baseline
+from models.pitcher_adjustment import build_pitcher_feature_row, compute_pitcher_adjustment
+# ---------------------------------------------------------------------------
+# Static home team → stadium name mapping (all 30 MLB teams)
+# Keys match Odds API / sportsbook team name format.
+# Values are canonical names accepted by models/stadium_lookup.resolve_stadium().
+# ---------------------------------------------------------------------------
+HOME_TEAM_TO_STADIUM: dict[str, str] = {
+    "Baltimore Orioles": "oriole park at camden yards",
+    "Boston Red Sox": "fenway park",
+    "New York Yankees": "yankee stadium",
+    "Tampa Bay Rays": "tropicana field",
+    "Toronto Blue Jays": "rogers centre",
+    "Chicago White Sox": "guaranteed rate field",
+    "Cleveland Guardians": "progressive field",
+    "Detroit Tigers": "comerica park",
+    "Kansas City Royals": "kauffman stadium",
+    "Minnesota Twins": "target field",
+    "Houston Astros": "minute maid park",
+    "Los Angeles Angels": "angel stadium",
+    "Oakland Athletics": "athletics ballpark",
+    "Seattle Mariners": "t-mobile park",
+    "Texas Rangers": "globe life field",
+    "Atlanta Braves": "truist park",
+    "Miami Marlins": "loandepot park",
+    "New York Mets": "citi field",
+    "Philadelphia Phillies": "citizens bank park",
+    "Washington Nationals": "nationals park",
+    "Chicago Cubs": "wrigley field",
+    "Cincinnati Reds": "great american ball park",
+    "Milwaukee Brewers": "american family field",
+    "Pittsburgh Pirates": "pnc park",
+    "St. Louis Cardinals": "busch stadium",
+    "Arizona Diamondbacks": "chase field",
+    "Colorado Rockies": "coors field",
+    "Los Angeles Dodgers": "dodger stadium",
+    "San Diego Padres": "petco park",
+    "San Francisco Giants": "oracle park",
+}
 def _build_statcast_name_index(statcast_df: pd.DataFrame) -> dict[str, str]:
     return index
+def _lookup_batter_team(
+    statcast_name: str,
+    props_away_team: str,
+    props_home_team: str,
+    statcast_df: pd.DataFrame,
+) -> str | None:
+    """
+    Returns "home" or "away" indicating which team the batter plays on, or None if unknown.
+    Checks whether the batter's statcast rows most frequently list them as playing
+    against the opposite team (i.e. batter's home_team != props_away_team implies batter
+    is on home team).
+    """
+    if statcast_df.empty or "player_name" not in statcast_df.columns:
+        return None
+    if "home_team" not in statcast_df.columns or "away_team" not in statcast_df.columns:
+        return None
+    try:
+        player_rows = statcast_df[statcast_df["player_name"].astype(str) == statcast_name]
+        if player_rows.empty:
+            return None
+        # For batter-perspective statcast: if the batter is the home team's batter,
+        # their team should appear as home_team in most rows.
+        props_away = str(props_away_team or "").strip().lower()
+        props_home = str(props_home_team or "").strip().lower()
+        home_team_vals = player_rows["home_team"].astype(str).str.strip().str.lower()
+        away_team_vals = player_rows["away_team"].astype(str).str.strip().str.lower()
+        # Count rows where batter's home_team matches props game teams
+        home_count = int((home_team_vals == props_home).sum())
+        away_count = int((away_team_vals == props_away).sum())
+        if home_count > away_count:
+            return "home"
+        if away_count > home_count:
+            return "away"
+        # Fallback: count by whether batter's team appears as home in any game row
+        # using both teams from props row
+        props_team_home = int((home_team_vals.isin([props_home, props_away])).sum())
+        if props_team_home > 0:
+            # Most common home_team for this player among game rows with either team
+            relevant = player_rows[
+                home_team_vals.isin([props_home, props_away]) |
+                away_team_vals.isin([props_home, props_away])
+            ]
+            if not relevant.empty:
+                ht = relevant["home_team"].astype(str).str.strip().str.lower().mode()
+                if not ht.empty:
+                    return "home" if ht.iloc[0] == props_home else "away"
+        return None
+    except Exception:
+        return None
+def _get_full_pregame_adjustments(
+    props_row: Any,
+    statcast_name: str,
+    batter_features: dict[str, Any],
+    statcast_df: pd.DataFrame,
+    pitcher_statcast_df: pd.DataFrame,
+    probable_starters: dict[tuple[str, str], dict[str, str | None]],
+) -> tuple[float, str]:
+    """
+    Apply the full pre-game model stack to a single props row.
+    Returns (total_hr_adj, source_detail_str).
+    Weight order (highest → lowest):
+      Pitcher quality ±0.025 > Rolling form ±0.012 > Zone/Arsenal ±0.010 > Park ±0.006
+    """
+    total_adj = 0.0
+    source_parts: list[str] = []
+    away_team = str(props_row.get("away_team") or "")
+    home_team = str(props_row.get("home_team") or "")
+    commence_time = str(props_row.get("commence_time") or "")
+    # Parse reference date from commence_time for rolling form
+    ref_date: date | None = None
+    try:
+        import datetime as _dt
+        ref_date = _dt.datetime.fromisoformat(commence_time.replace("Z", "+00:00")).date()
+    except Exception:
+        ref_date = pd.Timestamp.utcnow().date()
+    # ------------------------------------------------------------------
+    # Probable pitcher lookup
+    # ------------------------------------------------------------------
+    pitcher_name: str | None = None
+    if probable_starters and away_team and home_team:
+        try:
+            from data.mlb_starters import lookup_pitchers_for_game
+            pitchers = lookup_pitchers_for_game(away_team, home_team, probable_starters)
+            batter_side = _lookup_batter_team(statcast_name, away_team, home_team, statcast_df)
+            if batter_side == "home":
+                pitcher_name = pitchers.get("away_pitcher")
+            elif batter_side == "away":
+                pitcher_name = pitchers.get("home_pitcher")
+            else:
+                # Can't determine side — use whichever pitcher is available (best effort)
+                pitcher_name = pitchers.get("home_pitcher") or pitchers.get("away_pitcher")
+        except Exception:
+            pass
+    # ------------------------------------------------------------------
+    # 1. Pitcher quality (dominant signal, ±0.025)
+    # ------------------------------------------------------------------
+    pitcher_row: dict[str, Any] = {}
+    if pitcher_name and not pitcher_statcast_df.empty:
+        try:
+            pitcher_row = build_pitcher_feature_row(pitcher_statcast_df, pitcher_name)
+            if pitcher_row.get("sample_size", 0) > 0:
+                p_adj = compute_pitcher_adjustment(batter_features, pitcher_row, context={})
+                hr_adj = float(p_adj.get("hr_adj", 0.0) or 0.0)
+                total_adj += hr_adj
+                if abs(hr_adj) > 0.001:
+                    source_parts.append("pitcher_quality")
+        except Exception:
+            pass
+    # ------------------------------------------------------------------
+    # 2. Zone matchup (±0.010)
+    # ------------------------------------------------------------------
+    try:
+        from models.batter_zone_model import build_batter_zone_feature_row
+        from models.pitcher_zone_model import build_pitcher_zone_feature_row
+        from models.zone_matchup_model import compute_zone_matchup_adjustment
+        batter_zone = build_batter_zone_feature_row(statcast_df=statcast_df, player_name=statcast_name)
+        pitcher_zone: dict[str, Any] = {}
+        if pitcher_name and not pitcher_statcast_df.empty:
+            pitcher_zone = build_pitcher_zone_feature_row(statcast_df=pitcher_statcast_df, pitcher_name=pitcher_name)
+        zone_adj = compute_zone_matchup_adjustment(batter_zone, pitcher_zone)
+        zone_hr_boost = float(zone_adj.get("hr_zone_boost", 0.0) or 0.0)
+        # hr_zone_boost is an absolute probability, not a delta — subtract batter baseline
+        baseline_hr = float(batter_features.get("hr_prob_base") or batter_features.get("ev90", 0) * 0.0015 or 0.04)
+        zone_delta = max(-0.010, min(0.010, zone_hr_boost - baseline_hr))
+        if zone_adj.get("sample_size", 0) > 0 and abs(zone_delta) > 0.001:
+            total_adj += zone_delta
+            source_parts.append("zone_matchup")
+    except Exception:
+        pass
+    # ------------------------------------------------------------------
+    # 3. Arsenal matchup (±0.010)
+    # ------------------------------------------------------------------
+    try:
+        from models.batter_arsenal_model import build_batter_arsenal_feature_row
+        from models.pitcher_arsenal_model import build_pitcher_arsenal_feature_row
+        from models.arsenal_matchup_model import compute_arsenal_matchup_adjustment
+        batter_arsenal = build_batter_arsenal_feature_row(statcast_df, statcast_name)
+        pitcher_arsenal: dict[str, Any] = {}
+        if pitcher_name and not pitcher_statcast_df.empty:
+            pitcher_arsenal = build_pitcher_arsenal_feature_row(pitcher_statcast_df, pitcher_name)
+        if pitcher_arsenal.get("arsenal_sample_size", 0) > 0:
+            arsenal_adj = compute_arsenal_matchup_adjustment(batter_arsenal, pitcher_arsenal)
+            arsenal_hr = float(arsenal_adj.get("arsenal_hr_boost", 0.0) or 0.0)
+            # arsenal_hr_boost is a weighted average of batter HR probs by pitch family —
+            # subtract batter baseline to get the delta
+            baseline_hr = float(batter_features.get("hr_prob_base") or 0.04)
+            arsenal_delta = max(-0.010, min(0.010, arsenal_hr - baseline_hr))
+            if abs(arsenal_delta) > 0.001:
+                total_adj += arsenal_delta
+                source_parts.append("arsenal_matchup")
+    except Exception:
+        pass
+    # ------------------------------------------------------------------
+    # 4. Rolling form (±0.012)
+    # ------------------------------------------------------------------
+    try:
+        from models.rolling_form_model import (
+            build_batter_rolling_form_row,
+            build_pitcher_rolling_form_row,
+            compute_upcoming_rolling_adjustment,
+        )
+        batter_roll = build_batter_rolling_form_row(
+            statcast_df, statcast_name, reference_date=ref_date
+        )
+        pitcher_roll: dict[str, Any] = {}
+        if pitcher_name and not pitcher_statcast_df.empty:
+            pitcher_roll = build_pitcher_rolling_form_row(
+                pitcher_statcast_df, pitcher_name, reference_date=ref_date
+            )
+        roll_adj = compute_upcoming_rolling_adjustment(
+            batter_roll, pitcher_roll, batter_features, pitcher_row or {}
+        )
+        rolling_hr = float(roll_adj.get("rolling_hr_adjustment", 0.0) or 0.0)
+        if abs(rolling_hr) > 0.001:
+            total_adj += rolling_hr
+            source_parts.append("rolling_form")
+    except Exception:
+        pass
+    # ------------------------------------------------------------------
+    # 5. Park factor — last, least weight (±0.006)
+    # ------------------------------------------------------------------
+    try:
+        from models.stadium_lookup import resolve_stadium
+        from models.environment_model import compute_park_adjustment
+        venue_name = HOME_TEAM_TO_STADIUM.get(home_team)
+        if not venue_name:
+            # try explicit venue in props row
+            for k in ("venue", "stadium", "venue_name", "park"):
+                v = props_row.get(k) if hasattr(props_row, "get") else None
+                if v and str(v).strip() not in ("", "nan", "None"):
+                    venue_name = str(v).strip()
+                    break
+        if venue_name:
+            stadium = resolve_stadium(venue_name)
+            if stadium:
+                park_out = compute_park_adjustment(stadium)
+                raw_park = float(park_out.get("park_hr_boost", 0.0) or 0.0)
+                park_adj = max(-0.006, min(0.006, raw_park))
+                if abs(park_adj) > 0.0001:
+                    total_adj += park_adj
+                    source_parts.append("park")
+    except Exception:
+        pass
+    source_detail = "baseline+" + "+".join(source_parts) if source_parts else "baseline"
+    return total_adj, source_detail
 def get_player_hr_prob(
     player_name_normalized: str,
     statcast_df: pd.DataFrame,
     _name_index: dict[str, str] | None = None,
 ) -> tuple[float | None, str]:
     """
+    Returns (prob, source) for a pre-game HR probability (baseline only).
     source values:
       "internal_model_baseline" — compute_batter_baseline() with statcast features
     statcast_df: pd.DataFrame,
     prob_fn: Callable[[str, pd.DataFrame, dict[str, str] | None], tuple[float | None, str]] | None = None,
     pitcher_stats_df: pd.DataFrame | None = None,
+    pitcher_statcast_df: pd.DataFrame | None = None,
+    probable_starters: dict | None = None,
 ) -> pd.DataFrame:
     """
     Join HR prop rows to model HR probabilities and compute edge.
     Adds columns:
+      implied_prob              — book implied probability (vig-inclusive)
+      model_hr_prob             — pre-game model HR probability (or None)
+      model_hr_prob_source      — source label ("internal_model_baseline" or "unavailable")
+      model_hr_prob_source_detail — pipe-joined list of model components applied
+      edge                      — model_hr_prob - implied_prob (or None)
     Filters to market == "hr".
     Sorts by edge descending (rows with no edge/model prob sort last).
+    pitcher_statcast_df: pitcher-perspective statcast (player_name = pitcher).
+    probable_starters: {(away_team_norm, home_team_norm): {home_pitcher, away_pitcher}}.
     """
     if props_df.empty:
         return pd.DataFrame()
     if hr_df.empty:
         return pd.DataFrame()
     name_index = _build_statcast_name_index(statcast_df)
+    _pitcher_df = pitcher_statcast_df if pitcher_statcast_df is not None else (
+        pitcher_stats_df if pitcher_stats_df is not None else pd.DataFrame()
+    )
+    _probable_starters = probable_starters or {}
     implied_probs: list[float] = []
     model_probs: list[float | None] = []
     sources: list[str] = []
     edges: list[float | None] = []
     source_details: list[str] = []
     for _, row in hr_df.iterrows():
         odds = row.get("odds_american")
         player_name = str(row.get("player_name") or "")
         try:
             implied = american_to_implied_prob(odds) if odds is not None else None
         except Exception:
             implied = None
         if player_name:
             model_prob, source = _prob_fn(player_name, statcast_df, name_index)
         else:
             model_prob, source = None, "unavailable"
+        # Apply full pre-game model stack if batter baseline succeeded
+        total_adj = 0.0
+        src_detail = "baseline"
+        if model_prob is not None:
+            statcast_name = name_index.get(player_name, "")
+            if statcast_name:
+                batter_features = build_batter_feature_row(statcast_df, statcast_name)
+                try:
+                    total_adj, src_detail = _get_full_pregame_adjustments(
+                        row,
+                        statcast_name,
+                        batter_features,
+                        statcast_df,
+                        _pitcher_df,
+                        _probable_starters,
+                    )
+                except Exception:
+                    pass
+        if model_prob is not None:
+            model_prob_adj: float | None = max(0.005, min(0.40, model_prob + total_adj))
         else:
+            model_prob_adj = None
         if model_prob_adj is not None and implied is not None:
             edge = compute_edge(model_prob_adj, implied)
         else:
         model_probs.append(model_prob_adj)
         sources.append(source)
         edges.append(edge)
         source_details.append(src_detail)
     hr_df = hr_df.copy()
     hr_df["model_hr_prob"] = model_probs
     hr_df["model_hr_prob_source"] = sources
     hr_df["edge"] = edges
     hr_df["model_hr_prob_source_detail"] = source_details
     has_edge = hr_df["edge"].notna()
     with_edge = hr_df[has_edge].sort_values("edge", ascending=False)
     without_edge = hr_df[~has_edge]

app.py CHANGED Viewed

@@ -91,7 +91,7 @@ from utils.dates import current_wbc_date_str
 from data.scores import fetch_scores_for_date
 from data.odds import fetch_featured_odds
 from data.schedule import fetch_schedule_for_date
-from data.statcast import fetch_statcast_range, normalize_statcast
 from data.weather import fetch_weather_for_venue
 from database.db import (
     get_connection,
@@ -570,6 +570,27 @@ def load_statcast_previous_season_full() -> pd.DataFrame:
     enriched = add_pitch_features(normalized)
     return enriched
 @st.cache_data(ttl=STATCAST_TTL_SECONDS)
 def load_statcast_recent() -> pd.DataFrame:
     end_date_str = current_dashboard_date_str()
@@ -3406,7 +3427,16 @@ def main() -> None:
     if page == "Dashboard":
         render_dashboard()
     elif page == "Props":
-        render_props(load_statcast_recent(), conn=conn, raw_props=load_upcoming_hr_props())
     elif page == "Card Lab":
         render_card_lab(conn=conn)
     elif page == "Betting":

 from data.scores import fetch_scores_for_date
 from data.odds import fetch_featured_odds
 from data.schedule import fetch_schedule_for_date
+from data.statcast import fetch_statcast_range, fetch_statcast_range_pitcher, normalize_statcast
 from data.weather import fetch_weather_for_venue
 from database.db import (
     get_connection,
     enriched = add_pitch_features(normalized)
     return enriched
+@st.cache_data(ttl=60 * 60 * 12, show_spinner=False)
+def load_statcast_previous_season_full_pitcher() -> pd.DataFrame:
+    """2025 season pitcher-perspective statcast. player_name = pitcher name."""
+    today = pd.Timestamp.utcnow().date()
+    previous_year = today.year - 1
+    start_date = pd.Timestamp(year=previous_year, month=1, day=1).date()
+    end_date = pd.Timestamp(year=previous_year, month=12, day=31).date()
+    raw = fetch_statcast_range_pitcher(start_date.isoformat(), end_date.isoformat())
+    normalized = normalize_statcast(raw)
+    return add_pitch_features(normalized)
+@st.cache_data(ttl=60 * 60 * 1, show_spinner=False)
+def load_probable_starters() -> dict:
+    """Probable starting pitchers for next 7 days from MLB Stats API."""
+    from data.mlb_starters import fetch_probable_starters_for_props
+    return fetch_probable_starters_for_props()
 @st.cache_data(ttl=STATCAST_TTL_SECONDS)
 def load_statcast_recent() -> pd.DataFrame:
     end_date_str = current_dashboard_date_str()
     if page == "Dashboard":
         render_dashboard()
     elif page == "Props":
+        _statcast_for_props = load_statcast_recent()
+        if _statcast_for_props.empty:
+            _statcast_for_props = load_statcast_previous_season_full()
+        render_props(
+            _statcast_for_props,
+            conn=conn,
+            raw_props=load_upcoming_hr_props(),
+            pitcher_statcast_df=load_statcast_previous_season_full_pitcher(),
+            probable_starters=load_probable_starters(),
+        )
     elif page == "Card Lab":
         render_card_lab(conn=conn)
     elif page == "Betting":

data/live_prop_odds.py CHANGED Viewed

@@ -79,6 +79,7 @@ def fetch_all_upcoming_hr_props(
     providers.append(TheOddsAPIProvider())
     providers.append(ScrapeFallbackProvider())   # fallback if Odds API returns empty
     for provider in providers:
         try:
             fetch_fn = getattr(provider, "fetch_all_upcoming_hr_props", None)
@@ -86,12 +87,38 @@ def fetch_all_upcoming_hr_props(
                 continue
             df = fetch_fn(sportsbooks=sportsbooks)
             if not df.empty:
-                return normalize_prop_odds(df)   # stop at first provider that returns data
         except Exception as e:
             logger.warning(f"[odds_provider_fetch] failure: {e}", exc_info=True)
             continue
-    return pd.DataFrame()
 def fetch_live_prop_odds(

     providers.append(TheOddsAPIProvider())
     providers.append(ScrapeFallbackProvider())   # fallback if Odds API returns empty
+    frames = []
     for provider in providers:
         try:
             fetch_fn = getattr(provider, "fetch_all_upcoming_hr_props", None)
                 continue
             df = fetch_fn(sportsbooks=sportsbooks)
             if not df.empty:
+                frames.append(df)
         except Exception as e:
             logger.warning(f"[odds_provider_fetch] failure: {e}", exc_info=True)
             continue
+    if not frames:
+        return pd.DataFrame()
+    merged = pd.concat(frames, ignore_index=True)
+    merged = normalize_prop_odds(merged)
+    # Dedup: keep one row per (player_name, sportsbook_key, market) — best odds wins
+    if not merged.empty and "sportsbook_key" in merged.columns:
+        merged["_odds_score"] = merged["odds_american"].apply(
+            lambda x: int(x) if pd.notna(x) else -9999
+        )
+        merged = (
+            merged
+            .sort_values("_odds_score", ascending=False)
+            .drop_duplicates(subset=["player_name", "sportsbook_key", "market"], keep="first")
+            .drop(columns=["_odds_score"])
+            .reset_index(drop=True)
+        )
+    logger.warning(
+        "[fetch_all_upcoming_hr_props] providers=%d frames=%d merged_rows=%d unique_books=%s",
+        len(providers),
+        len(frames),
+        len(merged),
+        sorted(merged["sportsbook"].dropna().unique().tolist()) if not merged.empty else [],
+    )
+    return merged
 def fetch_live_prop_odds(

data/mlb_starters.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+data/mlb_starters.py
+Fetches probable starting pitchers for upcoming MLB games from the public
+MLB Stats API. Used by the Props page to enrich HR props with matchup context.
+Returns a dict keyed by (away_team, home_team) canonical names → pitcher names.
+Both teams in the key are normalized to lowercase stripped strings for fuzzy matching.
+"""
+from __future__ import annotations
+import logging
+import re
+import unicodedata
+from datetime import timedelta
+from typing import Any
+import pandas as pd
+import requests
+_log = logging.getLogger(__name__)
+_SCHEDULE_URL = "https://statsapi.mlb.com/api/v1/schedule"
+def _normalize_team(name: str) -> str:
+    text = str(name or "").strip().lower()
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(ch for ch in text if not unicodedata.combining(ch))
+    text = re.sub(r"[^a-z0-9 ]", "", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def fetch_probable_starters_for_props() -> dict[tuple[str, str], dict[str, str | None]]:
+    """
+    Fetch probable starters for all MLB games in the next 7 days.
+    Returns:
+        {
+            (away_team_normalized, home_team_normalized): {
+                "home_pitcher": "Luis Castillo" | None,
+                "away_pitcher": "Cole Irvin" | None,
+                "away_team_raw": "Seattle Mariners",
+                "home_team_raw": "Oakland Athletics",
+            }
+        }
+    Keys are lowercased/normalized for fuzzy matching against props row team names.
+    """
+    today = pd.Timestamp.utcnow().date()
+    end_date = today + timedelta(days=7)
+    params: dict[str, Any] = {
+        "sportId": 1,
+        "startDate": today.isoformat(),
+        "endDate": end_date.isoformat(),
+        "hydrate": "probablePitcher",
+        "gameType": "R,F,D,L,W",
+    }
+    try:
+        r = requests.get(_SCHEDULE_URL, params=params, timeout=15)
+        r.raise_for_status()
+        data = r.json()
+    except Exception as exc:
+        _log.warning("[mlb_starters] schedule fetch failed: %s", exc)
+        return {}
+    result: dict[tuple[str, str], dict[str, str | None]] = {}
+    games_total = 0
+    games_with_starters = 0
+    for date_entry in data.get("dates", []):
+        for game in date_entry.get("games", []):
+            games_total += 1
+            teams = game.get("teams", {})
+            away_raw = str(teams.get("away", {}).get("team", {}).get("name", "") or "")
+            home_raw = str(teams.get("home", {}).get("team", {}).get("name", "") or "")
+            away_pitcher_obj = teams.get("away", {}).get("probablePitcher") or {}
+            home_pitcher_obj = teams.get("home", {}).get("probablePitcher") or {}
+            away_pitcher = str(away_pitcher_obj.get("fullName", "") or "").strip() or None
+            home_pitcher = str(home_pitcher_obj.get("fullName", "") or "").strip() or None
+            if not away_raw or not home_raw:
+                continue
+            key = (_normalize_team(away_raw), _normalize_team(home_raw))
+            result[key] = {
+                "home_pitcher": home_pitcher,
+                "away_pitcher": away_pitcher,
+                "away_team_raw": away_raw,
+                "home_team_raw": home_raw,
+            }
+            if home_pitcher or away_pitcher:
+                games_with_starters += 1
+    _log.warning(
+        "[mlb_starters] games_total=%d games_with_starters=%d",
+        games_total,
+        games_with_starters,
+    )
+    return result
+def lookup_pitchers_for_game(
+    away_team: str,
+    home_team: str,
+    starters_map: dict[tuple[str, str], dict[str, str | None]],
+) -> dict[str, str | None]:
+    """
+    Look up probable pitchers for a specific game matchup.
+    Returns {"home_pitcher": name_or_None, "away_pitcher": name_or_None}.
+    Uses normalized string matching — tolerates minor differences in team name format.
+    """
+    away_norm = _normalize_team(away_team)
+    home_norm = _normalize_team(home_team)
+    # Exact normalized match
+    entry = starters_map.get((away_norm, home_norm))
+    if entry:
+        return entry
+    # Partial match fallback: any key where both normalized parts are substrings
+    for (k_away, k_home), v in starters_map.items():
+        away_match = away_norm in k_away or k_away in away_norm
+        home_match = home_norm in k_home or k_home in home_norm
+        if away_match and home_match:
+            return v
+    return {"home_pitcher": None, "away_pitcher": None}

data/statcast.py CHANGED Viewed

@@ -14,7 +14,7 @@ HEADERS = {
 }
-def _query_statcast(start_date: str, end_date: str, season: str) -> pd.DataFrame:
     params = {
         "all": "true",
         "hfPT": "",
@@ -29,7 +29,7 @@ def _query_statcast(start_date: str, end_date: str, season: str) -> pd.DataFrame
         "hfC": "",
         "hfSea": f"{season}|",
         "hfSit": "",
-        "player_type": "batter",
         "hfOuts": "",
         "opponent": "",
         "pitcher_throws": "",
@@ -73,9 +73,15 @@ def _query_statcast(start_date: str, end_date: str, season: str) -> pd.DataFrame
 def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
-    """Fetch Statcast data for the given date range (MLB only)."""
     season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
-    return _query_statcast(start_date, end_date, season=season)
 def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:

 }
+def _query_statcast(start_date: str, end_date: str, season: str, player_type: str = "batter") -> pd.DataFrame:
     params = {
         "all": "true",
         "hfPT": "",
         "hfC": "",
         "hfSea": f"{season}|",
         "hfSit": "",
+        "player_type": player_type,
         "hfOuts": "",
         "opponent": "",
         "pitcher_throws": "",
 def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
+    """Fetch Statcast data for the given date range (MLB only). player_name = batter."""
     season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
+    return _query_statcast(start_date, end_date, season=season, player_type="batter")
+def fetch_statcast_range_pitcher(start_date: str, end_date: str) -> pd.DataFrame:
+    """Fetch pitcher-perspective Statcast for the given date range. player_name = pitcher."""
+    season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
+    return _query_statcast(start_date, end_date, season=season, player_type="pitcher")
 def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:

visualization/props_page.py CHANGED Viewed

@@ -52,7 +52,13 @@ def _format_edge(val: float | None) -> str:
     return f"{val * 100:+.1f}%"
-def render_props(statcast_df: pd.DataFrame, conn=None, raw_props: pd.DataFrame | None = None) -> None:
     st.subheader("Props")
     # Use pre-fetched (cached) props when available.
@@ -104,7 +110,12 @@ def render_props(statcast_df: pd.DataFrame, conn=None, raw_props: pd.DataFrame |
     # Model mapping (HR only) + DB logging
     # ---------------------------------------------------------------------------
     if market_type == "hr":
-        mapped = map_hr_props_to_model(filtered_raw, statcast_df)
         if mapped.empty:
             st.info("No mappable HR prop rows.")
             return

     return f"{val * 100:+.1f}%"
+def render_props(
+    statcast_df: pd.DataFrame,
+    conn=None,
+    raw_props: pd.DataFrame | None = None,
+    pitcher_statcast_df: pd.DataFrame | None = None,
+    probable_starters: dict | None = None,
+) -> None:
     st.subheader("Props")
     # Use pre-fetched (cached) props when available.
     # Model mapping (HR only) + DB logging
     # ---------------------------------------------------------------------------
     if market_type == "hr":
+        mapped = map_hr_props_to_model(
+            filtered_raw,
+            statcast_df,
+            pitcher_statcast_df=pitcher_statcast_df,
+            probable_starters=probable_starters,
+        )
         if mapped.empty:
             st.info("No mappable HR prop rows.")
             return