Syntrex Claude Sonnet 4.6 commited on
Commit
37fe240
·
1 Parent(s): 0ff601a

Props: multi-book concat-all, full pre-game model stack, pre-season statcast fallback

Browse files

Issue A: Switch fetch_all_upcoming_hr_props from stop-at-first to concat-all
so Odds API partial data (e.g. Caesars only) no longer blocks the scraper.
Dedup by best odds per (player_name, sportsbook_key, market) after merge.

Issue B: Fall back to load_statcast_previous_season_full() (2025) when
load_statcast_recent() returns empty (pre-season). Model HR% now populates.

Issue C: Full pre-game model stack in props_mapper._get_full_pregame_adjustments():
- Pitcher quality ±0.025 via compute_pitcher_adjustment() [dominant signal]
- Zone matchup ±0.010 via batter_zone_store + pitcher_zone_model
- Arsenal matchup ±0.010 via batter/pitcher arsenal feature rows
- Rolling form ±0.012 via compute_upcoming_rolling_adjustment()
- Park factor ±0.006 via HOME_TEAM_TO_STADIUM + compute_park_adjustment()

New data sources:
- data/mlb_starters.py: probable starters from MLB Stats API, cached 1h
- data/statcast.py: fetch_statcast_range_pitcher() for pitcher-perspective data
- app.py: load_statcast_previous_season_full_pitcher() + load_probable_starters()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

analytics/props_mapper.py CHANGED
@@ -1,26 +1,28 @@
1
  """
2
  analytics/props_mapper.py
3
 
4
- Batch 14: Maps sportsbook HR prop rows to internal model HR probabilities
5
- and computes edge.
6
-
7
- Model HR probability resolution order (pre-game, no live lineup context):
8
- 1. internal_model_baselinecompute_batter_baseline() using batter statcast
9
- features (EV90, barrel rate, hard hit rate, xwOBA, launch angle).
10
- Preferred source when plate_appearances > 0.
11
- 2. unavailableinsufficient statcast coverage for this player.
12
-
13
- Note: XGBoost HR model (xgb_shadow.py) requires anchor probs from the live
14
- simulator and cannot be used pre-game. It remains the source for Dashboard
15
- live-game recommendations only.
16
-
17
- The prob_fn parameter in map_hr_props_to_model() is injectable so the
18
- probability source can be swapped later without touching odds ingestion or
19
- the Props page.
 
20
  """
21
 
22
  from __future__ import annotations
23
 
 
24
  from typing import Any, Callable
25
 
26
  import pandas as pd
@@ -28,79 +30,46 @@ import pandas as pd
28
  from analytics.no_vig_props import american_to_implied_prob, compute_edge
29
  from data.odds_name_map import map_odds_name_to_model_name
30
  from models.batter_baseline import build_batter_feature_row, compute_batter_baseline
31
- from models.pitcher_adjustment import build_pitcher_feature_row
32
-
33
-
34
- def _get_pregame_context_adjustments(
35
- props_row: Any,
36
- statcast_df: pd.DataFrame,
37
- ) -> tuple[float, float, bool, str]:
38
- """
39
- Derive pitcher quality + park context adjustments for a pre-game props row.
40
- Returns (pitcher_adj, park_adj, context_applied, source_detail_str).
41
- All adjustments are no-op safe — any missing data yields 0.0.
42
- """
43
- pitcher_adj = 0.0
44
- park_adj = 0.0
45
- context_applied = False
46
- source_parts: list[str] = ["baseline"]
47
-
48
- # --- Pitcher context (only when pitcher_name is explicit in props row) ---
49
- pitcher_name = None
50
- for key in ("pitcher_name", "pitcher", "opposing_pitcher"):
51
- val = props_row.get(key) if hasattr(props_row, "get") else None
52
- if val and str(val).strip() not in ("", "nan", "None"):
53
- pitcher_name = str(val).strip()
54
- break
55
-
56
- if pitcher_name and not statcast_df.empty:
57
- try:
58
- p_row = build_pitcher_feature_row(statcast_df, pitcher_name)
59
- if p_row.get("sample_size", 0) > 0:
60
- velo = p_row.get("avg_release_speed")
61
- ev = p_row.get("ev_allowed")
62
- barrel = p_row.get("barrel_rate_allowed")
63
-
64
- quality_score = 0.0
65
- if velo is not None:
66
- quality_score += (float(velo) - 93.0) * (-0.15) # higher velo = better pitcher = negative for batter
67
- if ev is not None:
68
- quality_score += (float(ev) - 89.0) * 0.08 # higher EV allowed = worse pitcher
69
- if barrel is not None:
70
- quality_score += (float(barrel) - 0.07) * 1.0 # higher barrel = worse pitcher
71
-
72
- pitcher_adj = max(-0.005, min(0.005, quality_score * 0.003))
73
- if abs(pitcher_adj) > 0.0001:
74
- context_applied = True
75
- source_parts.append("pitcher_quality")
76
- except Exception:
77
- pass
78
-
79
- # --- Park context (if venue available) ---
80
- venue = None
81
- for key in ("venue", "stadium", "venue_name", "park"):
82
- val = props_row.get(key) if hasattr(props_row, "get") else None
83
- if val and str(val).strip() not in ("", "nan", "None"):
84
- venue = str(val).strip()
85
- break
86
-
87
- if venue:
88
- try:
89
- from models.environment_model import compute_environment_adjustment
90
- env = compute_environment_adjustment(
91
- game_row={"venue": venue, "stadium": venue},
92
- weather_row=None,
93
- )
94
- raw_park = float(env.get("park_hr_boost", 0.0) or 0.0)
95
- park_adj = max(-0.004, min(0.004, raw_park))
96
- if abs(park_adj) > 0.0001:
97
- context_applied = True
98
- source_parts.append("park")
99
- except Exception:
100
- pass
101
-
102
- source_detail = "+".join(source_parts)
103
- return pitcher_adj, park_adj, context_applied, source_detail
104
 
105
 
106
  def _build_statcast_name_index(statcast_df: pd.DataFrame) -> dict[str, str]:
@@ -118,13 +87,252 @@ def _build_statcast_name_index(statcast_df: pd.DataFrame) -> dict[str, str]:
118
  return index
119
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def get_player_hr_prob(
122
  player_name_normalized: str,
123
  statcast_df: pd.DataFrame,
124
  _name_index: dict[str, str] | None = None,
125
  ) -> tuple[float | None, str]:
126
  """
127
- Returns (prob, source) for a pre-game HR probability.
128
 
129
  source values:
130
  "internal_model_baseline" — compute_batter_baseline() with statcast features
@@ -153,19 +361,24 @@ def map_hr_props_to_model(
153
  statcast_df: pd.DataFrame,
154
  prob_fn: Callable[[str, pd.DataFrame, dict[str, str] | None], tuple[float | None, str]] | None = None,
155
  pitcher_stats_df: pd.DataFrame | None = None,
 
 
156
  ) -> pd.DataFrame:
157
  """
158
  Join HR prop rows to model HR probabilities and compute edge.
159
 
160
  Adds columns:
161
- implied_prob — book implied probability (vig-inclusive)
162
- model_hr_prob — pre-game model HR probability (or None)
163
- model_hr_prob_source — source label for model_hr_prob
164
- edge model_hr_prob - implied_prob (or None)
 
165
 
166
  Filters to market == "hr".
167
  Sorts by edge descending (rows with no edge/model prob sort last).
168
- prob_fn is injectable for future source swaps; defaults to get_player_hr_prob.
 
 
169
  """
170
  if props_df.empty:
171
  return pd.DataFrame()
@@ -176,52 +389,57 @@ def map_hr_props_to_model(
176
  if hr_df.empty:
177
  return pd.DataFrame()
178
 
179
- # Build name index once for all players
180
  name_index = _build_statcast_name_index(statcast_df)
181
 
182
- # Use pitcher_stats_df if provided, else fall back to statcast_df for pitcher lookups
183
- _pitcher_df = pitcher_stats_df if pitcher_stats_df is not None else statcast_df
 
 
184
 
185
  implied_probs: list[float] = []
186
  model_probs: list[float | None] = []
187
  sources: list[str] = []
188
  edges: list[float | None] = []
189
- pitcher_context_adjs: list[float | None] = []
190
- park_context_adjs: list[float | None] = []
191
- context_applied_flags: list[bool] = []
192
  source_details: list[str] = []
193
 
194
  for _, row in hr_df.iterrows():
195
  odds = row.get("odds_american")
196
  player_name = str(row.get("player_name") or "")
197
 
198
- # Implied probability from book odds
199
  try:
200
  implied = american_to_implied_prob(odds) if odds is not None else None
201
  except Exception:
202
  implied = None
203
 
204
- # Model HR probability (baseline only)
205
  if player_name:
206
  model_prob, source = _prob_fn(player_name, statcast_df, name_index)
207
  else:
208
  model_prob, source = None, "unavailable"
209
 
210
- # Pregame context adjustments (pitcher quality + park)
211
- try:
212
- pitcher_adj, park_adj, ctx_applied, src_detail = _get_pregame_context_adjustments(
213
- row, _pitcher_df
214
- )
215
- except Exception:
216
- pitcher_adj, park_adj, ctx_applied, src_detail = 0.0, 0.0, False, "baseline"
217
-
218
- # Apply context to model prob
219
- if model_prob is not None and ctx_applied:
220
- model_prob_adj: float | None = max(0.01, min(0.40, model_prob + pitcher_adj + park_adj))
 
 
 
 
 
 
 
 
 
 
221
  else:
222
- model_prob_adj = model_prob
223
 
224
- # Edge (uses context-adjusted prob)
225
  if model_prob_adj is not None and implied is not None:
226
  edge = compute_edge(model_prob_adj, implied)
227
  else:
@@ -231,9 +449,6 @@ def map_hr_props_to_model(
231
  model_probs.append(model_prob_adj)
232
  sources.append(source)
233
  edges.append(edge)
234
- pitcher_context_adjs.append(pitcher_adj if ctx_applied else None)
235
- park_context_adjs.append(park_adj if ctx_applied else None)
236
- context_applied_flags.append(ctx_applied)
237
  source_details.append(src_detail)
238
 
239
  hr_df = hr_df.copy()
@@ -241,12 +456,8 @@ def map_hr_props_to_model(
241
  hr_df["model_hr_prob"] = model_probs
242
  hr_df["model_hr_prob_source"] = sources
243
  hr_df["edge"] = edges
244
- hr_df["pregame_pitcher_context_adj"] = pitcher_context_adjs
245
- hr_df["pregame_park_context_adj"] = park_context_adjs
246
- hr_df["pregame_context_applied"] = context_applied_flags
247
  hr_df["model_hr_prob_source_detail"] = source_details
248
 
249
- # Sort: rows with edge first (highest edge first), then no-edge rows
250
  has_edge = hr_df["edge"].notna()
251
  with_edge = hr_df[has_edge].sort_values("edge", ascending=False)
252
  without_edge = hr_df[~has_edge]
 
1
  """
2
  analytics/props_mapper.py
3
 
4
+ Maps sportsbook HR prop rows to internal model HR probabilities and computes edge.
5
+
6
+ Pre-game model stack (applied additively in weight order):
7
+ 1. Batter baseline — compute_batter_baseline() (EV90, barrel, hard-hit, xwOBA, LA)
8
+ 2. Pitcher quality compute_pitcher_adjustment() ±0.025 [requires probable starter]
9
+ 3. Zone matchup — compute_zone_matchup_adjustment() ±0.010
10
+ 4. Arsenal matchup — compute_arsenal_matchup_adjustment() ±0.010
11
+ 5. Rolling form compute_upcoming_rolling_adjustment() ±0.012
12
+ 6. Park factor — compute_park_adjustment() ±0.006
13
+
14
+ Pitcher is the dominant adjustment. Park is supporting context only.
15
+
16
+ Pitcher data requires:
17
+ - pitcher_statcast_df (player_type=pitcher player_name = pitcher name)
18
+ - probable_starters dict from data.mlb_starters.fetch_probable_starters_for_props()
19
+
20
+ Both are optional; any missing data causes a graceful no-op for that component.
21
  """
22
 
23
  from __future__ import annotations
24
 
25
+ from datetime import date
26
  from typing import Any, Callable
27
 
28
  import pandas as pd
 
30
  from analytics.no_vig_props import american_to_implied_prob, compute_edge
31
  from data.odds_name_map import map_odds_name_to_model_name
32
  from models.batter_baseline import build_batter_feature_row, compute_batter_baseline
33
+ from models.pitcher_adjustment import build_pitcher_feature_row, compute_pitcher_adjustment
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Static home team → stadium name mapping (all 30 MLB teams)
37
+ # Keys match Odds API / sportsbook team name format.
38
+ # Values are canonical names accepted by models/stadium_lookup.resolve_stadium().
39
+ # ---------------------------------------------------------------------------
40
+
41
+ HOME_TEAM_TO_STADIUM: dict[str, str] = {
42
+ "Baltimore Orioles": "oriole park at camden yards",
43
+ "Boston Red Sox": "fenway park",
44
+ "New York Yankees": "yankee stadium",
45
+ "Tampa Bay Rays": "tropicana field",
46
+ "Toronto Blue Jays": "rogers centre",
47
+ "Chicago White Sox": "guaranteed rate field",
48
+ "Cleveland Guardians": "progressive field",
49
+ "Detroit Tigers": "comerica park",
50
+ "Kansas City Royals": "kauffman stadium",
51
+ "Minnesota Twins": "target field",
52
+ "Houston Astros": "minute maid park",
53
+ "Los Angeles Angels": "angel stadium",
54
+ "Oakland Athletics": "athletics ballpark",
55
+ "Seattle Mariners": "t-mobile park",
56
+ "Texas Rangers": "globe life field",
57
+ "Atlanta Braves": "truist park",
58
+ "Miami Marlins": "loandepot park",
59
+ "New York Mets": "citi field",
60
+ "Philadelphia Phillies": "citizens bank park",
61
+ "Washington Nationals": "nationals park",
62
+ "Chicago Cubs": "wrigley field",
63
+ "Cincinnati Reds": "great american ball park",
64
+ "Milwaukee Brewers": "american family field",
65
+ "Pittsburgh Pirates": "pnc park",
66
+ "St. Louis Cardinals": "busch stadium",
67
+ "Arizona Diamondbacks": "chase field",
68
+ "Colorado Rockies": "coors field",
69
+ "Los Angeles Dodgers": "dodger stadium",
70
+ "San Diego Padres": "petco park",
71
+ "San Francisco Giants": "oracle park",
72
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  def _build_statcast_name_index(statcast_df: pd.DataFrame) -> dict[str, str]:
 
87
  return index
88
 
89
 
90
+ def _lookup_batter_team(
91
+ statcast_name: str,
92
+ props_away_team: str,
93
+ props_home_team: str,
94
+ statcast_df: pd.DataFrame,
95
+ ) -> str | None:
96
+ """
97
+ Returns "home" or "away" indicating which team the batter plays on, or None if unknown.
98
+
99
+ Checks whether the batter's statcast rows most frequently list them as playing
100
+ against the opposite team (i.e. batter's home_team != props_away_team implies batter
101
+ is on home team).
102
+ """
103
+ if statcast_df.empty or "player_name" not in statcast_df.columns:
104
+ return None
105
+ if "home_team" not in statcast_df.columns or "away_team" not in statcast_df.columns:
106
+ return None
107
+
108
+ try:
109
+ player_rows = statcast_df[statcast_df["player_name"].astype(str) == statcast_name]
110
+ if player_rows.empty:
111
+ return None
112
+
113
+ # For batter-perspective statcast: if the batter is the home team's batter,
114
+ # their team should appear as home_team in most rows.
115
+ props_away = str(props_away_team or "").strip().lower()
116
+ props_home = str(props_home_team or "").strip().lower()
117
+
118
+ home_team_vals = player_rows["home_team"].astype(str).str.strip().str.lower()
119
+ away_team_vals = player_rows["away_team"].astype(str).str.strip().str.lower()
120
+
121
+ # Count rows where batter's home_team matches props game teams
122
+ home_count = int((home_team_vals == props_home).sum())
123
+ away_count = int((away_team_vals == props_away).sum())
124
+
125
+ if home_count > away_count:
126
+ return "home"
127
+ if away_count > home_count:
128
+ return "away"
129
+
130
+ # Fallback: count by whether batter's team appears as home in any game row
131
+ # using both teams from props row
132
+ props_team_home = int((home_team_vals.isin([props_home, props_away])).sum())
133
+ if props_team_home > 0:
134
+ # Most common home_team for this player among game rows with either team
135
+ relevant = player_rows[
136
+ home_team_vals.isin([props_home, props_away]) |
137
+ away_team_vals.isin([props_home, props_away])
138
+ ]
139
+ if not relevant.empty:
140
+ ht = relevant["home_team"].astype(str).str.strip().str.lower().mode()
141
+ if not ht.empty:
142
+ return "home" if ht.iloc[0] == props_home else "away"
143
+
144
+ return None
145
+ except Exception:
146
+ return None
147
+
148
+
149
+ def _get_full_pregame_adjustments(
150
+ props_row: Any,
151
+ statcast_name: str,
152
+ batter_features: dict[str, Any],
153
+ statcast_df: pd.DataFrame,
154
+ pitcher_statcast_df: pd.DataFrame,
155
+ probable_starters: dict[tuple[str, str], dict[str, str | None]],
156
+ ) -> tuple[float, str]:
157
+ """
158
+ Apply the full pre-game model stack to a single props row.
159
+
160
+ Returns (total_hr_adj, source_detail_str).
161
+
162
+ Weight order (highest → lowest):
163
+ Pitcher quality ±0.025 > Rolling form ±0.012 > Zone/Arsenal ±0.010 > Park ±0.006
164
+ """
165
+ total_adj = 0.0
166
+ source_parts: list[str] = []
167
+
168
+ away_team = str(props_row.get("away_team") or "")
169
+ home_team = str(props_row.get("home_team") or "")
170
+ commence_time = str(props_row.get("commence_time") or "")
171
+
172
+ # Parse reference date from commence_time for rolling form
173
+ ref_date: date | None = None
174
+ try:
175
+ import datetime as _dt
176
+ ref_date = _dt.datetime.fromisoformat(commence_time.replace("Z", "+00:00")).date()
177
+ except Exception:
178
+ ref_date = pd.Timestamp.utcnow().date()
179
+
180
+ # ------------------------------------------------------------------
181
+ # Probable pitcher lookup
182
+ # ------------------------------------------------------------------
183
+ pitcher_name: str | None = None
184
+
185
+ if probable_starters and away_team and home_team:
186
+ try:
187
+ from data.mlb_starters import lookup_pitchers_for_game
188
+ pitchers = lookup_pitchers_for_game(away_team, home_team, probable_starters)
189
+
190
+ batter_side = _lookup_batter_team(statcast_name, away_team, home_team, statcast_df)
191
+
192
+ if batter_side == "home":
193
+ pitcher_name = pitchers.get("away_pitcher")
194
+ elif batter_side == "away":
195
+ pitcher_name = pitchers.get("home_pitcher")
196
+ else:
197
+ # Can't determine side — use whichever pitcher is available (best effort)
198
+ pitcher_name = pitchers.get("home_pitcher") or pitchers.get("away_pitcher")
199
+ except Exception:
200
+ pass
201
+
202
+ # ------------------------------------------------------------------
203
+ # 1. Pitcher quality (dominant signal, ±0.025)
204
+ # ------------------------------------------------------------------
205
+ pitcher_row: dict[str, Any] = {}
206
+ if pitcher_name and not pitcher_statcast_df.empty:
207
+ try:
208
+ pitcher_row = build_pitcher_feature_row(pitcher_statcast_df, pitcher_name)
209
+ if pitcher_row.get("sample_size", 0) > 0:
210
+ p_adj = compute_pitcher_adjustment(batter_features, pitcher_row, context={})
211
+ hr_adj = float(p_adj.get("hr_adj", 0.0) or 0.0)
212
+ total_adj += hr_adj
213
+ if abs(hr_adj) > 0.001:
214
+ source_parts.append("pitcher_quality")
215
+ except Exception:
216
+ pass
217
+
218
+ # ------------------------------------------------------------------
219
+ # 2. Zone matchup (±0.010)
220
+ # ------------------------------------------------------------------
221
+ try:
222
+ from models.batter_zone_model import build_batter_zone_feature_row
223
+ from models.pitcher_zone_model import build_pitcher_zone_feature_row
224
+ from models.zone_matchup_model import compute_zone_matchup_adjustment
225
+
226
+ batter_zone = build_batter_zone_feature_row(statcast_df=statcast_df, player_name=statcast_name)
227
+ pitcher_zone: dict[str, Any] = {}
228
+ if pitcher_name and not pitcher_statcast_df.empty:
229
+ pitcher_zone = build_pitcher_zone_feature_row(statcast_df=pitcher_statcast_df, pitcher_name=pitcher_name)
230
+
231
+ zone_adj = compute_zone_matchup_adjustment(batter_zone, pitcher_zone)
232
+ zone_hr_boost = float(zone_adj.get("hr_zone_boost", 0.0) or 0.0)
233
+ # hr_zone_boost is an absolute probability, not a delta — subtract batter baseline
234
+ baseline_hr = float(batter_features.get("hr_prob_base") or batter_features.get("ev90", 0) * 0.0015 or 0.04)
235
+ zone_delta = max(-0.010, min(0.010, zone_hr_boost - baseline_hr))
236
+ if zone_adj.get("sample_size", 0) > 0 and abs(zone_delta) > 0.001:
237
+ total_adj += zone_delta
238
+ source_parts.append("zone_matchup")
239
+ except Exception:
240
+ pass
241
+
242
+ # ------------------------------------------------------------------
243
+ # 3. Arsenal matchup (±0.010)
244
+ # ------------------------------------------------------------------
245
+ try:
246
+ from models.batter_arsenal_model import build_batter_arsenal_feature_row
247
+ from models.pitcher_arsenal_model import build_pitcher_arsenal_feature_row
248
+ from models.arsenal_matchup_model import compute_arsenal_matchup_adjustment
249
+
250
+ batter_arsenal = build_batter_arsenal_feature_row(statcast_df, statcast_name)
251
+ pitcher_arsenal: dict[str, Any] = {}
252
+ if pitcher_name and not pitcher_statcast_df.empty:
253
+ pitcher_arsenal = build_pitcher_arsenal_feature_row(pitcher_statcast_df, pitcher_name)
254
+
255
+ if pitcher_arsenal.get("arsenal_sample_size", 0) > 0:
256
+ arsenal_adj = compute_arsenal_matchup_adjustment(batter_arsenal, pitcher_arsenal)
257
+ arsenal_hr = float(arsenal_adj.get("arsenal_hr_boost", 0.0) or 0.0)
258
+ # arsenal_hr_boost is a weighted average of batter HR probs by pitch family —
259
+ # subtract batter baseline to get the delta
260
+ baseline_hr = float(batter_features.get("hr_prob_base") or 0.04)
261
+ arsenal_delta = max(-0.010, min(0.010, arsenal_hr - baseline_hr))
262
+ if abs(arsenal_delta) > 0.001:
263
+ total_adj += arsenal_delta
264
+ source_parts.append("arsenal_matchup")
265
+ except Exception:
266
+ pass
267
+
268
+ # ------------------------------------------------------------------
269
+ # 4. Rolling form (±0.012)
270
+ # ------------------------------------------------------------------
271
+ try:
272
+ from models.rolling_form_model import (
273
+ build_batter_rolling_form_row,
274
+ build_pitcher_rolling_form_row,
275
+ compute_upcoming_rolling_adjustment,
276
+ )
277
+
278
+ batter_roll = build_batter_rolling_form_row(
279
+ statcast_df, statcast_name, reference_date=ref_date
280
+ )
281
+ pitcher_roll: dict[str, Any] = {}
282
+ if pitcher_name and not pitcher_statcast_df.empty:
283
+ pitcher_roll = build_pitcher_rolling_form_row(
284
+ pitcher_statcast_df, pitcher_name, reference_date=ref_date
285
+ )
286
+
287
+ roll_adj = compute_upcoming_rolling_adjustment(
288
+ batter_roll, pitcher_roll, batter_features, pitcher_row or {}
289
+ )
290
+ rolling_hr = float(roll_adj.get("rolling_hr_adjustment", 0.0) or 0.0)
291
+ if abs(rolling_hr) > 0.001:
292
+ total_adj += rolling_hr
293
+ source_parts.append("rolling_form")
294
+ except Exception:
295
+ pass
296
+
297
+ # ------------------------------------------------------------------
298
+ # 5. Park factor — last, least weight (±0.006)
299
+ # ------------------------------------------------------------------
300
+ try:
301
+ from models.stadium_lookup import resolve_stadium
302
+ from models.environment_model import compute_park_adjustment
303
+
304
+ venue_name = HOME_TEAM_TO_STADIUM.get(home_team)
305
+ if not venue_name:
306
+ # try explicit venue in props row
307
+ for k in ("venue", "stadium", "venue_name", "park"):
308
+ v = props_row.get(k) if hasattr(props_row, "get") else None
309
+ if v and str(v).strip() not in ("", "nan", "None"):
310
+ venue_name = str(v).strip()
311
+ break
312
+
313
+ if venue_name:
314
+ stadium = resolve_stadium(venue_name)
315
+ if stadium:
316
+ park_out = compute_park_adjustment(stadium)
317
+ raw_park = float(park_out.get("park_hr_boost", 0.0) or 0.0)
318
+ park_adj = max(-0.006, min(0.006, raw_park))
319
+ if abs(park_adj) > 0.0001:
320
+ total_adj += park_adj
321
+ source_parts.append("park")
322
+ except Exception:
323
+ pass
324
+
325
+ source_detail = "baseline+" + "+".join(source_parts) if source_parts else "baseline"
326
+ return total_adj, source_detail
327
+
328
+
329
  def get_player_hr_prob(
330
  player_name_normalized: str,
331
  statcast_df: pd.DataFrame,
332
  _name_index: dict[str, str] | None = None,
333
  ) -> tuple[float | None, str]:
334
  """
335
+ Returns (prob, source) for a pre-game HR probability (baseline only).
336
 
337
  source values:
338
  "internal_model_baseline" — compute_batter_baseline() with statcast features
 
361
  statcast_df: pd.DataFrame,
362
  prob_fn: Callable[[str, pd.DataFrame, dict[str, str] | None], tuple[float | None, str]] | None = None,
363
  pitcher_stats_df: pd.DataFrame | None = None,
364
+ pitcher_statcast_df: pd.DataFrame | None = None,
365
+ probable_starters: dict | None = None,
366
  ) -> pd.DataFrame:
367
  """
368
  Join HR prop rows to model HR probabilities and compute edge.
369
 
370
  Adds columns:
371
+ implied_prob — book implied probability (vig-inclusive)
372
+ model_hr_prob — pre-game model HR probability (or None)
373
+ model_hr_prob_source — source label ("internal_model_baseline" or "unavailable")
374
+ model_hr_prob_source_detail pipe-joined list of model components applied
375
+ edge — model_hr_prob - implied_prob (or None)
376
 
377
  Filters to market == "hr".
378
  Sorts by edge descending (rows with no edge/model prob sort last).
379
+
380
+ pitcher_statcast_df: pitcher-perspective statcast (player_name = pitcher).
381
+ probable_starters: {(away_team_norm, home_team_norm): {home_pitcher, away_pitcher}}.
382
  """
383
  if props_df.empty:
384
  return pd.DataFrame()
 
389
  if hr_df.empty:
390
  return pd.DataFrame()
391
 
 
392
  name_index = _build_statcast_name_index(statcast_df)
393
 
394
+ _pitcher_df = pitcher_statcast_df if pitcher_statcast_df is not None else (
395
+ pitcher_stats_df if pitcher_stats_df is not None else pd.DataFrame()
396
+ )
397
+ _probable_starters = probable_starters or {}
398
 
399
  implied_probs: list[float] = []
400
  model_probs: list[float | None] = []
401
  sources: list[str] = []
402
  edges: list[float | None] = []
 
 
 
403
  source_details: list[str] = []
404
 
405
  for _, row in hr_df.iterrows():
406
  odds = row.get("odds_american")
407
  player_name = str(row.get("player_name") or "")
408
 
 
409
  try:
410
  implied = american_to_implied_prob(odds) if odds is not None else None
411
  except Exception:
412
  implied = None
413
 
 
414
  if player_name:
415
  model_prob, source = _prob_fn(player_name, statcast_df, name_index)
416
  else:
417
  model_prob, source = None, "unavailable"
418
 
419
+ # Apply full pre-game model stack if batter baseline succeeded
420
+ total_adj = 0.0
421
+ src_detail = "baseline"
422
+ if model_prob is not None:
423
+ statcast_name = name_index.get(player_name, "")
424
+ if statcast_name:
425
+ batter_features = build_batter_feature_row(statcast_df, statcast_name)
426
+ try:
427
+ total_adj, src_detail = _get_full_pregame_adjustments(
428
+ row,
429
+ statcast_name,
430
+ batter_features,
431
+ statcast_df,
432
+ _pitcher_df,
433
+ _probable_starters,
434
+ )
435
+ except Exception:
436
+ pass
437
+
438
+ if model_prob is not None:
439
+ model_prob_adj: float | None = max(0.005, min(0.40, model_prob + total_adj))
440
  else:
441
+ model_prob_adj = None
442
 
 
443
  if model_prob_adj is not None and implied is not None:
444
  edge = compute_edge(model_prob_adj, implied)
445
  else:
 
449
  model_probs.append(model_prob_adj)
450
  sources.append(source)
451
  edges.append(edge)
 
 
 
452
  source_details.append(src_detail)
453
 
454
  hr_df = hr_df.copy()
 
456
  hr_df["model_hr_prob"] = model_probs
457
  hr_df["model_hr_prob_source"] = sources
458
  hr_df["edge"] = edges
 
 
 
459
  hr_df["model_hr_prob_source_detail"] = source_details
460
 
 
461
  has_edge = hr_df["edge"].notna()
462
  with_edge = hr_df[has_edge].sort_values("edge", ascending=False)
463
  without_edge = hr_df[~has_edge]
app.py CHANGED
@@ -91,7 +91,7 @@ from utils.dates import current_wbc_date_str
91
  from data.scores import fetch_scores_for_date
92
  from data.odds import fetch_featured_odds
93
  from data.schedule import fetch_schedule_for_date
94
- from data.statcast import fetch_statcast_range, normalize_statcast
95
  from data.weather import fetch_weather_for_venue
96
  from database.db import (
97
  get_connection,
@@ -570,6 +570,27 @@ def load_statcast_previous_season_full() -> pd.DataFrame:
570
  enriched = add_pitch_features(normalized)
571
  return enriched
572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  @st.cache_data(ttl=STATCAST_TTL_SECONDS)
574
  def load_statcast_recent() -> pd.DataFrame:
575
  end_date_str = current_dashboard_date_str()
@@ -3406,7 +3427,16 @@ def main() -> None:
3406
  if page == "Dashboard":
3407
  render_dashboard()
3408
  elif page == "Props":
3409
- render_props(load_statcast_recent(), conn=conn, raw_props=load_upcoming_hr_props())
 
 
 
 
 
 
 
 
 
3410
  elif page == "Card Lab":
3411
  render_card_lab(conn=conn)
3412
  elif page == "Betting":
 
91
  from data.scores import fetch_scores_for_date
92
  from data.odds import fetch_featured_odds
93
  from data.schedule import fetch_schedule_for_date
94
+ from data.statcast import fetch_statcast_range, fetch_statcast_range_pitcher, normalize_statcast
95
  from data.weather import fetch_weather_for_venue
96
  from database.db import (
97
  get_connection,
 
570
  enriched = add_pitch_features(normalized)
571
  return enriched
572
 
573
+
574
+ @st.cache_data(ttl=60 * 60 * 12, show_spinner=False)
575
+ def load_statcast_previous_season_full_pitcher() -> pd.DataFrame:
576
+ """2025 season pitcher-perspective statcast. player_name = pitcher name."""
577
+ today = pd.Timestamp.utcnow().date()
578
+ previous_year = today.year - 1
579
+ start_date = pd.Timestamp(year=previous_year, month=1, day=1).date()
580
+ end_date = pd.Timestamp(year=previous_year, month=12, day=31).date()
581
+
582
+ raw = fetch_statcast_range_pitcher(start_date.isoformat(), end_date.isoformat())
583
+ normalized = normalize_statcast(raw)
584
+ return add_pitch_features(normalized)
585
+
586
+
587
+ @st.cache_data(ttl=60 * 60 * 1, show_spinner=False)
588
+ def load_probable_starters() -> dict:
589
+ """Probable starting pitchers for next 7 days from MLB Stats API."""
590
+ from data.mlb_starters import fetch_probable_starters_for_props
591
+ return fetch_probable_starters_for_props()
592
+
593
+
594
  @st.cache_data(ttl=STATCAST_TTL_SECONDS)
595
  def load_statcast_recent() -> pd.DataFrame:
596
  end_date_str = current_dashboard_date_str()
 
3427
  if page == "Dashboard":
3428
  render_dashboard()
3429
  elif page == "Props":
3430
+ _statcast_for_props = load_statcast_recent()
3431
+ if _statcast_for_props.empty:
3432
+ _statcast_for_props = load_statcast_previous_season_full()
3433
+ render_props(
3434
+ _statcast_for_props,
3435
+ conn=conn,
3436
+ raw_props=load_upcoming_hr_props(),
3437
+ pitcher_statcast_df=load_statcast_previous_season_full_pitcher(),
3438
+ probable_starters=load_probable_starters(),
3439
+ )
3440
  elif page == "Card Lab":
3441
  render_card_lab(conn=conn)
3442
  elif page == "Betting":
data/live_prop_odds.py CHANGED
@@ -79,6 +79,7 @@ def fetch_all_upcoming_hr_props(
79
  providers.append(TheOddsAPIProvider())
80
  providers.append(ScrapeFallbackProvider()) # fallback if Odds API returns empty
81
 
 
82
  for provider in providers:
83
  try:
84
  fetch_fn = getattr(provider, "fetch_all_upcoming_hr_props", None)
@@ -86,12 +87,38 @@ def fetch_all_upcoming_hr_props(
86
  continue
87
  df = fetch_fn(sportsbooks=sportsbooks)
88
  if not df.empty:
89
- return normalize_prop_odds(df) # stop at first provider that returns data
90
  except Exception as e:
91
  logger.warning(f"[odds_provider_fetch] failure: {e}", exc_info=True)
92
  continue
93
 
94
- return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
 
97
  def fetch_live_prop_odds(
 
79
  providers.append(TheOddsAPIProvider())
80
  providers.append(ScrapeFallbackProvider()) # fallback if Odds API returns empty
81
 
82
+ frames = []
83
  for provider in providers:
84
  try:
85
  fetch_fn = getattr(provider, "fetch_all_upcoming_hr_props", None)
 
87
  continue
88
  df = fetch_fn(sportsbooks=sportsbooks)
89
  if not df.empty:
90
+ frames.append(df)
91
  except Exception as e:
92
  logger.warning(f"[odds_provider_fetch] failure: {e}", exc_info=True)
93
  continue
94
 
95
+ if not frames:
96
+ return pd.DataFrame()
97
+
98
+ merged = pd.concat(frames, ignore_index=True)
99
+ merged = normalize_prop_odds(merged)
100
+
101
+ # Dedup: keep one row per (player_name, sportsbook_key, market) — best odds wins
102
+ if not merged.empty and "sportsbook_key" in merged.columns:
103
+ merged["_odds_score"] = merged["odds_american"].apply(
104
+ lambda x: int(x) if pd.notna(x) else -9999
105
+ )
106
+ merged = (
107
+ merged
108
+ .sort_values("_odds_score", ascending=False)
109
+ .drop_duplicates(subset=["player_name", "sportsbook_key", "market"], keep="first")
110
+ .drop(columns=["_odds_score"])
111
+ .reset_index(drop=True)
112
+ )
113
+
114
+ logger.warning(
115
+ "[fetch_all_upcoming_hr_props] providers=%d frames=%d merged_rows=%d unique_books=%s",
116
+ len(providers),
117
+ len(frames),
118
+ len(merged),
119
+ sorted(merged["sportsbook"].dropna().unique().tolist()) if not merged.empty else [],
120
+ )
121
+ return merged
122
 
123
 
124
  def fetch_live_prop_odds(
data/mlb_starters.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ data/mlb_starters.py
3
+
4
+ Fetches probable starting pitchers for upcoming MLB games from the public
5
+ MLB Stats API. Used by the Props page to enrich HR props with matchup context.
6
+
7
+ Returns a dict keyed by (away_team, home_team) canonical names → pitcher names.
8
+ Both teams in the key are normalized to lowercase stripped strings for fuzzy matching.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import re
14
+ import unicodedata
15
+ from datetime import timedelta
16
+ from typing import Any
17
+
18
+ import pandas as pd
19
+ import requests
20
+
21
+ _log = logging.getLogger(__name__)
22
+
23
+ _SCHEDULE_URL = "https://statsapi.mlb.com/api/v1/schedule"
24
+
25
+
26
+ def _normalize_team(name: str) -> str:
27
+ text = str(name or "").strip().lower()
28
+ text = unicodedata.normalize("NFKD", text)
29
+ text = "".join(ch for ch in text if not unicodedata.combining(ch))
30
+ text = re.sub(r"[^a-z0-9 ]", "", text)
31
+ text = re.sub(r"\s+", " ", text).strip()
32
+ return text
33
+
34
+
35
+ def fetch_probable_starters_for_props() -> dict[tuple[str, str], dict[str, str | None]]:
36
+ """
37
+ Fetch probable starters for all MLB games in the next 7 days.
38
+
39
+ Returns:
40
+ {
41
+ (away_team_normalized, home_team_normalized): {
42
+ "home_pitcher": "Luis Castillo" | None,
43
+ "away_pitcher": "Cole Irvin" | None,
44
+ "away_team_raw": "Seattle Mariners",
45
+ "home_team_raw": "Oakland Athletics",
46
+ }
47
+ }
48
+
49
+ Keys are lowercased/normalized for fuzzy matching against props row team names.
50
+ """
51
+ today = pd.Timestamp.utcnow().date()
52
+ end_date = today + timedelta(days=7)
53
+ params: dict[str, Any] = {
54
+ "sportId": 1,
55
+ "startDate": today.isoformat(),
56
+ "endDate": end_date.isoformat(),
57
+ "hydrate": "probablePitcher",
58
+ "gameType": "R,F,D,L,W",
59
+ }
60
+
61
+ try:
62
+ r = requests.get(_SCHEDULE_URL, params=params, timeout=15)
63
+ r.raise_for_status()
64
+ data = r.json()
65
+ except Exception as exc:
66
+ _log.warning("[mlb_starters] schedule fetch failed: %s", exc)
67
+ return {}
68
+
69
+ result: dict[tuple[str, str], dict[str, str | None]] = {}
70
+ games_total = 0
71
+ games_with_starters = 0
72
+
73
+ for date_entry in data.get("dates", []):
74
+ for game in date_entry.get("games", []):
75
+ games_total += 1
76
+ teams = game.get("teams", {})
77
+
78
+ away_raw = str(teams.get("away", {}).get("team", {}).get("name", "") or "")
79
+ home_raw = str(teams.get("home", {}).get("team", {}).get("name", "") or "")
80
+
81
+ away_pitcher_obj = teams.get("away", {}).get("probablePitcher") or {}
82
+ home_pitcher_obj = teams.get("home", {}).get("probablePitcher") or {}
83
+
84
+ away_pitcher = str(away_pitcher_obj.get("fullName", "") or "").strip() or None
85
+ home_pitcher = str(home_pitcher_obj.get("fullName", "") or "").strip() or None
86
+
87
+ if not away_raw or not home_raw:
88
+ continue
89
+
90
+ key = (_normalize_team(away_raw), _normalize_team(home_raw))
91
+ result[key] = {
92
+ "home_pitcher": home_pitcher,
93
+ "away_pitcher": away_pitcher,
94
+ "away_team_raw": away_raw,
95
+ "home_team_raw": home_raw,
96
+ }
97
+
98
+ if home_pitcher or away_pitcher:
99
+ games_with_starters += 1
100
+
101
+ _log.warning(
102
+ "[mlb_starters] games_total=%d games_with_starters=%d",
103
+ games_total,
104
+ games_with_starters,
105
+ )
106
+ return result
107
+
108
+
109
+ def lookup_pitchers_for_game(
110
+ away_team: str,
111
+ home_team: str,
112
+ starters_map: dict[tuple[str, str], dict[str, str | None]],
113
+ ) -> dict[str, str | None]:
114
+ """
115
+ Look up probable pitchers for a specific game matchup.
116
+
117
+ Returns {"home_pitcher": name_or_None, "away_pitcher": name_or_None}.
118
+ Uses normalized string matching — tolerates minor differences in team name format.
119
+ """
120
+ away_norm = _normalize_team(away_team)
121
+ home_norm = _normalize_team(home_team)
122
+
123
+ # Exact normalized match
124
+ entry = starters_map.get((away_norm, home_norm))
125
+ if entry:
126
+ return entry
127
+
128
+ # Partial match fallback: any key where both normalized parts are substrings
129
+ for (k_away, k_home), v in starters_map.items():
130
+ away_match = away_norm in k_away or k_away in away_norm
131
+ home_match = home_norm in k_home or k_home in home_norm
132
+ if away_match and home_match:
133
+ return v
134
+
135
+ return {"home_pitcher": None, "away_pitcher": None}
data/statcast.py CHANGED
@@ -14,7 +14,7 @@ HEADERS = {
14
  }
15
 
16
 
17
- def _query_statcast(start_date: str, end_date: str, season: str) -> pd.DataFrame:
18
  params = {
19
  "all": "true",
20
  "hfPT": "",
@@ -29,7 +29,7 @@ def _query_statcast(start_date: str, end_date: str, season: str) -> pd.DataFrame
29
  "hfC": "",
30
  "hfSea": f"{season}|",
31
  "hfSit": "",
32
- "player_type": "batter",
33
  "hfOuts": "",
34
  "opponent": "",
35
  "pitcher_throws": "",
@@ -73,9 +73,15 @@ def _query_statcast(start_date: str, end_date: str, season: str) -> pd.DataFrame
73
 
74
 
75
  def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
76
- """Fetch Statcast data for the given date range (MLB only)."""
77
  season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
78
- return _query_statcast(start_date, end_date, season=season)
 
 
 
 
 
 
79
 
80
 
81
  def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:
 
14
  }
15
 
16
 
17
+ def _query_statcast(start_date: str, end_date: str, season: str, player_type: str = "batter") -> pd.DataFrame:
18
  params = {
19
  "all": "true",
20
  "hfPT": "",
 
29
  "hfC": "",
30
  "hfSea": f"{season}|",
31
  "hfSit": "",
32
+ "player_type": player_type,
33
  "hfOuts": "",
34
  "opponent": "",
35
  "pitcher_throws": "",
 
73
 
74
 
75
  def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
76
+ """Fetch Statcast data for the given date range (MLB only). player_name = batter."""
77
  season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
78
+ return _query_statcast(start_date, end_date, season=season, player_type="batter")
79
+
80
+
81
+ def fetch_statcast_range_pitcher(start_date: str, end_date: str) -> pd.DataFrame:
82
+ """Fetch pitcher-perspective Statcast for the given date range. player_name = pitcher."""
83
+ season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
84
+ return _query_statcast(start_date, end_date, season=season, player_type="pitcher")
85
 
86
 
87
  def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:
visualization/props_page.py CHANGED
@@ -52,7 +52,13 @@ def _format_edge(val: float | None) -> str:
52
  return f"{val * 100:+.1f}%"
53
 
54
 
55
- def render_props(statcast_df: pd.DataFrame, conn=None, raw_props: pd.DataFrame | None = None) -> None:
 
 
 
 
 
 
56
  st.subheader("Props")
57
 
58
  # Use pre-fetched (cached) props when available.
@@ -104,7 +110,12 @@ def render_props(statcast_df: pd.DataFrame, conn=None, raw_props: pd.DataFrame |
104
  # Model mapping (HR only) + DB logging
105
  # ---------------------------------------------------------------------------
106
  if market_type == "hr":
107
- mapped = map_hr_props_to_model(filtered_raw, statcast_df)
 
 
 
 
 
108
  if mapped.empty:
109
  st.info("No mappable HR prop rows.")
110
  return
 
52
  return f"{val * 100:+.1f}%"
53
 
54
 
55
+ def render_props(
56
+ statcast_df: pd.DataFrame,
57
+ conn=None,
58
+ raw_props: pd.DataFrame | None = None,
59
+ pitcher_statcast_df: pd.DataFrame | None = None,
60
+ probable_starters: dict | None = None,
61
+ ) -> None:
62
  st.subheader("Props")
63
 
64
  # Use pre-fetched (cached) props when available.
 
110
  # Model mapping (HR only) + DB logging
111
  # ---------------------------------------------------------------------------
112
  if market_type == "hr":
113
+ mapped = map_hr_props_to_model(
114
+ filtered_raw,
115
+ statcast_df,
116
+ pitcher_statcast_df=pitcher_statcast_df,
117
+ probable_starters=probable_starters,
118
+ )
119
  if mapped.empty:
120
  st.info("No mappable HR prop rows.")
121
  return