2026_MLB_Model / models /strikeout_probability_engine_v2.py
Syntrex's picture
Optimize props load path and reuse modeled state
2885bcc
raw
history blame
22.2 kB
from __future__ import annotations
import math
from typing import Any
import pandas as pd
from models.opportunity_model import build_projected_strikeout_opportunity
from models.pitcher_adjustment import build_pitcher_feature_row
from models.shared_matchup_engine import compose_shared_matchup_context
def _safe_float(value: Any, default: float | None = None) -> float | None:
try:
if value is None:
return default
text = str(value).strip().lower()
if text in {"", "nan", "none"}:
return default
return float(value)
except Exception:
return default
def _clamp(value: float, lo: float, hi: float) -> float:
return max(lo, min(hi, value))
def _reliability(sample_size: Any, k: float = 120.0) -> float:
sample = max(0.0, float(_safe_float(sample_size, 0.0) or 0.0))
return _clamp(sample / (sample + max(1.0, k)), 0.0, 1.0)
def _confidence_component(label: str, value: float, direction: str) -> dict[str, Any]:
return {
"label": label,
"value": round(float(value), 1),
"direction": direction,
}
def _poisson_prob_over(expected_value: float, line: float) -> float:
if expected_value <= 0:
return 0.0
target = int(math.floor(line))
cumulative = 0.0
for k in range(0, target + 1):
cumulative += math.exp(-expected_value) * (expected_value ** k) / math.factorial(k)
return _clamp(1.0 - cumulative, 0.0, 1.0)
def _poisson_prob_under(expected_value: float, line: float) -> float:
return _clamp(1.0 - _poisson_prob_over(expected_value, line), 0.0, 1.0)
def _calibrate(probability: float) -> float:
centered = probability - 0.50
return _clamp(0.50 + (centered * 0.92), 0.02, 0.98)
def build_strikeout_probability_result_v2(
pitcher_statcast_df: pd.DataFrame,
pitcher_name: str,
batter_statcast_df: pd.DataFrame | None = None,
opponent_batters: list[str] | None = None,
opponent_team: str | None = None,
line: float | None = None,
selection_side: str | None = None,
game_row: dict[str, Any] | None = None,
runtime_cache: dict[str, Any] | None = None,
) -> dict[str, Any]:
result: dict[str, Any] = {
"formula_version": "strikeout_v2_live",
"raw_k_prob": None,
"calibrated_k_prob": None,
"fair_prob": None,
"expected_strikeouts": None,
"raw_k_prob_v2": None,
"calibrated_k_prob_v2": None,
"fair_prob_v2": None,
"expected_strikeouts_v2": None,
"projected_pitch_count": None,
"projected_batters_faced": None,
"projected_innings": None,
"projected_k_rate": None,
"pitches_per_bf": None,
"opportunity_confidence": None,
"opportunity_reasons": [],
"k_rate_pitch_signal": None,
"k_rate_anchor": None,
"bb_rate_anchor": None,
"command_efficiency_signal": None,
"pitcher_swstr_rate": None,
"pitcher_csw_rate": None,
"pitcher_ball_rate": None,
"swing_miss_subscore": None,
"called_strike_subscore": None,
"command_efficiency_subscore": None,
"lineup_whiff_subscore": None,
"zone_matchup_subscore": None,
"family_zone_matchup_subscore": None,
"arsenal_fit_subscore": None,
"tunneling_subscore": None,
"release_consistency_subscore": None,
"sequencing_subscore": None,
"count_leverage_subscore": None,
"leash_risk_subscore": None,
"role_certainty_score": None,
"times_through_order_penalty": None,
"telemetry_path_status": "baseline_only",
"model_tier": "baseline_only_degraded",
"variance_band_low": None,
"variance_band_high": None,
"matchup_coverage_confidence": None,
"component_source_map": {},
"predicted_whiff_regions": [],
"predicted_attack_regions": [],
"predicted_damage_regions": [],
"tunnel_pair_scores": [],
"applied_layers": "",
"skipped_layers": "",
"confidence_score": None,
"confidence_score_raw": None,
"confidence_score_display": None,
"confidence_source": "strikeout_v2_live",
"confidence_bucket": None,
"confidence_reasons": [],
"confidence_component_bonuses": [],
"confidence_component_penalties": [],
"confidence_primary_driver": None,
"confidence_summary_label": None,
"reason_tags_for": [],
"reason_tags_against": [],
"applied_layers_v2": "",
"skipped_layers_v2": "",
"confidence_score_v2": None,
"confidence_score_raw_v2": None,
"confidence_score_display_v2": None,
"confidence_source_v2": "strikeout_v2_live",
"confidence_bucket_v2": None,
"confidence_reasons_v2": [],
"confidence_component_bonuses_v2": [],
"confidence_component_penalties_v2": [],
"confidence_primary_driver_v2": None,
"confidence_summary_label_v2": None,
}
if (
pitcher_statcast_df is None
or pitcher_statcast_df.empty
or not pitcher_name
or line is None
or selection_side not in {"over", "under"}
):
result["skipped_layers"] = "missing_pitcher_or_line"
return result
pitcher_row = build_pitcher_feature_row(pitcher_statcast_df, pitcher_name)
sample_size = int(pitcher_row.get("sample_size") or 0)
reliability = _reliability(sample_size, k=180.0)
swstr = _safe_float(pitcher_row.get("swstr_rate"))
csw = _safe_float(pitcher_row.get("csw_rate"))
ball = _safe_float(pitcher_row.get("ball_rate"))
strike_anchor = None
walk_anchor = None
if swstr is not None:
strike_anchor = _clamp(0.12 + ((swstr - 0.11) * 1.15), 0.12, 0.42)
if ball is not None:
walk_anchor = _clamp(0.05 + ((ball - 0.36) * 0.75), 0.02, 0.14)
matchup_rows: list[dict[str, Any]] = []
if batter_statcast_df is not None and not batter_statcast_df.empty and opponent_batters:
matchup_cache_bucket = None
lineup_cache_key = None
if runtime_cache is not None:
matchup_cache_bucket = runtime_cache.setdefault("strikeout_lineup_matchups", {})
lineup_cache_key = (
id(batter_statcast_df),
id(pitcher_statcast_df),
str(pitcher_name or "").strip().lower(),
tuple(str(name or "").strip().lower() for name in (opponent_batters or [])[:9]),
str((game_row or {}).get("away_team") or "").strip().lower(),
str((game_row or {}).get("home_team") or "").strip().lower(),
str((game_row or {}).get("projected_starter_match_status") or "").strip().lower(),
str(opponent_team or "").strip().lower(),
)
if matchup_cache_bucket is not None and lineup_cache_key in matchup_cache_bucket:
matchup_rows = list(matchup_cache_bucket[lineup_cache_key])
else:
for batter_name in opponent_batters[:9]:
try:
matchup_rows.append(
compose_shared_matchup_context(
batter_name=batter_name,
pitcher_name=pitcher_name,
batter_statcast_df=batter_statcast_df,
pitcher_statcast_df=pitcher_statcast_df,
pitcher_row=pitcher_row,
game_row=game_row,
batter_features={"batter_stand": "L"},
runtime_cache=runtime_cache,
)
)
except Exception:
continue
if matchup_cache_bucket is not None and lineup_cache_key is not None:
matchup_cache_bucket[lineup_cache_key] = list(matchup_rows)
if matchup_rows:
def _avg(path: tuple[str, ...], default: float = 0.0) -> float:
vals: list[float] = []
for row in matchup_rows:
cur: Any = row
for key in path:
if not isinstance(cur, dict):
cur = default
break
cur = cur.get(key)
if cur is not None:
vals.append(float(_safe_float(cur, default) or default))
return sum(vals) / len(vals) if vals else default
matchup = {
"predicted_whiff_regions": matchup_rows[0].get("predicted_whiff_regions") or [],
"predicted_attack_regions": matchup_rows[0].get("predicted_attack_regions") or [],
"predicted_damage_regions": matchup_rows[0].get("predicted_damage_regions") or [],
"tunnel_pair_scores": matchup_rows[0].get("tunnel_pair_scores") or [],
"matchup_coverage_confidence": _avg(("matchup_coverage_confidence",), 0.0),
"component_source_map": matchup_rows[0].get("component_source_map") or {},
"zone_matchup": {"hit_zone_boost": _avg(("zone_matchup", "hit_zone_boost"), 0.0)},
"family_zone_matchup": {"family_zone_whiff_risk": _avg(("family_zone_matchup", "family_zone_whiff_risk"), 0.0)},
"arsenal_matchup": {"arsenal_whiff_risk": _avg(("arsenal_matchup", "arsenal_whiff_risk"), 0.0)},
"trajectory": matchup_rows[0].get("trajectory") or {},
"count_context_profile": matchup_rows[0].get("count_context_profile") or {},
}
else:
matchup = {
"predicted_whiff_regions": [],
"predicted_attack_regions": [],
"predicted_damage_regions": [],
"tunnel_pair_scores": [],
"matchup_coverage_confidence": 0.0,
"component_source_map": {},
"zone_matchup": {},
"family_zone_matchup": {},
"arsenal_matchup": {},
"trajectory": {},
"count_context_profile": {},
}
telemetry_path_status = "full_telemetry" if matchup_rows else "core_baseline_plus_projected_pitcher"
model_tier = "full_telemetry" if matchup_rows else "core_baseline_plus_projected_pitcher"
if matchup_rows and float(matchup.get("matchup_coverage_confidence") or 0.0) < 0.45:
telemetry_path_status = "partial_telemetry"
model_tier = "partial_telemetry"
zone_matchup_subscore = _safe_float((matchup.get("zone_matchup") or {}).get("hit_zone_boost"), 0.0) or 0.0
family_zone_matchup_subscore = _safe_float((matchup.get("family_zone_matchup") or {}).get("family_zone_whiff_risk"), 0.0) or 0.0
arsenal_fit_subscore = _safe_float((matchup.get("arsenal_matchup") or {}).get("arsenal_whiff_risk"), 0.0) or 0.0
trajectory = matchup.get("trajectory") or {}
tunneling_subscore = _safe_float(trajectory.get("tunnel_score"), 0.5) or 0.5
release_consistency_subscore = _safe_float(trajectory.get("release_consistency_score"), 0.5) or 0.5
sequencing_profiles = matchup.get("count_context_profile") or {}
putaway_states = [v for k, v in sequencing_profiles.items() if str(k).endswith("-2")]
count_leverage_subscore = 0.58 if putaway_states else 0.50
sequencing_subscore = _clamp(0.5 + ((count_leverage_subscore - 0.5) * 0.6), 0.0, 1.0)
swing_miss_subscore = _clamp((swstr or 0.11) / 0.18, 0.0, 1.0)
called_strike_subscore = _clamp((csw or 0.28) / 0.36, 0.0, 1.0)
command_efficiency_signal = _clamp(1.0 - ((ball or 0.36) - 0.30) / 0.12, 0.0, 1.0)
command_efficiency_subscore = command_efficiency_signal
lineup_whiff_subscore = _clamp(
(
family_zone_matchup_subscore * 0.55
+ arsenal_fit_subscore * 0.45
) / 0.35 if (family_zone_matchup_subscore or arsenal_fit_subscore) else 0.5,
0.0,
1.0,
)
opportunity = build_projected_strikeout_opportunity(
pitcher_row=pitcher_row,
opponent_batters=opponent_batters,
projected_starter_available=bool((game_row or {}).get("projected_starter_available")),
projected_starter_match_status=str((game_row or {}).get("projected_starter_match_status") or ""),
game_row=game_row,
)
pitch_count = float(opportunity.get("projected_pitch_count") or 88.0)
projected_batters_faced = float(opportunity.get("projected_batters_faced") or 22.5)
projected_innings = float(opportunity.get("projected_innings") or 5.2)
pitches_per_bf = float(opportunity.get("pitches_per_bf") or 3.85)
times_through_order_penalty = float(opportunity.get("times_through_order_penalty") or 0.0)
leash_risk_score = float(opportunity.get("leash_risk_score") or 0.0)
role_certainty_score = float(opportunity.get("role_certainty_score") or 0.7)
opportunity_confidence = float(opportunity.get("opportunity_confidence") or 0.0)
opportunity_reasons = list(opportunity.get("opportunity_reasons") or [])
pitch_signal = (
swing_miss_subscore * 0.34
+ called_strike_subscore * 0.18
+ command_efficiency_subscore * 0.14
+ lineup_whiff_subscore * 0.10
+ _clamp(zone_matchup_subscore / 0.40, 0.0, 1.0) * 0.08
+ _clamp(family_zone_matchup_subscore / 0.40, 0.0, 1.0) * 0.06
+ _clamp(arsenal_fit_subscore / 0.35, 0.0, 1.0) * 0.05
+ tunneling_subscore * 0.03
+ release_consistency_subscore * 0.01
+ sequencing_subscore * 0.01
)
k_rate_pitch_signal = _clamp(0.12 + (pitch_signal * 0.26), 0.14, 0.42)
k_rate_anchor = strike_anchor if strike_anchor is not None else k_rate_pitch_signal
projected_k_rate = (
k_rate_pitch_signal * (1.0 - reliability)
+ k_rate_anchor * reliability
)
projected_k_rate = _clamp(projected_k_rate - times_through_order_penalty, 0.12, 0.40)
expected_strikeouts = _clamp(projected_batters_faced * projected_k_rate, 1.5, 11.5)
raw_prob = _poisson_prob_over(expected_strikeouts, float(line)) if selection_side == "over" else _poisson_prob_under(expected_strikeouts, float(line))
calibrated_prob = _calibrate(raw_prob)
variance = _clamp(0.35 + leash_risk_score * 0.9 + (1.0 - reliability) * 0.8, 0.35, 1.8)
confidence = 56.0
reasons: list[str] = []
bonuses: list[dict[str, Any]] = []
penalties: list[dict[str, Any]] = []
if sample_size >= 400:
confidence += 10.0
bonuses.append(_confidence_component("Strong pitcher sample", 10.0, "bonus"))
elif sample_size < 150:
confidence -= 10.0
reasons.append("Limited pitcher pitch sample")
penalties.append(_confidence_component("Limited pitcher pitch sample", 10.0, "penalty"))
if matchup.get("matchup_coverage_confidence", 0.0) >= 0.45:
confidence += 8.0
bonuses.append(_confidence_component("Strong telemetry and zone coverage", 8.0, "bonus"))
else:
confidence -= 6.0
reasons.append("Thin telemetry and zone-coverage sample")
penalties.append(_confidence_component("Thin telemetry and zone-coverage sample", 6.0, "penalty"))
if opponent_batters and len(opponent_batters) >= 7:
confidence += 5.0
bonuses.append(_confidence_component("Projected lineup mostly complete", 5.0, "bonus"))
else:
confidence -= 5.0
reasons.append("Projected opponent lineup is incomplete")
penalties.append(_confidence_component("Projected opponent lineup is incomplete", 5.0, "penalty"))
if leash_risk_score >= 0.55:
confidence -= 7.0
reasons.append("Pitch-count and leash risk remain elevated")
penalties.append(_confidence_component("Pitch-count and leash risk remain elevated", 7.0, "penalty"))
if role_certainty_score >= 0.92:
confidence += 4.0
bonuses.append(_confidence_component("Public projected starter confirmed", 4.0, "bonus"))
elif role_certainty_score <= 0.65:
confidence -= 6.0
reasons.append("Starter role and leash certainty remain soft")
penalties.append(_confidence_component("Starter role and leash certainty remain soft", 6.0, "penalty"))
reason_tags_for: list[str] = []
reason_tags_against: list[str] = []
if swing_miss_subscore >= 0.62:
reason_tags_for.append("Misses bats consistently")
if called_strike_subscore >= 0.72:
reason_tags_for.append("Strong called plus whiff strike mix")
if projected_k_rate >= 0.27:
reason_tags_for.append("Projected strikeout rate supports the line")
if projected_batters_faced >= 24.0:
reason_tags_for.append("Projected workload supports deep strikeout opportunity")
if leash_risk_score >= 0.48:
reason_tags_against.append("Pitch-count and leash risk limit the strikeout path")
if projected_batters_faced <= 21.5:
reason_tags_against.append("Projected batters faced are lighter than ideal")
if not opponent_batters or len(opponent_batters) < 7:
reason_tags_against.append("Projected opponent lineup is incomplete")
confidence_raw = _clamp(confidence, 1.0, 100.0)
bucket = "high" if confidence_raw >= 75 else "medium" if confidence_raw >= 55 else "low"
primary_penalty = max(
[item for item in penalties if float(item.get("value") or 0.0) > 0.0],
key=lambda item: float(item.get("value") or 0.0),
default=None,
)
primary_bonus = max(
[item for item in bonuses if float(item.get("value") or 0.0) > 0.0],
key=lambda item: float(item.get("value") or 0.0),
default=None,
)
primary_driver = primary_penalty or primary_bonus
summary_label = str((primary_driver or {}).get("label") or "").strip() or None
result.update(
{
"raw_k_prob": raw_prob,
"calibrated_k_prob": calibrated_prob,
"fair_prob": calibrated_prob,
"expected_strikeouts": expected_strikeouts,
"raw_k_prob_v2": raw_prob,
"calibrated_k_prob_v2": calibrated_prob,
"fair_prob_v2": calibrated_prob,
"expected_strikeouts_v2": expected_strikeouts,
"projected_pitch_count": round(pitch_count, 2),
"projected_batters_faced": round(projected_batters_faced, 2),
"projected_innings": round(projected_innings, 2),
"projected_k_rate": round(projected_k_rate, 4),
"pitches_per_bf": round(pitches_per_bf, 3),
"opportunity_confidence": round(opportunity_confidence, 4),
"opportunity_reasons": opportunity_reasons,
"k_rate_pitch_signal": round(k_rate_pitch_signal, 4),
"k_rate_anchor": round(k_rate_anchor, 4) if k_rate_anchor is not None else None,
"bb_rate_anchor": round(walk_anchor, 4) if walk_anchor is not None else None,
"command_efficiency_signal": round(command_efficiency_signal, 4),
"pitcher_swstr_rate": round(swstr, 4) if swstr is not None else None,
"pitcher_csw_rate": round(csw, 4) if csw is not None else None,
"pitcher_ball_rate": round(ball, 4) if ball is not None else None,
"swing_miss_subscore": round(swing_miss_subscore, 4),
"called_strike_subscore": round(called_strike_subscore, 4),
"command_efficiency_subscore": round(command_efficiency_subscore, 4),
"lineup_whiff_subscore": round(lineup_whiff_subscore, 4),
"zone_matchup_subscore": round(zone_matchup_subscore, 4),
"family_zone_matchup_subscore": round(family_zone_matchup_subscore, 4),
"arsenal_fit_subscore": round(arsenal_fit_subscore, 4),
"tunneling_subscore": round(tunneling_subscore, 4),
"release_consistency_subscore": round(release_consistency_subscore, 4),
"sequencing_subscore": round(sequencing_subscore, 4),
"count_leverage_subscore": round(count_leverage_subscore, 4),
"leash_risk_subscore": round(leash_risk_score, 4),
"role_certainty_score": round(role_certainty_score, 4),
"times_through_order_penalty": round(times_through_order_penalty, 4),
"telemetry_path_status": telemetry_path_status,
"model_tier": model_tier,
"variance_band_low": round(_clamp(expected_strikeouts - variance, 0.5, 12.0), 2),
"variance_band_high": round(_clamp(expected_strikeouts + variance, 0.5, 12.5), 2),
"matchup_coverage_confidence": matchup.get("matchup_coverage_confidence"),
"component_source_map": matchup.get("component_source_map") or {},
"predicted_whiff_regions": matchup.get("predicted_whiff_regions") or [],
"predicted_attack_regions": matchup.get("predicted_attack_regions") or [],
"predicted_damage_regions": matchup.get("predicted_damage_regions") or [],
"tunnel_pair_scores": matchup.get("tunnel_pair_scores") or [],
"applied_layers": "opportunity|pitch_win|probability|uncertainty",
"skipped_layers": "",
"confidence_score": round(confidence_raw, 1),
"confidence_score_raw": round(confidence_raw, 1),
"confidence_score_display": round(confidence_raw, 1),
"confidence_bucket": bucket,
"confidence_reasons": reasons[:5],
"confidence_component_bonuses": bonuses,
"confidence_component_penalties": penalties,
"confidence_primary_driver": primary_driver,
"confidence_summary_label": summary_label,
"reason_tags_for": reason_tags_for,
"reason_tags_against": reason_tags_against,
"applied_layers_v2": "opportunity|pitch_win|probability|uncertainty",
"skipped_layers_v2": "",
"confidence_score_v2": round(confidence_raw, 1),
"confidence_score_raw_v2": round(confidence_raw, 1),
"confidence_score_display_v2": round(confidence_raw, 1),
"confidence_bucket_v2": bucket,
"confidence_reasons_v2": reasons[:5],
"confidence_component_bonuses_v2": bonuses,
"confidence_component_penalties_v2": penalties,
"confidence_primary_driver_v2": primary_driver,
"confidence_summary_label_v2": summary_label,
}
)
return result