Spaces:

Syntrex
/

2026_MLB_Model

Sleeping

App Files Files

2026_MLB_Model / models /strikeout_probability_engine_v2.py

Syntrex

Optimize props load path and reuse modeled state

2885bcc 2 months ago

raw

history blame

22.2 kB

	from __future__ import annotations

	import math
	from typing import Any

	import pandas as pd

	from models.opportunity_model import build_projected_strikeout_opportunity
	from models.pitcher_adjustment import build_pitcher_feature_row
	from models.shared_matchup_engine import compose_shared_matchup_context


	def _safe_float(value: Any, default: float \| None = None) -> float \| None:
	try:
	if value is None:
	return default
	text = str(value).strip().lower()
	if text in {"", "nan", "none"}:
	return default
	return float(value)
	except Exception:
	return default


	def _clamp(value: float, lo: float, hi: float) -> float:
	return max(lo, min(hi, value))


	def _reliability(sample_size: Any, k: float = 120.0) -> float:
	sample = max(0.0, float(_safe_float(sample_size, 0.0) or 0.0))
	return _clamp(sample / (sample + max(1.0, k)), 0.0, 1.0)


	def _confidence_component(label: str, value: float, direction: str) -> dict[str, Any]:
	return {
	"label": label,
	"value": round(float(value), 1),
	"direction": direction,
	}


	def _poisson_prob_over(expected_value: float, line: float) -> float:
	if expected_value <= 0:
	return 0.0
	target = int(math.floor(line))
	cumulative = 0.0
	for k in range(0, target + 1):
	cumulative += math.exp(-expected_value) * (expected_value ** k) / math.factorial(k)
	return _clamp(1.0 - cumulative, 0.0, 1.0)


	def _poisson_prob_under(expected_value: float, line: float) -> float:
	return _clamp(1.0 - _poisson_prob_over(expected_value, line), 0.0, 1.0)


	def _calibrate(probability: float) -> float:
	centered = probability - 0.50
	return _clamp(0.50 + (centered * 0.92), 0.02, 0.98)


	def build_strikeout_probability_result_v2(
	pitcher_statcast_df: pd.DataFrame,
	pitcher_name: str,
	batter_statcast_df: pd.DataFrame \| None = None,
	opponent_batters: list[str] \| None = None,
	opponent_team: str \| None = None,
	line: float \| None = None,
	selection_side: str \| None = None,
	game_row: dict[str, Any] \| None = None,
	runtime_cache: dict[str, Any] \| None = None,
	) -> dict[str, Any]:
	result: dict[str, Any] = {
	"formula_version": "strikeout_v2_live",
	"raw_k_prob": None,
	"calibrated_k_prob": None,
	"fair_prob": None,
	"expected_strikeouts": None,
	"raw_k_prob_v2": None,
	"calibrated_k_prob_v2": None,
	"fair_prob_v2": None,
	"expected_strikeouts_v2": None,
	"projected_pitch_count": None,
	"projected_batters_faced": None,
	"projected_innings": None,
	"projected_k_rate": None,
	"pitches_per_bf": None,
	"opportunity_confidence": None,
	"opportunity_reasons": [],
	"k_rate_pitch_signal": None,
	"k_rate_anchor": None,
	"bb_rate_anchor": None,
	"command_efficiency_signal": None,
	"pitcher_swstr_rate": None,
	"pitcher_csw_rate": None,
	"pitcher_ball_rate": None,
	"swing_miss_subscore": None,
	"called_strike_subscore": None,
	"command_efficiency_subscore": None,
	"lineup_whiff_subscore": None,
	"zone_matchup_subscore": None,
	"family_zone_matchup_subscore": None,
	"arsenal_fit_subscore": None,
	"tunneling_subscore": None,
	"release_consistency_subscore": None,
	"sequencing_subscore": None,
	"count_leverage_subscore": None,
	"leash_risk_subscore": None,
	"role_certainty_score": None,
	"times_through_order_penalty": None,
	"telemetry_path_status": "baseline_only",
	"model_tier": "baseline_only_degraded",
	"variance_band_low": None,
	"variance_band_high": None,
	"matchup_coverage_confidence": None,
	"component_source_map": {},
	"predicted_whiff_regions": [],
	"predicted_attack_regions": [],
	"predicted_damage_regions": [],
	"tunnel_pair_scores": [],
	"applied_layers": "",
	"skipped_layers": "",
	"confidence_score": None,
	"confidence_score_raw": None,
	"confidence_score_display": None,
	"confidence_source": "strikeout_v2_live",
	"confidence_bucket": None,
	"confidence_reasons": [],
	"confidence_component_bonuses": [],
	"confidence_component_penalties": [],
	"confidence_primary_driver": None,
	"confidence_summary_label": None,
	"reason_tags_for": [],
	"reason_tags_against": [],
	"applied_layers_v2": "",
	"skipped_layers_v2": "",
	"confidence_score_v2": None,
	"confidence_score_raw_v2": None,
	"confidence_score_display_v2": None,
	"confidence_source_v2": "strikeout_v2_live",
	"confidence_bucket_v2": None,
	"confidence_reasons_v2": [],
	"confidence_component_bonuses_v2": [],
	"confidence_component_penalties_v2": [],
	"confidence_primary_driver_v2": None,
	"confidence_summary_label_v2": None,
	}

	if (
	pitcher_statcast_df is None
	or pitcher_statcast_df.empty
	or not pitcher_name
	or line is None
	or selection_side not in {"over", "under"}
	):
	result["skipped_layers"] = "missing_pitcher_or_line"
	return result

	pitcher_row = build_pitcher_feature_row(pitcher_statcast_df, pitcher_name)
	sample_size = int(pitcher_row.get("sample_size") or 0)
	reliability = _reliability(sample_size, k=180.0)
	swstr = _safe_float(pitcher_row.get("swstr_rate"))
	csw = _safe_float(pitcher_row.get("csw_rate"))
	ball = _safe_float(pitcher_row.get("ball_rate"))

	strike_anchor = None
	walk_anchor = None
	if swstr is not None:
	strike_anchor = _clamp(0.12 + ((swstr - 0.11) * 1.15), 0.12, 0.42)
	if ball is not None:
	walk_anchor = _clamp(0.05 + ((ball - 0.36) * 0.75), 0.02, 0.14)

	matchup_rows: list[dict[str, Any]] = []
	if batter_statcast_df is not None and not batter_statcast_df.empty and opponent_batters:
	matchup_cache_bucket = None
	lineup_cache_key = None
	if runtime_cache is not None:
	matchup_cache_bucket = runtime_cache.setdefault("strikeout_lineup_matchups", {})
	lineup_cache_key = (
	id(batter_statcast_df),
	id(pitcher_statcast_df),
	str(pitcher_name or "").strip().lower(),
	tuple(str(name or "").strip().lower() for name in (opponent_batters or [])[:9]),
	str((game_row or {}).get("away_team") or "").strip().lower(),
	str((game_row or {}).get("home_team") or "").strip().lower(),
	str((game_row or {}).get("projected_starter_match_status") or "").strip().lower(),
	str(opponent_team or "").strip().lower(),
	)
	if matchup_cache_bucket is not None and lineup_cache_key in matchup_cache_bucket:
	matchup_rows = list(matchup_cache_bucket[lineup_cache_key])
	else:
	for batter_name in opponent_batters[:9]:
	try:
	matchup_rows.append(
	compose_shared_matchup_context(
	batter_name=batter_name,
	pitcher_name=pitcher_name,
	batter_statcast_df=batter_statcast_df,
	pitcher_statcast_df=pitcher_statcast_df,
	pitcher_row=pitcher_row,
	game_row=game_row,
	batter_features={"batter_stand": "L"},
	runtime_cache=runtime_cache,
	)
	)
	except Exception:
	continue
	if matchup_cache_bucket is not None and lineup_cache_key is not None:
	matchup_cache_bucket[lineup_cache_key] = list(matchup_rows)

	if matchup_rows:
	def _avg(path: tuple[str, ...], default: float = 0.0) -> float:
	vals: list[float] = []
	for row in matchup_rows:
	cur: Any = row
	for key in path:
	if not isinstance(cur, dict):
	cur = default
	break
	cur = cur.get(key)
	if cur is not None:
	vals.append(float(_safe_float(cur, default) or default))
	return sum(vals) / len(vals) if vals else default

	matchup = {
	"predicted_whiff_regions": matchup_rows[0].get("predicted_whiff_regions") or [],
	"predicted_attack_regions": matchup_rows[0].get("predicted_attack_regions") or [],
	"predicted_damage_regions": matchup_rows[0].get("predicted_damage_regions") or [],
	"tunnel_pair_scores": matchup_rows[0].get("tunnel_pair_scores") or [],
	"matchup_coverage_confidence": _avg(("matchup_coverage_confidence",), 0.0),
	"component_source_map": matchup_rows[0].get("component_source_map") or {},
	"zone_matchup": {"hit_zone_boost": _avg(("zone_matchup", "hit_zone_boost"), 0.0)},
	"family_zone_matchup": {"family_zone_whiff_risk": _avg(("family_zone_matchup", "family_zone_whiff_risk"), 0.0)},
	"arsenal_matchup": {"arsenal_whiff_risk": _avg(("arsenal_matchup", "arsenal_whiff_risk"), 0.0)},
	"trajectory": matchup_rows[0].get("trajectory") or {},
	"count_context_profile": matchup_rows[0].get("count_context_profile") or {},
	}
	else:
	matchup = {
	"predicted_whiff_regions": [],
	"predicted_attack_regions": [],
	"predicted_damage_regions": [],
	"tunnel_pair_scores": [],
	"matchup_coverage_confidence": 0.0,
	"component_source_map": {},
	"zone_matchup": {},
	"family_zone_matchup": {},
	"arsenal_matchup": {},
	"trajectory": {},
	"count_context_profile": {},
	}
	telemetry_path_status = "full_telemetry" if matchup_rows else "core_baseline_plus_projected_pitcher"
	model_tier = "full_telemetry" if matchup_rows else "core_baseline_plus_projected_pitcher"
	if matchup_rows and float(matchup.get("matchup_coverage_confidence") or 0.0) < 0.45:
	telemetry_path_status = "partial_telemetry"
	model_tier = "partial_telemetry"

	zone_matchup_subscore = _safe_float((matchup.get("zone_matchup") or {}).get("hit_zone_boost"), 0.0) or 0.0
	family_zone_matchup_subscore = _safe_float((matchup.get("family_zone_matchup") or {}).get("family_zone_whiff_risk"), 0.0) or 0.0
	arsenal_fit_subscore = _safe_float((matchup.get("arsenal_matchup") or {}).get("arsenal_whiff_risk"), 0.0) or 0.0
	trajectory = matchup.get("trajectory") or {}
	tunneling_subscore = _safe_float(trajectory.get("tunnel_score"), 0.5) or 0.5
	release_consistency_subscore = _safe_float(trajectory.get("release_consistency_score"), 0.5) or 0.5
	sequencing_profiles = matchup.get("count_context_profile") or {}
	putaway_states = [v for k, v in sequencing_profiles.items() if str(k).endswith("-2")]
	count_leverage_subscore = 0.58 if putaway_states else 0.50
	sequencing_subscore = _clamp(0.5 + ((count_leverage_subscore - 0.5) * 0.6), 0.0, 1.0)

	swing_miss_subscore = _clamp((swstr or 0.11) / 0.18, 0.0, 1.0)
	called_strike_subscore = _clamp((csw or 0.28) / 0.36, 0.0, 1.0)
	command_efficiency_signal = _clamp(1.0 - ((ball or 0.36) - 0.30) / 0.12, 0.0, 1.0)
	command_efficiency_subscore = command_efficiency_signal
	lineup_whiff_subscore = _clamp(
	(
	family_zone_matchup_subscore * 0.55
	+ arsenal_fit_subscore * 0.45
	) / 0.35 if (family_zone_matchup_subscore or arsenal_fit_subscore) else 0.5,
	0.0,
	1.0,
	)

	opportunity = build_projected_strikeout_opportunity(
	pitcher_row=pitcher_row,
	opponent_batters=opponent_batters,
	projected_starter_available=bool((game_row or {}).get("projected_starter_available")),
	projected_starter_match_status=str((game_row or {}).get("projected_starter_match_status") or ""),
	game_row=game_row,
	)
	pitch_count = float(opportunity.get("projected_pitch_count") or 88.0)
	projected_batters_faced = float(opportunity.get("projected_batters_faced") or 22.5)
	projected_innings = float(opportunity.get("projected_innings") or 5.2)
	pitches_per_bf = float(opportunity.get("pitches_per_bf") or 3.85)
	times_through_order_penalty = float(opportunity.get("times_through_order_penalty") or 0.0)
	leash_risk_score = float(opportunity.get("leash_risk_score") or 0.0)
	role_certainty_score = float(opportunity.get("role_certainty_score") or 0.7)
	opportunity_confidence = float(opportunity.get("opportunity_confidence") or 0.0)
	opportunity_reasons = list(opportunity.get("opportunity_reasons") or [])

	pitch_signal = (
	swing_miss_subscore * 0.34
	+ called_strike_subscore * 0.18
	+ command_efficiency_subscore * 0.14
	+ lineup_whiff_subscore * 0.10
	+ _clamp(zone_matchup_subscore / 0.40, 0.0, 1.0) * 0.08
	+ _clamp(family_zone_matchup_subscore / 0.40, 0.0, 1.0) * 0.06
	+ _clamp(arsenal_fit_subscore / 0.35, 0.0, 1.0) * 0.05
	+ tunneling_subscore * 0.03
	+ release_consistency_subscore * 0.01
	+ sequencing_subscore * 0.01
	)
	k_rate_pitch_signal = _clamp(0.12 + (pitch_signal * 0.26), 0.14, 0.42)
	k_rate_anchor = strike_anchor if strike_anchor is not None else k_rate_pitch_signal
	projected_k_rate = (
	k_rate_pitch_signal * (1.0 - reliability)
	+ k_rate_anchor * reliability
	)
	projected_k_rate = _clamp(projected_k_rate - times_through_order_penalty, 0.12, 0.40)

	expected_strikeouts = _clamp(projected_batters_faced * projected_k_rate, 1.5, 11.5)
	raw_prob = _poisson_prob_over(expected_strikeouts, float(line)) if selection_side == "over" else _poisson_prob_under(expected_strikeouts, float(line))
	calibrated_prob = _calibrate(raw_prob)
	variance = _clamp(0.35 + leash_risk_score * 0.9 + (1.0 - reliability) * 0.8, 0.35, 1.8)

	confidence = 56.0
	reasons: list[str] = []
	bonuses: list[dict[str, Any]] = []
	penalties: list[dict[str, Any]] = []
	if sample_size >= 400:
	confidence += 10.0
	bonuses.append(_confidence_component("Strong pitcher sample", 10.0, "bonus"))
	elif sample_size < 150:
	confidence -= 10.0
	reasons.append("Limited pitcher pitch sample")
	penalties.append(_confidence_component("Limited pitcher pitch sample", 10.0, "penalty"))
	if matchup.get("matchup_coverage_confidence", 0.0) >= 0.45:
	confidence += 8.0
	bonuses.append(_confidence_component("Strong telemetry and zone coverage", 8.0, "bonus"))
	else:
	confidence -= 6.0
	reasons.append("Thin telemetry and zone-coverage sample")
	penalties.append(_confidence_component("Thin telemetry and zone-coverage sample", 6.0, "penalty"))
	if opponent_batters and len(opponent_batters) >= 7:
	confidence += 5.0
	bonuses.append(_confidence_component("Projected lineup mostly complete", 5.0, "bonus"))
	else:
	confidence -= 5.0
	reasons.append("Projected opponent lineup is incomplete")
	penalties.append(_confidence_component("Projected opponent lineup is incomplete", 5.0, "penalty"))
	if leash_risk_score >= 0.55:
	confidence -= 7.0
	reasons.append("Pitch-count and leash risk remain elevated")
	penalties.append(_confidence_component("Pitch-count and leash risk remain elevated", 7.0, "penalty"))
	if role_certainty_score >= 0.92:
	confidence += 4.0
	bonuses.append(_confidence_component("Public projected starter confirmed", 4.0, "bonus"))
	elif role_certainty_score <= 0.65:
	confidence -= 6.0
	reasons.append("Starter role and leash certainty remain soft")
	penalties.append(_confidence_component("Starter role and leash certainty remain soft", 6.0, "penalty"))

	reason_tags_for: list[str] = []
	reason_tags_against: list[str] = []
	if swing_miss_subscore >= 0.62:
	reason_tags_for.append("Misses bats consistently")
	if called_strike_subscore >= 0.72:
	reason_tags_for.append("Strong called plus whiff strike mix")
	if projected_k_rate >= 0.27:
	reason_tags_for.append("Projected strikeout rate supports the line")
	if projected_batters_faced >= 24.0:
	reason_tags_for.append("Projected workload supports deep strikeout opportunity")
	if leash_risk_score >= 0.48:
	reason_tags_against.append("Pitch-count and leash risk limit the strikeout path")
	if projected_batters_faced <= 21.5:
	reason_tags_against.append("Projected batters faced are lighter than ideal")
	if not opponent_batters or len(opponent_batters) < 7:
	reason_tags_against.append("Projected opponent lineup is incomplete")

	confidence_raw = _clamp(confidence, 1.0, 100.0)
	bucket = "high" if confidence_raw >= 75 else "medium" if confidence_raw >= 55 else "low"
	primary_penalty = max(
	[item for item in penalties if float(item.get("value") or 0.0) > 0.0],
	key=lambda item: float(item.get("value") or 0.0),
	default=None,
	)
	primary_bonus = max(
	[item for item in bonuses if float(item.get("value") or 0.0) > 0.0],
	key=lambda item: float(item.get("value") or 0.0),
	default=None,
	)
	primary_driver = primary_penalty or primary_bonus
	summary_label = str((primary_driver or {}).get("label") or "").strip() or None

	result.update(
	{
	"raw_k_prob": raw_prob,
	"calibrated_k_prob": calibrated_prob,
	"fair_prob": calibrated_prob,
	"expected_strikeouts": expected_strikeouts,
	"raw_k_prob_v2": raw_prob,
	"calibrated_k_prob_v2": calibrated_prob,
	"fair_prob_v2": calibrated_prob,
	"expected_strikeouts_v2": expected_strikeouts,
	"projected_pitch_count": round(pitch_count, 2),
	"projected_batters_faced": round(projected_batters_faced, 2),
	"projected_innings": round(projected_innings, 2),
	"projected_k_rate": round(projected_k_rate, 4),
	"pitches_per_bf": round(pitches_per_bf, 3),
	"opportunity_confidence": round(opportunity_confidence, 4),
	"opportunity_reasons": opportunity_reasons,
	"k_rate_pitch_signal": round(k_rate_pitch_signal, 4),
	"k_rate_anchor": round(k_rate_anchor, 4) if k_rate_anchor is not None else None,
	"bb_rate_anchor": round(walk_anchor, 4) if walk_anchor is not None else None,
	"command_efficiency_signal": round(command_efficiency_signal, 4),
	"pitcher_swstr_rate": round(swstr, 4) if swstr is not None else None,
	"pitcher_csw_rate": round(csw, 4) if csw is not None else None,
	"pitcher_ball_rate": round(ball, 4) if ball is not None else None,
	"swing_miss_subscore": round(swing_miss_subscore, 4),
	"called_strike_subscore": round(called_strike_subscore, 4),
	"command_efficiency_subscore": round(command_efficiency_subscore, 4),
	"lineup_whiff_subscore": round(lineup_whiff_subscore, 4),
	"zone_matchup_subscore": round(zone_matchup_subscore, 4),
	"family_zone_matchup_subscore": round(family_zone_matchup_subscore, 4),
	"arsenal_fit_subscore": round(arsenal_fit_subscore, 4),
	"tunneling_subscore": round(tunneling_subscore, 4),
	"release_consistency_subscore": round(release_consistency_subscore, 4),
	"sequencing_subscore": round(sequencing_subscore, 4),
	"count_leverage_subscore": round(count_leverage_subscore, 4),
	"leash_risk_subscore": round(leash_risk_score, 4),
	"role_certainty_score": round(role_certainty_score, 4),
	"times_through_order_penalty": round(times_through_order_penalty, 4),
	"telemetry_path_status": telemetry_path_status,
	"model_tier": model_tier,
	"variance_band_low": round(_clamp(expected_strikeouts - variance, 0.5, 12.0), 2),
	"variance_band_high": round(_clamp(expected_strikeouts + variance, 0.5, 12.5), 2),
	"matchup_coverage_confidence": matchup.get("matchup_coverage_confidence"),
	"component_source_map": matchup.get("component_source_map") or {},
	"predicted_whiff_regions": matchup.get("predicted_whiff_regions") or [],
	"predicted_attack_regions": matchup.get("predicted_attack_regions") or [],
	"predicted_damage_regions": matchup.get("predicted_damage_regions") or [],
	"tunnel_pair_scores": matchup.get("tunnel_pair_scores") or [],
	"applied_layers": "opportunity\|pitch_win\|probability\|uncertainty",
	"skipped_layers": "",
	"confidence_score": round(confidence_raw, 1),
	"confidence_score_raw": round(confidence_raw, 1),
	"confidence_score_display": round(confidence_raw, 1),
	"confidence_bucket": bucket,
	"confidence_reasons": reasons[:5],
	"confidence_component_bonuses": bonuses,
	"confidence_component_penalties": penalties,
	"confidence_primary_driver": primary_driver,
	"confidence_summary_label": summary_label,
	"reason_tags_for": reason_tags_for,
	"reason_tags_against": reason_tags_against,
	"applied_layers_v2": "opportunity\|pitch_win\|probability\|uncertainty",
	"skipped_layers_v2": "",
	"confidence_score_v2": round(confidence_raw, 1),
	"confidence_score_raw_v2": round(confidence_raw, 1),
	"confidence_score_display_v2": round(confidence_raw, 1),
	"confidence_bucket_v2": bucket,
	"confidence_reasons_v2": reasons[:5],
	"confidence_component_bonuses_v2": bonuses,
	"confidence_component_penalties_v2": penalties,
	"confidence_primary_driver_v2": primary_driver,
	"confidence_summary_label_v2": summary_label,
	}
	)
	return result