Spaces:

Syntrex
/

2026_MLB_Model

Sleeping

2026_MLB_Model

File size: 22,217 Bytes

from __future__ import annotations

import math
from typing import Any

import pandas as pd

from models.opportunity_model import build_projected_strikeout_opportunity
from models.pitcher_adjustment import build_pitcher_feature_row
from models.shared_matchup_engine import compose_shared_matchup_context


def _safe_float(value: Any, default: float | None = None) -> float | None:
    try:
        if value is None:
            return default
        text = str(value).strip().lower()
        if text in {"", "nan", "none"}:
            return default
        return float(value)
    except Exception:
        return default


def _clamp(value: float, lo: float, hi: float) -> float:
    return max(lo, min(hi, value))


def _reliability(sample_size: Any, k: float = 120.0) -> float:
    sample = max(0.0, float(_safe_float(sample_size, 0.0) or 0.0))
    return _clamp(sample / (sample + max(1.0, k)), 0.0, 1.0)


def _confidence_component(label: str, value: float, direction: str) -> dict[str, Any]:
    return {
        "label": label,
        "value": round(float(value), 1),
        "direction": direction,
    }


def _poisson_prob_over(expected_value: float, line: float) -> float:
    if expected_value <= 0:
        return 0.0
    target = int(math.floor(line))
    cumulative = 0.0
    for k in range(0, target + 1):
        cumulative += math.exp(-expected_value) * (expected_value ** k) / math.factorial(k)
    return _clamp(1.0 - cumulative, 0.0, 1.0)


def _poisson_prob_under(expected_value: float, line: float) -> float:
    return _clamp(1.0 - _poisson_prob_over(expected_value, line), 0.0, 1.0)


def _calibrate(probability: float) -> float:
    centered = probability - 0.50
    return _clamp(0.50 + (centered * 0.92), 0.02, 0.98)


def build_strikeout_probability_result_v2(
    pitcher_statcast_df: pd.DataFrame,
    pitcher_name: str,
    batter_statcast_df: pd.DataFrame | None = None,
    opponent_batters: list[str] | None = None,
    opponent_team: str | None = None,
    line: float | None = None,
    selection_side: str | None = None,
    game_row: dict[str, Any] | None = None,
    runtime_cache: dict[str, Any] | None = None,
) -> dict[str, Any]:
    result: dict[str, Any] = {
        "formula_version": "strikeout_v2_live",
        "raw_k_prob": None,
        "calibrated_k_prob": None,
        "fair_prob": None,
        "expected_strikeouts": None,
        "raw_k_prob_v2": None,
        "calibrated_k_prob_v2": None,
        "fair_prob_v2": None,
        "expected_strikeouts_v2": None,
        "projected_pitch_count": None,
        "projected_batters_faced": None,
        "projected_innings": None,
        "projected_k_rate": None,
        "pitches_per_bf": None,
        "opportunity_confidence": None,
        "opportunity_reasons": [],
        "k_rate_pitch_signal": None,
        "k_rate_anchor": None,
        "bb_rate_anchor": None,
        "command_efficiency_signal": None,
        "pitcher_swstr_rate": None,
        "pitcher_csw_rate": None,
        "pitcher_ball_rate": None,
        "swing_miss_subscore": None,
        "called_strike_subscore": None,
        "command_efficiency_subscore": None,
        "lineup_whiff_subscore": None,
        "zone_matchup_subscore": None,
        "family_zone_matchup_subscore": None,
        "arsenal_fit_subscore": None,
        "tunneling_subscore": None,
        "release_consistency_subscore": None,
        "sequencing_subscore": None,
        "count_leverage_subscore": None,
        "leash_risk_subscore": None,
        "role_certainty_score": None,
        "times_through_order_penalty": None,
        "telemetry_path_status": "baseline_only",
        "model_tier": "baseline_only_degraded",
        "variance_band_low": None,
        "variance_band_high": None,
        "matchup_coverage_confidence": None,
        "component_source_map": {},
        "predicted_whiff_regions": [],
        "predicted_attack_regions": [],
        "predicted_damage_regions": [],
        "tunnel_pair_scores": [],
        "applied_layers": "",
        "skipped_layers": "",
        "confidence_score": None,
        "confidence_score_raw": None,
        "confidence_score_display": None,
        "confidence_source": "strikeout_v2_live",
        "confidence_bucket": None,
        "confidence_reasons": [],
        "confidence_component_bonuses": [],
        "confidence_component_penalties": [],
        "confidence_primary_driver": None,
        "confidence_summary_label": None,
        "reason_tags_for": [],
        "reason_tags_against": [],
        "applied_layers_v2": "",
        "skipped_layers_v2": "",
        "confidence_score_v2": None,
        "confidence_score_raw_v2": None,
        "confidence_score_display_v2": None,
        "confidence_source_v2": "strikeout_v2_live",
        "confidence_bucket_v2": None,
        "confidence_reasons_v2": [],
        "confidence_component_bonuses_v2": [],
        "confidence_component_penalties_v2": [],
        "confidence_primary_driver_v2": None,
        "confidence_summary_label_v2": None,
    }

    if (
        pitcher_statcast_df is None
        or pitcher_statcast_df.empty
        or not pitcher_name
        or line is None
        or selection_side not in {"over", "under"}
    ):
        result["skipped_layers"] = "missing_pitcher_or_line"
        return result

    pitcher_row = build_pitcher_feature_row(pitcher_statcast_df, pitcher_name)
    sample_size = int(pitcher_row.get("sample_size") or 0)
    reliability = _reliability(sample_size, k=180.0)
    swstr = _safe_float(pitcher_row.get("swstr_rate"))
    csw = _safe_float(pitcher_row.get("csw_rate"))
    ball = _safe_float(pitcher_row.get("ball_rate"))

    strike_anchor = None
    walk_anchor = None
    if swstr is not None:
        strike_anchor = _clamp(0.12 + ((swstr - 0.11) * 1.15), 0.12, 0.42)
    if ball is not None:
        walk_anchor = _clamp(0.05 + ((ball - 0.36) * 0.75), 0.02, 0.14)

    matchup_rows: list[dict[str, Any]] = []
    if batter_statcast_df is not None and not batter_statcast_df.empty and opponent_batters:
        matchup_cache_bucket = None
        lineup_cache_key = None
        if runtime_cache is not None:
            matchup_cache_bucket = runtime_cache.setdefault("strikeout_lineup_matchups", {})
            lineup_cache_key = (
                id(batter_statcast_df),
                id(pitcher_statcast_df),
                str(pitcher_name or "").strip().lower(),
                tuple(str(name or "").strip().lower() for name in (opponent_batters or [])[:9]),
                str((game_row or {}).get("away_team") or "").strip().lower(),
                str((game_row or {}).get("home_team") or "").strip().lower(),
                str((game_row or {}).get("projected_starter_match_status") or "").strip().lower(),
                str(opponent_team or "").strip().lower(),
            )
        if matchup_cache_bucket is not None and lineup_cache_key in matchup_cache_bucket:
            matchup_rows = list(matchup_cache_bucket[lineup_cache_key])
        else:
            for batter_name in opponent_batters[:9]:
                try:
                    matchup_rows.append(
                        compose_shared_matchup_context(
                            batter_name=batter_name,
                            pitcher_name=pitcher_name,
                            batter_statcast_df=batter_statcast_df,
                            pitcher_statcast_df=pitcher_statcast_df,
                            pitcher_row=pitcher_row,
                            game_row=game_row,
                            batter_features={"batter_stand": "L"},
                            runtime_cache=runtime_cache,
                        )
                    )
                except Exception:
                    continue
            if matchup_cache_bucket is not None and lineup_cache_key is not None:
                matchup_cache_bucket[lineup_cache_key] = list(matchup_rows)

    if matchup_rows:
        def _avg(path: tuple[str, ...], default: float = 0.0) -> float:
            vals: list[float] = []
            for row in matchup_rows:
                cur: Any = row
                for key in path:
                    if not isinstance(cur, dict):
                        cur = default
                        break
                    cur = cur.get(key)
                if cur is not None:
                    vals.append(float(_safe_float(cur, default) or default))
            return sum(vals) / len(vals) if vals else default

        matchup = {
            "predicted_whiff_regions": matchup_rows[0].get("predicted_whiff_regions") or [],
            "predicted_attack_regions": matchup_rows[0].get("predicted_attack_regions") or [],
            "predicted_damage_regions": matchup_rows[0].get("predicted_damage_regions") or [],
            "tunnel_pair_scores": matchup_rows[0].get("tunnel_pair_scores") or [],
            "matchup_coverage_confidence": _avg(("matchup_coverage_confidence",), 0.0),
            "component_source_map": matchup_rows[0].get("component_source_map") or {},
            "zone_matchup": {"hit_zone_boost": _avg(("zone_matchup", "hit_zone_boost"), 0.0)},
            "family_zone_matchup": {"family_zone_whiff_risk": _avg(("family_zone_matchup", "family_zone_whiff_risk"), 0.0)},
            "arsenal_matchup": {"arsenal_whiff_risk": _avg(("arsenal_matchup", "arsenal_whiff_risk"), 0.0)},
            "trajectory": matchup_rows[0].get("trajectory") or {},
            "count_context_profile": matchup_rows[0].get("count_context_profile") or {},
        }
    else:
        matchup = {
        "predicted_whiff_regions": [],
        "predicted_attack_regions": [],
        "predicted_damage_regions": [],
        "tunnel_pair_scores": [],
        "matchup_coverage_confidence": 0.0,
        "component_source_map": {},
        "zone_matchup": {},
        "family_zone_matchup": {},
        "arsenal_matchup": {},
        "trajectory": {},
        "count_context_profile": {},
    }
    telemetry_path_status = "full_telemetry" if matchup_rows else "core_baseline_plus_projected_pitcher"
    model_tier = "full_telemetry" if matchup_rows else "core_baseline_plus_projected_pitcher"
    if matchup_rows and float(matchup.get("matchup_coverage_confidence") or 0.0) < 0.45:
        telemetry_path_status = "partial_telemetry"
        model_tier = "partial_telemetry"

    zone_matchup_subscore = _safe_float((matchup.get("zone_matchup") or {}).get("hit_zone_boost"), 0.0) or 0.0
    family_zone_matchup_subscore = _safe_float((matchup.get("family_zone_matchup") or {}).get("family_zone_whiff_risk"), 0.0) or 0.0
    arsenal_fit_subscore = _safe_float((matchup.get("arsenal_matchup") or {}).get("arsenal_whiff_risk"), 0.0) or 0.0
    trajectory = matchup.get("trajectory") or {}
    tunneling_subscore = _safe_float(trajectory.get("tunnel_score"), 0.5) or 0.5
    release_consistency_subscore = _safe_float(trajectory.get("release_consistency_score"), 0.5) or 0.5
    sequencing_profiles = matchup.get("count_context_profile") or {}
    putaway_states = [v for k, v in sequencing_profiles.items() if str(k).endswith("-2")]
    count_leverage_subscore = 0.58 if putaway_states else 0.50
    sequencing_subscore = _clamp(0.5 + ((count_leverage_subscore - 0.5) * 0.6), 0.0, 1.0)

    swing_miss_subscore = _clamp((swstr or 0.11) / 0.18, 0.0, 1.0)
    called_strike_subscore = _clamp((csw or 0.28) / 0.36, 0.0, 1.0)
    command_efficiency_signal = _clamp(1.0 - ((ball or 0.36) - 0.30) / 0.12, 0.0, 1.0)
    command_efficiency_subscore = command_efficiency_signal
    lineup_whiff_subscore = _clamp(
        (
            family_zone_matchup_subscore * 0.55
            + arsenal_fit_subscore * 0.45
        ) / 0.35 if (family_zone_matchup_subscore or arsenal_fit_subscore) else 0.5,
        0.0,
        1.0,
    )

    opportunity = build_projected_strikeout_opportunity(
        pitcher_row=pitcher_row,
        opponent_batters=opponent_batters,
        projected_starter_available=bool((game_row or {}).get("projected_starter_available")),
        projected_starter_match_status=str((game_row or {}).get("projected_starter_match_status") or ""),
        game_row=game_row,
    )
    pitch_count = float(opportunity.get("projected_pitch_count") or 88.0)
    projected_batters_faced = float(opportunity.get("projected_batters_faced") or 22.5)
    projected_innings = float(opportunity.get("projected_innings") or 5.2)
    pitches_per_bf = float(opportunity.get("pitches_per_bf") or 3.85)
    times_through_order_penalty = float(opportunity.get("times_through_order_penalty") or 0.0)
    leash_risk_score = float(opportunity.get("leash_risk_score") or 0.0)
    role_certainty_score = float(opportunity.get("role_certainty_score") or 0.7)
    opportunity_confidence = float(opportunity.get("opportunity_confidence") or 0.0)
    opportunity_reasons = list(opportunity.get("opportunity_reasons") or [])

    pitch_signal = (
        swing_miss_subscore * 0.34
        + called_strike_subscore * 0.18
        + command_efficiency_subscore * 0.14
        + lineup_whiff_subscore * 0.10
        + _clamp(zone_matchup_subscore / 0.40, 0.0, 1.0) * 0.08
        + _clamp(family_zone_matchup_subscore / 0.40, 0.0, 1.0) * 0.06
        + _clamp(arsenal_fit_subscore / 0.35, 0.0, 1.0) * 0.05
        + tunneling_subscore * 0.03
        + release_consistency_subscore * 0.01
        + sequencing_subscore * 0.01
    )
    k_rate_pitch_signal = _clamp(0.12 + (pitch_signal * 0.26), 0.14, 0.42)
    k_rate_anchor = strike_anchor if strike_anchor is not None else k_rate_pitch_signal
    projected_k_rate = (
        k_rate_pitch_signal * (1.0 - reliability)
        + k_rate_anchor * reliability
    )
    projected_k_rate = _clamp(projected_k_rate - times_through_order_penalty, 0.12, 0.40)

    expected_strikeouts = _clamp(projected_batters_faced * projected_k_rate, 1.5, 11.5)
    raw_prob = _poisson_prob_over(expected_strikeouts, float(line)) if selection_side == "over" else _poisson_prob_under(expected_strikeouts, float(line))
    calibrated_prob = _calibrate(raw_prob)
    variance = _clamp(0.35 + leash_risk_score * 0.9 + (1.0 - reliability) * 0.8, 0.35, 1.8)

    confidence = 56.0
    reasons: list[str] = []
    bonuses: list[dict[str, Any]] = []
    penalties: list[dict[str, Any]] = []
    if sample_size >= 400:
        confidence += 10.0
        bonuses.append(_confidence_component("Strong pitcher sample", 10.0, "bonus"))
    elif sample_size < 150:
        confidence -= 10.0
        reasons.append("Limited pitcher pitch sample")
        penalties.append(_confidence_component("Limited pitcher pitch sample", 10.0, "penalty"))
    if matchup.get("matchup_coverage_confidence", 0.0) >= 0.45:
        confidence += 8.0
        bonuses.append(_confidence_component("Strong telemetry and zone coverage", 8.0, "bonus"))
    else:
        confidence -= 6.0
        reasons.append("Thin telemetry and zone-coverage sample")
        penalties.append(_confidence_component("Thin telemetry and zone-coverage sample", 6.0, "penalty"))
    if opponent_batters and len(opponent_batters) >= 7:
        confidence += 5.0
        bonuses.append(_confidence_component("Projected lineup mostly complete", 5.0, "bonus"))
    else:
        confidence -= 5.0
        reasons.append("Projected opponent lineup is incomplete")
        penalties.append(_confidence_component("Projected opponent lineup is incomplete", 5.0, "penalty"))
    if leash_risk_score >= 0.55:
        confidence -= 7.0
        reasons.append("Pitch-count and leash risk remain elevated")
        penalties.append(_confidence_component("Pitch-count and leash risk remain elevated", 7.0, "penalty"))
    if role_certainty_score >= 0.92:
        confidence += 4.0
        bonuses.append(_confidence_component("Public projected starter confirmed", 4.0, "bonus"))
    elif role_certainty_score <= 0.65:
        confidence -= 6.0
        reasons.append("Starter role and leash certainty remain soft")
        penalties.append(_confidence_component("Starter role and leash certainty remain soft", 6.0, "penalty"))

    reason_tags_for: list[str] = []
    reason_tags_against: list[str] = []
    if swing_miss_subscore >= 0.62:
        reason_tags_for.append("Misses bats consistently")
    if called_strike_subscore >= 0.72:
        reason_tags_for.append("Strong called plus whiff strike mix")
    if projected_k_rate >= 0.27:
        reason_tags_for.append("Projected strikeout rate supports the line")
    if projected_batters_faced >= 24.0:
        reason_tags_for.append("Projected workload supports deep strikeout opportunity")
    if leash_risk_score >= 0.48:
        reason_tags_against.append("Pitch-count and leash risk limit the strikeout path")
    if projected_batters_faced <= 21.5:
        reason_tags_against.append("Projected batters faced are lighter than ideal")
    if not opponent_batters or len(opponent_batters) < 7:
        reason_tags_against.append("Projected opponent lineup is incomplete")

    confidence_raw = _clamp(confidence, 1.0, 100.0)
    bucket = "high" if confidence_raw >= 75 else "medium" if confidence_raw >= 55 else "low"
    primary_penalty = max(
        [item for item in penalties if float(item.get("value") or 0.0) > 0.0],
        key=lambda item: float(item.get("value") or 0.0),
        default=None,
    )
    primary_bonus = max(
        [item for item in bonuses if float(item.get("value") or 0.0) > 0.0],
        key=lambda item: float(item.get("value") or 0.0),
        default=None,
    )
    primary_driver = primary_penalty or primary_bonus
    summary_label = str((primary_driver or {}).get("label") or "").strip() or None

    result.update(
        {
            "raw_k_prob": raw_prob,
            "calibrated_k_prob": calibrated_prob,
            "fair_prob": calibrated_prob,
            "expected_strikeouts": expected_strikeouts,
            "raw_k_prob_v2": raw_prob,
            "calibrated_k_prob_v2": calibrated_prob,
            "fair_prob_v2": calibrated_prob,
            "expected_strikeouts_v2": expected_strikeouts,
            "projected_pitch_count": round(pitch_count, 2),
            "projected_batters_faced": round(projected_batters_faced, 2),
            "projected_innings": round(projected_innings, 2),
            "projected_k_rate": round(projected_k_rate, 4),
            "pitches_per_bf": round(pitches_per_bf, 3),
            "opportunity_confidence": round(opportunity_confidence, 4),
            "opportunity_reasons": opportunity_reasons,
            "k_rate_pitch_signal": round(k_rate_pitch_signal, 4),
            "k_rate_anchor": round(k_rate_anchor, 4) if k_rate_anchor is not None else None,
            "bb_rate_anchor": round(walk_anchor, 4) if walk_anchor is not None else None,
            "command_efficiency_signal": round(command_efficiency_signal, 4),
            "pitcher_swstr_rate": round(swstr, 4) if swstr is not None else None,
            "pitcher_csw_rate": round(csw, 4) if csw is not None else None,
            "pitcher_ball_rate": round(ball, 4) if ball is not None else None,
            "swing_miss_subscore": round(swing_miss_subscore, 4),
            "called_strike_subscore": round(called_strike_subscore, 4),
            "command_efficiency_subscore": round(command_efficiency_subscore, 4),
            "lineup_whiff_subscore": round(lineup_whiff_subscore, 4),
            "zone_matchup_subscore": round(zone_matchup_subscore, 4),
            "family_zone_matchup_subscore": round(family_zone_matchup_subscore, 4),
            "arsenal_fit_subscore": round(arsenal_fit_subscore, 4),
            "tunneling_subscore": round(tunneling_subscore, 4),
            "release_consistency_subscore": round(release_consistency_subscore, 4),
            "sequencing_subscore": round(sequencing_subscore, 4),
            "count_leverage_subscore": round(count_leverage_subscore, 4),
            "leash_risk_subscore": round(leash_risk_score, 4),
            "role_certainty_score": round(role_certainty_score, 4),
            "times_through_order_penalty": round(times_through_order_penalty, 4),
            "telemetry_path_status": telemetry_path_status,
            "model_tier": model_tier,
            "variance_band_low": round(_clamp(expected_strikeouts - variance, 0.5, 12.0), 2),
            "variance_band_high": round(_clamp(expected_strikeouts + variance, 0.5, 12.5), 2),
            "matchup_coverage_confidence": matchup.get("matchup_coverage_confidence"),
            "component_source_map": matchup.get("component_source_map") or {},
            "predicted_whiff_regions": matchup.get("predicted_whiff_regions") or [],
            "predicted_attack_regions": matchup.get("predicted_attack_regions") or [],
            "predicted_damage_regions": matchup.get("predicted_damage_regions") or [],
            "tunnel_pair_scores": matchup.get("tunnel_pair_scores") or [],
            "applied_layers": "opportunity|pitch_win|probability|uncertainty",
            "skipped_layers": "",
            "confidence_score": round(confidence_raw, 1),
            "confidence_score_raw": round(confidence_raw, 1),
            "confidence_score_display": round(confidence_raw, 1),
            "confidence_bucket": bucket,
            "confidence_reasons": reasons[:5],
            "confidence_component_bonuses": bonuses,
            "confidence_component_penalties": penalties,
            "confidence_primary_driver": primary_driver,
            "confidence_summary_label": summary_label,
            "reason_tags_for": reason_tags_for,
            "reason_tags_against": reason_tags_against,
            "applied_layers_v2": "opportunity|pitch_win|probability|uncertainty",
            "skipped_layers_v2": "",
            "confidence_score_v2": round(confidence_raw, 1),
            "confidence_score_raw_v2": round(confidence_raw, 1),
            "confidence_score_display_v2": round(confidence_raw, 1),
            "confidence_bucket_v2": bucket,
            "confidence_reasons_v2": reasons[:5],
            "confidence_component_bonuses_v2": bonuses,
            "confidence_component_penalties_v2": penalties,
            "confidence_primary_driver_v2": primary_driver,
            "confidence_summary_label_v2": summary_label,
        }
    )
    return result