2026_MLB_Model / simulation /monte_carlo.py
Syntrex's picture
Accuracy overhaul: pitcher resolution logging, baseline recalibration, vig fix, XGBoost blend
21151ce
raw
history blame
2.47 kB
from __future__ import annotations
import numpy as np
import pandas as pd
def simulate_batter_outcomes(
hit_prob: float,
hr_prob: float,
n_sims: int = 10000,
batter_row: dict | None = None,
) -> pd.DataFrame:
rng = np.random.default_rng() # A2: fresh seed each run
# A4: Batter-specific hit type distribution (empirical MLB 2024 baseline)
hit_single_p = 0.62
hit_double_p = 0.33
hit_triple_p = 0.05
if batter_row is not None:
pull_rate = batter_row.get("pull_rate")
air_ball_rate = batter_row.get("air_ball_rate")
avg_launch_angle = batter_row.get("avg_launch_angle")
try:
if pull_rate is not None and air_ball_rate is not None:
if float(pull_rate) >= 0.45 and float(air_ball_rate) >= 0.45:
# High pull + high air ball → more doubles
hit_single_p -= 0.04
hit_double_p += 0.04
except Exception:
pass
try:
if avg_launch_angle is not None and float(avg_launch_angle) >= 18:
# Fly ball tendency → slight 2B/3B boost
hit_single_p -= 0.02
hit_double_p += 0.01
hit_triple_p += 0.01
except Exception:
pass
hit_type_probs = np.array([hit_single_p, hit_double_p, hit_triple_p])
hit_type_probs = hit_type_probs / hit_type_probs.sum()
# A3: Hierarchical sampling — HR first, then hit (mutually exclusive outcomes)
is_hr = rng.binomial(1, min(hr_prob, 1.0), size=n_sims).astype(bool)
# Among non-HR PAs, use hit_prob adjusted down by hr_prob already claimed
adj_hit_prob = max(0.0, min(1.0, hit_prob - hr_prob))
is_hit = rng.binomial(1, adj_hit_prob, size=n_sims).astype(bool)
hits = np.zeros(n_sims, dtype=int)
hrs = np.zeros(n_sims, dtype=int)
total_bases = np.zeros(n_sims, dtype=int)
# HRs take priority
hr_mask = is_hr
hits[hr_mask] = 1
hrs[hr_mask] = 1
total_bases[hr_mask] = 4
# Non-HR hits distributed by batter-specific type distribution
hit_mask = is_hit & ~is_hr
hits[hit_mask] = 1
hit_count = int(hit_mask.sum())
if hit_count > 0:
hit_types = rng.choice([1, 2, 3], size=hit_count, p=hit_type_probs)
total_bases[hit_mask] = hit_types
return pd.DataFrame(
{
"hit": hits,
"hr": hrs,
"total_bases": total_bases,
}
)