2026_MLB_Model / models /pitcher_adjustment.py
Syntrex's picture
Accuracy overhaul: pitcher resolution logging, baseline recalibration, vig fix, XGBoost blend
21151ce
raw
history blame
17.3 kB
from __future__ import annotations
import logging
from typing import Any
import pandas as pd
from data.odds_name_map import normalize_pitcher_name
from database.db import log_pitcher_resolution
from models.pitcher_live_state_v2 import build_pitcher_live_state_v2
logger = logging.getLogger(__name__)
def _to_last_first_variants(name: str) -> set[str]:
normalized = normalize_pitcher_name(name)
if not normalized:
return set()
parts = normalized.split()
variants = {normalized}
if len(parts) >= 2:
first = parts[0]
last = parts[-1]
middle = " ".join(parts[1:-1]).strip()
if middle:
variants.add(f"{last} {first} {middle}".strip())
variants.add(f"{last} {first}".strip())
return variants
def _safe_mean(series: pd.Series) -> float | None:
numeric = pd.to_numeric(series, errors="coerce").dropna()
if numeric.empty:
return None
return float(numeric.mean())
def build_pitcher_feature_row(
statcast_df: pd.DataFrame,
pitcher_name: str,
pitcher_id: int | None = None,
conn: Any = None,
game_pk: str | None = None,
game_date: str | None = None,
source: str = "pitcher_adjustment",
) -> dict[str, Any]:
_empty_row: dict[str, Any] = {
"pitcher_name": pitcher_name,
"ev_allowed": None,
"hard_hit_rate_allowed": None,
"barrel_rate_allowed": None,
"avg_release_speed": None,
"avg_release_spin_rate": None,
"avg_release_extension": None,
"avg_pfx_x": None,
"avg_pfx_z": None,
"sample_size": 0,
"p_throws": "R",
"gb_rate_allowed": 0.0,
"ld_rate_allowed": 0.0,
"fb_rate_allowed": 0.0,
"popup_rate_allowed": 0.0,
"la_sweet_spot_allowed_rate": 0.0,
"la_optimal_hr_allowed_rate": 0.0,
"avg_launch_angle_allowed": None,
"swstr_rate": None,
"csw_rate": None,
"ball_rate": None,
}
def _log(match_method: str, matched_canonical: str | None, sample_size: int, p_throws: str | None) -> None:
if conn is None:
return
try:
log_pitcher_resolution(conn, {
"game_pk": game_pk,
"game_date": game_date,
"source": source,
"input_name": pitcher_name,
"normalized_name": normalize_pitcher_name(pitcher_name),
"matched_canonical": matched_canonical,
"pitcher_id": pitcher_id,
"match_method": match_method,
"sample_size": sample_size,
"p_throws": p_throws,
})
except Exception as exc:
logger.debug("[pitcher_adjustment] resolution log write failed: %s", exc)
if statcast_df.empty or "player_name" not in statcast_df.columns:
_log("failed", None, 0, None)
return _empty_row
player_names = statcast_df["player_name"].astype(str).fillna("")
normalized_series = player_names.map(normalize_pitcher_name)
# Priority 1: pitcher ID match
df = pd.DataFrame()
if pitcher_id is not None and "pitcher" in statcast_df.columns:
try:
numeric_pitcher_ids = pd.to_numeric(statcast_df["pitcher"], errors="coerce")
id_df = statcast_df[numeric_pitcher_ids == int(pitcher_id)].copy()
if not id_df.empty:
df = id_df
_throws = None
if "p_throws" in df.columns:
mode_vals = df["p_throws"].dropna().mode()
if not mode_vals.empty:
_throws = str(mode_vals.iloc[0]).strip()
canonical = str(df["player_name"].iloc[0]) if "player_name" in df.columns else None
_log("id", canonical, int(len(df)), _throws)
except Exception as e:
logger.debug(f"[pitcher_adjustment] pitcher ID match failed: {e}")
# Priority 2: exact normalized name / last-first variant match
if df.empty:
normalized_target_variants = _to_last_first_variants(pitcher_name)
mask = normalized_series.isin(normalized_target_variants)
df = statcast_df[mask].copy()
if not df.empty:
_throws = None
if "p_throws" in df.columns:
mode_vals = df["p_throws"].dropna().mode()
if not mode_vals.empty:
_throws = str(mode_vals.iloc[0]).strip()
canonical = str(df["player_name"].iloc[0]) if "player_name" in df.columns else None
_log("exact", canonical, int(len(df)), _throws)
# Priority 3: loose contains-style fallback on first + last token
if df.empty:
normalized_pitcher_name = normalize_pitcher_name(pitcher_name)
name_parts = normalized_pitcher_name.split()
if len(name_parts) >= 2:
first = name_parts[0]
last = name_parts[-1]
loose_mask = normalized_series.apply(
lambda n: isinstance(n, str) and first in n and last in n
)
df = statcast_df[loose_mask].copy()
if not df.empty:
_throws = None
if "p_throws" in df.columns:
mode_vals = df["p_throws"].dropna().mode()
if not mode_vals.empty:
_throws = str(mode_vals.iloc[0]).strip()
canonical = str(df["player_name"].iloc[0]) if "player_name" in df.columns else None
_log("loose", canonical, int(len(df)), _throws)
if df.empty:
_log("failed", None, 0, None)
return _empty_row
launch_speed = pd.to_numeric(df.get("launch_speed"), errors="coerce")
launch_angle = pd.to_numeric(df.get("launch_angle"), errors="coerce")
release_speed = pd.to_numeric(df.get("release_speed"), errors="coerce")
release_spin_rate = pd.to_numeric(df.get("release_spin_rate"), errors="coerce")
release_extension = pd.to_numeric(df.get("release_extension"), errors="coerce")
pfx_x = pd.to_numeric(df.get("pfx_x"), errors="coerce")
pfx_z = pd.to_numeric(df.get("pfx_z"), errors="coerce")
ev_allowed = _safe_mean(launch_speed)
avg_release_speed = _safe_mean(release_speed)
avg_release_spin_rate = _safe_mean(release_spin_rate)
avg_release_extension = _safe_mean(release_extension)
avg_pfx_x = _safe_mean(pfx_x)
avg_pfx_z = _safe_mean(pfx_z)
hard_hit_rate_allowed = None
if not launch_speed.dropna().empty:
hard_hit_rate_allowed = float((launch_speed >= 95).mean())
barrel_rate_allowed = None
if not launch_speed.dropna().empty and not launch_angle.dropna().empty:
valid = pd.DataFrame({"launch_speed": launch_speed, "launch_angle": launch_angle}).dropna()
if not valid.empty:
barrel_mask = (
(valid["launch_speed"] >= 98)
& (valid["launch_angle"] >= 26)
& (valid["launch_angle"] <= 30)
) | (
(valid["launch_speed"] >= 100)
& (valid["launch_angle"] >= 23)
& (valid["launch_angle"] <= 33)
)
barrel_rate_allowed = float(barrel_mask.mean())
# E1b: Extract pitcher throwing hand
p_throws = "R"
if "p_throws" in df.columns:
mode_vals = df["p_throws"].dropna().mode()
if not mode_vals.empty:
p_throws = str(mode_vals.iloc[0]).strip()
# ----------------------------
# 12G: Pitcher contact-shape taxonomy (allowed)
# ----------------------------
gb_rate_allowed = 0.0
ld_rate_allowed = 0.0
fb_rate_allowed = 0.0
popup_rate_allowed = 0.0
la_sweet_spot_allowed_rate = 0.0
la_optimal_hr_allowed_rate = 0.0
avg_launch_angle_allowed = None
if "bb_type" in df.columns:
contact_df = df[df["bb_type"].notna() & (df["bb_type"].astype(str).str.strip() != "")]
if len(contact_df) >= 5:
bb = contact_df["bb_type"].astype(str).str.strip().str.lower()
n = float(len(bb))
gb_rate_allowed = float((bb == "ground_ball").sum()) / n
ld_rate_allowed = float((bb == "line_drive").sum()) / n
fb_rate_allowed = float((bb == "fly_ball").sum()) / n
popup_rate_allowed = float((bb == "popup").sum()) / n
la_valid = launch_angle.dropna()
if len(la_valid) >= 5:
la_sweet_spot_allowed_rate = float(((la_valid >= 8) & (la_valid <= 32)).mean())
la_optimal_hr_allowed_rate = float(((la_valid >= 25) & (la_valid <= 35)).mean())
avg_launch_angle_allowed = _safe_mean(launch_angle)
# Batch 13: Pitch-level command rates from description column
swstr_rate = None
csw_rate = None
ball_rate = None
if "description" in df.columns and len(df) >= 10:
desc = df["description"].astype(str).str.strip().str.lower()
total = len(desc)
swstr_mask = desc.isin({"swinging_strike", "swinging_strike_blocked"})
cs_mask = desc == "called_strike"
ball_mask = desc.isin({"ball", "blocked_ball", "intent_ball", "pitchout"})
swstr_rate = float(swstr_mask.sum() / total)
csw_rate = float((swstr_mask | cs_mask).sum() / total)
ball_rate = float(ball_mask.sum() / total)
return {
"pitcher_name": pitcher_name,
"ev_allowed": ev_allowed,
"hard_hit_rate_allowed": hard_hit_rate_allowed,
"barrel_rate_allowed": barrel_rate_allowed,
"avg_release_speed": avg_release_speed,
"avg_release_spin_rate": avg_release_spin_rate,
"avg_release_extension": avg_release_extension,
"avg_pfx_x": avg_pfx_x,
"avg_pfx_z": avg_pfx_z,
"sample_size": int(len(df)),
"p_throws": p_throws,
"gb_rate_allowed": gb_rate_allowed,
"ld_rate_allowed": ld_rate_allowed,
"fb_rate_allowed": fb_rate_allowed,
"popup_rate_allowed": popup_rate_allowed,
"la_sweet_spot_allowed_rate": la_sweet_spot_allowed_rate,
"la_optimal_hr_allowed_rate": la_optimal_hr_allowed_rate,
"avg_launch_angle_allowed": avg_launch_angle_allowed,
"swstr_rate": swstr_rate,
"csw_rate": csw_rate,
"ball_rate": ball_rate,
}
def compute_pitcher_adjustment(
batter_row: dict[str, Any],
pitcher_row: dict[str, Any],
context: dict[str, Any] | None = None,
) -> dict[str, Any]:
hit_adj = 0.0
hr_adj = 0.0
tb2p_adj = 0.0
reason_tags: list[str] = []
context = context or {}
ev_allowed = pitcher_row.get("ev_allowed")
hard_hit_rate_allowed = pitcher_row.get("hard_hit_rate_allowed")
barrel_rate_allowed = pitcher_row.get("barrel_rate_allowed")
avg_release_speed = pitcher_row.get("avg_release_speed")
avg_release_spin_rate = pitcher_row.get("avg_release_spin_rate")
avg_release_extension = pitcher_row.get("avg_release_extension")
avg_pfx_x = pitcher_row.get("avg_pfx_x")
avg_pfx_z = pitcher_row.get("avg_pfx_z")
# Shared core profile effects
if ev_allowed is not None:
shift = max(-0.03, min(0.04, (float(ev_allowed) - 89.0) * 0.004))
hit_adj += shift
tb2p_adj += shift * 0.9
hr_adj += shift * 0.7
if shift > 0.01:
reason_tags.append("Pitcher allows loud contact")
if hard_hit_rate_allowed is not None:
shift = max(-0.02, min(0.03, (float(hard_hit_rate_allowed) - 0.38) * 0.12))
hit_adj += shift
tb2p_adj += shift
if shift > 0.01:
reason_tags.append("Elevated hard-hit allowed")
if barrel_rate_allowed is not None:
shift = max(-0.015, min(0.04, (float(barrel_rate_allowed) - 0.07) * 0.55))
hr_adj += shift
tb2p_adj += shift * 0.75
if shift > 0.01:
reason_tags.append("Barrel-prone pitcher")
if avg_release_spin_rate is not None:
spin = float(avg_release_spin_rate)
if spin >= 2400:
hit_adj -= 0.003
hr_adj -= 0.003
tb2p_adj -= 0.002
reason_tags.append("Strong pitch spin")
elif spin <= 2050:
hit_adj += 0.004
hr_adj += 0.005
tb2p_adj += 0.003
reason_tags.append("Below-average spin")
if avg_release_extension is not None:
ext = float(avg_release_extension)
if ext >= 6.4:
hit_adj -= 0.003
hr_adj -= 0.002
tb2p_adj -= 0.002
reason_tags.append("Long release extension")
elif ext <= 5.8:
hit_adj += 0.003
hr_adj += 0.003
tb2p_adj += 0.002
reason_tags.append("Short release extension")
movement_fired = False
_pfx_x_str = f"{avg_pfx_x:.3f}" if avg_pfx_x is not None else "N/A"
_pfx_z_str = f"{avg_pfx_z:.3f}" if avg_pfx_z is not None else "N/A"
try:
if avg_pfx_x is not None and abs(float(avg_pfx_x)) >= 0.75: # ~9 inches in feet
hit_adj -= 0.003
tb2p_adj -= 0.003
reason_tags.append("strong_horizontal_break")
movement_fired = True
except Exception as e:
logger.debug(f"[pitcher_adjustment] pfx_x movement block skipped: {e}")
try:
if avg_pfx_z is not None and abs(float(avg_pfx_z)) >= 1.17: # ~14 inches in feet
hit_adj -= 0.003
hr_adj -= 0.003
tb2p_adj -= 0.003
reason_tags.append("strong_vertical_break")
movement_fired = True
except Exception as e:
logger.debug(f"[pitcher_adjustment] pfx_z movement block skipped: {e}")
movement_signal_debug = f"pfx_x={_pfx_x_str} pfx_z={_pfx_z_str} fired={'Y' if movement_fired else 'N'}"
# G1: Velocity-band precision segmentation
if avg_release_speed is not None:
avg_velo = float(avg_release_speed)
if avg_velo >= 97:
hit_adj -= 0.008
hr_adj -= 0.007
reason_tags.append("Elite velo (97+)")
elif avg_velo >= 95:
hit_adj -= 0.004
hr_adj -= 0.004
reason_tags.append("Plus velo (95-96)")
elif avg_velo <= 90:
hit_adj += 0.009
hr_adj += 0.008
reason_tags.append("Soft velo (≤90)")
elif avg_velo <= 92:
hit_adj += 0.005
hr_adj += 0.006
reason_tags.append("Below-avg velo (91-92)")
# Strong Phase 6 adaptive live-state overlay
game_row = context.get("game_row", {}) or {}
live_state = build_pitcher_live_state_v2(
pitcher_row=pitcher_row,
game_row=game_row,
context=context,
)
fatigue_score = float(live_state.get("fatigue_score", 0.0) or 0.0)
degradation_score = float(live_state.get("degradation_score", 0.0) or 0.0)
trust_live_score = float(live_state.get("trust_live_score", 0.0) or 0.0)
# Live degradation affects contact / HR more than hit generally
hit_adj += fatigue_score * 0.010
hr_adj += fatigue_score * 0.015
tb2p_adj += fatigue_score * 0.012
hit_adj += degradation_score * 0.008
hr_adj += degradation_score * 0.014
tb2p_adj += degradation_score * 0.011
# Trust-live score increases the magnitude slightly when evidence is strong
hit_adj += trust_live_score * 0.003
hr_adj += trust_live_score * 0.004
tb2p_adj += trust_live_score * 0.003
reason_tags.extend(live_state.get("reason_tags", []))
# Capture pre-clamp values for debug
_hit_adj_pre = hit_adj
_hr_adj_pre = hr_adj
_tb2p_adj_pre = tb2p_adj
# Final net clamp — prevents extreme multi-signal stacking
hit_adj = max(-0.030, min(0.030, hit_adj))
hr_adj = max(-0.025, min(0.025, hr_adj))
tb2p_adj = max(-0.025, min(0.025, tb2p_adj))
return {
"hit_adj": hit_adj,
"hr_adj": hr_adj,
"tb2p_adj": tb2p_adj,
"reason_tags": reason_tags[:8],
"fatigue_score": fatigue_score,
"degradation_score": degradation_score,
"trust_live_score": trust_live_score,
"baseline_weight": live_state.get("baseline_weight"),
"live_weight": live_state.get("live_weight"),
"velo_delta": live_state.get("velo_delta"),
"spin_delta": live_state.get("spin_delta"),
"extension_delta": live_state.get("extension_delta"),
"pitch_count": live_state.get("pitch_count"),
"times_through_order": live_state.get("times_through_order"),
"rolling_pitch_sample_size": live_state.get("rolling_pitch_sample_size"),
"rolling_pitch_velocity_sample_size": live_state.get("rolling_pitch_velocity_sample_size"),
"rolling_pitch_spin_sample_size": live_state.get("rolling_pitch_spin_sample_size"),
"rolling_pitch_extension_sample_size": live_state.get("rolling_pitch_extension_sample_size"),
"rolling_pitch_pfx_x_sample_size": live_state.get("rolling_pitch_pfx_x_sample_size"),
"rolling_pitch_pfx_z_sample_size": live_state.get("rolling_pitch_pfx_z_sample_size"),
# Task 5 — movement signal debug
"movement_signal_debug": movement_signal_debug,
# Task 6 — pre-clamp values for transparency
"pitcher_net_adj_pre_clamp_hit": _hit_adj_pre,
"pitcher_net_adj_pre_clamp_hr": _hr_adj_pre,
"pitcher_net_adj_pre_clamp_tb2p": _tb2p_adj_pre,
}