Spaces:

Syntrex
/

2026_MLB_Model

Sleeping

2026_MLB_Model

File size: 9,528 Bytes

"""
Batch 10: Physics-aware trajectory modeling from raw Statcast kinematic fields.

Uses: release_pos_x/y/z, vx0/vy0/vz0, ax/ay/az, plate_x/plate_z, pitch_type

Provides:
  - Pitch trajectory reconstruction (early-flight x/z at t=0.167s)
  - Tunneling metric: pitch types look similar early, diverge at plate
  - Release consistency: variance of release_pos_x/z across pitch types
  - Arsenal deception score: weighted combination of tunneling + consistency
"""
from __future__ import annotations

import logging
import math
from typing import Any

import numpy as np
import pandas as pd

from data.odds_name_map import normalize_pitcher_name as _normalize_name_text
from models.pitcher_adjustment import _to_last_first_variants

logger = logging.getLogger(__name__)

# ~5 ft of travel from release point, consistent with early batter read window
_EARLY_FLIGHT_T = 0.167  # seconds

_TRAJ_COLS = ["release_pos_x", "release_pos_z", "vx0", "vz0", "ax", "az"]
_PLATE_COLS = ["plate_x", "plate_z"]
_RELEASE_COLS = ["release_pos_x", "release_pos_y", "release_pos_z"]


def _filter_pitcher_df(
    statcast_df: pd.DataFrame,
    pitcher_name: str,
    pitcher_id: int | None = None,
) -> pd.DataFrame:
    if statcast_df.empty or "player_name" not in statcast_df.columns:
        return pd.DataFrame()

    if pitcher_id is not None and "pitcher" in statcast_df.columns:
        try:
            ids = pd.to_numeric(statcast_df["pitcher"], errors="coerce")
            df = statcast_df[ids == int(pitcher_id)].copy()
            if not df.empty:
                return df
        except Exception:
            pass

    normalized_series = statcast_df["player_name"].astype(str).map(_normalize_name_text)
    variants = _to_last_first_variants(pitcher_name)
    df = statcast_df[normalized_series.isin(variants)].copy()

    if df.empty:
        parts = _normalize_name_text(pitcher_name).split()
        if len(parts) >= 2:
            first, last = parts[0], parts[-1]
            loose = normalized_series.apply(
                lambda n: isinstance(n, str) and first in n and last in n
            )
            df = statcast_df[loose].copy()

    return df


def _compute_release_consistency(df: pd.DataFrame) -> float | None:
    """
    Release consistency score in [0, 1].
    Measures 2-D spread (x + z) of release point across all pitches.
    Score 1.0 = perfect consistency; 0.0 = highly inconsistent (std >= 3.0 in).
    Threshold calibration:
      std < 0.5 in = elite mechanical repeatability
      std > 3.0 in = poor command
    """
    if not all(c in df.columns for c in _RELEASE_COLS):
        return None

    valid = df[_RELEASE_COLS].apply(pd.to_numeric, errors="coerce").dropna()
    if len(valid) < 10:
        return None

    std_x = float(valid["release_pos_x"].std())
    std_z = float(valid["release_pos_z"].std())
    combined_std = math.sqrt(std_x ** 2 + std_z ** 2)
    return max(0.0, min(1.0, 1.0 - combined_std / 3.0))


def _compute_tunneling(df: pd.DataFrame) -> float | None:
    """
    Tunneling score in [0, 1].

    Algorithm:
      1. Reconstruct each pitch's (x, z) at t=0.167s (early-flight window)
         using kinematic equations:
           x(t) = release_pos_x + vx0*t + 0.5*ax*t^2
           z(t) = release_pos_z + vz0*t + 0.5*az*t^2
      2. For each pair of pitch types, compute:
           early_dist  = Euclidean dist of mean early-flight (x, z)
           plate_dist  = Euclidean dist of mean plate (plate_x, plate_z)
           tunnel_ratio = plate_dist / early_dist
      3. High ratio = pitches look similar early, diverge at plate = elite tunneling.
      4. Map mean ratio to [0, 1]: ratio=1.0 → score=0.0, ratio>=4.0 → score=1.0.
    """
    needed = _TRAJ_COLS + _PLATE_COLS
    if not all(c in df.columns for c in needed) or "pitch_type" not in df.columns:
        return None

    work = df[needed + ["pitch_type"]].copy()
    for col in needed:
        work[col] = pd.to_numeric(work[col], errors="coerce")
    work = work.dropna(subset=needed)

    if len(work) < 20:
        return None

    t = _EARLY_FLIGHT_T
    work["x_early"] = work["release_pos_x"] + work["vx0"] * t + 0.5 * work["ax"] * t ** 2
    work["z_early"] = work["release_pos_z"] + work["vz0"] * t + 0.5 * work["az"] * t ** 2

    grouped = work.groupby("pitch_type").agg(
        x_early_mean=("x_early", "mean"),
        z_early_mean=("z_early", "mean"),
        plate_x_mean=("plate_x", "mean"),
        plate_z_mean=("plate_z", "mean"),
        count=("plate_x", "count"),
    )
    grouped = grouped[grouped["count"] >= 10]

    if len(grouped) < 2:
        return None

    types = list(grouped.index)
    ratios: list[float] = []

    for i in range(len(types)):
        for j in range(i + 1, len(types)):
            a = grouped.loc[types[i]]
            b = grouped.loc[types[j]]

            early_dist = math.sqrt(
                (a["x_early_mean"] - b["x_early_mean"]) ** 2
                + (a["z_early_mean"] - b["z_early_mean"]) ** 2
            )
            plate_dist = math.sqrt(
                (a["plate_x_mean"] - b["plate_x_mean"]) ** 2
                + (a["plate_z_mean"] - b["plate_z_mean"]) ** 2
            )
            # Avoid div-by-zero: pitches that share the same early path get
            # full credit for any plate divergence.
            denom = max(early_dist, 0.01)
            ratios.append(min(5.0, plate_dist / denom))

    if not ratios:
        return None

    mean_ratio = float(np.mean(ratios))
    # ratio=1.0 → 0.0 (no separation gain), ratio=4.0 → 1.0 (elite)
    return max(0.0, min(1.0, (mean_ratio - 1.0) / 3.0))


def build_trajectory_features(
    statcast_df: pd.DataFrame,
    pitcher_name: str,
    pitcher_id: int | None = None,
) -> dict[str, Any]:
    """
    Build physics-aware trajectory metrics from raw Statcast kinematic fields.

    Returns:
      release_consistency_score  : float [0,1] or None
      tunnel_score               : float [0,1] or None
      deception_score            : float [0,1] or None (weighted combo)
      trajectory_sample_size     : int
    """
    _empty: dict[str, Any] = {
        "pitcher_name": pitcher_name,
        "release_consistency_score": None,
        "tunnel_score": None,
        "deception_score": None,
        "trajectory_sample_size": 0,
    }

    df = _filter_pitcher_df(statcast_df, pitcher_name, pitcher_id)
    if df.empty:
        return _empty

    release_consistency = _compute_release_consistency(df)
    tunnel_score = _compute_tunneling(df)

    # Deception: 40% release consistency, 60% tunneling.
    # Partial credit when only one metric is available.
    deception_score: float | None = None
    if release_consistency is not None and tunnel_score is not None:
        deception_score = 0.40 * release_consistency + 0.60 * tunnel_score
    elif release_consistency is not None:
        deception_score = release_consistency * 0.60
    elif tunnel_score is not None:
        deception_score = tunnel_score * 0.70

    return {
        "pitcher_name": pitcher_name,
        "release_consistency_score": release_consistency,
        "tunnel_score": tunnel_score,
        "deception_score": deception_score,
        "trajectory_sample_size": int(len(df)),
    }


def compute_trajectory_adjustment(trajectory_row: dict[str, Any]) -> dict[str, Any]:
    """
    Convert trajectory/deception metrics into batter-outcome adjustments.

    Direction:
      High deception (tunneling + consistent release) → pitcher advantage
        → negative batter hit/hr/tb2p adjustments
      Low deception (poor tunneling or wild release) → batter advantage
        → positive adjustments

    Scale design (per sub-signal):
      release_consistency: max ±0.008 on hit, ±0.006 on tb2p
      tunneling:           max ±0.007 on hit, ±0.004 on hr, ±0.005 on tb2p

    Totals clamped: hit ±0.015, hr ±0.010, tb2p ±0.012.
    """
    hit_adj = 0.0
    hr_adj = 0.0
    tb2p_adj = 0.0
    reason_tags: list[str] = []

    release_consistency = trajectory_row.get("release_consistency_score")
    tunnel_score = trajectory_row.get("tunnel_score")

    if release_consistency is not None:
        rc = float(release_consistency)
        # rc=1.0 → shift=-0.008 (elite consistency suppresses contact)
        # rc=0.0 → shift=+0.008 (erratic release helps batter)
        shift = (0.5 - rc) * 0.016
        hit_adj += shift
        tb2p_adj += shift * 0.75
        if rc >= 0.75:
            reason_tags.append("Consistent release point")
        elif rc <= 0.35:
            reason_tags.append("Inconsistent release point")

    if tunnel_score is not None:
        ts = float(tunnel_score)
        # ts=1.0 → shift=-0.007 (elite tunneling suppresses reads)
        # ts=0.0 → shift=+0.007 (poor tunneling, pitches easy to track)
        shift = (0.5 - ts) * 0.014
        hit_adj += shift
        hr_adj += shift * 0.55
        tb2p_adj += shift * 0.70
        if ts >= 0.70:
            reason_tags.append("Strong pitch tunneling")
        elif ts <= 0.30:
            reason_tags.append("Poor pitch tunneling")

    hit_adj = max(-0.015, min(0.015, hit_adj))
    hr_adj = max(-0.010, min(0.010, hr_adj))
    tb2p_adj = max(-0.012, min(0.012, tb2p_adj))

    return {
        "hit_adj": hit_adj,
        "hr_adj": hr_adj,
        "tb2p_adj": tb2p_adj,
        "release_consistency_score": release_consistency,
        "tunnel_score": tunnel_score,
        "deception_score": trajectory_row.get("deception_score"),
        "reason_tags": reason_tags,
    }