2026_MLB_Model / data /statcast.py
Syntrex's picture
Improve load-time caching and baseline snapshots
c4ae8c3
raw
history blame
5.02 kB
from __future__ import annotations
from datetime import datetime
from io import StringIO
import pandas as pd
import requests
from config.settings import STATCAST_SEARCH_URL
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Accept-Language": "en-US,en;q=0.9",
}
def _query_statcast(start_date: str, end_date: str, season: str, player_type: str = "batter") -> pd.DataFrame:
params = {
"all": "true",
"hfPT": "",
"hfAB": "",
"hfBBT": "",
"hfPR": "",
"hfZ": "",
"stadium": "",
"hfBBL": "",
"hfNewZones": "",
"hfGT": "R|",
"hfC": "",
"hfSea": f"{season}|",
"hfSit": "",
"player_type": player_type,
"hfOuts": "",
"opponent": "",
"pitcher_throws": "",
"batter_stands": "",
"hfSA": "",
"game_date_gt": start_date,
"game_date_lt": end_date,
"team": "",
"position": "",
"hfRO": "",
"home_road": "",
"hfFlag": "",
"metric_1": "",
"hfInn": "",
"min_pitches": "0",
"min_results": "0",
"group_by": "name",
"sort_col": "pitches",
"player_event_sort": "api_h_launch_speed",
"sort_order": "desc",
"min_abs": "0",
"type": "details",
}
response = requests.get(
STATCAST_SEARCH_URL,
params=params,
headers=HEADERS,
timeout=180,
)
response.raise_for_status()
text = response.text.strip()
if not text or text.startswith("<!DOCTYPE html"):
return pd.DataFrame()
try:
return pd.read_csv(StringIO(text))
except Exception:
return pd.DataFrame()
def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
"""Fetch Statcast data for the given date range (MLB only). player_name = batter."""
season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
return _query_statcast(start_date, end_date, season=season, player_type="batter")
def fetch_statcast_range_pitcher(start_date: str, end_date: str) -> pd.DataFrame:
"""Fetch pitcher-perspective Statcast for the given date range. player_name = pitcher."""
season = str(datetime.strptime(start_date, "%Y-%m-%d").year)
return _query_statcast(start_date, end_date, season=season, player_type="pitcher")
def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:
if df.empty:
return df
rename_map = {
"player_name": "player_name",
"batter": "batter",
"pitcher": "pitcher",
"pitch_type": "pitch_type",
"pitch_name": "pitch_name",
"release_speed": "release_speed",
"release_spin_rate": "release_spin_rate",
"release_pos_x": "release_pos_x",
"release_pos_z": "release_pos_z",
"plate_x": "plate_x",
"plate_z": "plate_z",
"pfx_x": "pfx_x",
"pfx_z": "pfx_z",
"launch_speed": "launch_speed",
"launch_angle": "launch_angle",
"estimated_ba_using_speedangle": "xba",
"estimated_woba_using_speedangle": "xwoba",
"spray_angle": "spray_angle",
"hc_x": "hc_x",
"hc_y": "hc_y",
"bb_type": "bb_type",
"events": "events",
"description": "description",
"stand": "batter_stand",
"p_throws": "pitcher_hand",
"home_team": "home_team",
"away_team": "away_team",
"inning_topbot": "inning_topbot",
"team": "team",
"batter_team": "batter_team",
"team_name": "team_name",
"game_date": "game_date",
"game_pk": "game_pk",
"inning": "inning",
"outs_when_up": "outs_when_up",
"balls": "balls",
"strikes": "strikes",
"bat_score": "bat_score",
"fld_score": "fld_score",
"post_bat_score": "post_bat_score",
"post_fld_score": "post_fld_score",
}
keep_cols = [col for col in rename_map if col in df.columns]
out = df[keep_cols].copy()
out = out.rename(columns={col: rename_map[col] for col in keep_cols})
if "pitcher_hand" in out.columns and "p_throws" not in out.columns:
out["p_throws"] = out["pitcher_hand"]
if "batter_stand" in out.columns and "stand" not in out.columns:
out["stand"] = out["batter_stand"]
numeric_cols = [
"release_speed",
"release_spin_rate",
"release_pos_x",
"release_pos_z",
"plate_x",
"plate_z",
"pfx_x",
"pfx_z",
"launch_speed",
"launch_angle",
"xba",
"xwoba",
"inning",
"outs_when_up",
"balls",
"strikes",
"bat_score",
"fld_score",
"post_bat_score",
"post_fld_score",
]
for col in numeric_cols:
if col in out.columns:
out[col] = pd.to_numeric(out[col], errors="coerce")
if "game_date" in out.columns:
out["game_date"] = pd.to_datetime(out["game_date"], errors="coerce")
return out