Syntrex's picture
Add live 2026 pitch/PA tables and migrate Card Lab to new sources
9846940
raw
history blame
22.7 kB
from __future__ import annotations
import re
import datetime
import pandas as pd
from utils.logger import logger
PALETTE = {
"bg": "#080B14",
"panel": "#0F1629",
"panel_alt": "#141C35",
"accent_blue": "#38BDF8",
"accent_green": "#22C55E",
"accent_yellow": "#F59E0B",
"accent_red": "#EF4444",
"accent_indigo": "#818CF8",
"text_primary": "#F1F5F9",
"text_secondary": "#94A3B8",
"text_dim": "#475569",
}
def _clamp(val, lo=0.0, hi=100.0):
if val is None:
return None
return max(lo, min(hi, float(val)))
def _safe_score(val, lo, hi, default=50.0) -> float:
"""Maps val in [lo, hi] to [0, 100]. Returns default if val is None. Clamps result."""
if val is None:
return _clamp(default)
span = hi - lo
if span == 0:
return _clamp(default)
return _clamp((float(val) - lo) / span * 100)
def _fmt_val(val, fmt: str = "float") -> str:
"""Consistent None formatting: None → '—', pct → XX.X%, float → X.X, int → X."""
if val is None:
return "—"
try:
f = float(val)
except Exception:
return str(val)
if fmt == "pct":
return f"{f * 100:.1f}%"
elif fmt == "pct_direct":
return f"{f:.1f}%"
elif fmt == "int":
return str(int(round(f)))
else:
return f"{f:.1f}"
def _sanitize_id(s: str) -> str:
"""Lowercase, spaces → underscores, strip non-alphanumeric."""
s = str(s).lower().replace(" ", "_")
s = re.sub(r"[^a-z0-9_\-]", "", s)
return s
# ---------------------------------------------------------------------------
# Player list helpers
# ---------------------------------------------------------------------------
def get_available_hitters(statcast_df: pd.DataFrame) -> list[str]:
"""Players with batted ball events (launch_speed not null OR events not null)."""
if statcast_df.empty or "player_name" not in statcast_df.columns:
return []
cols_exist = [c for c in ["launch_speed", "events"] if c in statcast_df.columns]
if not cols_exist:
return sorted(statcast_df["player_name"].dropna().unique().tolist())
mask = pd.Series(False, index=statcast_df.index)
for c in cols_exist:
mask = mask | statcast_df[c].notna()
return sorted(statcast_df[mask]["player_name"].dropna().unique().tolist())
def get_available_pitchers(statcast_df: pd.DataFrame) -> list[str]:
"""Players with pitching events (release_speed not null)."""
if statcast_df.empty or "player_name" not in statcast_df.columns:
return []
if "release_speed" not in statcast_df.columns:
return []
mask = statcast_df["release_speed"].notna()
return sorted(statcast_df[mask]["player_name"].dropna().unique().tolist())
def get_available_dates_for_player(statcast_df: pd.DataFrame, player_name: str) -> list[str]:
"""Returns up to 20 most-recent game dates for the player as ISO strings."""
if statcast_df.empty or "player_name" not in statcast_df.columns:
return []
rows = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)]
if "game_date" not in rows.columns:
return []
dates = rows["game_date"].dropna().sort_values(ascending=False).unique()
return [str(d)[:10] for d in dates[:20]]
def _get_player_team(statcast_df: pd.DataFrame, player_name: str) -> str:
"""Try to extract team name from statcast columns for this player."""
if statcast_df.empty or "player_name" not in statcast_df.columns:
return "—"
rows = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)]
if rows.empty:
return "—"
for col in ["team", "batter_team", "home_team"]:
if col in rows.columns:
val = rows[col].dropna()
if not val.empty:
return str(val.iloc[-1])
return "—"
# ---------------------------------------------------------------------------
# Data quality
# ---------------------------------------------------------------------------
def _data_quality_hitter(features: dict) -> str:
pa = features.get("plate_appearances", 0) or 0
if pa >= 80:
return "full"
elif pa >= 25:
return "partial"
return "limited"
def _data_quality_pitcher(features: dict) -> str:
ss = features.get("sample_size", 0) or 0
if ss >= 100:
return "full"
elif ss >= 30:
return "partial"
return "limited"
# ---------------------------------------------------------------------------
# Timeframe filtering
# ---------------------------------------------------------------------------
def _filter_statcast_by_window(
statcast_df: pd.DataFrame,
player_name: str,
mode: str,
year: int | None = None,
date: str | None = None,
start_date: str | None = None,
end_date: str | None = None,
) -> tuple[pd.DataFrame, bool]:
"""
Returns (filtered_df, used_fallback).
If window yields empty, falls back to full player data; used_fallback=True.
"""
df = statcast_df[statcast_df["player_name"].astype(str) == str(player_name)].copy()
if df.empty:
return df, False
if "game_date" not in df.columns:
return df, False
df["game_date"] = pd.to_datetime(df["game_date"], errors="coerce")
result = df
used_fallback = False
try:
if mode == "single_date" and date:
mask = df["game_date"].dt.date == pd.to_datetime(date).date()
result = df[mask]
elif mode == "date_range" and start_date and end_date:
lo = pd.to_datetime(start_date)
hi = pd.to_datetime(end_date)
result = df[(df["game_date"] >= lo) & (df["game_date"] <= hi)]
elif mode == "season" and year:
result = df[df["game_date"].dt.year == int(year)]
else:
result = df
except Exception:
result = df
if result.empty:
used_fallback = True
result = df # fallback to full player data
return result, used_fallback
# ---------------------------------------------------------------------------
# Metric computation (v1 proxies)
# ---------------------------------------------------------------------------
def _compute_hitter_metrics(features: dict, baseline: dict) -> dict:
"""All metrics are v1 proxies. Returns values in [0, 100]."""
ev90 = features.get("ev90")
hard_hit = features.get("hard_hit_rate")
barrel = features.get("barrel_rate")
la_opt = features.get("la_optimal_hr_rate")
fb_rate = features.get("fb_rate")
pull_barrel = features.get("pulled_barrel_rate")
pull_air = features.get("pull_air_rate")
pa = features.get("plate_appearances", 0) or 0
contact_plus = _clamp(
0.40 * _safe_score(ev90, 85, 103, default=40)
+ 0.35 * _safe_score(hard_hit, 0, 0.50, default=40)
+ 0.25 * _safe_score(barrel, 0, 0.15, default=30)
)
hr_shape_plus = _clamp(
0.40 * _safe_score(la_opt, 0, 0.15, default=30)
+ 0.35 * _safe_score(fb_rate, 0, 0.50, default=30)
+ 0.25 * _safe_score(barrel, 0, 0.15, default=30)
)
damage_zone_plus = _clamp(
0.60 * _safe_score(pull_barrel, 0, 0.10, default=25)
+ 0.40 * _safe_score(pull_air, 0, 0.25, default=25)
)
ball_flight_confidence = _clamp(_safe_score(pa, 0, 200, default=20))
return {
"contact_plus": contact_plus,
"hr_shape_plus": hr_shape_plus,
"damage_zone_plus": damage_zone_plus,
"ball_flight_confidence": ball_flight_confidence,
}
def _compute_pitcher_metrics(features: dict) -> dict:
"""All metrics are v1 proxies. Returns values in [0, 100]."""
velo = features.get("avg_release_speed")
pfx_z = features.get("avg_pfx_z")
ball_rate = features.get("ball_rate")
csw_rate = features.get("csw_rate")
barrel_all = features.get("barrel_rate_allowed")
ev_all = features.get("ev_allowed")
stuff_plus = _clamp(
0.55 * _safe_score(velo, 85, 99, default=40)
+ 0.45 * _safe_score(abs(pfx_z or 0), 0, 1.2, default=30)
)
if ball_rate is None and csw_rate is None:
command_plus = 50.0
else:
command_plus = _clamp(
0.50 * (100 - _safe_score(ball_rate, 0.15, 0.42, default=50))
+ 0.50 * _safe_score(csw_rate, 0.18, 0.38, default=40)
)
damage_zone_plus = _clamp(
0.50 * (100 - _safe_score(barrel_all, 0, 0.15, default=50))
+ 0.50 * (100 - _safe_score(ev_all, 85, 95, default=50))
)
bullpen_fatigue = 50.0
return {
"stuff_plus": stuff_plus,
"command_plus": command_plus,
"damage_zone_plus": damage_zone_plus,
"bullpen_fatigue": bullpen_fatigue,
}
# ---------------------------------------------------------------------------
# Readout builders
# ---------------------------------------------------------------------------
def build_hitter_readout(metrics: dict, features: dict, trend: dict) -> list[str]:
lines = []
cp = metrics.get("contact_plus", 50)
hs = metrics.get("hr_shape_plus", 50)
pa = features.get("plate_appearances", 0) or 0
if cp >= 80:
lines.append("Elite contact profile — high exit velocity, strong barrel frequency.")
elif cp >= 65:
lines.append("Above-average contact quality with solid hard-hit rates.")
elif cp < 40:
lines.append("Below-average contact profile — limited hard-hit frequency.")
else:
lines.append("Moderate contact quality in selected window.")
if hs >= 75:
lines.append("Strong HR trajectory shape — pull power with optimal launch angle.")
elif hs >= 55:
lines.append("Moderate home run shape. Fly ball profile developing.")
hot = (trend or {}).get("batter_hot_flag")
cold = (trend or {}).get("batter_cold_flag")
ev90_7 = (trend or {}).get("ev90_7d")
if hot and ev90_7:
lines.append(f"TRENDING HOT — EV90 up in last 7 days ({float(ev90_7):.1f} mph).")
elif cold:
lines.append("COOLING — exit velocity down over last 7 days.")
if pa < 30:
lines.append(f"Limited sample ({pa} PA). Metrics are early-window estimates.")
return lines[:4]
def build_pitcher_readout(metrics: dict, features: dict) -> list[str]:
lines = []
sp = metrics.get("stuff_plus", 50)
cp = metrics.get("command_plus", 50)
dz = metrics.get("damage_zone_plus", 50)
ss = features.get("sample_size", 0) or 0
if sp >= 80:
lines.append("Elite velocity and movement combo — difficult to square up.")
elif sp >= 65:
lines.append("Above-average stuff. Good movement on primary pitches.")
elif sp < 40:
lines.append("Below-average stuff. Hitter-friendly pitch profile.")
else:
lines.append("Average stuff in selected window.")
if cp >= 70:
lines.append("High command profile — limits free passes and works ahead in count.")
elif cp < 40:
lines.append("Command issues — elevated ball rate, struggles to work ahead.")
if dz >= 70:
lines.append("Suppresses hard contact well — low barrel and EV allowed.")
elif dz < 40:
lines.append("Vulnerable to barrel damage — high hard-hit contact allowed.")
if ss < 30:
lines.append(f"Limited sample ({ss} events). Metrics are early-window estimates.")
return lines[:4]
# ---------------------------------------------------------------------------
# Timeframe label
# ---------------------------------------------------------------------------
def _build_timeframe_label(mode, year, date, start_date, end_date) -> str:
if mode == "single_date" and date:
return _sanitize_id(str(date)[:10])
elif mode == "date_range" and start_date and end_date:
return _sanitize_id(f"{str(start_date)[:10]}_to_{str(end_date)[:10]}")
elif mode == "season" and year:
return f"{year}_season"
return "recent"
# ---------------------------------------------------------------------------
# Public data builders
# ---------------------------------------------------------------------------
def build_hitter_card_data(
player_name: str,
statcast_df: pd.DataFrame,
mode: str = "season",
year: int | None = None,
date: str | None = None,
start_date: str | None = None,
end_date: str | None = None,
) -> dict:
from models.batter_baseline import build_batter_feature_row, compute_batter_baseline
from models.batter_trend_model import build_batter_trend_row
from models.rolling_form_model import build_batter_rolling_form_row
windowed_df, used_fallback = _filter_statcast_by_window(
statcast_df, player_name, mode, year, date, start_date, end_date
)
ref_date = (
end_date or date
or (str(windowed_df["game_date"].max())[:10] if not windowed_df.empty and "game_date" in windowed_df.columns else None)
)
features = {}
baseline = {}
trend = {}
rolling = {}
# Features from windowed data — respects selected timeframe
try:
features = build_batter_feature_row(windowed_df, player_name)
except Exception as exc:
logger.warning("[card_data] batter features windowed: %s", exc)
try:
features = build_batter_feature_row(statcast_df, player_name)
except Exception as exc2:
logger.warning("[card_data] batter features full fallback: %s", exc2)
# Baseline can use full sample
try:
baseline = compute_batter_baseline(features) if features else {}
except Exception as exc:
logger.warning("[card_data] batter baseline: %s", exc)
# Trend + rolling use windowed reference
try:
trend = build_batter_trend_row(windowed_df, player_name, reference_date=ref_date)
except Exception:
trend = {}
try:
rolling = build_batter_rolling_form_row(windowed_df, player_name, reference_date=ref_date)
except Exception:
rolling = {}
metrics = _compute_hitter_metrics(features, baseline)
readout = build_hitter_readout(metrics, features, trend)
dq = "limited" if used_fallback else _data_quality_hitter(features)
team = _get_player_team(statcast_df, player_name)
tf_label = _build_timeframe_label(mode, year, date, start_date, end_date)
if used_fallback:
tf_label = f"{tf_label}_fallback"
card_id = _sanitize_id(
f"{player_name}_{tf_label}_hitter_{int(datetime.datetime.utcnow().timestamp())}"
)
payload = {
"card_type": "hitter",
"player_name": player_name,
"team": team,
"timeframe": tf_label,
"data_quality": dq,
"card_id": card_id,
"metrics": metrics,
"summary": {
"ev90": features.get("ev90"),
"barrel_rate": features.get("barrel_rate"),
"hard_hit_rate": features.get("hard_hit_rate"),
"xwoba": features.get("xwoba"),
"avg_launch_angle": features.get("avg_launch_angle"),
},
"baseline": baseline,
"trend": trend,
"rolling": rolling,
"readout": readout,
"plate_appearances": features.get("plate_appearances", 0),
"windowed_df": windowed_df,
}
logger.info(
"[card_generated] player=%s type=hitter range=%s quality=%s",
player_name, tf_label, dq,
)
return payload
def build_pitcher_card_data(
player_name: str,
statcast_df: pd.DataFrame,
mode: str = "season",
year: int | None = None,
date: str | None = None,
start_date: str | None = None,
end_date: str | None = None,
) -> dict:
from models.pitcher_adjustment import build_pitcher_feature_row
from models.rolling_form_model import build_pitcher_rolling_form_row
windowed_df, used_fallback = _filter_statcast_by_window(
statcast_df, player_name, mode, year, date, start_date, end_date
)
ref_date = (
end_date or date
or (str(windowed_df["game_date"].max())[:10] if not windowed_df.empty and "game_date" in windowed_df.columns else None)
)
features = {}
rolling = {}
try:
features = build_pitcher_feature_row(windowed_df, player_name)
except Exception as exc:
logger.warning("[card_data] pitcher features windowed: %s", exc)
try:
features = build_pitcher_feature_row(statcast_df, player_name)
except Exception as exc2:
logger.warning("[card_data] pitcher features full fallback: %s", exc2)
try:
rolling = build_pitcher_rolling_form_row(windowed_df, player_name, reference_date=ref_date)
except Exception:
rolling = {}
metrics = _compute_pitcher_metrics(features)
readout = build_pitcher_readout(metrics, features)
dq = "limited" if used_fallback else _data_quality_pitcher(features)
team = _get_player_team(statcast_df, player_name)
tf_label = _build_timeframe_label(mode, year, date, start_date, end_date)
if used_fallback:
tf_label = f"{tf_label}_fallback"
card_id = _sanitize_id(
f"{player_name}_{tf_label}_pitcher_{int(datetime.datetime.utcnow().timestamp())}"
)
payload = {
"card_type": "pitcher",
"player_name": player_name,
"team": team,
"timeframe": tf_label,
"data_quality": dq,
"card_id": card_id,
"metrics": metrics,
"summary": {
"avg_release_speed": features.get("avg_release_speed"),
"avg_release_spin_rate": features.get("avg_release_spin_rate"),
"ev_allowed": features.get("ev_allowed"),
"barrel_rate_allowed": features.get("barrel_rate_allowed"),
"swstr_rate": features.get("swstr_rate"),
},
"damage": {
"gb_rate_allowed": features.get("gb_rate_allowed"),
"fb_rate_allowed": features.get("fb_rate_allowed"),
"ld_rate_allowed": features.get("ld_rate_allowed"),
"popup_rate_allowed": features.get("popup_rate_allowed"),
},
"rolling": rolling,
"readout": readout,
"sample_size": features.get("sample_size", 0),
"windowed_df": windowed_df,
}
logger.info(
"[card_generated] player=%s type=pitcher range=%s quality=%s",
player_name, tf_label, dq,
)
return payload
def build_game_summary_card_data(
game_pk: str | int,
statcast_df: pd.DataFrame,
game_row: dict,
player_name: str | None = None,
selected_hitters: list[dict] | None = None,
selected_pitchers: list[dict] | None = None,
batter_log_df: pd.DataFrame | None = None,
) -> dict:
gdf = pd.DataFrame()
if not statcast_df.empty and "game_pk" in statcast_df.columns:
gdf = statcast_df[statcast_df["game_pk"].astype(str) == str(game_pk)].copy()
if player_name:
gdf = gdf[gdf["player_name"].astype(str) == str(player_name)] if not gdf.empty else gdf
hitter_rows = []
if batter_log_df is not None and not batter_log_df.empty and "batter_name" in batter_log_df.columns:
# PA-level path: live_batter_game_log_2026 (one row per PA, proper batter identity)
bgl = batter_log_df.copy()
if "game_pk" in bgl.columns:
bgl = bgl[bgl["game_pk"].astype(str) == str(game_pk)]
for bname, bgrp in bgl.groupby("batter_name"):
hr_count = int(pd.to_numeric(bgrp["hr_flag"], errors="coerce").fillna(0).sum())
hit_count = int(pd.to_numeric(bgrp["hit_flag"], errors="coerce").fillna(0).sum())
speeds = pd.to_numeric(bgrp["launch_speed"], errors="coerce").dropna()
ev90 = float(speeds.quantile(0.90)) if len(speeds) >= 3 else None
barrels = int(pd.to_numeric(bgrp["barrel"], errors="coerce").fillna(0).sum())
hitter_rows.append({
"player_name": bname,
"hr": hr_count,
"hits": hit_count,
"ev90": ev90,
"barrels": barrels,
})
hitter_rows.sort(key=lambda x: (x["hr"], x["ev90"] or 0), reverse=True)
elif not gdf.empty and "events" in gdf.columns:
# Fallback: pitch-level grouping (used when batter_log_df unavailable)
for pname, pgrp in gdf.groupby("player_name"):
contact = pgrp[pgrp["launch_speed"].notna()] if "launch_speed" in pgrp.columns else pd.DataFrame()
hr_count = int((pgrp["events"] == "home_run").sum())
hit_count = int(pgrp["events"].isin(["single", "double", "triple", "home_run"]).sum())
ev90 = float(contact["launch_speed"].quantile(0.90)) if len(contact) >= 3 else None
barrels = int((contact["launch_speed"] >= 98).sum()) if not contact.empty else 0
hitter_rows.append({
"player_name": pname,
"hr": hr_count,
"hits": hit_count,
"ev90": ev90,
"barrels": barrels,
})
hitter_rows.sort(key=lambda x: (x["hr"], x["ev90"] or 0), reverse=True)
game_date_str = str(game_row.get("game_date", "—"))[:10]
card_id = _sanitize_id(f"game_{game_pk}_{int(datetime.datetime.utcnow().timestamp())}")
away = game_row.get("away_team", "—")
home = game_row.get("home_team", "—")
payload = {
"card_type": "game_summary",
"player_name": f"{away} @ {home}",
"team": "—",
"game_pk": str(game_pk),
"away_team": away,
"home_team": home,
"away_score": game_row.get("away_score"),
"home_score": game_row.get("home_score"),
"game_date": game_date_str,
"data_quality": "full" if not gdf.empty else "limited",
"card_id": card_id,
"hitters": hitter_rows[:8],
"player_filter": player_name,
"windowed_df": gdf,
"timeframe": game_date_str,
"selected_hitters": selected_hitters or [],
"selected_pitchers": selected_pitchers or [],
}
logger.info(
"[game_summary_generated] game_pk=%s date=%s quality=%s",
game_pk, game_date_str, payload["data_quality"],
)
return payload