2026_MLB_Model / analytics /batter_prop_grader.py
Syntrex's picture
Fix batter_prop_outcomes explosion: dedup audit view + scoped grading
22080c8
raw
history blame
2.14 kB
from __future__ import annotations
from typing import Any
import pandas as pd
def build_batter_prop_outcome_rows_from_audit(
audit_df: pd.DataFrame,
graded_at: str,
) -> pd.DataFrame:
"""
Safe scaffold for batter-prop outcome grading.
This does NOT yet infer real hit/HR/TB outcomes from play-by-play.
It creates a persistent batter-level grading table structure from
recommendation audit rows so later phases can fill realized values.
"""
if audit_df is None or audit_df.empty:
return pd.DataFrame()
rows: list[dict[str, Any]] = []
for _, row in audit_df.iterrows():
rows.append(
{
"created_at": row.get("created_at"),
"graded_at": graded_at,
"game_pk": str(row.get("game_pk", "") or "").strip(),
"away_team": str(row.get("away_team", "") or "").strip(),
"home_team": str(row.get("home_team", "") or "").strip(),
"slot": str(row.get("slot", "") or "").strip(),
"batter_name": str(row.get("batter_name", "") or "").strip(),
"pitcher_name": str(row.get("pitcher_name", "") or "").strip(),
"market": "hr",
"fair_hr_odds": row.get("fair_hr_odds"),
"book_hr_odds": row.get("book_hr_odds"),
"adjusted_edge": row.get("adjusted_edge"),
"confidence": row.get("confidence"),
"recommendation_tier": row.get("recommendation_tier"),
"realized_hit": None,
"realized_hr": None,
"realized_tb2p": None,
"grade_status": "pending",
"outcome_source": "pending_batter_prop_grade",
}
)
df = pd.DataFrame(rows)
if df.empty:
return df
# One outcome row per (game_pk, batter_name, market) — keep latest by created_at
df = (
df.sort_values("created_at", ascending=False, na_position="last")
.drop_duplicates(subset=["game_pk", "batter_name", "market"], keep="first")
.reset_index(drop=True)
)
return df