Spaces:

Syntrex
/

2026_MLB_Model

Sleeping

2026_MLB_Model

File size: 2,954 Bytes

abeeae7

from __future__ import annotations

from typing import Any

import pandas as pd


def _safe_text(value: Any) -> str:
    return str(value or "").strip()


def _event_is_hit(event: str) -> int:
    e = str(event or "").strip().lower()
    return int(e in {"single", "double", "triple", "home_run"})


def _event_is_hr(event: str) -> int:
    e = str(event or "").strip().lower()
    return int(e == "home_run")


def _event_total_bases(event: str) -> int:
    e = str(event or "").strip().lower()
    if e == "single":
        return 1
    if e == "double":
        return 2
    if e == "triple":
        return 3
    if e == "home_run":
        return 4
    return 0


def build_batter_realization_rows(
    batter_prop_outcomes_df: pd.DataFrame,
    statcast_df: pd.DataFrame,
    graded_at: str,
) -> pd.DataFrame:
    """
    First-pass realized batter outcome scaffold.

    Uses currently loaded statcast rows to determine whether a batter:
    - recorded at least one hit
    - recorded at least one HR
    - recorded 2+ total bases

    This is not yet game-perfect by game_pk, but it upgrades outcomes
    from pending to actual observed values within the current loaded dataset.
    """
    if batter_prop_outcomes_df is None or batter_prop_outcomes_df.empty:
        return pd.DataFrame()

    if statcast_df is None or statcast_df.empty:
        return pd.DataFrame()

    if "player_name" not in statcast_df.columns or "events" not in statcast_df.columns:
        return pd.DataFrame()

    rows: list[dict[str, Any]] = []

    for _, outcome_row in batter_prop_outcomes_df.iterrows():
        batter_name = _safe_text(outcome_row.get("batter_name"))
        if not batter_name:
            continue

        batter_events = statcast_df[
            statcast_df["player_name"].astype(str).str.strip() == batter_name
        ].copy()

        if batter_events.empty:
            realized_hit = None
            realized_hr = None
            realized_tb2p = None
            grade_status = "pending"
            outcome_source = "statcast_no_match"
        else:
            events = batter_events["events"].fillna("").astype(str)

            hit_count = sum(_event_is_hit(e) for e in events)
            hr_count = sum(_event_is_hr(e) for e in events)
            total_bases = sum(_event_total_bases(e) for e in events)

            realized_hit = int(hit_count > 0)
            realized_hr = int(hr_count > 0)
            realized_tb2p = int(total_bases >= 2)
            grade_status = "graded"
            outcome_source = "statcast_loaded_window"

        row_dict = outcome_row.to_dict()
        row_dict["graded_at"] = graded_at
        row_dict["realized_hit"] = realized_hit
        row_dict["realized_hr"] = realized_hr
        row_dict["realized_tb2p"] = realized_tb2p
        row_dict["grade_status"] = grade_status
        row_dict["outcome_source"] = outcome_source

        rows.append(row_dict)

    return pd.DataFrame(rows)