Spaces:

Syntrex
/

2026_MLB_Model

Running

2026_MLB_Model

File size: 7,567 Bytes

8f839e6

"""
Build validation script — must pass before committing Card Lab Parquet artifacts.

Usage:
    python scripts/validate_card_lab_build.py

Run from project root after build_player_identity_map.py and build_card_lab_season_summaries.py.
Exits with code 1 if any FAIL condition is detected.
"""
from __future__ import annotations

import sys
from collections import Counter
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

import pandas as pd

_ROOT = Path(__file__).parent.parent / "data"
_IDENTITY_MAP_PATH    = _ROOT / "player_identity_map.parquet"
_PITCHER_OUTPUT_PATH  = _ROOT / "card_lab_pitcher_seasons.parquet"
_HITTER_OUTPUT_PATH   = _ROOT / "card_lab_hitter_seasons.parquet"

SPOT_CHECK_YEAR = 2025

PITCHER_SPOT_CHECKS_PRESENT = ["Tarik Skubal", "Sonny Gray", "Aaron Nola"]
HITTER_SPOT_CHECKS_PRESENT  = ["Lars Nootbaar", "Alec Burleson"]
HITTER_SPOT_CHECKS_ABSENT   = ["Freddy Peralta", "Tarik Skubal"]


def _check_file(path: Path, name: str) -> bool:
    if not path.exists():
        print(f"\n[FAIL] {name} not found: {path}")
        return False
    return True


def validate() -> bool:
    all_pass = True

    # -------------------------------------------------------------------
    # player_identity_map.parquet
    # -------------------------------------------------------------------
    print("\n=== player_identity_map.parquet ===")
    if not _check_file(_IDENTITY_MAP_PATH, "player_identity_map"):
        return False

    idf = pd.read_parquet(_IDENTITY_MAP_PATH)
    total = len(idf)
    with_mlbam    = int(idf["player_id"].notna().sum())
    miss_mlbam    = total - with_mlbam
    with_statcast = int(idf["statcast_name"].notna().sum())
    miss_statcast = total - with_statcast
    collisions    = int(idf["source_note"].str.contains("collision", na=False).sum())

    print(f"  Total records:          {total}")
    print(f"  With MLBAM player_id:   {with_mlbam}")
    print(f"  Missing player_id:      {miss_mlbam}")
    print(f"  With statcast_name:     {with_statcast}")
    print(f"  Missing statcast_name:  {miss_statcast}")
    print(f"  Collision records:      {collisions}")

    if miss_statcast:
        missing_names = idf[idf["statcast_name"].isna()]["canonical_name"].head(20).tolist()
        print(f"  First 20 missing statcast_name: {missing_names}")

    # Check for duplicate display_names in identity map
    dup_display = sum(1 for c in Counter(idf["display_name"].dropna()).values() if c > 1)
    if dup_display:
        print(f"  [FAIL] Duplicate display_names in identity map: {dup_display}")
        all_pass = False

    # -------------------------------------------------------------------
    # card_lab_pitcher_seasons.parquet
    # -------------------------------------------------------------------
    print("\n=== card_lab_pitcher_seasons.parquet ===")
    if not _check_file(_PITCHER_OUTPUT_PATH, "pitcher seasons"):
        all_pass = False
    else:
        pdf = pd.read_parquet(_PITCHER_OUTPUT_PATH)
        years = sorted(pdf["Season"].dropna().unique().astype(int).tolist())
        print(f"  Years present: {years}")
        for yr in years:
            yr_df = pdf[pdf["Season"] == yr]
            with_pid = int(yr_df["player_id"].notna().sum())
            null_pid = len(yr_df) - with_pid
            print(f"  {yr}: {len(yr_df)} rows  ({with_pid} with player_id, {null_pid} without)")

        total_null_pid = int(pdf["player_id"].isna().sum())
        print(f"  Total null player_id: {total_null_pid}")

        # Duplicate display_name check per year
        for yr in years:
            yr_df = pdf[(pdf["Season"] == yr) & (pdf["IP"] > 0)]
            dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
            if dup:
                print(f"  [FAIL] Duplicate display_names in pitcher {yr}: {dup}")
                all_pass = False
            else:
                print(f"  [PASS] No duplicate display_names in pitcher {yr}")

        # Spot checks (pitcher present checks)
        if SPOT_CHECK_YEAR in years:
            yr_names = pdf[(pdf["Season"] == SPOT_CHECK_YEAR) & (pdf["IP"] > 0)]["display_name"].str.lower().tolist()
            for check in PITCHER_SPOT_CHECKS_PRESENT:
                found = any(check.lower() in n for n in yr_names)
                status = "[PASS]" if found else "[FAIL]"
                if not found:
                    all_pass = False
                print(f"  {status} Spot check — {check!r} in pitcher {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")

    # -------------------------------------------------------------------
    # card_lab_hitter_seasons.parquet
    # -------------------------------------------------------------------
    print("\n=== card_lab_hitter_seasons.parquet ===")
    if not _check_file(_HITTER_OUTPUT_PATH, "hitter seasons"):
        all_pass = False
    else:
        hdf = pd.read_parquet(_HITTER_OUTPUT_PATH)
        years = sorted(hdf["Season"].dropna().unique().astype(int).tolist())
        print(f"  Years present: {years}")
        for yr in years:
            yr_df = hdf[hdf["Season"] == yr]
            print(f"  {yr}: {len(yr_df)} rows (all have statcast_name — null rows excluded at build)")

        # Confirm no null statcast_name in output (build script must have excluded them)
        null_sc = int(hdf["statcast_name"].isna().sum())
        if null_sc:
            print(f"  [FAIL] {null_sc} hitter rows with null statcast_name in output (should be 0)")
            all_pass = False
        else:
            print(f"  [PASS] All hitter rows have statcast_name")

        # Duplicate display_name check per year
        for yr in years:
            yr_df = hdf[(hdf["Season"] == yr) & (hdf["AB"] > 0)]
            dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
            if dup:
                print(f"  [FAIL] Duplicate display_names in hitter {yr}: {dup}")
                all_pass = False
            else:
                print(f"  [PASS] No duplicate display_names in hitter {yr}")

        # Spot checks
        if SPOT_CHECK_YEAR in years:
            yr_names_lc = {
                n.lower()
                for n in hdf[(hdf["Season"] == SPOT_CHECK_YEAR) & (hdf["AB"] > 0)]["display_name"].dropna()
            }
            for check in HITTER_SPOT_CHECKS_PRESENT:
                found = any(check.lower() in n for n in yr_names_lc)
                status = "[PASS]" if found else "[FAIL]"
                if not found:
                    all_pass = False
                print(f"  {status} Spot check — {check!r} in hitter {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")
            for check in HITTER_SPOT_CHECKS_ABSENT:
                found = any(check.lower() in n for n in yr_names_lc)
                status = "[PASS]" if not found else "[FAIL]"
                if found:
                    all_pass = False
                print(f"  {status} Spot check — {check!r} NOT in hitter {SPOT_CHECK_YEAR}: {'ABSENT' if not found else 'PRESENT (FAIL)'}")

    # -------------------------------------------------------------------
    # Final verdict
    # -------------------------------------------------------------------
    print("\n" + ("=" * 50))
    if all_pass:
        print("RESULT: ALL PASS — safe to commit Parquet artifacts")
    else:
        print("RESULT: FAIL — fix issues above before committing")
    print("=" * 50)

    return all_pass


if __name__ == "__main__":
    passed = validate()
    sys.exit(0 if passed else 1)