2026_MLB_Model / scripts /validate_card_lab_build.py
Syntrex's picture
Card Lab Pass 5: player identity layer + pybaseball season-summary selectors
8f839e6
raw
history blame
7.57 kB
"""
Build validation script — must pass before committing Card Lab Parquet artifacts.
Usage:
python scripts/validate_card_lab_build.py
Run from project root after build_player_identity_map.py and build_card_lab_season_summaries.py.
Exits with code 1 if any FAIL condition is detected.
"""
from __future__ import annotations
import sys
from collections import Counter
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import pandas as pd
_ROOT = Path(__file__).parent.parent / "data"
_IDENTITY_MAP_PATH = _ROOT / "player_identity_map.parquet"
_PITCHER_OUTPUT_PATH = _ROOT / "card_lab_pitcher_seasons.parquet"
_HITTER_OUTPUT_PATH = _ROOT / "card_lab_hitter_seasons.parquet"
SPOT_CHECK_YEAR = 2025
PITCHER_SPOT_CHECKS_PRESENT = ["Tarik Skubal", "Sonny Gray", "Aaron Nola"]
HITTER_SPOT_CHECKS_PRESENT = ["Lars Nootbaar", "Alec Burleson"]
HITTER_SPOT_CHECKS_ABSENT = ["Freddy Peralta", "Tarik Skubal"]
def _check_file(path: Path, name: str) -> bool:
if not path.exists():
print(f"\n[FAIL] {name} not found: {path}")
return False
return True
def validate() -> bool:
all_pass = True
# -------------------------------------------------------------------
# player_identity_map.parquet
# -------------------------------------------------------------------
print("\n=== player_identity_map.parquet ===")
if not _check_file(_IDENTITY_MAP_PATH, "player_identity_map"):
return False
idf = pd.read_parquet(_IDENTITY_MAP_PATH)
total = len(idf)
with_mlbam = int(idf["player_id"].notna().sum())
miss_mlbam = total - with_mlbam
with_statcast = int(idf["statcast_name"].notna().sum())
miss_statcast = total - with_statcast
collisions = int(idf["source_note"].str.contains("collision", na=False).sum())
print(f" Total records: {total}")
print(f" With MLBAM player_id: {with_mlbam}")
print(f" Missing player_id: {miss_mlbam}")
print(f" With statcast_name: {with_statcast}")
print(f" Missing statcast_name: {miss_statcast}")
print(f" Collision records: {collisions}")
if miss_statcast:
missing_names = idf[idf["statcast_name"].isna()]["canonical_name"].head(20).tolist()
print(f" First 20 missing statcast_name: {missing_names}")
# Check for duplicate display_names in identity map
dup_display = sum(1 for c in Counter(idf["display_name"].dropna()).values() if c > 1)
if dup_display:
print(f" [FAIL] Duplicate display_names in identity map: {dup_display}")
all_pass = False
# -------------------------------------------------------------------
# card_lab_pitcher_seasons.parquet
# -------------------------------------------------------------------
print("\n=== card_lab_pitcher_seasons.parquet ===")
if not _check_file(_PITCHER_OUTPUT_PATH, "pitcher seasons"):
all_pass = False
else:
pdf = pd.read_parquet(_PITCHER_OUTPUT_PATH)
years = sorted(pdf["Season"].dropna().unique().astype(int).tolist())
print(f" Years present: {years}")
for yr in years:
yr_df = pdf[pdf["Season"] == yr]
with_pid = int(yr_df["player_id"].notna().sum())
null_pid = len(yr_df) - with_pid
print(f" {yr}: {len(yr_df)} rows ({with_pid} with player_id, {null_pid} without)")
total_null_pid = int(pdf["player_id"].isna().sum())
print(f" Total null player_id: {total_null_pid}")
# Duplicate display_name check per year
for yr in years:
yr_df = pdf[(pdf["Season"] == yr) & (pdf["IP"] > 0)]
dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
if dup:
print(f" [FAIL] Duplicate display_names in pitcher {yr}: {dup}")
all_pass = False
else:
print(f" [PASS] No duplicate display_names in pitcher {yr}")
# Spot checks (pitcher present checks)
if SPOT_CHECK_YEAR in years:
yr_names = pdf[(pdf["Season"] == SPOT_CHECK_YEAR) & (pdf["IP"] > 0)]["display_name"].str.lower().tolist()
for check in PITCHER_SPOT_CHECKS_PRESENT:
found = any(check.lower() in n for n in yr_names)
status = "[PASS]" if found else "[FAIL]"
if not found:
all_pass = False
print(f" {status} Spot check — {check!r} in pitcher {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")
# -------------------------------------------------------------------
# card_lab_hitter_seasons.parquet
# -------------------------------------------------------------------
print("\n=== card_lab_hitter_seasons.parquet ===")
if not _check_file(_HITTER_OUTPUT_PATH, "hitter seasons"):
all_pass = False
else:
hdf = pd.read_parquet(_HITTER_OUTPUT_PATH)
years = sorted(hdf["Season"].dropna().unique().astype(int).tolist())
print(f" Years present: {years}")
for yr in years:
yr_df = hdf[hdf["Season"] == yr]
print(f" {yr}: {len(yr_df)} rows (all have statcast_name — null rows excluded at build)")
# Confirm no null statcast_name in output (build script must have excluded them)
null_sc = int(hdf["statcast_name"].isna().sum())
if null_sc:
print(f" [FAIL] {null_sc} hitter rows with null statcast_name in output (should be 0)")
all_pass = False
else:
print(f" [PASS] All hitter rows have statcast_name")
# Duplicate display_name check per year
for yr in years:
yr_df = hdf[(hdf["Season"] == yr) & (hdf["AB"] > 0)]
dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
if dup:
print(f" [FAIL] Duplicate display_names in hitter {yr}: {dup}")
all_pass = False
else:
print(f" [PASS] No duplicate display_names in hitter {yr}")
# Spot checks
if SPOT_CHECK_YEAR in years:
yr_names_lc = {
n.lower()
for n in hdf[(hdf["Season"] == SPOT_CHECK_YEAR) & (hdf["AB"] > 0)]["display_name"].dropna()
}
for check in HITTER_SPOT_CHECKS_PRESENT:
found = any(check.lower() in n for n in yr_names_lc)
status = "[PASS]" if found else "[FAIL]"
if not found:
all_pass = False
print(f" {status} Spot check — {check!r} in hitter {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")
for check in HITTER_SPOT_CHECKS_ABSENT:
found = any(check.lower() in n for n in yr_names_lc)
status = "[PASS]" if not found else "[FAIL]"
if found:
all_pass = False
print(f" {status} Spot check — {check!r} NOT in hitter {SPOT_CHECK_YEAR}: {'ABSENT' if not found else 'PRESENT (FAIL)'}")
# -------------------------------------------------------------------
# Final verdict
# -------------------------------------------------------------------
print("\n" + ("=" * 50))
if all_pass:
print("RESULT: ALL PASS — safe to commit Parquet artifacts")
else:
print("RESULT: FAIL — fix issues above before committing")
print("=" * 50)
return all_pass
if __name__ == "__main__":
passed = validate()
sys.exit(0 if passed else 1)