""" Build validation script — must pass before committing Card Lab Parquet artifacts. Usage: python scripts/validate_card_lab_build.py Run from project root after build_player_identity_map.py and build_card_lab_season_summaries.py. Exits with code 1 if any FAIL condition is detected. """ from __future__ import annotations import sys from collections import Counter from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) import pandas as pd _ROOT = Path(__file__).parent.parent / "data" _IDENTITY_MAP_PATH = _ROOT / "player_identity_map.parquet" _PITCHER_OUTPUT_PATH = _ROOT / "card_lab_pitcher_seasons.parquet" _HITTER_OUTPUT_PATH = _ROOT / "card_lab_hitter_seasons.parquet" SPOT_CHECK_YEAR = 2025 PITCHER_SPOT_CHECKS_PRESENT = ["Tarik Skubal", "Sonny Gray", "Aaron Nola"] HITTER_SPOT_CHECKS_PRESENT = ["Lars Nootbaar", "Alec Burleson"] HITTER_SPOT_CHECKS_ABSENT = ["Freddy Peralta", "Tarik Skubal"] def _check_file(path: Path, name: str) -> bool: if not path.exists(): print(f"\n[FAIL] {name} not found: {path}") return False return True def validate() -> bool: all_pass = True # ------------------------------------------------------------------- # player_identity_map.parquet # ------------------------------------------------------------------- print("\n=== player_identity_map.parquet ===") if not _check_file(_IDENTITY_MAP_PATH, "player_identity_map"): return False idf = pd.read_parquet(_IDENTITY_MAP_PATH) total = len(idf) with_mlbam = int(idf["player_id"].notna().sum()) miss_mlbam = total - with_mlbam with_statcast = int(idf["statcast_name"].notna().sum()) miss_statcast = total - with_statcast collisions = int(idf["source_note"].str.contains("collision", na=False).sum()) print(f" Total records: {total}") print(f" With MLBAM player_id: {with_mlbam}") print(f" Missing player_id: {miss_mlbam}") print(f" With statcast_name: {with_statcast}") print(f" Missing statcast_name: {miss_statcast}") print(f" Collision records: {collisions}") if miss_statcast: missing_names = idf[idf["statcast_name"].isna()]["canonical_name"].head(20).tolist() print(f" First 20 missing statcast_name: {missing_names}") # Check for duplicate display_names in identity map dup_display = sum(1 for c in Counter(idf["display_name"].dropna()).values() if c > 1) if dup_display: print(f" [FAIL] Duplicate display_names in identity map: {dup_display}") all_pass = False # ------------------------------------------------------------------- # card_lab_pitcher_seasons.parquet # ------------------------------------------------------------------- print("\n=== card_lab_pitcher_seasons.parquet ===") if not _check_file(_PITCHER_OUTPUT_PATH, "pitcher seasons"): all_pass = False else: pdf = pd.read_parquet(_PITCHER_OUTPUT_PATH) years = sorted(pdf["Season"].dropna().unique().astype(int).tolist()) print(f" Years present: {years}") for yr in years: yr_df = pdf[pdf["Season"] == yr] with_pid = int(yr_df["player_id"].notna().sum()) null_pid = len(yr_df) - with_pid print(f" {yr}: {len(yr_df)} rows ({with_pid} with player_id, {null_pid} without)") total_null_pid = int(pdf["player_id"].isna().sum()) print(f" Total null player_id: {total_null_pid}") # Duplicate display_name check per year for yr in years: yr_df = pdf[(pdf["Season"] == yr) & (pdf["IP"] > 0)] dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1) if dup: print(f" [FAIL] Duplicate display_names in pitcher {yr}: {dup}") all_pass = False else: print(f" [PASS] No duplicate display_names in pitcher {yr}") # Spot checks (pitcher present checks) if SPOT_CHECK_YEAR in years: yr_names = pdf[(pdf["Season"] == SPOT_CHECK_YEAR) & (pdf["IP"] > 0)]["display_name"].str.lower().tolist() for check in PITCHER_SPOT_CHECKS_PRESENT: found = any(check.lower() in n for n in yr_names) status = "[PASS]" if found else "[FAIL]" if not found: all_pass = False print(f" {status} Spot check — {check!r} in pitcher {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}") # ------------------------------------------------------------------- # card_lab_hitter_seasons.parquet # ------------------------------------------------------------------- print("\n=== card_lab_hitter_seasons.parquet ===") if not _check_file(_HITTER_OUTPUT_PATH, "hitter seasons"): all_pass = False else: hdf = pd.read_parquet(_HITTER_OUTPUT_PATH) years = sorted(hdf["Season"].dropna().unique().astype(int).tolist()) print(f" Years present: {years}") for yr in years: yr_df = hdf[hdf["Season"] == yr] print(f" {yr}: {len(yr_df)} rows (all have statcast_name — null rows excluded at build)") # Confirm no null statcast_name in output (build script must have excluded them) null_sc = int(hdf["statcast_name"].isna().sum()) if null_sc: print(f" [FAIL] {null_sc} hitter rows with null statcast_name in output (should be 0)") all_pass = False else: print(f" [PASS] All hitter rows have statcast_name") # Duplicate display_name check per year for yr in years: yr_df = hdf[(hdf["Season"] == yr) & (hdf["AB"] > 0)] dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1) if dup: print(f" [FAIL] Duplicate display_names in hitter {yr}: {dup}") all_pass = False else: print(f" [PASS] No duplicate display_names in hitter {yr}") # Spot checks if SPOT_CHECK_YEAR in years: yr_names_lc = { n.lower() for n in hdf[(hdf["Season"] == SPOT_CHECK_YEAR) & (hdf["AB"] > 0)]["display_name"].dropna() } for check in HITTER_SPOT_CHECKS_PRESENT: found = any(check.lower() in n for n in yr_names_lc) status = "[PASS]" if found else "[FAIL]" if not found: all_pass = False print(f" {status} Spot check — {check!r} in hitter {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}") for check in HITTER_SPOT_CHECKS_ABSENT: found = any(check.lower() in n for n in yr_names_lc) status = "[PASS]" if not found else "[FAIL]" if found: all_pass = False print(f" {status} Spot check — {check!r} NOT in hitter {SPOT_CHECK_YEAR}: {'ABSENT' if not found else 'PRESENT (FAIL)'}") # ------------------------------------------------------------------- # Final verdict # ------------------------------------------------------------------- print("\n" + ("=" * 50)) if all_pass: print("RESULT: ALL PASS — safe to commit Parquet artifacts") else: print("RESULT: FAIL — fix issues above before committing") print("=" * 50) return all_pass if __name__ == "__main__": passed = validate() sys.exit(0 if passed else 1)