Spaces:
Running
Running
| """ | |
| Build validation script — must pass before committing Card Lab Parquet artifacts. | |
| Usage: | |
| python scripts/validate_card_lab_build.py | |
| Run from project root after build_player_identity_map.py and build_card_lab_season_summaries.py. | |
| Exits with code 1 if any FAIL condition is detected. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from collections import Counter | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import pandas as pd | |
| _ROOT = Path(__file__).parent.parent / "data" | |
| _IDENTITY_MAP_PATH = _ROOT / "player_identity_map.parquet" | |
| _PITCHER_OUTPUT_PATH = _ROOT / "card_lab_pitcher_seasons.parquet" | |
| _HITTER_OUTPUT_PATH = _ROOT / "card_lab_hitter_seasons.parquet" | |
| SPOT_CHECK_YEAR = 2025 | |
| PITCHER_SPOT_CHECKS_PRESENT = ["Tarik Skubal", "Sonny Gray", "Aaron Nola"] | |
| HITTER_SPOT_CHECKS_PRESENT = ["Lars Nootbaar", "Alec Burleson"] | |
| HITTER_SPOT_CHECKS_ABSENT = ["Freddy Peralta", "Tarik Skubal"] | |
| def _check_file(path: Path, name: str) -> bool: | |
| if not path.exists(): | |
| print(f"\n[FAIL] {name} not found: {path}") | |
| return False | |
| return True | |
| def validate() -> bool: | |
| all_pass = True | |
| # ------------------------------------------------------------------- | |
| # player_identity_map.parquet | |
| # ------------------------------------------------------------------- | |
| print("\n=== player_identity_map.parquet ===") | |
| if not _check_file(_IDENTITY_MAP_PATH, "player_identity_map"): | |
| return False | |
| idf = pd.read_parquet(_IDENTITY_MAP_PATH) | |
| total = len(idf) | |
| with_mlbam = int(idf["player_id"].notna().sum()) | |
| miss_mlbam = total - with_mlbam | |
| with_statcast = int(idf["statcast_name"].notna().sum()) | |
| miss_statcast = total - with_statcast | |
| collisions = int(idf["source_note"].str.contains("collision", na=False).sum()) | |
| print(f" Total records: {total}") | |
| print(f" With MLBAM player_id: {with_mlbam}") | |
| print(f" Missing player_id: {miss_mlbam}") | |
| print(f" With statcast_name: {with_statcast}") | |
| print(f" Missing statcast_name: {miss_statcast}") | |
| print(f" Collision records: {collisions}") | |
| if miss_statcast: | |
| missing_names = idf[idf["statcast_name"].isna()]["canonical_name"].head(20).tolist() | |
| print(f" First 20 missing statcast_name: {missing_names}") | |
| # Check for duplicate display_names in identity map | |
| dup_display = sum(1 for c in Counter(idf["display_name"].dropna()).values() if c > 1) | |
| if dup_display: | |
| print(f" [FAIL] Duplicate display_names in identity map: {dup_display}") | |
| all_pass = False | |
| # ------------------------------------------------------------------- | |
| # card_lab_pitcher_seasons.parquet | |
| # ------------------------------------------------------------------- | |
| print("\n=== card_lab_pitcher_seasons.parquet ===") | |
| if not _check_file(_PITCHER_OUTPUT_PATH, "pitcher seasons"): | |
| all_pass = False | |
| else: | |
| pdf = pd.read_parquet(_PITCHER_OUTPUT_PATH) | |
| years = sorted(pdf["Season"].dropna().unique().astype(int).tolist()) | |
| print(f" Years present: {years}") | |
| for yr in years: | |
| yr_df = pdf[pdf["Season"] == yr] | |
| with_pid = int(yr_df["player_id"].notna().sum()) | |
| null_pid = len(yr_df) - with_pid | |
| print(f" {yr}: {len(yr_df)} rows ({with_pid} with player_id, {null_pid} without)") | |
| total_null_pid = int(pdf["player_id"].isna().sum()) | |
| print(f" Total null player_id: {total_null_pid}") | |
| # Duplicate display_name check per year | |
| for yr in years: | |
| yr_df = pdf[(pdf["Season"] == yr) & (pdf["IP"] > 0)] | |
| dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1) | |
| if dup: | |
| print(f" [FAIL] Duplicate display_names in pitcher {yr}: {dup}") | |
| all_pass = False | |
| else: | |
| print(f" [PASS] No duplicate display_names in pitcher {yr}") | |
| # Spot checks (pitcher present checks) | |
| if SPOT_CHECK_YEAR in years: | |
| yr_names = pdf[(pdf["Season"] == SPOT_CHECK_YEAR) & (pdf["IP"] > 0)]["display_name"].str.lower().tolist() | |
| for check in PITCHER_SPOT_CHECKS_PRESENT: | |
| found = any(check.lower() in n for n in yr_names) | |
| status = "[PASS]" if found else "[FAIL]" | |
| if not found: | |
| all_pass = False | |
| print(f" {status} Spot check — {check!r} in pitcher {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}") | |
| # ------------------------------------------------------------------- | |
| # card_lab_hitter_seasons.parquet | |
| # ------------------------------------------------------------------- | |
| print("\n=== card_lab_hitter_seasons.parquet ===") | |
| if not _check_file(_HITTER_OUTPUT_PATH, "hitter seasons"): | |
| all_pass = False | |
| else: | |
| hdf = pd.read_parquet(_HITTER_OUTPUT_PATH) | |
| years = sorted(hdf["Season"].dropna().unique().astype(int).tolist()) | |
| print(f" Years present: {years}") | |
| for yr in years: | |
| yr_df = hdf[hdf["Season"] == yr] | |
| print(f" {yr}: {len(yr_df)} rows (all have statcast_name — null rows excluded at build)") | |
| # Confirm no null statcast_name in output (build script must have excluded them) | |
| null_sc = int(hdf["statcast_name"].isna().sum()) | |
| if null_sc: | |
| print(f" [FAIL] {null_sc} hitter rows with null statcast_name in output (should be 0)") | |
| all_pass = False | |
| else: | |
| print(f" [PASS] All hitter rows have statcast_name") | |
| # Duplicate display_name check per year | |
| for yr in years: | |
| yr_df = hdf[(hdf["Season"] == yr) & (hdf["AB"] > 0)] | |
| dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1) | |
| if dup: | |
| print(f" [FAIL] Duplicate display_names in hitter {yr}: {dup}") | |
| all_pass = False | |
| else: | |
| print(f" [PASS] No duplicate display_names in hitter {yr}") | |
| # Spot checks | |
| if SPOT_CHECK_YEAR in years: | |
| yr_names_lc = { | |
| n.lower() | |
| for n in hdf[(hdf["Season"] == SPOT_CHECK_YEAR) & (hdf["AB"] > 0)]["display_name"].dropna() | |
| } | |
| for check in HITTER_SPOT_CHECKS_PRESENT: | |
| found = any(check.lower() in n for n in yr_names_lc) | |
| status = "[PASS]" if found else "[FAIL]" | |
| if not found: | |
| all_pass = False | |
| print(f" {status} Spot check — {check!r} in hitter {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}") | |
| for check in HITTER_SPOT_CHECKS_ABSENT: | |
| found = any(check.lower() in n for n in yr_names_lc) | |
| status = "[PASS]" if not found else "[FAIL]" | |
| if found: | |
| all_pass = False | |
| print(f" {status} Spot check — {check!r} NOT in hitter {SPOT_CHECK_YEAR}: {'ABSENT' if not found else 'PRESENT (FAIL)'}") | |
| # ------------------------------------------------------------------- | |
| # Final verdict | |
| # ------------------------------------------------------------------- | |
| print("\n" + ("=" * 50)) | |
| if all_pass: | |
| print("RESULT: ALL PASS — safe to commit Parquet artifacts") | |
| else: | |
| print("RESULT: FAIL — fix issues above before committing") | |
| print("=" * 50) | |
| return all_pass | |
| if __name__ == "__main__": | |
| passed = validate() | |
| sys.exit(0 if passed else 1) | |