Spaces:
Running
Running
File size: 7,567 Bytes
8f839e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | """
Build validation script — must pass before committing Card Lab Parquet artifacts.
Usage:
python scripts/validate_card_lab_build.py
Run from project root after build_player_identity_map.py and build_card_lab_season_summaries.py.
Exits with code 1 if any FAIL condition is detected.
"""
from __future__ import annotations
import sys
from collections import Counter
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import pandas as pd
_ROOT = Path(__file__).parent.parent / "data"
_IDENTITY_MAP_PATH = _ROOT / "player_identity_map.parquet"
_PITCHER_OUTPUT_PATH = _ROOT / "card_lab_pitcher_seasons.parquet"
_HITTER_OUTPUT_PATH = _ROOT / "card_lab_hitter_seasons.parquet"
SPOT_CHECK_YEAR = 2025
PITCHER_SPOT_CHECKS_PRESENT = ["Tarik Skubal", "Sonny Gray", "Aaron Nola"]
HITTER_SPOT_CHECKS_PRESENT = ["Lars Nootbaar", "Alec Burleson"]
HITTER_SPOT_CHECKS_ABSENT = ["Freddy Peralta", "Tarik Skubal"]
def _check_file(path: Path, name: str) -> bool:
if not path.exists():
print(f"\n[FAIL] {name} not found: {path}")
return False
return True
def validate() -> bool:
all_pass = True
# -------------------------------------------------------------------
# player_identity_map.parquet
# -------------------------------------------------------------------
print("\n=== player_identity_map.parquet ===")
if not _check_file(_IDENTITY_MAP_PATH, "player_identity_map"):
return False
idf = pd.read_parquet(_IDENTITY_MAP_PATH)
total = len(idf)
with_mlbam = int(idf["player_id"].notna().sum())
miss_mlbam = total - with_mlbam
with_statcast = int(idf["statcast_name"].notna().sum())
miss_statcast = total - with_statcast
collisions = int(idf["source_note"].str.contains("collision", na=False).sum())
print(f" Total records: {total}")
print(f" With MLBAM player_id: {with_mlbam}")
print(f" Missing player_id: {miss_mlbam}")
print(f" With statcast_name: {with_statcast}")
print(f" Missing statcast_name: {miss_statcast}")
print(f" Collision records: {collisions}")
if miss_statcast:
missing_names = idf[idf["statcast_name"].isna()]["canonical_name"].head(20).tolist()
print(f" First 20 missing statcast_name: {missing_names}")
# Check for duplicate display_names in identity map
dup_display = sum(1 for c in Counter(idf["display_name"].dropna()).values() if c > 1)
if dup_display:
print(f" [FAIL] Duplicate display_names in identity map: {dup_display}")
all_pass = False
# -------------------------------------------------------------------
# card_lab_pitcher_seasons.parquet
# -------------------------------------------------------------------
print("\n=== card_lab_pitcher_seasons.parquet ===")
if not _check_file(_PITCHER_OUTPUT_PATH, "pitcher seasons"):
all_pass = False
else:
pdf = pd.read_parquet(_PITCHER_OUTPUT_PATH)
years = sorted(pdf["Season"].dropna().unique().astype(int).tolist())
print(f" Years present: {years}")
for yr in years:
yr_df = pdf[pdf["Season"] == yr]
with_pid = int(yr_df["player_id"].notna().sum())
null_pid = len(yr_df) - with_pid
print(f" {yr}: {len(yr_df)} rows ({with_pid} with player_id, {null_pid} without)")
total_null_pid = int(pdf["player_id"].isna().sum())
print(f" Total null player_id: {total_null_pid}")
# Duplicate display_name check per year
for yr in years:
yr_df = pdf[(pdf["Season"] == yr) & (pdf["IP"] > 0)]
dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
if dup:
print(f" [FAIL] Duplicate display_names in pitcher {yr}: {dup}")
all_pass = False
else:
print(f" [PASS] No duplicate display_names in pitcher {yr}")
# Spot checks (pitcher present checks)
if SPOT_CHECK_YEAR in years:
yr_names = pdf[(pdf["Season"] == SPOT_CHECK_YEAR) & (pdf["IP"] > 0)]["display_name"].str.lower().tolist()
for check in PITCHER_SPOT_CHECKS_PRESENT:
found = any(check.lower() in n for n in yr_names)
status = "[PASS]" if found else "[FAIL]"
if not found:
all_pass = False
print(f" {status} Spot check — {check!r} in pitcher {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")
# -------------------------------------------------------------------
# card_lab_hitter_seasons.parquet
# -------------------------------------------------------------------
print("\n=== card_lab_hitter_seasons.parquet ===")
if not _check_file(_HITTER_OUTPUT_PATH, "hitter seasons"):
all_pass = False
else:
hdf = pd.read_parquet(_HITTER_OUTPUT_PATH)
years = sorted(hdf["Season"].dropna().unique().astype(int).tolist())
print(f" Years present: {years}")
for yr in years:
yr_df = hdf[hdf["Season"] == yr]
print(f" {yr}: {len(yr_df)} rows (all have statcast_name — null rows excluded at build)")
# Confirm no null statcast_name in output (build script must have excluded them)
null_sc = int(hdf["statcast_name"].isna().sum())
if null_sc:
print(f" [FAIL] {null_sc} hitter rows with null statcast_name in output (should be 0)")
all_pass = False
else:
print(f" [PASS] All hitter rows have statcast_name")
# Duplicate display_name check per year
for yr in years:
yr_df = hdf[(hdf["Season"] == yr) & (hdf["AB"] > 0)]
dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
if dup:
print(f" [FAIL] Duplicate display_names in hitter {yr}: {dup}")
all_pass = False
else:
print(f" [PASS] No duplicate display_names in hitter {yr}")
# Spot checks
if SPOT_CHECK_YEAR in years:
yr_names_lc = {
n.lower()
for n in hdf[(hdf["Season"] == SPOT_CHECK_YEAR) & (hdf["AB"] > 0)]["display_name"].dropna()
}
for check in HITTER_SPOT_CHECKS_PRESENT:
found = any(check.lower() in n for n in yr_names_lc)
status = "[PASS]" if found else "[FAIL]"
if not found:
all_pass = False
print(f" {status} Spot check — {check!r} in hitter {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")
for check in HITTER_SPOT_CHECKS_ABSENT:
found = any(check.lower() in n for n in yr_names_lc)
status = "[PASS]" if not found else "[FAIL]"
if found:
all_pass = False
print(f" {status} Spot check — {check!r} NOT in hitter {SPOT_CHECK_YEAR}: {'ABSENT' if not found else 'PRESENT (FAIL)'}")
# -------------------------------------------------------------------
# Final verdict
# -------------------------------------------------------------------
print("\n" + ("=" * 50))
if all_pass:
print("RESULT: ALL PASS — safe to commit Parquet artifacts")
else:
print("RESULT: FAIL — fix issues above before committing")
print("=" * 50)
return all_pass
if __name__ == "__main__":
passed = validate()
sys.exit(0 if passed else 1)
|