Spaces:

Syntrex
/

2026_MLB_Model

Running

App Files Files

2026_MLB_Model / scripts /validate_card_lab_build.py

Syntrex

Card Lab Pass 5: player identity layer + pybaseball season-summary selectors

8f839e6 about 2 months ago

raw

history blame

7.57 kB

	"""
	Build validation script — must pass before committing Card Lab Parquet artifacts.

	Usage:
	python scripts/validate_card_lab_build.py

	Run from project root after build_player_identity_map.py and build_card_lab_season_summaries.py.
	Exits with code 1 if any FAIL condition is detected.
	"""
	from __future__ import annotations

	import sys
	from collections import Counter
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent.parent))

	import pandas as pd

	_ROOT = Path(__file__).parent.parent / "data"
	_IDENTITY_MAP_PATH = _ROOT / "player_identity_map.parquet"
	_PITCHER_OUTPUT_PATH = _ROOT / "card_lab_pitcher_seasons.parquet"
	_HITTER_OUTPUT_PATH = _ROOT / "card_lab_hitter_seasons.parquet"

	SPOT_CHECK_YEAR = 2025

	PITCHER_SPOT_CHECKS_PRESENT = ["Tarik Skubal", "Sonny Gray", "Aaron Nola"]
	HITTER_SPOT_CHECKS_PRESENT = ["Lars Nootbaar", "Alec Burleson"]
	HITTER_SPOT_CHECKS_ABSENT = ["Freddy Peralta", "Tarik Skubal"]


	def _check_file(path: Path, name: str) -> bool:
	if not path.exists():
	print(f"\n[FAIL] {name} not found: {path}")
	return False
	return True


	def validate() -> bool:
	all_pass = True

	# -------------------------------------------------------------------
	# player_identity_map.parquet
	# -------------------------------------------------------------------
	print("\n=== player_identity_map.parquet ===")
	if not _check_file(_IDENTITY_MAP_PATH, "player_identity_map"):
	return False

	idf = pd.read_parquet(_IDENTITY_MAP_PATH)
	total = len(idf)
	with_mlbam = int(idf["player_id"].notna().sum())
	miss_mlbam = total - with_mlbam
	with_statcast = int(idf["statcast_name"].notna().sum())
	miss_statcast = total - with_statcast
	collisions = int(idf["source_note"].str.contains("collision", na=False).sum())

	print(f" Total records: {total}")
	print(f" With MLBAM player_id: {with_mlbam}")
	print(f" Missing player_id: {miss_mlbam}")
	print(f" With statcast_name: {with_statcast}")
	print(f" Missing statcast_name: {miss_statcast}")
	print(f" Collision records: {collisions}")

	if miss_statcast:
	missing_names = idf[idf["statcast_name"].isna()]["canonical_name"].head(20).tolist()
	print(f" First 20 missing statcast_name: {missing_names}")

	# Check for duplicate display_names in identity map
	dup_display = sum(1 for c in Counter(idf["display_name"].dropna()).values() if c > 1)
	if dup_display:
	print(f" [FAIL] Duplicate display_names in identity map: {dup_display}")
	all_pass = False

	# -------------------------------------------------------------------
	# card_lab_pitcher_seasons.parquet
	# -------------------------------------------------------------------
	print("\n=== card_lab_pitcher_seasons.parquet ===")
	if not _check_file(_PITCHER_OUTPUT_PATH, "pitcher seasons"):
	all_pass = False
	else:
	pdf = pd.read_parquet(_PITCHER_OUTPUT_PATH)
	years = sorted(pdf["Season"].dropna().unique().astype(int).tolist())
	print(f" Years present: {years}")
	for yr in years:
	yr_df = pdf[pdf["Season"] == yr]
	with_pid = int(yr_df["player_id"].notna().sum())
	null_pid = len(yr_df) - with_pid
	print(f" {yr}: {len(yr_df)} rows ({with_pid} with player_id, {null_pid} without)")

	total_null_pid = int(pdf["player_id"].isna().sum())
	print(f" Total null player_id: {total_null_pid}")

	# Duplicate display_name check per year
	for yr in years:
	yr_df = pdf[(pdf["Season"] == yr) & (pdf["IP"] > 0)]
	dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
	if dup:
	print(f" [FAIL] Duplicate display_names in pitcher {yr}: {dup}")
	all_pass = False
	else:
	print(f" [PASS] No duplicate display_names in pitcher {yr}")

	# Spot checks (pitcher present checks)
	if SPOT_CHECK_YEAR in years:
	yr_names = pdf[(pdf["Season"] == SPOT_CHECK_YEAR) & (pdf["IP"] > 0)]["display_name"].str.lower().tolist()
	for check in PITCHER_SPOT_CHECKS_PRESENT:
	found = any(check.lower() in n for n in yr_names)
	status = "[PASS]" if found else "[FAIL]"
	if not found:
	all_pass = False
	print(f" {status} Spot check — {check!r} in pitcher {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")

	# -------------------------------------------------------------------
	# card_lab_hitter_seasons.parquet
	# -------------------------------------------------------------------
	print("\n=== card_lab_hitter_seasons.parquet ===")
	if not _check_file(_HITTER_OUTPUT_PATH, "hitter seasons"):
	all_pass = False
	else:
	hdf = pd.read_parquet(_HITTER_OUTPUT_PATH)
	years = sorted(hdf["Season"].dropna().unique().astype(int).tolist())
	print(f" Years present: {years}")
	for yr in years:
	yr_df = hdf[hdf["Season"] == yr]
	print(f" {yr}: {len(yr_df)} rows (all have statcast_name — null rows excluded at build)")

	# Confirm no null statcast_name in output (build script must have excluded them)
	null_sc = int(hdf["statcast_name"].isna().sum())
	if null_sc:
	print(f" [FAIL] {null_sc} hitter rows with null statcast_name in output (should be 0)")
	all_pass = False
	else:
	print(f" [PASS] All hitter rows have statcast_name")

	# Duplicate display_name check per year
	for yr in years:
	yr_df = hdf[(hdf["Season"] == yr) & (hdf["AB"] > 0)]
	dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
	if dup:
	print(f" [FAIL] Duplicate display_names in hitter {yr}: {dup}")
	all_pass = False
	else:
	print(f" [PASS] No duplicate display_names in hitter {yr}")

	# Spot checks
	if SPOT_CHECK_YEAR in years:
	yr_names_lc = {
	n.lower()
	for n in hdf[(hdf["Season"] == SPOT_CHECK_YEAR) & (hdf["AB"] > 0)]["display_name"].dropna()
	}
	for check in HITTER_SPOT_CHECKS_PRESENT:
	found = any(check.lower() in n for n in yr_names_lc)
	status = "[PASS]" if found else "[FAIL]"
	if not found:
	all_pass = False
	print(f" {status} Spot check — {check!r} in hitter {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")
	for check in HITTER_SPOT_CHECKS_ABSENT:
	found = any(check.lower() in n for n in yr_names_lc)
	status = "[PASS]" if not found else "[FAIL]"
	if found:
	all_pass = False
	print(f" {status} Spot check — {check!r} NOT in hitter {SPOT_CHECK_YEAR}: {'ABSENT' if not found else 'PRESENT (FAIL)'}")

	# -------------------------------------------------------------------
	# Final verdict
	# -------------------------------------------------------------------
	print("\n" + ("=" * 50))
	if all_pass:
	print("RESULT: ALL PASS — safe to commit Parquet artifacts")
	else:
	print("RESULT: FAIL — fix issues above before committing")
	print("=" * 50)

	return all_pass


	if __name__ == "__main__":
	passed = validate()
	sys.exit(0 if passed else 1)