File size: 7,567 Bytes
8f839e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Build validation script — must pass before committing Card Lab Parquet artifacts.

Usage:
    python scripts/validate_card_lab_build.py

Run from project root after build_player_identity_map.py and build_card_lab_season_summaries.py.
Exits with code 1 if any FAIL condition is detected.
"""
from __future__ import annotations

import sys
from collections import Counter
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

import pandas as pd

_ROOT = Path(__file__).parent.parent / "data"
_IDENTITY_MAP_PATH    = _ROOT / "player_identity_map.parquet"
_PITCHER_OUTPUT_PATH  = _ROOT / "card_lab_pitcher_seasons.parquet"
_HITTER_OUTPUT_PATH   = _ROOT / "card_lab_hitter_seasons.parquet"

SPOT_CHECK_YEAR = 2025

PITCHER_SPOT_CHECKS_PRESENT = ["Tarik Skubal", "Sonny Gray", "Aaron Nola"]
HITTER_SPOT_CHECKS_PRESENT  = ["Lars Nootbaar", "Alec Burleson"]
HITTER_SPOT_CHECKS_ABSENT   = ["Freddy Peralta", "Tarik Skubal"]


def _check_file(path: Path, name: str) -> bool:
    if not path.exists():
        print(f"\n[FAIL] {name} not found: {path}")
        return False
    return True


def validate() -> bool:
    all_pass = True

    # -------------------------------------------------------------------
    # player_identity_map.parquet
    # -------------------------------------------------------------------
    print("\n=== player_identity_map.parquet ===")
    if not _check_file(_IDENTITY_MAP_PATH, "player_identity_map"):
        return False

    idf = pd.read_parquet(_IDENTITY_MAP_PATH)
    total = len(idf)
    with_mlbam    = int(idf["player_id"].notna().sum())
    miss_mlbam    = total - with_mlbam
    with_statcast = int(idf["statcast_name"].notna().sum())
    miss_statcast = total - with_statcast
    collisions    = int(idf["source_note"].str.contains("collision", na=False).sum())

    print(f"  Total records:          {total}")
    print(f"  With MLBAM player_id:   {with_mlbam}")
    print(f"  Missing player_id:      {miss_mlbam}")
    print(f"  With statcast_name:     {with_statcast}")
    print(f"  Missing statcast_name:  {miss_statcast}")
    print(f"  Collision records:      {collisions}")

    if miss_statcast:
        missing_names = idf[idf["statcast_name"].isna()]["canonical_name"].head(20).tolist()
        print(f"  First 20 missing statcast_name: {missing_names}")

    # Check for duplicate display_names in identity map
    dup_display = sum(1 for c in Counter(idf["display_name"].dropna()).values() if c > 1)
    if dup_display:
        print(f"  [FAIL] Duplicate display_names in identity map: {dup_display}")
        all_pass = False

    # -------------------------------------------------------------------
    # card_lab_pitcher_seasons.parquet
    # -------------------------------------------------------------------
    print("\n=== card_lab_pitcher_seasons.parquet ===")
    if not _check_file(_PITCHER_OUTPUT_PATH, "pitcher seasons"):
        all_pass = False
    else:
        pdf = pd.read_parquet(_PITCHER_OUTPUT_PATH)
        years = sorted(pdf["Season"].dropna().unique().astype(int).tolist())
        print(f"  Years present: {years}")
        for yr in years:
            yr_df = pdf[pdf["Season"] == yr]
            with_pid = int(yr_df["player_id"].notna().sum())
            null_pid = len(yr_df) - with_pid
            print(f"  {yr}: {len(yr_df)} rows  ({with_pid} with player_id, {null_pid} without)")

        total_null_pid = int(pdf["player_id"].isna().sum())
        print(f"  Total null player_id: {total_null_pid}")

        # Duplicate display_name check per year
        for yr in years:
            yr_df = pdf[(pdf["Season"] == yr) & (pdf["IP"] > 0)]
            dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
            if dup:
                print(f"  [FAIL] Duplicate display_names in pitcher {yr}: {dup}")
                all_pass = False
            else:
                print(f"  [PASS] No duplicate display_names in pitcher {yr}")

        # Spot checks (pitcher present checks)
        if SPOT_CHECK_YEAR in years:
            yr_names = pdf[(pdf["Season"] == SPOT_CHECK_YEAR) & (pdf["IP"] > 0)]["display_name"].str.lower().tolist()
            for check in PITCHER_SPOT_CHECKS_PRESENT:
                found = any(check.lower() in n for n in yr_names)
                status = "[PASS]" if found else "[FAIL]"
                if not found:
                    all_pass = False
                print(f"  {status} Spot check — {check!r} in pitcher {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")

    # -------------------------------------------------------------------
    # card_lab_hitter_seasons.parquet
    # -------------------------------------------------------------------
    print("\n=== card_lab_hitter_seasons.parquet ===")
    if not _check_file(_HITTER_OUTPUT_PATH, "hitter seasons"):
        all_pass = False
    else:
        hdf = pd.read_parquet(_HITTER_OUTPUT_PATH)
        years = sorted(hdf["Season"].dropna().unique().astype(int).tolist())
        print(f"  Years present: {years}")
        for yr in years:
            yr_df = hdf[hdf["Season"] == yr]
            print(f"  {yr}: {len(yr_df)} rows (all have statcast_name — null rows excluded at build)")

        # Confirm no null statcast_name in output (build script must have excluded them)
        null_sc = int(hdf["statcast_name"].isna().sum())
        if null_sc:
            print(f"  [FAIL] {null_sc} hitter rows with null statcast_name in output (should be 0)")
            all_pass = False
        else:
            print(f"  [PASS] All hitter rows have statcast_name")

        # Duplicate display_name check per year
        for yr in years:
            yr_df = hdf[(hdf["Season"] == yr) & (hdf["AB"] > 0)]
            dup = sum(1 for c in Counter(yr_df["display_name"].dropna()).values() if c > 1)
            if dup:
                print(f"  [FAIL] Duplicate display_names in hitter {yr}: {dup}")
                all_pass = False
            else:
                print(f"  [PASS] No duplicate display_names in hitter {yr}")

        # Spot checks
        if SPOT_CHECK_YEAR in years:
            yr_names_lc = {
                n.lower()
                for n in hdf[(hdf["Season"] == SPOT_CHECK_YEAR) & (hdf["AB"] > 0)]["display_name"].dropna()
            }
            for check in HITTER_SPOT_CHECKS_PRESENT:
                found = any(check.lower() in n for n in yr_names_lc)
                status = "[PASS]" if found else "[FAIL]"
                if not found:
                    all_pass = False
                print(f"  {status} Spot check — {check!r} in hitter {SPOT_CHECK_YEAR}: {'FOUND' if found else 'NOT FOUND'}")
            for check in HITTER_SPOT_CHECKS_ABSENT:
                found = any(check.lower() in n for n in yr_names_lc)
                status = "[PASS]" if not found else "[FAIL]"
                if found:
                    all_pass = False
                print(f"  {status} Spot check — {check!r} NOT in hitter {SPOT_CHECK_YEAR}: {'ABSENT' if not found else 'PRESENT (FAIL)'}")

    # -------------------------------------------------------------------
    # Final verdict
    # -------------------------------------------------------------------
    print("\n" + ("=" * 50))
    if all_pass:
        print("RESULT: ALL PASS — safe to commit Parquet artifacts")
    else:
        print("RESULT: FAIL — fix issues above before committing")
    print("=" * 50)

    return all_pass


if __name__ == "__main__":
    passed = validate()
    sys.exit(0 if passed else 1)