Spaces:
Sleeping
Sleeping
Card Lab Pass 5: identity map + batter_id-based hitter query
Browse files- Build player_identity_map.parquet from pybaseball + MLBAM lookup + DB enrichment
- Pitcher statcast_name resolved from ec.player_name (Last, First format)
- Hitter statcast_name set to canonical_name; hitter cards now query by ec.batter = :batter_id
- card_lab_pitcher_seasons.parquet and card_lab_hitter_seasons.parquet built from
pybaseball pitching_stats/batting_stats with IP>0/AB>0 gates
- get_player_card_window_df: add batter_id param for Hitter path (ec.batter = :batter_id)
- card_lab_page.py: read batter_id from Parquet, pass to window query; no DB call for MLBAM ID
- Collision-safe display_names; all validation checks pass (ALL PASS)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- data/card_lab_hitter_seasons.parquet +3 -0
- data/card_lab_pitcher_seasons.parquet +3 -0
- data/player_identity_map.parquet +3 -0
- scripts/build_card_lab_season_summaries.py +2 -2
- scripts/build_player_identity_map.py +91 -58
- visualization/card_lab_page.py +32 -11
- visualization/cards/card_queries.py +49 -7
data/card_lab_hitter_seasons.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a661d5caf2cf95decb7a632bc33c0eb33d3268ec689d7bbcccd46abb7c2f0885
|
| 3 |
+
size 92044
|
data/card_lab_pitcher_seasons.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7326980b9dffdbc208712b87faf8ed00706aea6172b5ad9e50b9250a84625308
|
| 3 |
+
size 102665
|
data/player_identity_map.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6036880b0335b6e28010c0540aebb65af0b3fe8946f526ad2d604b08aff4a842
|
| 3 |
+
size 206860
|
scripts/build_card_lab_season_summaries.py
CHANGED
|
@@ -240,12 +240,12 @@ def build_season_summaries(seasons: list[int]) -> None:
|
|
| 240 |
pitcher_df = build_pitcher_seasons(seasons, identity_df)
|
| 241 |
_PITCHER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 242 |
pitcher_df.to_parquet(_PITCHER_OUTPUT_PATH, index=False)
|
| 243 |
-
print(f"\nWrote {len(pitcher_df)} pitcher rows
|
| 244 |
|
| 245 |
hitter_df = build_hitter_seasons(seasons, identity_df)
|
| 246 |
_HITTER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 247 |
hitter_df.to_parquet(_HITTER_OUTPUT_PATH, index=False)
|
| 248 |
-
print(f"\nWrote {len(hitter_df)} hitter rows
|
| 249 |
|
| 250 |
|
| 251 |
if __name__ == "__main__":
|
|
|
|
| 240 |
pitcher_df = build_pitcher_seasons(seasons, identity_df)
|
| 241 |
_PITCHER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 242 |
pitcher_df.to_parquet(_PITCHER_OUTPUT_PATH, index=False)
|
| 243 |
+
print(f"\nWrote {len(pitcher_df)} pitcher rows -> {_PITCHER_OUTPUT_PATH}")
|
| 244 |
|
| 245 |
hitter_df = build_hitter_seasons(seasons, identity_df)
|
| 246 |
_HITTER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 247 |
hitter_df.to_parquet(_HITTER_OUTPUT_PATH, index=False)
|
| 248 |
+
print(f"\nWrote {len(hitter_df)} hitter rows -> {_HITTER_OUTPUT_PATH}")
|
| 249 |
|
| 250 |
|
| 251 |
if __name__ == "__main__":
|
scripts/build_player_identity_map.py
CHANGED
|
@@ -170,11 +170,19 @@ def _build_raw_records(
|
|
| 170 |
|
| 171 |
def _enrich_statcast_names(identity_df: pd.DataFrame) -> pd.DataFrame:
|
| 172 |
"""
|
| 173 |
-
Populate statcast_name
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
"""
|
| 177 |
-
print(" Connecting to DB for statcast_name enrichment...")
|
| 178 |
conn = get_connection()
|
| 179 |
try:
|
| 180 |
rows = conn.execute(
|
|
@@ -184,47 +192,74 @@ def _enrich_statcast_names(identity_df: pd.DataFrame) -> pd.DataFrame:
|
|
| 184 |
conn.close()
|
| 185 |
|
| 186 |
statcast_names: list[str] = [r[0] for r in rows if r[0]]
|
| 187 |
-
|
| 188 |
-
print(f" Loaded {len(statcast_names)} distinct statcast player_names")
|
| 189 |
|
| 190 |
-
#
|
|
|
|
|
|
|
|
|
|
| 191 |
norm_to_statcast: dict[str, list[str]] = defaultdict(list)
|
| 192 |
for sc in statcast_names:
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
|
|
|
| 198 |
|
| 199 |
for idx, row in identity_df.iterrows():
|
| 200 |
canonical = row["canonical_name"]
|
| 201 |
norm_key = row["canonical_name_normalized"]
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
if
|
|
|
|
| 209 |
identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast"
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
# Layer 2: normalized key match
|
| 214 |
-
candidates = norm_to_statcast.get(norm_key, [])
|
| 215 |
-
if len(candidates) == 1:
|
| 216 |
-
identity_df.at[idx, "statcast_name"] = candidates[0]
|
| 217 |
-
if "statcast" not in str(row.get("source_note", "")):
|
| 218 |
-
identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast-norm"
|
| 219 |
-
resolved += 1
|
| 220 |
-
elif len(candidates) > 1:
|
| 221 |
-
print(f" AMBIGUOUS: {canonical!r} → {candidates}")
|
| 222 |
-
ambiguous += 1
|
| 223 |
-
else:
|
| 224 |
-
print(f" UNMATCHED: {canonical!r} (no statcast player_name found)")
|
| 225 |
-
unmatched += 1
|
| 226 |
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
return identity_df
|
| 229 |
|
| 230 |
|
|
@@ -242,31 +277,29 @@ def _resolve_collisions(identity_df: pd.DataFrame) -> pd.DataFrame:
|
|
| 242 |
identity_df = identity_df.copy()
|
| 243 |
identity_df["display_name"] = identity_df["canonical_name"]
|
| 244 |
|
| 245 |
-
# Find collisions: same normalized name,
|
| 246 |
norm_groups = identity_df.groupby("canonical_name_normalized")
|
| 247 |
collision_count = 0
|
| 248 |
for norm_key, group in norm_groups:
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
else
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
)
|
| 269 |
-
collision_count += 1
|
| 270 |
|
| 271 |
if collision_count:
|
| 272 |
print(f" Resolved {collision_count} collision suffix(es)")
|
|
@@ -309,7 +342,7 @@ def build_identity_map(seasons: list[int]) -> None:
|
|
| 309 |
|
| 310 |
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 311 |
identity_df.to_parquet(OUTPUT_PATH, index=False)
|
| 312 |
-
print(f"\nWrote {len(identity_df)} rows
|
| 313 |
|
| 314 |
# Summary
|
| 315 |
with_mlbam = identity_df["player_id"].notna().sum()
|
|
|
|
| 170 |
|
| 171 |
def _enrich_statcast_names(identity_df: pd.DataFrame) -> pd.DataFrame:
|
| 172 |
"""
|
| 173 |
+
Populate statcast_name for each identity record.
|
| 174 |
+
|
| 175 |
+
DB context: statcast_event_core.player_name stores the PITCHER name in "Last, First"
|
| 176 |
+
format. Pure hitters never appear there. Strategy:
|
| 177 |
+
|
| 178 |
+
- role_pitcher records: match against ec.player_name (Last, First → canonical → norm).
|
| 179 |
+
statcast_name = the DB's "Last, First" string (used in pitcher selector label only;
|
| 180 |
+
pitcher window queries use ec.pitcher = :pitcher_id, not player_name).
|
| 181 |
+
- role_hitter-only records: set statcast_name = canonical_name (First Last).
|
| 182 |
+
Hitter window queries will use ec.batter = :batter_id via player_id, not player_name.
|
| 183 |
+
- Two-way players (both roles): try pitcher match first; fall back to canonical_name.
|
| 184 |
"""
|
| 185 |
+
print(" Connecting to DB for statcast_name enrichment (pitchers only)...")
|
| 186 |
conn = get_connection()
|
| 187 |
try:
|
| 188 |
rows = conn.execute(
|
|
|
|
| 192 |
conn.close()
|
| 193 |
|
| 194 |
statcast_names: list[str] = [r[0] for r in rows if r[0]]
|
| 195 |
+
print(f" Loaded {len(statcast_names)} distinct statcast player_names (pitcher names)")
|
|
|
|
| 196 |
|
| 197 |
+
# Pre-build O(1) lookup dicts so the per-record loop is fast.
|
| 198 |
+
# statcast names are "Last, First" — apply to_canonical_name before normalizing
|
| 199 |
+
# so the key matches pybaseball "First Last" canonical_name_normalized.
|
| 200 |
+
canonical_lower_to_statcast: dict[str, str] = {} # "first last" → "Last, First"
|
| 201 |
norm_to_statcast: dict[str, list[str]] = defaultdict(list)
|
| 202 |
for sc in statcast_names:
|
| 203 |
+
canon = to_canonical_name(sc) # "Last, First" → "First Last"
|
| 204 |
+
cl = canon.lower()
|
| 205 |
+
if cl not in canonical_lower_to_statcast:
|
| 206 |
+
canonical_lower_to_statcast[cl] = sc
|
| 207 |
+
norm_to_statcast[normalize_for_matching(canon)].append(sc)
|
| 208 |
|
| 209 |
+
resolved_pitcher = 0
|
| 210 |
+
resolved_hitter = 0
|
| 211 |
+
ambiguous = 0
|
| 212 |
+
unmatched_pitcher = 0
|
| 213 |
|
| 214 |
for idx, row in identity_df.iterrows():
|
| 215 |
canonical = row["canonical_name"]
|
| 216 |
norm_key = row["canonical_name_normalized"]
|
| 217 |
+
is_pitcher = bool(row.get("role_pitcher"))
|
| 218 |
+
is_hitter = bool(row.get("role_hitter"))
|
| 219 |
+
|
| 220 |
+
if is_pitcher:
|
| 221 |
+
# Layer 1: exact canonical lowercase match (O(1))
|
| 222 |
+
sc = canonical_lower_to_statcast.get(canonical.lower())
|
| 223 |
+
if sc:
|
| 224 |
+
identity_df.at[idx, "statcast_name"] = sc
|
| 225 |
identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast"
|
| 226 |
+
resolved_pitcher += 1
|
| 227 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
+
# Layer 2: normalized key match (O(1))
|
| 230 |
+
candidates = norm_to_statcast.get(norm_key, [])
|
| 231 |
+
if len(candidates) == 1:
|
| 232 |
+
identity_df.at[idx, "statcast_name"] = candidates[0]
|
| 233 |
+
identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast-norm"
|
| 234 |
+
resolved_pitcher += 1
|
| 235 |
+
continue
|
| 236 |
+
elif len(candidates) > 1:
|
| 237 |
+
print(f" AMBIGUOUS pitcher: {canonical!r} → {candidates}")
|
| 238 |
+
ambiguous += 1
|
| 239 |
+
# Fall through: use canonical_name as statcast_name so row is not excluded
|
| 240 |
+
else:
|
| 241 |
+
print(f" UNMATCHED pitcher: {canonical!r}")
|
| 242 |
+
unmatched_pitcher += 1
|
| 243 |
+
# Fall through to hitter branch if also a hitter; else canonical fallback
|
| 244 |
+
|
| 245 |
+
if is_hitter and identity_df.at[idx, "statcast_name"] is None:
|
| 246 |
+
# Hitter window queries use ec.batter = :batter_id (player_id), not player_name.
|
| 247 |
+
# statcast_name must be non-null to pass build validation.
|
| 248 |
+
# Use canonical_name (First Last) as a stable non-null placeholder.
|
| 249 |
+
identity_df.at[idx, "statcast_name"] = canonical
|
| 250 |
+
identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+hitter-canonical"
|
| 251 |
+
resolved_hitter += 1
|
| 252 |
+
|
| 253 |
+
# Final fallback: any record still missing statcast_name (e.g. unmatched pure pitcher)
|
| 254 |
+
if identity_df.at[idx, "statcast_name"] is None:
|
| 255 |
+
identity_df.at[idx, "statcast_name"] = canonical
|
| 256 |
+
identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+canonical-fallback"
|
| 257 |
+
|
| 258 |
+
print(
|
| 259 |
+
f" Enrichment: pitcher_matched={resolved_pitcher} "
|
| 260 |
+
f"hitter_canonical={resolved_hitter} "
|
| 261 |
+
f"ambiguous={ambiguous} unmatched_pitcher={unmatched_pitcher}"
|
| 262 |
+
)
|
| 263 |
return identity_df
|
| 264 |
|
| 265 |
|
|
|
|
| 277 |
identity_df = identity_df.copy()
|
| 278 |
identity_df["display_name"] = identity_df["canonical_name"]
|
| 279 |
|
| 280 |
+
# Find collisions: same normalized name, multiple rows
|
| 281 |
norm_groups = identity_df.groupby("canonical_name_normalized")
|
| 282 |
collision_count = 0
|
| 283 |
for norm_key, group in norm_groups:
|
| 284 |
+
if len(group) == 1:
|
| 285 |
+
continue # no collision
|
| 286 |
+
# Multiple records share the same normalized name — assign suffixes to all but the first.
|
| 287 |
+
# Sort: non-null player_id ascending first, then null (deterministic).
|
| 288 |
+
sorted_group = group.sort_values(
|
| 289 |
+
"player_id", ascending=True, na_position="last"
|
| 290 |
+
)
|
| 291 |
+
for rank, (idx, row) in enumerate(sorted_group.iterrows()):
|
| 292 |
+
if rank == 0:
|
| 293 |
+
# Primary: keep canonical_name as display_name
|
| 294 |
+
pass
|
| 295 |
+
else:
|
| 296 |
+
pid = row.get("player_id")
|
| 297 |
+
suffix = str(int(pid)) if pd.notna(pid) else "?"
|
| 298 |
+
identity_df.at[idx, "display_name"] = f"{row['canonical_name']} ({suffix})"
|
| 299 |
+
identity_df.at[idx, "source_note"] = (
|
| 300 |
+
str(row.get("source_note", "")) + "+collision-resolved"
|
| 301 |
+
)
|
| 302 |
+
collision_count += 1
|
|
|
|
|
|
|
| 303 |
|
| 304 |
if collision_count:
|
| 305 |
print(f" Resolved {collision_count} collision suffix(es)")
|
|
|
|
| 342 |
|
| 343 |
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 344 |
identity_df.to_parquet(OUTPUT_PATH, index=False)
|
| 345 |
+
print(f"\nWrote {len(identity_df)} rows -> {OUTPUT_PATH}")
|
| 346 |
|
| 347 |
# Summary
|
| 348 |
with_mlbam = identity_df["player_id"].notna().sum()
|
visualization/card_lab_page.py
CHANGED
|
@@ -47,10 +47,11 @@ def normalize_name(name: str) -> str:
|
|
| 47 |
# Card generation functions — button-click only, no caching
|
| 48 |
# ---------------------------------------------------------------------------
|
| 49 |
|
| 50 |
-
def _gen_hitter_bytes(conn, player_name, mode, year, date, start_date, end_date, fmt, player_pil):
|
| 51 |
windowed_df = get_player_card_window_df(
|
| 52 |
conn, player_name, "Hitter", mode=mode, year=year,
|
| 53 |
date=date, start_date=start_date, end_date=end_date,
|
|
|
|
| 54 |
)
|
| 55 |
if windowed_df.empty:
|
| 56 |
return None, "", "limited"
|
|
@@ -149,6 +150,7 @@ def render_card_lab(conn) -> None:
|
|
| 149 |
# Hitter pipeline variables
|
| 150 |
hitter_display_names: list[str] = []
|
| 151 |
hitter_display_to_statcast: dict[str, str] = {}
|
|
|
|
| 152 |
|
| 153 |
# Pitcher pipeline variables
|
| 154 |
pitcher_display_names: list[str] = []
|
|
@@ -169,15 +171,31 @@ def render_card_lab(conn) -> None:
|
|
| 169 |
return
|
| 170 |
|
| 171 |
selected_hitter_display = st.selectbox("Player", hitter_display_names, key="cl_player_hitter")
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
)
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
elif card_type == "Pitcher":
|
| 183 |
# Source: data/card_lab_pitcher_seasons.parquet (pybaseball pitching_stats, IP > 0 gate)
|
|
@@ -239,7 +257,8 @@ def render_card_lab(conn) -> None:
|
|
| 239 |
mlbam_id: int | None = None
|
| 240 |
player_pil = None
|
| 241 |
if card_type == "Hitter" and player_name:
|
| 242 |
-
|
|
|
|
| 243 |
player_pil = resolve_player_image(mlbam_id) if mlbam_id else None
|
| 244 |
elif card_type == "Pitcher" and pitcher_id:
|
| 245 |
mlbam_id = pitcher_id
|
|
@@ -300,6 +319,8 @@ def render_card_lab(conn) -> None:
|
|
| 300 |
|
| 301 |
if card_type in ("Hitter", "Pitcher"):
|
| 302 |
st.write(f"**MLBAM ID:** {mlbam_id}")
|
|
|
|
|
|
|
| 303 |
st.write(f"**Image fetched:** {player_pil is not None}")
|
| 304 |
|
| 305 |
fmt = st.radio("Format", ["PNG", "JPG"], horizontal=True, key="cl_fmt")
|
|
@@ -321,7 +342,7 @@ def render_card_lab(conn) -> None:
|
|
| 321 |
status.info("Querying warehouse data...")
|
| 322 |
status.info("Building poster...")
|
| 323 |
img_bytes, tf, dq = _gen_hitter_bytes(
|
| 324 |
-
conn, player_name, mode_key, year, date, start_date, end_date, fmt, player_pil
|
| 325 |
)
|
| 326 |
st.session_state["card_player"] = normalize_name(player_name or "unknown")
|
| 327 |
st.session_state["card_timeframe"] = tf
|
|
|
|
| 47 |
# Card generation functions — button-click only, no caching
|
| 48 |
# ---------------------------------------------------------------------------
|
| 49 |
|
| 50 |
+
def _gen_hitter_bytes(conn, player_name, batter_id, mode, year, date, start_date, end_date, fmt, player_pil):
|
| 51 |
windowed_df = get_player_card_window_df(
|
| 52 |
conn, player_name, "Hitter", mode=mode, year=year,
|
| 53 |
date=date, start_date=start_date, end_date=end_date,
|
| 54 |
+
batter_id=batter_id,
|
| 55 |
)
|
| 56 |
if windowed_df.empty:
|
| 57 |
return None, "", "limited"
|
|
|
|
| 150 |
# Hitter pipeline variables
|
| 151 |
hitter_display_names: list[str] = []
|
| 152 |
hitter_display_to_statcast: dict[str, str] = {}
|
| 153 |
+
_hitter_batter_id: int | None = None # MLBAM batter id from Parquet
|
| 154 |
|
| 155 |
# Pitcher pipeline variables
|
| 156 |
pitcher_display_names: list[str] = []
|
|
|
|
| 171 |
return
|
| 172 |
|
| 173 |
selected_hitter_display = st.selectbox("Player", hitter_display_names, key="cl_player_hitter")
|
| 174 |
+
|
| 175 |
+
# Read batter_id (MLBAM) from the Parquet — used for ec.batter = :batter_id query.
|
| 176 |
+
# DB schema: ec.player_name = PITCHER, not batter. batter_id is required.
|
| 177 |
+
_hitter_batter_id: int | None = None
|
| 178 |
+
if _HITTER_SEASONS_PATH.exists():
|
| 179 |
+
import pandas as _pd_hi
|
| 180 |
+
_hdf = _pd_hi.read_parquet(_HITTER_SEASONS_PATH)
|
| 181 |
+
_match = _hdf[_hdf["display_name"] == selected_hitter_display]
|
| 182 |
+
if selector_year is not None:
|
| 183 |
+
_yr = _match[_match["Season"] == selector_year]
|
| 184 |
+
if not _yr.empty:
|
| 185 |
+
_match = _yr
|
| 186 |
+
if not _match.empty:
|
| 187 |
+
_pid = _match.iloc[0]["player_id"]
|
| 188 |
+
if _pd_hi.notna(_pid):
|
| 189 |
+
_hitter_batter_id = int(_pid)
|
| 190 |
+
|
| 191 |
+
if _hitter_batter_id is None:
|
| 192 |
+
st.warning(
|
| 193 |
+
f"No MLBAM batter ID found for {selected_hitter_display!r}. "
|
| 194 |
+
"Card data will be empty. Re-run build_player_identity_map.py."
|
| 195 |
)
|
| 196 |
+
|
| 197 |
+
player_name = selected_hitter_display # display label (injected as literal in SQL)
|
| 198 |
+
player_name_display = selected_hitter_display
|
| 199 |
|
| 200 |
elif card_type == "Pitcher":
|
| 201 |
# Source: data/card_lab_pitcher_seasons.parquet (pybaseball pitching_stats, IP > 0 gate)
|
|
|
|
| 257 |
mlbam_id: int | None = None
|
| 258 |
player_pil = None
|
| 259 |
if card_type == "Hitter" and player_name:
|
| 260 |
+
# batter_id from Parquet — no DB call needed (ec.player_name = pitcher, not batter)
|
| 261 |
+
mlbam_id = _hitter_batter_id
|
| 262 |
player_pil = resolve_player_image(mlbam_id) if mlbam_id else None
|
| 263 |
elif card_type == "Pitcher" and pitcher_id:
|
| 264 |
mlbam_id = pitcher_id
|
|
|
|
| 319 |
|
| 320 |
if card_type in ("Hitter", "Pitcher"):
|
| 321 |
st.write(f"**MLBAM ID:** {mlbam_id}")
|
| 322 |
+
if card_type == "Hitter":
|
| 323 |
+
st.write(f"**Batter ID (from Parquet):** {_hitter_batter_id}")
|
| 324 |
st.write(f"**Image fetched:** {player_pil is not None}")
|
| 325 |
|
| 326 |
fmt = st.radio("Format", ["PNG", "JPG"], horizontal=True, key="cl_fmt")
|
|
|
|
| 342 |
status.info("Querying warehouse data...")
|
| 343 |
status.info("Building poster...")
|
| 344 |
img_bytes, tf, dq = _gen_hitter_bytes(
|
| 345 |
+
conn, player_name, _hitter_batter_id, mode_key, year, date, start_date, end_date, fmt, player_pil
|
| 346 |
)
|
| 347 |
st.session_state["card_player"] = normalize_name(player_name or "unknown")
|
| 348 |
st.session_state["card_timeframe"] = tf
|
visualization/cards/card_queries.py
CHANGED
|
@@ -228,6 +228,7 @@ def get_player_card_window_df(
|
|
| 228 |
start_date: str | None = None,
|
| 229 |
end_date: str | None = None,
|
| 230 |
pitcher_id: int | None = None,
|
|
|
|
| 231 |
) -> pd.DataFrame:
|
| 232 |
"""
|
| 233 |
Fetch the FULL matching pitch/event dataset for one player over a time window.
|
|
@@ -238,8 +239,9 @@ def get_player_card_window_df(
|
|
| 238 |
Season mode uses source_season INT filter (fastest).
|
| 239 |
Date range / single date use game_date TEXT range comparison.
|
| 240 |
|
| 241 |
-
For Pitcher card_type, pitcher_id is REQUIRED (
|
| 242 |
-
|
|
|
|
| 243 |
"""
|
| 244 |
try:
|
| 245 |
if card_type == "Pitcher":
|
|
@@ -289,14 +291,54 @@ def get_player_card_window_df(
|
|
| 289 |
params = {"player_name": player_name, "pitcher_id": pitcher_id, "sd": sd, "ed": ed}
|
| 290 |
|
| 291 |
else: # Hitter
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
if mode == "season" and year:
|
| 294 |
-
sql = text(
|
| 295 |
-
params = {"player_name": player_name, "year": int(year)}
|
| 296 |
else:
|
| 297 |
sd, ed = _date_range(mode, date, start_date, end_date)
|
| 298 |
-
sql = text(
|
| 299 |
-
params = {"player_name": player_name, "sd": sd, "ed": ed}
|
| 300 |
|
| 301 |
df = pd.read_sql(sql, conn, params=params)
|
| 302 |
|
|
|
|
| 228 |
start_date: str | None = None,
|
| 229 |
end_date: str | None = None,
|
| 230 |
pitcher_id: int | None = None,
|
| 231 |
+
batter_id: int | None = None,
|
| 232 |
) -> pd.DataFrame:
|
| 233 |
"""
|
| 234 |
Fetch the FULL matching pitch/event dataset for one player over a time window.
|
|
|
|
| 239 |
Season mode uses source_season INT filter (fastest).
|
| 240 |
Date range / single date use game_date TEXT range comparison.
|
| 241 |
|
| 242 |
+
For Pitcher card_type, pitcher_id is REQUIRED (ec.pitcher = :pitcher_id).
|
| 243 |
+
For Hitter card_type, batter_id is REQUIRED (ec.batter = :batter_id).
|
| 244 |
+
player_name is injected as a display label in both cases.
|
| 245 |
"""
|
| 246 |
try:
|
| 247 |
if card_type == "Pitcher":
|
|
|
|
| 291 |
params = {"player_name": player_name, "pitcher_id": pitcher_id, "sd": sd, "ed": ed}
|
| 292 |
|
| 293 |
else: # Hitter
|
| 294 |
+
if batter_id is None:
|
| 295 |
+
logger.warning(
|
| 296 |
+
"[card_lab_db_window] batter_id required for Hitter card_type "
|
| 297 |
+
"(player_name='%s') — returning empty", player_name,
|
| 298 |
+
)
|
| 299 |
+
return pd.DataFrame()
|
| 300 |
+
|
| 301 |
+
# DB schema: ec.player_name = PITCHER name. Hitters are identified by
|
| 302 |
+
# ec.batter (MLBAM ID). Inject player_name as a literal so downstream
|
| 303 |
+
# card_data.py filtering (df["player_name"] == player_name) still works.
|
| 304 |
+
_HITTER_SELECT = f"""
|
| 305 |
+
SELECT
|
| 306 |
+
ec.event_key,
|
| 307 |
+
:player_name AS player_name,
|
| 308 |
+
ec.game_date,
|
| 309 |
+
ec.game_pk,
|
| 310 |
+
ec.source_season,
|
| 311 |
+
ec.pitch_name,
|
| 312 |
+
ec.events,
|
| 313 |
+
ec.description,
|
| 314 |
+
ec.stand,
|
| 315 |
+
ec.p_throws,
|
| 316 |
+
ec.batter,
|
| 317 |
+
ec.home_team,
|
| 318 |
+
ec.away_team,
|
| 319 |
+
ec.inning,
|
| 320 |
+
ec.at_bat_number,
|
| 321 |
+
ec.pitch_number,
|
| 322 |
+
bb.launch_speed,
|
| 323 |
+
bb.launch_angle,
|
| 324 |
+
bb.bb_type,
|
| 325 |
+
bb.estimated_woba_using_speedangle,
|
| 326 |
+
pr.release_speed,
|
| 327 |
+
pr.release_spin_rate,
|
| 328 |
+
pr.pfx_x,
|
| 329 |
+
pr.pfx_z
|
| 330 |
+
FROM statcast_event_core ec
|
| 331 |
+
LEFT JOIN statcast_batted_ball bb ON ec.event_key = bb.event_key
|
| 332 |
+
LEFT JOIN statcast_pitch_release pr ON ec.event_key = pr.event_key
|
| 333 |
+
WHERE ec.batter = :batter_id
|
| 334 |
+
"""
|
| 335 |
if mode == "season" and year:
|
| 336 |
+
sql = text(_HITTER_SELECT + " AND ec.source_season = :year ORDER BY ec.game_date DESC, ec.game_pk DESC")
|
| 337 |
+
params = {"player_name": player_name, "batter_id": batter_id, "year": int(year)}
|
| 338 |
else:
|
| 339 |
sd, ed = _date_range(mode, date, start_date, end_date)
|
| 340 |
+
sql = text(_HITTER_SELECT + " AND ec.game_date >= :sd AND ec.game_date <= :ed ORDER BY ec.game_date DESC, ec.game_pk DESC")
|
| 341 |
+
params = {"player_name": player_name, "batter_id": batter_id, "sd": sd, "ed": ed}
|
| 342 |
|
| 343 |
df = pd.read_sql(sql, conn, params=params)
|
| 344 |
|