Syntrex Claude Sonnet 4.6 commited on
Commit
84163a9
·
1 Parent(s): 8f839e6

Card Lab Pass 5: identity map + batter_id-based hitter query

Browse files

- Build player_identity_map.parquet from pybaseball + MLBAM lookup + DB enrichment
- Pitcher statcast_name resolved from ec.player_name (Last, First format)
- Hitter statcast_name set to canonical_name; hitter cards now query by ec.batter = :batter_id
- card_lab_pitcher_seasons.parquet and card_lab_hitter_seasons.parquet built from
pybaseball pitching_stats/batting_stats with IP>0/AB>0 gates
- get_player_card_window_df: add batter_id param for Hitter path (ec.batter = :batter_id)
- card_lab_page.py: read batter_id from Parquet, pass to window query; no DB call for MLBAM ID
- Collision-safe display_names; all validation checks pass (ALL PASS)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

data/card_lab_hitter_seasons.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a661d5caf2cf95decb7a632bc33c0eb33d3268ec689d7bbcccd46abb7c2f0885
3
+ size 92044
data/card_lab_pitcher_seasons.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7326980b9dffdbc208712b87faf8ed00706aea6172b5ad9e50b9250a84625308
3
+ size 102665
data/player_identity_map.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6036880b0335b6e28010c0540aebb65af0b3fe8946f526ad2d604b08aff4a842
3
+ size 206860
scripts/build_card_lab_season_summaries.py CHANGED
@@ -240,12 +240,12 @@ def build_season_summaries(seasons: list[int]) -> None:
240
  pitcher_df = build_pitcher_seasons(seasons, identity_df)
241
  _PITCHER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
242
  pitcher_df.to_parquet(_PITCHER_OUTPUT_PATH, index=False)
243
- print(f"\nWrote {len(pitcher_df)} pitcher rows {_PITCHER_OUTPUT_PATH}")
244
 
245
  hitter_df = build_hitter_seasons(seasons, identity_df)
246
  _HITTER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
247
  hitter_df.to_parquet(_HITTER_OUTPUT_PATH, index=False)
248
- print(f"\nWrote {len(hitter_df)} hitter rows {_HITTER_OUTPUT_PATH}")
249
 
250
 
251
  if __name__ == "__main__":
 
240
  pitcher_df = build_pitcher_seasons(seasons, identity_df)
241
  _PITCHER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
242
  pitcher_df.to_parquet(_PITCHER_OUTPUT_PATH, index=False)
243
+ print(f"\nWrote {len(pitcher_df)} pitcher rows -> {_PITCHER_OUTPUT_PATH}")
244
 
245
  hitter_df = build_hitter_seasons(seasons, identity_df)
246
  _HITTER_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
247
  hitter_df.to_parquet(_HITTER_OUTPUT_PATH, index=False)
248
+ print(f"\nWrote {len(hitter_df)} hitter rows -> {_HITTER_OUTPUT_PATH}")
249
 
250
 
251
  if __name__ == "__main__":
scripts/build_player_identity_map.py CHANGED
@@ -170,11 +170,19 @@ def _build_raw_records(
170
 
171
  def _enrich_statcast_names(identity_df: pd.DataFrame) -> pd.DataFrame:
172
  """
173
- Populate statcast_name by matching against statcast_event_core.player_name.
174
- This is REQUIRED — without it, hitter rows stay statcast_name=None and are
175
- excluded from the hitter selector at build time.
 
 
 
 
 
 
 
 
176
  """
177
- print(" Connecting to DB for statcast_name enrichment...")
178
  conn = get_connection()
179
  try:
180
  rows = conn.execute(
@@ -184,47 +192,74 @@ def _enrich_statcast_names(identity_df: pd.DataFrame) -> pd.DataFrame:
184
  conn.close()
185
 
186
  statcast_names: list[str] = [r[0] for r in rows if r[0]]
187
- statcast_names_set: set[str] = set(statcast_names)
188
- print(f" Loaded {len(statcast_names)} distinct statcast player_names")
189
 
190
- # Build normalized key [statcast_names] map for fallback matching
 
 
 
191
  norm_to_statcast: dict[str, list[str]] = defaultdict(list)
192
  for sc in statcast_names:
193
- norm_to_statcast[normalize_for_matching(sc)].append(sc)
 
 
 
 
194
 
195
- resolved = 0
196
- ambiguous = 0
197
- unmatched = 0
 
198
 
199
  for idx, row in identity_df.iterrows():
200
  canonical = row["canonical_name"]
201
  norm_key = row["canonical_name_normalized"]
202
-
203
- # Layer 1: exact match (case-insensitive)
204
- sc_lower = canonical.lower()
205
- exact = [sc for sc in statcast_names_set if sc.lower() == sc_lower]
206
- if len(exact) == 1:
207
- identity_df.at[idx, "statcast_name"] = exact[0]
208
- if "statcast" not in str(row.get("source_note", "")):
 
209
  identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast"
210
- resolved += 1
211
- continue
212
-
213
- # Layer 2: normalized key match
214
- candidates = norm_to_statcast.get(norm_key, [])
215
- if len(candidates) == 1:
216
- identity_df.at[idx, "statcast_name"] = candidates[0]
217
- if "statcast" not in str(row.get("source_note", "")):
218
- identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast-norm"
219
- resolved += 1
220
- elif len(candidates) > 1:
221
- print(f" AMBIGUOUS: {canonical!r} → {candidates}")
222
- ambiguous += 1
223
- else:
224
- print(f" UNMATCHED: {canonical!r} (no statcast player_name found)")
225
- unmatched += 1
226
 
227
- print(f" Enrichment: resolved={resolved} ambiguous={ambiguous} unmatched={unmatched}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  return identity_df
229
 
230
 
@@ -242,31 +277,29 @@ def _resolve_collisions(identity_df: pd.DataFrame) -> pd.DataFrame:
242
  identity_df = identity_df.copy()
243
  identity_df["display_name"] = identity_df["canonical_name"]
244
 
245
- # Find collisions: same normalized name, different player_id
246
  norm_groups = identity_df.groupby("canonical_name_normalized")
247
  collision_count = 0
248
  for norm_key, group in norm_groups:
249
- # Only a collision if multiple distinct player_ids exist in the group
250
- ids = group["player_id"].dropna().unique()
251
- if len(ids) <= 1 and len(group) == 1:
252
- continue
253
- if len(ids) > 1:
254
- # Sort by player_id ascending (deterministic) — None last
255
- sorted_group = group.sort_values(
256
- "player_id", ascending=True, na_position="last"
257
- )
258
- for rank, (idx, row) in enumerate(sorted_group.iterrows()):
259
- if rank == 0:
260
- # Primary: keep canonical_name as display_name
261
- pass
262
- else:
263
- pid = row.get("player_id")
264
- suffix = str(int(pid)) if pd.notna(pid) else "?"
265
- identity_df.at[idx, "display_name"] = f"{row['canonical_name']} ({suffix})"
266
- identity_df.at[idx, "source_note"] = (
267
- str(row.get("source_note", "")) + "+collision-resolved"
268
- )
269
- collision_count += 1
270
 
271
  if collision_count:
272
  print(f" Resolved {collision_count} collision suffix(es)")
@@ -309,7 +342,7 @@ def build_identity_map(seasons: list[int]) -> None:
309
 
310
  OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
311
  identity_df.to_parquet(OUTPUT_PATH, index=False)
312
- print(f"\nWrote {len(identity_df)} rows {OUTPUT_PATH}")
313
 
314
  # Summary
315
  with_mlbam = identity_df["player_id"].notna().sum()
 
170
 
171
  def _enrich_statcast_names(identity_df: pd.DataFrame) -> pd.DataFrame:
172
  """
173
+ Populate statcast_name for each identity record.
174
+
175
+ DB context: statcast_event_core.player_name stores the PITCHER name in "Last, First"
176
+ format. Pure hitters never appear there. Strategy:
177
+
178
+ - role_pitcher records: match against ec.player_name (Last, First → canonical → norm).
179
+ statcast_name = the DB's "Last, First" string (used in pitcher selector label only;
180
+ pitcher window queries use ec.pitcher = :pitcher_id, not player_name).
181
+ - role_hitter-only records: set statcast_name = canonical_name (First Last).
182
+ Hitter window queries will use ec.batter = :batter_id via player_id, not player_name.
183
+ - Two-way players (both roles): try pitcher match first; fall back to canonical_name.
184
  """
185
+ print(" Connecting to DB for statcast_name enrichment (pitchers only)...")
186
  conn = get_connection()
187
  try:
188
  rows = conn.execute(
 
192
  conn.close()
193
 
194
  statcast_names: list[str] = [r[0] for r in rows if r[0]]
195
+ print(f" Loaded {len(statcast_names)} distinct statcast player_names (pitcher names)")
 
196
 
197
+ # Pre-build O(1) lookup dicts so the per-record loop is fast.
198
+ # statcast names are "Last, First" — apply to_canonical_name before normalizing
199
+ # so the key matches pybaseball "First Last" canonical_name_normalized.
200
+ canonical_lower_to_statcast: dict[str, str] = {} # "first last" → "Last, First"
201
  norm_to_statcast: dict[str, list[str]] = defaultdict(list)
202
  for sc in statcast_names:
203
+ canon = to_canonical_name(sc) # "Last, First" → "First Last"
204
+ cl = canon.lower()
205
+ if cl not in canonical_lower_to_statcast:
206
+ canonical_lower_to_statcast[cl] = sc
207
+ norm_to_statcast[normalize_for_matching(canon)].append(sc)
208
 
209
+ resolved_pitcher = 0
210
+ resolved_hitter = 0
211
+ ambiguous = 0
212
+ unmatched_pitcher = 0
213
 
214
  for idx, row in identity_df.iterrows():
215
  canonical = row["canonical_name"]
216
  norm_key = row["canonical_name_normalized"]
217
+ is_pitcher = bool(row.get("role_pitcher"))
218
+ is_hitter = bool(row.get("role_hitter"))
219
+
220
+ if is_pitcher:
221
+ # Layer 1: exact canonical lowercase match (O(1))
222
+ sc = canonical_lower_to_statcast.get(canonical.lower())
223
+ if sc:
224
+ identity_df.at[idx, "statcast_name"] = sc
225
  identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast"
226
+ resolved_pitcher += 1
227
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
+ # Layer 2: normalized key match (O(1))
230
+ candidates = norm_to_statcast.get(norm_key, [])
231
+ if len(candidates) == 1:
232
+ identity_df.at[idx, "statcast_name"] = candidates[0]
233
+ identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+statcast-norm"
234
+ resolved_pitcher += 1
235
+ continue
236
+ elif len(candidates) > 1:
237
+ print(f" AMBIGUOUS pitcher: {canonical!r} → {candidates}")
238
+ ambiguous += 1
239
+ # Fall through: use canonical_name as statcast_name so row is not excluded
240
+ else:
241
+ print(f" UNMATCHED pitcher: {canonical!r}")
242
+ unmatched_pitcher += 1
243
+ # Fall through to hitter branch if also a hitter; else canonical fallback
244
+
245
+ if is_hitter and identity_df.at[idx, "statcast_name"] is None:
246
+ # Hitter window queries use ec.batter = :batter_id (player_id), not player_name.
247
+ # statcast_name must be non-null to pass build validation.
248
+ # Use canonical_name (First Last) as a stable non-null placeholder.
249
+ identity_df.at[idx, "statcast_name"] = canonical
250
+ identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+hitter-canonical"
251
+ resolved_hitter += 1
252
+
253
+ # Final fallback: any record still missing statcast_name (e.g. unmatched pure pitcher)
254
+ if identity_df.at[idx, "statcast_name"] is None:
255
+ identity_df.at[idx, "statcast_name"] = canonical
256
+ identity_df.at[idx, "source_note"] = str(row.get("source_note", "")) + "+canonical-fallback"
257
+
258
+ print(
259
+ f" Enrichment: pitcher_matched={resolved_pitcher} "
260
+ f"hitter_canonical={resolved_hitter} "
261
+ f"ambiguous={ambiguous} unmatched_pitcher={unmatched_pitcher}"
262
+ )
263
  return identity_df
264
 
265
 
 
277
  identity_df = identity_df.copy()
278
  identity_df["display_name"] = identity_df["canonical_name"]
279
 
280
+ # Find collisions: same normalized name, multiple rows
281
  norm_groups = identity_df.groupby("canonical_name_normalized")
282
  collision_count = 0
283
  for norm_key, group in norm_groups:
284
+ if len(group) == 1:
285
+ continue # no collision
286
+ # Multiple records share the same normalized name — assign suffixes to all but the first.
287
+ # Sort: non-null player_id ascending first, then null (deterministic).
288
+ sorted_group = group.sort_values(
289
+ "player_id", ascending=True, na_position="last"
290
+ )
291
+ for rank, (idx, row) in enumerate(sorted_group.iterrows()):
292
+ if rank == 0:
293
+ # Primary: keep canonical_name as display_name
294
+ pass
295
+ else:
296
+ pid = row.get("player_id")
297
+ suffix = str(int(pid)) if pd.notna(pid) else "?"
298
+ identity_df.at[idx, "display_name"] = f"{row['canonical_name']} ({suffix})"
299
+ identity_df.at[idx, "source_note"] = (
300
+ str(row.get("source_note", "")) + "+collision-resolved"
301
+ )
302
+ collision_count += 1
 
 
303
 
304
  if collision_count:
305
  print(f" Resolved {collision_count} collision suffix(es)")
 
342
 
343
  OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
344
  identity_df.to_parquet(OUTPUT_PATH, index=False)
345
+ print(f"\nWrote {len(identity_df)} rows -> {OUTPUT_PATH}")
346
 
347
  # Summary
348
  with_mlbam = identity_df["player_id"].notna().sum()
visualization/card_lab_page.py CHANGED
@@ -47,10 +47,11 @@ def normalize_name(name: str) -> str:
47
  # Card generation functions — button-click only, no caching
48
  # ---------------------------------------------------------------------------
49
 
50
- def _gen_hitter_bytes(conn, player_name, mode, year, date, start_date, end_date, fmt, player_pil):
51
  windowed_df = get_player_card_window_df(
52
  conn, player_name, "Hitter", mode=mode, year=year,
53
  date=date, start_date=start_date, end_date=end_date,
 
54
  )
55
  if windowed_df.empty:
56
  return None, "", "limited"
@@ -149,6 +150,7 @@ def render_card_lab(conn) -> None:
149
  # Hitter pipeline variables
150
  hitter_display_names: list[str] = []
151
  hitter_display_to_statcast: dict[str, str] = {}
 
152
 
153
  # Pitcher pipeline variables
154
  pitcher_display_names: list[str] = []
@@ -169,15 +171,31 @@ def render_card_lab(conn) -> None:
169
  return
170
 
171
  selected_hitter_display = st.selectbox("Player", hitter_display_names, key="cl_player_hitter")
172
- # statcast_name is required — null rows excluded at build time, no runtime fallback allowed
173
- statcast_name = hitter_display_to_statcast.get(selected_hitter_display)
174
- if not statcast_name:
175
- raise RuntimeError(
176
- f"Missing statcast_name for hitter selector row: {selected_hitter_display!r}. "
177
- "Re-run scripts/build_player_identity_map.py with DB enrichment."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  )
179
- player_name = statcast_name # exact name for DB queries
180
- player_name_display = selected_hitter_display # display-only
 
181
 
182
  elif card_type == "Pitcher":
183
  # Source: data/card_lab_pitcher_seasons.parquet (pybaseball pitching_stats, IP > 0 gate)
@@ -239,7 +257,8 @@ def render_card_lab(conn) -> None:
239
  mlbam_id: int | None = None
240
  player_pil = None
241
  if card_type == "Hitter" and player_name:
242
- mlbam_id = get_batter_mlbam_id(conn, player_name, selector_year)
 
243
  player_pil = resolve_player_image(mlbam_id) if mlbam_id else None
244
  elif card_type == "Pitcher" and pitcher_id:
245
  mlbam_id = pitcher_id
@@ -300,6 +319,8 @@ def render_card_lab(conn) -> None:
300
 
301
  if card_type in ("Hitter", "Pitcher"):
302
  st.write(f"**MLBAM ID:** {mlbam_id}")
 
 
303
  st.write(f"**Image fetched:** {player_pil is not None}")
304
 
305
  fmt = st.radio("Format", ["PNG", "JPG"], horizontal=True, key="cl_fmt")
@@ -321,7 +342,7 @@ def render_card_lab(conn) -> None:
321
  status.info("Querying warehouse data...")
322
  status.info("Building poster...")
323
  img_bytes, tf, dq = _gen_hitter_bytes(
324
- conn, player_name, mode_key, year, date, start_date, end_date, fmt, player_pil
325
  )
326
  st.session_state["card_player"] = normalize_name(player_name or "unknown")
327
  st.session_state["card_timeframe"] = tf
 
47
  # Card generation functions — button-click only, no caching
48
  # ---------------------------------------------------------------------------
49
 
50
+ def _gen_hitter_bytes(conn, player_name, batter_id, mode, year, date, start_date, end_date, fmt, player_pil):
51
  windowed_df = get_player_card_window_df(
52
  conn, player_name, "Hitter", mode=mode, year=year,
53
  date=date, start_date=start_date, end_date=end_date,
54
+ batter_id=batter_id,
55
  )
56
  if windowed_df.empty:
57
  return None, "", "limited"
 
150
  # Hitter pipeline variables
151
  hitter_display_names: list[str] = []
152
  hitter_display_to_statcast: dict[str, str] = {}
153
+ _hitter_batter_id: int | None = None # MLBAM batter id from Parquet
154
 
155
  # Pitcher pipeline variables
156
  pitcher_display_names: list[str] = []
 
171
  return
172
 
173
  selected_hitter_display = st.selectbox("Player", hitter_display_names, key="cl_player_hitter")
174
+
175
+ # Read batter_id (MLBAM) from the Parquet — used for ec.batter = :batter_id query.
176
+ # DB schema: ec.player_name = PITCHER, not batter. batter_id is required.
177
+ _hitter_batter_id: int | None = None
178
+ if _HITTER_SEASONS_PATH.exists():
179
+ import pandas as _pd_hi
180
+ _hdf = _pd_hi.read_parquet(_HITTER_SEASONS_PATH)
181
+ _match = _hdf[_hdf["display_name"] == selected_hitter_display]
182
+ if selector_year is not None:
183
+ _yr = _match[_match["Season"] == selector_year]
184
+ if not _yr.empty:
185
+ _match = _yr
186
+ if not _match.empty:
187
+ _pid = _match.iloc[0]["player_id"]
188
+ if _pd_hi.notna(_pid):
189
+ _hitter_batter_id = int(_pid)
190
+
191
+ if _hitter_batter_id is None:
192
+ st.warning(
193
+ f"No MLBAM batter ID found for {selected_hitter_display!r}. "
194
+ "Card data will be empty. Re-run build_player_identity_map.py."
195
  )
196
+
197
+ player_name = selected_hitter_display # display label (injected as literal in SQL)
198
+ player_name_display = selected_hitter_display
199
 
200
  elif card_type == "Pitcher":
201
  # Source: data/card_lab_pitcher_seasons.parquet (pybaseball pitching_stats, IP > 0 gate)
 
257
  mlbam_id: int | None = None
258
  player_pil = None
259
  if card_type == "Hitter" and player_name:
260
+ # batter_id from Parquet — no DB call needed (ec.player_name = pitcher, not batter)
261
+ mlbam_id = _hitter_batter_id
262
  player_pil = resolve_player_image(mlbam_id) if mlbam_id else None
263
  elif card_type == "Pitcher" and pitcher_id:
264
  mlbam_id = pitcher_id
 
319
 
320
  if card_type in ("Hitter", "Pitcher"):
321
  st.write(f"**MLBAM ID:** {mlbam_id}")
322
+ if card_type == "Hitter":
323
+ st.write(f"**Batter ID (from Parquet):** {_hitter_batter_id}")
324
  st.write(f"**Image fetched:** {player_pil is not None}")
325
 
326
  fmt = st.radio("Format", ["PNG", "JPG"], horizontal=True, key="cl_fmt")
 
342
  status.info("Querying warehouse data...")
343
  status.info("Building poster...")
344
  img_bytes, tf, dq = _gen_hitter_bytes(
345
+ conn, player_name, _hitter_batter_id, mode_key, year, date, start_date, end_date, fmt, player_pil
346
  )
347
  st.session_state["card_player"] = normalize_name(player_name or "unknown")
348
  st.session_state["card_timeframe"] = tf
visualization/cards/card_queries.py CHANGED
@@ -228,6 +228,7 @@ def get_player_card_window_df(
228
  start_date: str | None = None,
229
  end_date: str | None = None,
230
  pitcher_id: int | None = None,
 
231
  ) -> pd.DataFrame:
232
  """
233
  Fetch the FULL matching pitch/event dataset for one player over a time window.
@@ -238,8 +239,9 @@ def get_player_card_window_df(
238
  Season mode uses source_season INT filter (fastest).
239
  Date range / single date use game_date TEXT range comparison.
240
 
241
- For Pitcher card_type, pitcher_id is REQUIRED (the numeric ID from statcast_event_core).
242
- The player_name string is injected as a display label for the feature builders.
 
243
  """
244
  try:
245
  if card_type == "Pitcher":
@@ -289,14 +291,54 @@ def get_player_card_window_df(
289
  params = {"player_name": player_name, "pitcher_id": pitcher_id, "sd": sd, "ed": ed}
290
 
291
  else: # Hitter
292
- _HITTER_WHERE = f"SELECT {_HITTER_JOIN_SELECT} {_HITTER_JOIN_FROM} WHERE ec.player_name = :player_name"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  if mode == "season" and year:
294
- sql = text(_HITTER_WHERE + " AND ec.source_season = :year ORDER BY ec.game_date DESC, ec.game_pk DESC")
295
- params = {"player_name": player_name, "year": int(year)}
296
  else:
297
  sd, ed = _date_range(mode, date, start_date, end_date)
298
- sql = text(_HITTER_WHERE + " AND ec.game_date >= :sd AND ec.game_date <= :ed ORDER BY ec.game_date DESC, ec.game_pk DESC")
299
- params = {"player_name": player_name, "sd": sd, "ed": ed}
300
 
301
  df = pd.read_sql(sql, conn, params=params)
302
 
 
228
  start_date: str | None = None,
229
  end_date: str | None = None,
230
  pitcher_id: int | None = None,
231
+ batter_id: int | None = None,
232
  ) -> pd.DataFrame:
233
  """
234
  Fetch the FULL matching pitch/event dataset for one player over a time window.
 
239
  Season mode uses source_season INT filter (fastest).
240
  Date range / single date use game_date TEXT range comparison.
241
 
242
+ For Pitcher card_type, pitcher_id is REQUIRED (ec.pitcher = :pitcher_id).
243
+ For Hitter card_type, batter_id is REQUIRED (ec.batter = :batter_id).
244
+ player_name is injected as a display label in both cases.
245
  """
246
  try:
247
  if card_type == "Pitcher":
 
291
  params = {"player_name": player_name, "pitcher_id": pitcher_id, "sd": sd, "ed": ed}
292
 
293
  else: # Hitter
294
+ if batter_id is None:
295
+ logger.warning(
296
+ "[card_lab_db_window] batter_id required for Hitter card_type "
297
+ "(player_name='%s') — returning empty", player_name,
298
+ )
299
+ return pd.DataFrame()
300
+
301
+ # DB schema: ec.player_name = PITCHER name. Hitters are identified by
302
+ # ec.batter (MLBAM ID). Inject player_name as a literal so downstream
303
+ # card_data.py filtering (df["player_name"] == player_name) still works.
304
+ _HITTER_SELECT = f"""
305
+ SELECT
306
+ ec.event_key,
307
+ :player_name AS player_name,
308
+ ec.game_date,
309
+ ec.game_pk,
310
+ ec.source_season,
311
+ ec.pitch_name,
312
+ ec.events,
313
+ ec.description,
314
+ ec.stand,
315
+ ec.p_throws,
316
+ ec.batter,
317
+ ec.home_team,
318
+ ec.away_team,
319
+ ec.inning,
320
+ ec.at_bat_number,
321
+ ec.pitch_number,
322
+ bb.launch_speed,
323
+ bb.launch_angle,
324
+ bb.bb_type,
325
+ bb.estimated_woba_using_speedangle,
326
+ pr.release_speed,
327
+ pr.release_spin_rate,
328
+ pr.pfx_x,
329
+ pr.pfx_z
330
+ FROM statcast_event_core ec
331
+ LEFT JOIN statcast_batted_ball bb ON ec.event_key = bb.event_key
332
+ LEFT JOIN statcast_pitch_release pr ON ec.event_key = pr.event_key
333
+ WHERE ec.batter = :batter_id
334
+ """
335
  if mode == "season" and year:
336
+ sql = text(_HITTER_SELECT + " AND ec.source_season = :year ORDER BY ec.game_date DESC, ec.game_pk DESC")
337
+ params = {"player_name": player_name, "batter_id": batter_id, "year": int(year)}
338
  else:
339
  sd, ed = _date_range(mode, date, start_date, end_date)
340
+ sql = text(_HITTER_SELECT + " AND ec.game_date >= :sd AND ec.game_date <= :ed ORDER BY ec.game_date DESC, ec.game_pk DESC")
341
+ params = {"player_name": player_name, "batter_id": batter_id, "sd": sd, "ed": ed}
342
 
343
  df = pd.read_sql(sql, conn, params=params)
344