Syntrex commited on
Commit
d144d29
·
verified ·
1 Parent(s): 38cb69d

Update data/statcast.py

Browse files
Files changed (1) hide show
  1. data/statcast.py +88 -65
data/statcast.py CHANGED
@@ -1,96 +1,107 @@
1
  from __future__ import annotations
2
 
3
  from io import StringIO
4
- from typing import Any
5
 
6
  import pandas as pd
7
  import requests
8
 
9
  from config.settings import STATCAST_SEARCH_URL
10
 
11
-
12
  HEADERS = {
13
  "User-Agent": "Mozilla/5.0",
14
  "Accept-Language": "en-US,en;q=0.9",
15
  }
16
 
17
 
18
- def fetch_wbc_statcast_range(start_date: str, end_date: str, season: str = "2026") -> pd.DataFrame:
19
  """
20
- Pull WBC pitch/event-level Statcast-style CSV from Baseball Savant.
21
 
22
- Baseball Savant has a dedicated WBC search surface, but CSV exports are still served
23
- from the same csv backend path pattern. The key difference is using tournament filters.
24
  """
25
- params = {
26
- "all": "true",
27
- "hfPT": "",
28
- "hfAB": "",
29
- "hfBBT": "",
30
- "hfPR": "",
31
- "hfZ": "",
32
- "stadium": "",
33
- "hfBBL": "",
34
- "hfNewZones": "",
35
- "hfGT": "F|D|L|W|", # game types commonly used in savant filters
36
- "hfC": "",
37
- "hfSea": f"{season}|",
38
- "hfSit": "",
39
- "player_type": "batter",
40
- "hfOuts": "",
41
- "opponent": "",
42
- "pitcher_throws": "",
43
- "batter_stands": "",
44
- "hfSA": "",
45
- "game_date_gt": start_date,
46
- "game_date_lt": end_date,
47
- "team": "",
48
- "position": "",
49
- "hfRO": "",
50
- "home_road": "",
51
- "hfFlag": "",
52
- "metric_1": "",
53
- "hfInn": "",
54
- "min_pitches": "0",
55
- "min_results": "0",
56
- "group_by": "name",
57
- "sort_col": "pitches",
58
- "player_event_sort": "h_launch_speed",
59
- "sort_order": "desc",
60
- "min_abs": "0",
61
- "type": "details",
62
- }
63
-
64
- response = requests.get(STATCAST_SEARCH_URL, params=params, headers=HEADERS, timeout=60)
65
- response.raise_for_status()
66
-
67
- text = response.text.strip()
68
- if not text or text.startswith("<!DOCTYPE html"):
69
- return pd.DataFrame()
70
-
71
- try:
72
- df = pd.read_csv(StringIO(text))
73
- except Exception:
74
- return pd.DataFrame()
75
 
76
- return df
77
-
78
-
79
- def normalize_wbc_statcast(df: pd.DataFrame) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  if df.empty:
81
  return df
82
 
83
  rename_map = {
84
  "player_name": "player_name",
85
  "pitch_type": "pitch_type",
 
86
  "release_speed": "release_speed",
87
  "release_spin_rate": "release_spin_rate",
88
- "pfx_x": "pfx_x",
89
- "pfx_z": "pfx_z",
90
  "release_pos_x": "release_pos_x",
91
  "release_pos_z": "release_pos_z",
92
  "plate_x": "plate_x",
93
  "plate_z": "plate_z",
 
 
94
  "launch_speed": "launch_speed",
95
  "launch_angle": "launch_angle",
96
  "estimated_ba_using_speedangle": "xba",
@@ -99,13 +110,18 @@ def normalize_wbc_statcast(df: pd.DataFrame) -> pd.DataFrame:
99
  "description": "description",
100
  "stand": "batter_stand",
101
  "p_throws": "pitcher_hand",
102
- "game_date": "game_date",
103
  "home_team": "home_team",
104
  "away_team": "away_team",
 
 
105
  "inning": "inning",
106
  "outs_when_up": "outs_when_up",
107
  "balls": "balls",
108
  "strikes": "strikes",
 
 
 
 
109
  }
110
 
111
  keep_cols = [col for col in rename_map if col in df.columns]
@@ -115,12 +131,12 @@ def normalize_wbc_statcast(df: pd.DataFrame) -> pd.DataFrame:
115
  numeric_cols = [
116
  "release_speed",
117
  "release_spin_rate",
118
- "pfx_x",
119
- "pfx_z",
120
  "release_pos_x",
121
  "release_pos_z",
122
  "plate_x",
123
  "plate_z",
 
 
124
  "launch_speed",
125
  "launch_angle",
126
  "xba",
@@ -129,10 +145,17 @@ def normalize_wbc_statcast(df: pd.DataFrame) -> pd.DataFrame:
129
  "outs_when_up",
130
  "balls",
131
  "strikes",
 
 
 
 
132
  ]
133
 
134
  for col in numeric_cols:
135
  if col in out.columns:
136
  out[col] = pd.to_numeric(out[col], errors="coerce")
137
 
 
 
 
138
  return out
 
1
  from __future__ import annotations
2
 
3
  from io import StringIO
 
4
 
5
  import pandas as pd
6
  import requests
7
 
8
  from config.settings import STATCAST_SEARCH_URL
9
 
 
10
  HEADERS = {
11
  "User-Agent": "Mozilla/5.0",
12
  "Accept-Language": "en-US,en;q=0.9",
13
  }
14
 
15
 
16
+ def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
17
  """
18
+ WBC-first Statcast pull.
19
 
20
+ This uses Savant's CSV backend with tournament-style filters and recent dates.
21
+ If 2026 returns no rows, it falls back to 2023 WBC season coverage.
22
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ season_candidates = ["2026", "2023"]
25
+
26
+ for season in season_candidates:
27
+ params = {
28
+ "all": "true",
29
+ "hfPT": "",
30
+ "hfAB": "",
31
+ "hfBBT": "",
32
+ "hfPR": "",
33
+ "hfZ": "",
34
+ "stadium": "",
35
+ "hfBBL": "",
36
+ "hfNewZones": "",
37
+ "hfGT": "F|D|L|W|",
38
+ "hfC": "",
39
+ "hfSea": f"{season}|",
40
+ "hfSit": "",
41
+ "player_type": "batter",
42
+ "hfOuts": "",
43
+ "opponent": "",
44
+ "pitcher_throws": "",
45
+ "batter_stands": "",
46
+ "hfSA": "",
47
+ "game_date_gt": start_date,
48
+ "game_date_lt": end_date,
49
+ "team": "",
50
+ "position": "",
51
+ "hfRO": "",
52
+ "home_road": "",
53
+ "hfFlag": "",
54
+ "metric_1": "",
55
+ "hfInn": "",
56
+ "min_pitches": "0",
57
+ "min_results": "0",
58
+ "group_by": "name",
59
+ "sort_col": "pitches",
60
+ "player_event_sort": "api_h_launch_speed",
61
+ "sort_order": "desc",
62
+ "min_abs": "0",
63
+ "type": "details",
64
+ }
65
+
66
+ response = requests.get(
67
+ STATCAST_SEARCH_URL,
68
+ params=params,
69
+ headers=HEADERS,
70
+ timeout=60,
71
+ )
72
+ response.raise_for_status()
73
+
74
+ text = response.text.strip()
75
+ if not text or text.startswith("<!DOCTYPE html"):
76
+ continue
77
+
78
+ try:
79
+ df = pd.read_csv(StringIO(text))
80
+ except Exception:
81
+ continue
82
+
83
+ if not df.empty:
84
+ return df
85
+
86
+ return pd.DataFrame()
87
+
88
+
89
+ def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:
90
  if df.empty:
91
  return df
92
 
93
  rename_map = {
94
  "player_name": "player_name",
95
  "pitch_type": "pitch_type",
96
+ "pitch_name": "pitch_name",
97
  "release_speed": "release_speed",
98
  "release_spin_rate": "release_spin_rate",
 
 
99
  "release_pos_x": "release_pos_x",
100
  "release_pos_z": "release_pos_z",
101
  "plate_x": "plate_x",
102
  "plate_z": "plate_z",
103
+ "pfx_x": "pfx_x",
104
+ "pfx_z": "pfx_z",
105
  "launch_speed": "launch_speed",
106
  "launch_angle": "launch_angle",
107
  "estimated_ba_using_speedangle": "xba",
 
110
  "description": "description",
111
  "stand": "batter_stand",
112
  "p_throws": "pitcher_hand",
 
113
  "home_team": "home_team",
114
  "away_team": "away_team",
115
+ "game_date": "game_date",
116
+ "game_pk": "game_pk",
117
  "inning": "inning",
118
  "outs_when_up": "outs_when_up",
119
  "balls": "balls",
120
  "strikes": "strikes",
121
+ "bat_score": "bat_score",
122
+ "fld_score": "fld_score",
123
+ "post_bat_score": "post_bat_score",
124
+ "post_fld_score": "post_fld_score",
125
  }
126
 
127
  keep_cols = [col for col in rename_map if col in df.columns]
 
131
  numeric_cols = [
132
  "release_speed",
133
  "release_spin_rate",
 
 
134
  "release_pos_x",
135
  "release_pos_z",
136
  "plate_x",
137
  "plate_z",
138
+ "pfx_x",
139
+ "pfx_z",
140
  "launch_speed",
141
  "launch_angle",
142
  "xba",
 
145
  "outs_when_up",
146
  "balls",
147
  "strikes",
148
+ "bat_score",
149
+ "fld_score",
150
+ "post_bat_score",
151
+ "post_fld_score",
152
  ]
153
 
154
  for col in numeric_cols:
155
  if col in out.columns:
156
  out[col] = pd.to_numeric(out[col], errors="coerce")
157
 
158
+ if "game_date" in out.columns:
159
+ out["game_date"] = pd.to_datetime(out["game_date"], errors="coerce")
160
+
161
  return out