Syntrex commited on
Commit
d563c3d
·
verified ·
1 Parent(s): 1eb1bcd

Update data/statcast.py

Browse files
Files changed (1) hide show
  1. data/statcast.py +63 -12
data/statcast.py CHANGED
@@ -9,7 +9,19 @@ import requests
9
  from config.settings import STATCAST_SEARCH_URL
10
 
11
 
12
- def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
13
  params = {
14
  "all": "true",
15
  "hfPT": "",
@@ -20,9 +32,9 @@ def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
20
  "stadium": "",
21
  "hfBBL": "",
22
  "hfNewZones": "",
23
- "hfGT": "R|",
24
  "hfC": "",
25
- "hfSea": "",
26
  "hfSit": "",
27
  "player_type": "batter",
28
  "hfOuts": "",
@@ -49,17 +61,22 @@ def fetch_statcast_range(start_date: str, end_date: str) -> pd.DataFrame:
49
  "type": "details",
50
  }
51
 
52
- response = requests.get(STATCAST_SEARCH_URL, params=params, timeout=60)
53
  response.raise_for_status()
54
 
55
  text = response.text.strip()
56
  if not text or text.startswith("<!DOCTYPE html"):
57
  return pd.DataFrame()
58
 
59
- return pd.read_csv(StringIO(text))
 
 
 
 
 
60
 
61
 
62
- def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:
63
  if df.empty:
64
  return df
65
 
@@ -70,18 +87,52 @@ def normalize_statcast(df: pd.DataFrame) -> pd.DataFrame:
70
  "release_spin_rate": "release_spin_rate",
71
  "pfx_x": "pfx_x",
72
  "pfx_z": "pfx_z",
 
 
 
 
73
  "launch_speed": "launch_speed",
74
  "launch_angle": "launch_angle",
 
 
 
 
75
  "stand": "batter_stand",
76
  "p_throws": "pitcher_hand",
77
  "game_date": "game_date",
78
- "events": "events",
79
- "description": "description",
80
- "plate_x": "plate_x",
81
- "plate_z": "plate_z",
 
 
82
  }
83
 
84
- keep_cols = [c for c in rename_map if c in df.columns]
85
  out = df[keep_cols].copy()
86
- out = out.rename(columns={c: rename_map[c] for c in keep_cols})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  return out
 
9
  from config.settings import STATCAST_SEARCH_URL
10
 
11
 
12
+ HEADERS = {
13
+ "User-Agent": "Mozilla/5.0",
14
+ "Accept-Language": "en-US,en;q=0.9",
15
+ }
16
+
17
+
18
+ def fetch_wbc_statcast_range(start_date: str, end_date: str, season: str = "2026") -> pd.DataFrame:
19
+ """
20
+ Pull WBC pitch/event-level Statcast-style CSV from Baseball Savant.
21
+
22
+ Baseball Savant has a dedicated WBC search surface, but CSV exports are still served
23
+ from the same csv backend path pattern. The key difference is using tournament filters.
24
+ """
25
  params = {
26
  "all": "true",
27
  "hfPT": "",
 
32
  "stadium": "",
33
  "hfBBL": "",
34
  "hfNewZones": "",
35
+ "hfGT": "F|D|L|W|", # game types commonly used in savant filters
36
  "hfC": "",
37
+ "hfSea": f"{season}|",
38
  "hfSit": "",
39
  "player_type": "batter",
40
  "hfOuts": "",
 
61
  "type": "details",
62
  }
63
 
64
+ response = requests.get(STATCAST_SEARCH_URL, params=params, headers=HEADERS, timeout=60)
65
  response.raise_for_status()
66
 
67
  text = response.text.strip()
68
  if not text or text.startswith("<!DOCTYPE html"):
69
  return pd.DataFrame()
70
 
71
+ try:
72
+ df = pd.read_csv(StringIO(text))
73
+ except Exception:
74
+ return pd.DataFrame()
75
+
76
+ return df
77
 
78
 
79
+ def normalize_wbc_statcast(df: pd.DataFrame) -> pd.DataFrame:
80
  if df.empty:
81
  return df
82
 
 
87
  "release_spin_rate": "release_spin_rate",
88
  "pfx_x": "pfx_x",
89
  "pfx_z": "pfx_z",
90
+ "release_pos_x": "release_pos_x",
91
+ "release_pos_z": "release_pos_z",
92
+ "plate_x": "plate_x",
93
+ "plate_z": "plate_z",
94
  "launch_speed": "launch_speed",
95
  "launch_angle": "launch_angle",
96
+ "estimated_ba_using_speedangle": "xba",
97
+ "estimated_woba_using_speedangle": "xwoba",
98
+ "events": "events",
99
+ "description": "description",
100
  "stand": "batter_stand",
101
  "p_throws": "pitcher_hand",
102
  "game_date": "game_date",
103
+ "home_team": "home_team",
104
+ "away_team": "away_team",
105
+ "inning": "inning",
106
+ "outs_when_up": "outs_when_up",
107
+ "balls": "balls",
108
+ "strikes": "strikes",
109
  }
110
 
111
+ keep_cols = [col for col in rename_map if col in df.columns]
112
  out = df[keep_cols].copy()
113
+ out = out.rename(columns={col: rename_map[col] for col in keep_cols})
114
+
115
+ numeric_cols = [
116
+ "release_speed",
117
+ "release_spin_rate",
118
+ "pfx_x",
119
+ "pfx_z",
120
+ "release_pos_x",
121
+ "release_pos_z",
122
+ "plate_x",
123
+ "plate_z",
124
+ "launch_speed",
125
+ "launch_angle",
126
+ "xba",
127
+ "xwoba",
128
+ "inning",
129
+ "outs_when_up",
130
+ "balls",
131
+ "strikes",
132
+ ]
133
+
134
+ for col in numeric_cols:
135
+ if col in out.columns:
136
+ out[col] = pd.to_numeric(out[col], errors="coerce")
137
+
138
  return out