nesticot commited on
Commit
76d9bce
·
verified ·
1 Parent(s): bf288cc

Update api_scraper.py

Browse files
Files changed (1) hide show
  1. api_scraper.py +831 -912
api_scraper.py CHANGED
@@ -1,912 +1,831 @@
1
- import requests
2
- import polars as pl
3
- import numpy as np
4
- from datetime import datetime
5
- from tqdm import tqdm
6
- from pytz import timezone
7
- import re
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
-
10
-
11
- class MLB_Scrape:
12
-
13
- def __init__(self):
14
- # Initialize your class here if needed
15
- pass
16
-
17
- def get_sport_id(self):
18
- """
19
- Retrieves the list of sports from the MLB API and processes it into a Polars DataFrame.
20
-
21
- Returns:
22
- - df (pl.DataFrame): A DataFrame containing the sports information.
23
- """
24
- # Make API call to retrieve sports information
25
- response = requests.get(url='https://statsapi.mlb.com/api/v1/sports').json()
26
-
27
- # Convert the JSON response into a Polars DataFrame
28
- df = pl.DataFrame(response['sports'])
29
-
30
- return df
31
-
32
- def get_sport_id_check(self, sport_id: int = 1):
33
- """
34
- Checks if the provided sport ID exists in the list of sports retrieved from the MLB API.
35
-
36
- Parameters:
37
- - sport_id (int): The sport ID to check. Default is 1.
38
-
39
- Returns:
40
- - bool: True if the sport ID exists, False otherwise. If False, prints the available sport IDs.
41
- """
42
- # Retrieve the list of sports from the MLB API
43
- sport_id_df = self.get_sport_id()
44
-
45
- # Check if the provided sport ID exists in the DataFrame
46
- if sport_id not in sport_id_df['id']:
47
- print('Please Select a New Sport ID from the following')
48
- print(sport_id_df)
49
- return False
50
-
51
- return True
52
-
53
-
54
- def get_game_types(self):
55
- """
56
- Retrieves the different types of MLB games from the MLB API and processes them into a Polars DataFrame.
57
-
58
- Returns:
59
- - df (pl.DataFrame): A DataFrame containing the game types information.
60
- """
61
- # Make API call to retrieve game types information
62
- response = requests.get(url='https://statsapi.mlb.com/api/v1/gameTypes').json()
63
-
64
- # Convert the JSON response into a Polars DataFrame
65
- df = pl.DataFrame(response)
66
-
67
- return df
68
-
69
- def get_schedule(self,
70
- year_input: list = [2024],
71
- sport_id: list = [1],
72
- game_type: list = ['R']):
73
-
74
- """
75
- Retrieves the schedule of baseball games based on the specified parameters.
76
- Parameters:
77
- - year_input (list): A list of years to filter the schedule. Default is [2024].
78
- - sport_id (list): A list of sport IDs to filter the schedule. Default is [1].
79
- - game_type (list): A list of game types to filter the schedule. Default is ['R'].
80
- Returns:
81
- - game_df (pandas.DataFrame): A DataFrame containing the game schedule information, including game ID, date, time, away team, home team, game state, venue ID, and venue name. If the schedule length is 0, it returns a message indicating that different parameters should be selected.
82
- """
83
-
84
- # Type checks
85
- if not isinstance(year_input, list) or not all(isinstance(year, int) for year in year_input):
86
- raise ValueError("year_input must be a list of integers.")
87
- if not isinstance(sport_id, list) or not all(isinstance(sid, int) for sid in sport_id):
88
- raise ValueError("sport_id must be a list of integers.")
89
-
90
- if not isinstance(game_type, list) or not all(isinstance(gt, str) for gt in game_type):
91
- raise ValueError("game_type must be a list of strings.")
92
-
93
- eastern = timezone('US/Eastern')
94
-
95
- # Convert input lists to comma-separated strings
96
- year_input_str = ','.join([str(x) for x in year_input])
97
- sport_id_str = ','.join([str(x) for x in sport_id])
98
- game_type_str = ','.join([str(x) for x in game_type])
99
-
100
- # Make API call to retrieve game schedule
101
- game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
102
- try:
103
- def safe_get(d, keys, default=np.nan):
104
- """Safely retrieve nested dictionary values."""
105
- for key in keys:
106
- d = d.get(key, {})
107
- if not isinstance(d, dict):
108
- return d # Return value if it's not a dict
109
- return default # Return default if keys don't exist
110
-
111
- game_list = [item for sublist in [[y.get('gamePk', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
112
- time_list = [item for sublist in [[y.get('gameDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
113
- date_list = [item for sublist in [[y.get('officialDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
114
- away_team_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
115
- away_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
116
- home_team_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
117
- home_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
118
- state_list = [item for sublist in [[safe_get(y, ['status', 'codedGameState'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
119
- venue_id = [item for sublist in [[safe_get(y, ['venue', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
120
- venue_name = [item for sublist in [[safe_get(y, ['venue', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
121
- gameday_type = [item for sublist in [[safe_get(y, ['gamedayType'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
122
-
123
- # Create a Polars DataFrame with the extracted data
124
- game_df = pl.DataFrame(data={'game_id': game_list,
125
- 'time': time_list,
126
- 'date': date_list,
127
- 'away': away_team_list,
128
- 'away_id': away_team_id_list,
129
- 'home': home_team_list,
130
- 'home_id': home_team_id_list,
131
- 'state': state_list,
132
- 'venue_id': venue_id,
133
- 'venue_name': venue_name,
134
- 'gameday_type':gameday_type})
135
-
136
-
137
- # Check if the DataFrame is empty
138
- if len(game_df) == 0:
139
- print('Schedule Length of 0, please select different parameters.')
140
- return None
141
-
142
- # Convert date and time columns to appropriate formats
143
- game_df = game_df.with_columns(
144
- game_df['date'].str.to_date(),
145
- game_df['time'].str.to_datetime().dt.convert_time_zone(eastern.zone).dt.strftime("%I:%M %p"))
146
-
147
- # Remove duplicate games and sort by date
148
- game_df = game_df.unique(subset='game_id').sort('date')
149
-
150
- # Check again if the DataFrame is empty after processing
151
- if len(game_df) == 0:
152
- print('Schedule Length of 0, please select different parameters.')
153
- return None
154
- except KeyError:
155
- print('No Data for Selected Parameters')
156
- return None
157
-
158
-
159
- return game_df
160
-
161
-
162
- def get_data(self, game_list_input: list):
163
- """
164
- Retrieves live game data for a list of game IDs.
165
-
166
- Parameters:
167
- - game_list_input (list): A list of game IDs for which to retrieve live data.
168
-
169
- Returns:
170
- - data_total (list): A list of JSON responses containing live game data for each game ID.
171
- """
172
- data_total = []
173
- print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
174
-
175
- # Iterate over the list of game IDs with a progress bar
176
- for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
177
- # Make a GET request to the MLB API for each game ID
178
- r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
179
- # Append the JSON response to the data_total list
180
- data_total.append(r.json())
181
-
182
- return data_total
183
-
184
- def get_data_new(self, game_list_input: list):
185
- """
186
- Retrieves live game data for a list of game IDs in parallel.
187
-
188
- Parameters:
189
- - game_list_input (list): A list of game IDs for which to retrieve live data.
190
-
191
- Returns:
192
- - data_total (list): A list of JSON responses containing live game data for each game ID.
193
- """
194
- data_total = []
195
- print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
196
-
197
- def fetch_data(game_id):
198
- r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
199
- return r.json()
200
-
201
- with ThreadPoolExecutor() as executor:
202
- futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
203
- for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
204
- data_total.append(future.result())
205
-
206
- return data_total
207
-
208
- def get_data_df(self, data_list):
209
- """
210
- Converts a list of game data JSON objects into a Polars DataFrame.
211
-
212
- Parameters:
213
- - data_list (list): A list of JSON objects containing game data.
214
-
215
- Returns:
216
- - data_df (pl.DataFrame): A DataFrame containing the structured game data.
217
- """
218
- swing_list = ['X','F','S','D','E','T','W','L','M','Q','Z','R','O','J']
219
- whiff_list = ['S','T','W','M','Q','O']
220
- print('Converting Data to Dataframe.')
221
- game_id = []
222
- game_date = []
223
- batter_id = []
224
- batter_name = []
225
- batter_hand = []
226
- batter_team = []
227
- batter_team_id = []
228
- pitcher_id = []
229
- pitcher_name = []
230
- pitcher_hand = []
231
- pitcher_team = []
232
- pitcher_team_id = []
233
-
234
- play_description = []
235
- play_code = []
236
- in_play = []
237
- is_strike = []
238
- is_swing = []
239
- is_whiff = []
240
- is_out = []
241
- is_ball = []
242
- is_review = []
243
- pitch_type = []
244
- pitch_description = []
245
- strikes = []
246
- balls = []
247
- outs = []
248
- strikes_after = []
249
- balls_after = []
250
- outs_after = []
251
- inning = []
252
-
253
- start_speed = []
254
- end_speed = []
255
- sz_top = []
256
- sz_bot = []
257
- x = []
258
- y = []
259
- ax = []
260
- ay = []
261
- az = []
262
- pfxx = []
263
- pfxz = []
264
- px = []
265
- pz = []
266
- vx0 = []
267
- vy0 = []
268
- vz0 = []
269
- x0 = []
270
- y0 = []
271
- z0 = []
272
- zone = []
273
- type_confidence = []
274
- plate_time = []
275
- extension = []
276
- spin_rate = []
277
- spin_direction = []
278
- vb = []
279
- ivb = []
280
- hb = []
281
-
282
- launch_speed = []
283
- launch_angle = []
284
- launch_distance = []
285
- launch_location = []
286
- trajectory = []
287
- hardness = []
288
- hit_x = []
289
- hit_y = []
290
-
291
- index_play = []
292
- play_id = []
293
- start_time = []
294
- end_time = []
295
- is_pitch = []
296
- type_type = []
297
-
298
-
299
- type_ab = []
300
- ab_number = []
301
- event = []
302
- event_type = []
303
- rbi = []
304
- away_score = []
305
- home_score = []
306
-
307
- for data in data_list:
308
- try:
309
- for ab_id in range(len(data['liveData']['plays']['allPlays'])):
310
- ab_list = data['liveData']['plays']['allPlays'][ab_id]
311
- for n in range(len(ab_list['playEvents'])):
312
-
313
-
314
- if ab_list['playEvents'][n]['isPitch'] == True or 'call' in ab_list['playEvents'][n]['details']:
315
- ab_number.append(ab_list['atBatIndex'] if 'atBatIndex' in ab_list else None)
316
-
317
- game_id.append(data['gamePk'])
318
- game_date.append(data['gameData']['datetime']['officialDate'])
319
- if 'matchup' in ab_list:
320
- batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else None)
321
- if 'batter' in ab_list['matchup']:
322
- batter_name.append(ab_list['matchup']['batter']['fullName'] if 'fullName' in ab_list['matchup']['batter'] else None)
323
- else:
324
- batter_name.append(None)
325
-
326
- batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else None)
327
- pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else None)
328
- if 'pitcher' in ab_list['matchup']:
329
- pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'fullName' in ab_list['matchup']['pitcher'] else None)
330
- else:
331
- pitcher_name.append(None)
332
-
333
- pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else None)
334
-
335
-
336
- if ab_list['about']['isTopInning']:
337
- batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
338
- batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
339
- pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
340
- pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
341
-
342
- else:
343
- batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
344
- batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
345
- pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
346
- pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
347
-
348
- play_description.append(ab_list['playEvents'][n]['details']['description'] if 'description' in ab_list['playEvents'][n]['details'] else None)
349
- play_code.append(ab_list['playEvents'][n]['details']['code'] if 'code' in ab_list['playEvents'][n]['details'] else None)
350
- in_play.append(ab_list['playEvents'][n]['details']['isInPlay'] if 'isInPlay' in ab_list['playEvents'][n]['details'] else None)
351
- is_strike.append(ab_list['playEvents'][n]['details']['isStrike'] if 'isStrike' in ab_list['playEvents'][n]['details'] else None)
352
-
353
- if 'details' in ab_list['playEvents'][n]:
354
- is_swing.append(True if ab_list['playEvents'][n]['details']['code'] in swing_list else None)
355
- is_whiff.append(True if ab_list['playEvents'][n]['details']['code'] in whiff_list else None)
356
- else:
357
- is_swing.append(None)
358
- is_whiff.append(None)
359
-
360
- inning.append(ab_list['about']['inning'] if 'inning' in ab_list['about'] else None)
361
- is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else None)
362
- is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else None)
363
- pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else None)
364
- pitch_description.append(ab_list['playEvents'][n]['details']['type']['description'] if 'type' in ab_list['playEvents'][n]['details'] else None)
365
-
366
- if ab_list['playEvents'][n]['pitchNumber'] == 1:
367
- strikes.append(0)
368
- balls.append(0)
369
- strikes_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
370
- balls_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
371
- outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
372
- outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
373
-
374
- else:
375
- strikes.append(ab_list['playEvents'][n-1]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n-1]['count'] else None)
376
- balls.append(ab_list['playEvents'][n-1]['count']['balls'] if 'balls' in ab_list['playEvents'][n-1]['count'] else None)
377
- outs.append(ab_list['playEvents'][n-1]['count']['outs'] if 'outs' in ab_list['playEvents'][n-1]['count'] else None)
378
-
379
- strikes_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
380
- balls_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
381
- outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
382
-
383
-
384
- if 'pitchData' in ab_list['playEvents'][n]:
385
-
386
- start_speed.append(ab_list['playEvents'][n]['pitchData']['startSpeed'] if 'startSpeed' in ab_list['playEvents'][n]['pitchData'] else None)
387
- end_speed.append(ab_list['playEvents'][n]['pitchData']['endSpeed'] if 'endSpeed' in ab_list['playEvents'][n]['pitchData'] else None)
388
-
389
- sz_top.append(ab_list['playEvents'][n]['pitchData']['strikeZoneTop'] if 'strikeZoneTop' in ab_list['playEvents'][n]['pitchData'] else None)
390
- sz_bot.append(ab_list['playEvents'][n]['pitchData']['strikeZoneBottom'] if 'strikeZoneBottom' in ab_list['playEvents'][n]['pitchData'] else None)
391
- x.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x'] if 'x' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
392
- y.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y'] if 'y' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
393
-
394
- ax.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aX'] if 'aX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
395
- ay.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aY'] if 'aY' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
396
- az.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aZ'] if 'aZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
397
- pfxx.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxX'] if 'pfxX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
398
- pfxz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxZ'] if 'pfxZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
399
- px.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pX'] if 'pX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
400
- pz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pZ'] if 'pZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
401
- vx0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vX0'] if 'vX0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
402
- vy0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vY0'] if 'vY0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
403
- vz0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vZ0'] if 'vZ0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
404
- x0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x0'] if 'x0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
405
- y0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y0'] if 'y0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
406
- z0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['z0'] if 'z0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
407
-
408
- zone.append(ab_list['playEvents'][n]['pitchData']['zone'] if 'zone' in ab_list['playEvents'][n]['pitchData'] else None)
409
- type_confidence.append(ab_list['playEvents'][n]['pitchData']['typeConfidence'] if 'typeConfidence' in ab_list['playEvents'][n]['pitchData'] else None)
410
- plate_time.append(ab_list['playEvents'][n]['pitchData']['plateTime'] if 'plateTime' in ab_list['playEvents'][n]['pitchData'] else None)
411
- extension.append(ab_list['playEvents'][n]['pitchData']['extension'] if 'extension' in ab_list['playEvents'][n]['pitchData'] else None)
412
-
413
- if 'breaks' in ab_list['playEvents'][n]['pitchData']:
414
- spin_rate.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinRate'] if 'spinRate' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
415
- spin_direction.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinDirection'] if 'spinDirection' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
416
- vb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakVertical'] if 'breakVertical' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
417
- ivb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakVerticalInduced'] if 'breakVerticalInduced' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
418
- hb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakHorizontal'] if 'breakHorizontal' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
419
-
420
- else:
421
- start_speed.append(None)
422
- end_speed.append(None)
423
-
424
- sz_top.append(None)
425
- sz_bot.append(None)
426
- x.append(None)
427
- y.append(None)
428
-
429
- ax.append(None)
430
- ay.append(None)
431
- az.append(None)
432
- pfxx.append(None)
433
- pfxz.append(None)
434
- px.append(None)
435
- pz.append(None)
436
- vx0.append(None)
437
- vy0.append(None)
438
- vz0.append(None)
439
- x0.append(None)
440
- y0.append(None)
441
- z0.append(None)
442
-
443
- zone.append(None)
444
- type_confidence.append(None)
445
- plate_time.append(None)
446
- extension.append(None)
447
- spin_rate.append(None)
448
- spin_direction.append(None)
449
- vb.append(None)
450
- ivb.append(None)
451
- hb.append(None)
452
-
453
-
454
- if 'hitData' in ab_list['playEvents'][n]:
455
- launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else None)
456
- launch_angle.append(ab_list['playEvents'][n]['hitData']['launchAngle'] if 'launchAngle' in ab_list['playEvents'][n]['hitData'] else None)
457
- launch_distance.append(ab_list['playEvents'][n]['hitData']['totalDistance'] if 'totalDistance' in ab_list['playEvents'][n]['hitData'] else None)
458
- launch_location.append(ab_list['playEvents'][n]['hitData']['location'] if 'location' in ab_list['playEvents'][n]['hitData'] else None)
459
-
460
- trajectory.append(ab_list['playEvents'][n]['hitData']['trajectory'] if 'trajectory' in ab_list['playEvents'][n]['hitData'] else None)
461
- hardness.append(ab_list['playEvents'][n]['hitData']['hardness'] if 'hardness' in ab_list['playEvents'][n]['hitData'] else None)
462
- hit_x.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordX'] if 'coordX' in ab_list['playEvents'][n]['hitData']['coordinates'] else None)
463
- hit_y.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordY'] if 'coordY' in ab_list['playEvents'][n]['hitData']['coordinates'] else None)
464
- else:
465
- launch_speed.append(None)
466
- launch_angle.append(None)
467
- launch_distance.append(None)
468
- launch_location.append(None)
469
- trajectory.append(None)
470
- hardness.append(None)
471
- hit_x.append(None)
472
- hit_y.append(None)
473
-
474
- index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else None)
475
- play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else None)
476
- start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else None)
477
- end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else None)
478
- is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else None)
479
- type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else None)
480
-
481
-
482
-
483
- if n == len(ab_list['playEvents']) - 1 :
484
-
485
- type_ab.append(data['liveData']['plays']['allPlays'][ab_id]['result']['type'] if 'type' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
486
- event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'] if 'event' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
487
- event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'] if 'eventType' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
488
- rbi.append(data['liveData']['plays']['allPlays'][ab_id]['result']['rbi'] if 'rbi' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
489
- away_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['awayScore'] if 'awayScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
490
- home_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['homeScore'] if 'homeScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
491
- is_out.append(data['liveData']['plays']['allPlays'][ab_id]['result']['isOut'] if 'isOut' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
492
-
493
- else:
494
-
495
- type_ab.append(None)
496
- event.append(None)
497
- event_type.append(None)
498
- rbi.append(None)
499
- away_score.append(None)
500
- home_score.append(None)
501
- is_out.append(None)
502
-
503
- elif ab_list['playEvents'][n]['count']['balls'] == 4:
504
-
505
- event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'])
506
- event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'])
507
-
508
-
509
- game_id.append(data['gamePk'])
510
- game_date.append(data['gameData']['datetime']['officialDate'])
511
- batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else None)
512
- batter_name.append(ab_list['matchup']['batter']['fullName'] if 'batter' in ab_list['matchup'] else None)
513
- batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else None)
514
- pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else None)
515
- pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else None)
516
- pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else None)
517
- if ab_list['about']['isTopInning']:
518
- batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
519
- batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
520
- pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
521
- pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
522
- else:
523
- batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
524
- batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
525
- pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
526
- pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
527
-
528
- play_description.append(None)
529
- play_code.append(None)
530
- in_play.append(None)
531
- is_strike.append(None)
532
- is_ball.append(None)
533
- is_review.append(None)
534
- pitch_type.append(None)
535
- pitch_description.append(None)
536
- inning.append(None)
537
- strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
538
- balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
539
- outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
540
- strikes_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
541
- balls_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
542
- outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
543
- index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else None)
544
- play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else None)
545
- start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else None)
546
- end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else None)
547
- is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else None)
548
- type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else None)
549
-
550
-
551
-
552
- is_swing.append(None)
553
- is_whiff.append(None)
554
- start_speed.append(None)
555
- end_speed.append(None)
556
- sz_top.append(None)
557
- sz_bot.append(None)
558
- x.append(None)
559
- y.append(None)
560
- ax.append(None)
561
- ay.append(None)
562
- az.append(None)
563
- pfxx.append(None)
564
- pfxz.append(None)
565
- px.append(None)
566
- pz.append(None)
567
- vx0.append(None)
568
- vy0.append(None)
569
- vz0.append(None)
570
- x0.append(None)
571
- y0.append(None)
572
- z0.append(None)
573
- zone.append(None)
574
- type_confidence.append(None)
575
- plate_time.append(None)
576
- extension.append(None)
577
- spin_rate.append(None)
578
- spin_direction.append(None)
579
- vb.append(None)
580
- ivb.append(None)
581
- hb.append(None)
582
- launch_speed.append(None)
583
- launch_angle.append(None)
584
- launch_distance.append(None)
585
- launch_location.append(None)
586
- trajectory.append(None)
587
- hardness.append(None)
588
- hit_x.append(None)
589
- hit_y.append(None)
590
- type_ab.append(None)
591
- ab_number.append(None)
592
-
593
- rbi.append(None)
594
- away_score.append(None)
595
- home_score.append(None)
596
- is_out.append(None)
597
-
598
- except KeyError:
599
- print(f"No Data for Game")
600
-
601
- df = pl.DataFrame(data={
602
- 'game_id':game_id,
603
- 'game_date':game_date,
604
- 'batter_id':batter_id,
605
- 'batter_name':batter_name,
606
- 'batter_hand':batter_hand,
607
- 'batter_team':batter_team,
608
- 'batter_team_id':batter_team_id,
609
- 'pitcher_id':pitcher_id,
610
- 'pitcher_name':pitcher_name,
611
- 'pitcher_hand':pitcher_hand,
612
- 'pitcher_team':pitcher_team,
613
- 'pitcher_team_id':pitcher_team_id,
614
- 'ab_number':ab_number,
615
- 'inning':inning,
616
- 'play_description':play_description,
617
- 'play_code':play_code,
618
- 'in_play':in_play,
619
- 'is_strike':is_strike,
620
- 'is_swing':is_swing,
621
- 'is_whiff':is_whiff,
622
- 'is_out':is_out,
623
- 'is_ball':is_ball,
624
- 'is_review':is_review,
625
- 'pitch_type':pitch_type,
626
- 'pitch_description':pitch_description,
627
- 'strikes':strikes,
628
- 'balls':balls,
629
- 'outs':outs,
630
- 'strikes_after':strikes_after,
631
- 'balls_after':balls_after,
632
- 'outs_after':outs_after,
633
- 'start_speed':start_speed,
634
- 'end_speed':end_speed,
635
- 'sz_top':sz_top,
636
- 'sz_bot':sz_bot,
637
- 'x':x,
638
- 'y':y,
639
- 'ax':ax,
640
- 'ay':ay,
641
- 'az':az,
642
- 'pfxx':pfxx,
643
- 'pfxz':pfxz,
644
- 'px':px,
645
- 'pz':pz,
646
- 'vx0':vx0,
647
- 'vy0':vy0,
648
- 'vz0':vz0,
649
- 'x0':x0,
650
- 'y0':y0,
651
- 'z0':z0,
652
- 'zone':zone,
653
- 'type_confidence':type_confidence,
654
- 'plate_time':plate_time,
655
- 'extension':extension,
656
- 'spin_rate':spin_rate,
657
- 'spin_direction':spin_direction,
658
- 'vb':vb,
659
- 'ivb':ivb,
660
- 'hb':hb,
661
- 'launch_speed':launch_speed,
662
- 'launch_angle':launch_angle,
663
- 'launch_distance':launch_distance,
664
- 'launch_location':launch_location,
665
- 'trajectory':trajectory,
666
- 'hardness':hardness,
667
- 'hit_x':hit_x,
668
- 'hit_y':hit_y,
669
- 'index_play':index_play,
670
- 'play_id':play_id,
671
- 'start_time':start_time,
672
- 'end_time':end_time,
673
- 'is_pitch':is_pitch,
674
- 'type_type':type_type,
675
- 'type_ab':type_ab,
676
- 'event':event,
677
- 'event_type':event_type,
678
- 'rbi':rbi,
679
- 'away_score':away_score,
680
- 'home_score':home_score,
681
-
682
- },strict=False
683
- )
684
-
685
- return df
686
-
687
- def get_teams(self):
688
- """
689
- Retrieves information about MLB teams from the MLB API and processes it into a Polars DataFrame.
690
-
691
- Returns:
692
- - mlb_teams_df (pl.DataFrame): A DataFrame containing team information, including team ID, city, name, franchise, abbreviation, parent organization ID, parent organization name, league ID, and league name.
693
- """
694
- # Make API call to retrieve team information
695
- teams = requests.get(url='https://statsapi.mlb.com/api/v1/teams/').json()
696
-
697
- # Extract relevant data from the API response
698
- mlb_teams_city = [x['franchiseName'] if 'franchiseName' in x else None for x in teams['teams']]
699
- mlb_teams_name = [x['teamName'] if 'franchiseName' in x else None for x in teams['teams']]
700
- mlb_teams_franchise = [x['name'] if 'franchiseName' in x else None for x in teams['teams']]
701
- mlb_teams_id = [x['id'] if 'franchiseName' in x else None for x in teams['teams']]
702
- mlb_teams_abb = [x['abbreviation'] if 'franchiseName' in x else None for x in teams['teams']]
703
- mlb_teams_parent_id = [x['parentOrgId'] if 'parentOrgId' in x else None for x in teams['teams']]
704
- mlb_teams_parent = [x['parentOrgName'] if 'parentOrgName' in x else None for x in teams['teams']]
705
- mlb_teams_league_id = [x['league']['id'] if 'id' in x['league'] else None for x in teams['teams']]
706
- mlb_teams_league_name = [x['league']['name'] if 'name' in x['league'] else None for x in teams['teams']]
707
-
708
- # Create a Polars DataFrame with the extracted data
709
- mlb_teams_df = pl.DataFrame(data={'team_id': mlb_teams_id,
710
- 'city': mlb_teams_franchise,
711
- 'name': mlb_teams_name,
712
- 'franchise': mlb_teams_franchise,
713
- 'abbreviation': mlb_teams_abb,
714
- 'parent_org_id': mlb_teams_parent_id,
715
- 'parent_org': mlb_teams_parent,
716
- 'league_id': mlb_teams_league_id,
717
- 'league_name': mlb_teams_league_name
718
- }).unique().drop_nulls(subset=['team_id']).sort('team_id')
719
-
720
- # Fill missing parent organization IDs with team IDs
721
- mlb_teams_df = mlb_teams_df.with_columns(
722
- pl.when(pl.col('parent_org_id').is_null())
723
- .then(pl.col('team_id'))
724
- .otherwise(pl.col('parent_org_id'))
725
- .alias('parent_org_id')
726
- )
727
-
728
- # Fill missing parent organization names with franchise names
729
- mlb_teams_df = mlb_teams_df.with_columns(
730
- pl.when(pl.col('parent_org').is_null())
731
- .then(pl.col('franchise'))
732
- .otherwise(pl.col('parent_org'))
733
- .alias('parent_org')
734
- )
735
-
736
- # Create a dictionary for mapping team IDs to abbreviations
737
- abbreviation_dict = mlb_teams_df.select(['team_id', 'abbreviation']).to_dict(as_series=False)
738
- abbreviation_map = {k: v for k, v in zip(abbreviation_dict['team_id'], abbreviation_dict['abbreviation'])}
739
-
740
- # Create a DataFrame for parent organization abbreviations
741
- abbreviation_df = mlb_teams_df.select(['team_id', 'abbreviation']).rename({'team_id': 'parent_org_id', 'abbreviation': 'parent_org_abbreviation'})
742
-
743
- # Join the parent organization abbreviations with the main DataFrame
744
- mlb_teams_df = mlb_teams_df.join(abbreviation_df, on='parent_org_id', how='left')
745
-
746
- return mlb_teams_df
747
-
748
- def get_leagues(self):
749
- """
750
- Retrieves information about MLB leagues from the MLB API and processes it into a Polars DataFrame.
751
-
752
- Returns:
753
- - leagues_df (pl.DataFrame): A DataFrame containing league information, including league ID, league name, league abbreviation, and sport ID.
754
- """
755
- # Make API call to retrieve league information
756
- leagues = requests.get(url='https://statsapi.mlb.com/api/v1/leagues/').json()
757
-
758
- # Extract relevant data from the API response
759
- sport_id = [x['sport']['id'] if 'sport' in x else None for x in leagues['leagues']]
760
- league_id = [x['id'] if 'id' in x else None for x in leagues['leagues']]
761
- league_name = [x['name'] if 'name' in x else None for x in leagues['leagues']]
762
- league_abbreviation = [x['abbreviation'] if 'abbreviation' in x else None for x in leagues['leagues']]
763
-
764
- # Create a Polars DataFrame with the extracted data
765
- leagues_df = pl.DataFrame(data={
766
- 'league_id': league_id,
767
- 'league_name': league_name,
768
- 'league_abbreviation': league_abbreviation,
769
- 'sport_id': sport_id,
770
- })
771
-
772
- return leagues_df
773
-
774
- def get_player_games_list(self, player_id: int,
775
- season: int,
776
- start_date: str = None,
777
- end_date: str = None,
778
- sport_id: int = 1,
779
- game_type: list = ['R'],
780
- pitching: bool = True):
781
- """
782
- Retrieves a list of game IDs for a specific player in a given season.
783
-
784
- Parameters:
785
- - player_id (int): The ID of the player.
786
- - season (int): The season year for which to retrieve the game list.
787
- - start_date (str): The start date (YYYY-MM-DD) of the range (default is January 1st of the specified season).
788
- - end_date (str): The end date (YYYY-MM-DD) of the range (default is December 31st of the specified season).
789
- - sport_id (int): The ID of the sport for which to retrieve player data.
790
- - game_type (list): A list of game types to filter the schedule. Default is ['R'].
791
- - pitching (bool): Return pitching games.
792
-
793
- Returns:
794
- - player_game_list (list): A list of game IDs in which the player participated during the specified season.
795
- """
796
- # Set default start and end dates if not provided
797
- if not start_date:
798
- start_date = f'{season}-01-01'
799
- if not end_date:
800
- end_date = f'{season}-12-31'
801
-
802
- # Determine the group based on the pitching flag
803
- group = 'pitching' if pitching else 'hitting'
804
-
805
- # Validate date format
806
- date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}$')
807
- if not date_pattern.match(start_date):
808
- raise ValueError(f"start_date {start_date} is not in YYYY-MM-DD format")
809
- if not date_pattern.match(end_date):
810
- raise ValueError(f"end_date {end_date} is not in YYYY-MM-DD format")
811
-
812
- # Convert game type list to a comma-separated string
813
- game_type_str = ','.join([str(x) for x in game_type])
814
-
815
- # Make API call to retrieve player game logs
816
- response = requests.get(url=f'http://statsapi.mlb.com/api/v1/people/{player_id}?hydrate=stats(group={group},type=gameLog,season={season},startDate={start_date},endDate={end_date},sportId={sport_id},gameType=[{game_type_str}]),hydrations').json()
817
-
818
- # Check if stats are available in the response
819
- if 'stats' not in response['people'][0]:
820
- print(f'No {group} games found for player {player_id} in season {season}')
821
- return []
822
-
823
- # Extract game IDs from the API response
824
- player_game_list = [x['game']['gamePk'] for x in response['people'][0]['stats'][0]['splits']]
825
-
826
- return player_game_list
827
-
828
- def get_players(self, sport_id: int, season: int, game_type: list = ['R']):
829
- """
830
- Retrieves data frame of players in a given league
831
-
832
- Parameters:
833
- - sport_id (int): The ID of the sport for which to retrieve player data.
834
- - season (int): The season year for which to retrieve player data.
835
- - game_type (list): A list of game types to filter the players. Default is ['R'].
836
-
837
- Returns:
838
- - player_df (pl.DataFrame): A DataFrame containing player information, including player ID, name, position, team, and age.
839
- """
840
- game_type_str = ','.join([str(x) for x in game_type])
841
-
842
- # If game type is 'S', fetch data from a different endpoint
843
- if game_type_str == 'S':
844
- # Fetch pitcher data
845
- pitcher_data = requests.get(f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?&env=prod&season={season}&sportId=1&stats=season&group=pitching&gameType=S&limit=1000000&offset=0&sortStat=inningsPitched&order=asc').json()
846
- fullName_list = [x['playerFullName'] for x in pitcher_data['stats']]
847
- firstName_list = [x['playerFirstName'] for x in pitcher_data['stats']]
848
- lastName_list = [x['playerLastName'] for x in pitcher_data['stats']]
849
- id_list = [x['playerId'] for x in pitcher_data['stats']]
850
- position_list = [x['primaryPositionAbbrev'] for x in pitcher_data['stats']]
851
- team_list = [x['teamId'] for x in pitcher_data['stats']]
852
-
853
- df_pitcher = pl.DataFrame(data={
854
- 'player_id': id_list,
855
- 'first_name': firstName_list,
856
- 'last_name': lastName_list,
857
- 'name': fullName_list,
858
- 'position': position_list,
859
- 'team': team_list
860
- })
861
-
862
- # Fetch batter data
863
- batter_data = requests.get(f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?&env=prod&season={season}&sportId=1&stats=season&group=hitting&gameType=S&limit=1000000&offset=0').json()
864
- fullName_list = [x['playerFullName'] for x in batter_data['stats']]
865
- firstName_list = [x['playerFirstName'] for x in batter_data['stats']]
866
- lastName_list = [x['playerLastName'] for x in batter_data['stats']]
867
- id_list = [x['playerId'] for x in batter_data['stats']]
868
- position_list = [x['primaryPositionAbbrev'] for x in batter_data['stats']]
869
- team_list = [x['teamId'] for x in batter_data['stats']]
870
-
871
- df_batter = pl.DataFrame(data={
872
- 'player_id': id_list,
873
- 'first_name': firstName_list,
874
- 'last_name': lastName_list,
875
- 'name': fullName_list,
876
- 'position': position_list,
877
- 'team': team_list
878
- })
879
-
880
- # Combine pitcher and batter data
881
- df = pl.concat([df_pitcher, df_batter]).unique().drop_nulls(subset=['player_id']).sort('player_id')
882
-
883
- else:
884
- # Fetch player data for other game types
885
- player_data = requests.get(url=f'https://statsapi.mlb.com/api/v1/sports/{sport_id}/players?season={season}&gameType=[{game_type_str}]').json()['people']
886
-
887
- # Extract relevant data
888
- fullName_list = [x['fullName'] for x in player_data]
889
- firstName_list = [x['firstName'] for x in player_data]
890
- lastName_list = [x['lastName'] for x in player_data]
891
- id_list = [x['id'] for x in player_data]
892
- position_list = [x['primaryPosition']['abbreviation'] if 'primaryPosition' in x else None for x in player_data]
893
- team_list = [x['currentTeam']['id'] if 'currentTeam' in x else None for x in player_data]
894
- weight_list = [x['weight'] if 'weight' in x else None for x in player_data]
895
- height_list = [x['height'] if 'height' in x else None for x in player_data]
896
- age_list = [x['currentAge'] if 'currentAge' in x else None for x in player_data]
897
- birthDate_list = [x['birthDate'] if 'birthDate' in x else None for x in player_data]
898
-
899
- df = pl.DataFrame(data={
900
- 'player_id': id_list,
901
- 'first_name': firstName_list,
902
- 'last_name': lastName_list,
903
- 'name': fullName_list,
904
- 'position': position_list,
905
- 'team': team_list,
906
- 'weight': weight_list,
907
- 'height': height_list,
908
- 'age': age_list,
909
- 'birthDate': birthDate_list
910
- })
911
-
912
- return df
 
1
+ import requests
2
+ import polars as pl
3
+ import numpy as np
4
+ from datetime import datetime
5
+ from tqdm import tqdm
6
+ from pytz import timezone
7
+ import re
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+
11
+ class MLB_Scrape:
12
+
13
+ def __init__(self):
14
+ # Initialize your class here if needed
15
+ pass
16
+
17
+ def get_sport_id(self):
18
+ """
19
+ Retrieves the list of sports from the MLB API and processes it into a Polars DataFrame.
20
+
21
+ Returns:
22
+ - df (pl.DataFrame): A DataFrame containing the sports information.
23
+ """
24
+ # Make API call to retrieve sports information
25
+ response = requests.get(url='https://statsapi.mlb.com/api/v1/sports').json()
26
+
27
+ # Convert the JSON response into a Polars DataFrame
28
+ df = pl.DataFrame(response['sports'])
29
+
30
+ return df
31
+
32
+ def get_sport_id_check(self, sport_id: int = 1):
33
+ """
34
+ Checks if the provided sport ID exists in the list of sports retrieved from the MLB API.
35
+
36
+ Parameters:
37
+ - sport_id (int): The sport ID to check. Default is 1.
38
+
39
+ Returns:
40
+ - bool: True if the sport ID exists, False otherwise. If False, prints the available sport IDs.
41
+ """
42
+ # Retrieve the list of sports from the MLB API
43
+ sport_id_df = self.get_sport_id()
44
+
45
+ # Check if the provided sport ID exists in the DataFrame
46
+ if sport_id not in sport_id_df['id']:
47
+ print('Please Select a New Sport ID from the following')
48
+ print(sport_id_df)
49
+ return False
50
+
51
+ return True
52
+
53
+
54
+ def get_game_types(self):
55
+ """
56
+ Retrieves the different types of MLB games from the MLB API and processes them into a Polars DataFrame.
57
+
58
+ Returns:
59
+ - df (pl.DataFrame): A DataFrame containing the game types information.
60
+ """
61
+ # Make API call to retrieve game types information
62
+ response = requests.get(url='https://statsapi.mlb.com/api/v1/gameTypes').json()
63
+
64
+ # Convert the JSON response into a Polars DataFrame
65
+ df = pl.DataFrame(response)
66
+
67
+ return df
68
+
69
+ def get_schedule(self,
70
+ year_input: list = [2024],
71
+ sport_id: list = [1],
72
+ game_type: list = ['R']):
73
+
74
+ """
75
+ Retrieves the schedule of baseball games based on the specified parameters.
76
+ Parameters:
77
+ - year_input (list): A list of years to filter the schedule. Default is [2024].
78
+ - sport_id (list): A list of sport IDs to filter the schedule. Default is [1].
79
+ - game_type (list): A list of game types to filter the schedule. Default is ['R'].
80
+ Returns:
81
+ - game_df (pandas.DataFrame): A DataFrame containing the game schedule information, including game ID, date, time, away team, home team, game state, venue ID, and venue name. If the schedule length is 0, it returns a message indicating that different parameters should be selected.
82
+ """
83
+
84
+ # Type checks
85
+ if not isinstance(year_input, list) or not all(isinstance(year, int) for year in year_input):
86
+ raise ValueError("year_input must be a list of integers.")
87
+ if not isinstance(sport_id, list) or not all(isinstance(sid, int) for sid in sport_id):
88
+ raise ValueError("sport_id must be a list of integers.")
89
+
90
+ if not isinstance(game_type, list) or not all(isinstance(gt, str) for gt in game_type):
91
+ raise ValueError("game_type must be a list of strings.")
92
+
93
+ eastern = timezone('US/Eastern')
94
+
95
+ # Convert input lists to comma-separated strings
96
+ year_input_str = ','.join([str(x) for x in year_input])
97
+ sport_id_str = ','.join([str(x) for x in sport_id])
98
+ game_type_str = ','.join([str(x) for x in game_type])
99
+
100
+ # Make API call to retrieve game schedule
101
+ game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
102
+ try:
103
+ def safe_get(d, keys, default=np.nan):
104
+ """Safely retrieve nested dictionary values."""
105
+ for key in keys:
106
+ d = d.get(key, {})
107
+ if not isinstance(d, dict):
108
+ return d # Return value if it's not a dict
109
+ return default # Return default if keys don't exist
110
+
111
+ game_list = [item for sublist in [[y.get('gamePk', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
112
+ time_list = [item for sublist in [[y.get('gameDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
113
+ date_list = [item for sublist in [[y.get('officialDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
114
+ away_team_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
115
+ away_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
116
+ home_team_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
117
+ home_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
118
+ state_list = [item for sublist in [[safe_get(y, ['status', 'codedGameState'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
119
+ venue_id = [item for sublist in [[safe_get(y, ['venue', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
120
+ venue_name = [item for sublist in [[safe_get(y, ['venue', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
121
+ gameday_type = [item for sublist in [[safe_get(y, ['gamedayType'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
122
+
123
+ # Create a Polars DataFrame with the extracted data
124
+ game_df = pl.DataFrame(data={'game_id': game_list,
125
+ 'time': time_list,
126
+ 'date': date_list,
127
+ 'away': away_team_list,
128
+ 'away_id': away_team_id_list,
129
+ 'home': home_team_list,
130
+ 'home_id': home_team_id_list,
131
+ 'state': state_list,
132
+ 'venue_id': venue_id,
133
+ 'venue_name': venue_name,
134
+ 'gameday_type':gameday_type})
135
+
136
+
137
+ # Check if the DataFrame is empty
138
+ if len(game_df) == 0:
139
+ print('Schedule Length of 0, please select different parameters.')
140
+ return None
141
+
142
+ # Convert date and time columns to appropriate formats
143
+ game_df = game_df.with_columns(
144
+ game_df['date'].str.to_date(),
145
+ game_df['time'].str.to_datetime().dt.convert_time_zone(eastern.zone).dt.strftime("%I:%M %p"))
146
+
147
+ # Remove duplicate games and sort by date
148
+ game_df = game_df.unique(subset='game_id').sort('date')
149
+
150
+ # Check again if the DataFrame is empty after processing
151
+ if len(game_df) == 0:
152
+ print('Schedule Length of 0, please select different parameters.')
153
+ return None
154
+ except KeyError:
155
+ print('No Data for Selected Parameters')
156
+ return None
157
+
158
+
159
+ return game_df
160
+
161
+
162
+ def get_data(self, game_list_input: list):
163
+ """
164
+ Retrieves live game data for a list of game IDs.
165
+
166
+ Parameters:
167
+ - game_list_input (list): A list of game IDs for which to retrieve live data.
168
+
169
+ Returns:
170
+ - data_total (list): A list of JSON responses containing live game data for each game ID.
171
+ """
172
+ data_total = []
173
+ print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
174
+
175
+ # Iterate over the list of game IDs with a progress bar
176
+ for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
177
+ # Make a GET request to the MLB API for each game ID
178
+ r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
179
+ # Append the JSON response to the data_total list
180
+ data_total.append(r.json())
181
+
182
+ return data_total
183
+
184
+ # def get_data(self, game_list_input: list):
185
+ # """
186
+ # Retrieves live game data for a list of game IDs in parallel.
187
+
188
+ # Parameters:
189
+ # - game_list_input (list): A list of game IDs for which to retrieve live data.
190
+
191
+ # Returns:
192
+ # - data_total (list): A list of JSON responses containing live game data for each game ID.
193
+ # """
194
+ # data_total = []
195
+ # print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
196
+
197
+ # def fetch_data(game_id):
198
+ # r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
199
+ # return r.json()
200
+
201
+ # with ThreadPoolExecutor() as executor:
202
+ # futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
203
+ # for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
204
+ # data_total.append(future.result())
205
+
206
+ # return data_total
207
+
208
+
209
+ def get_data_df(self, data_list):
210
+ """
211
+ Converts a list of game data JSON objects into a Polars DataFrame.
212
+
213
+ Parameters:
214
+ - data_list (list): A list of JSON objects containing game data.
215
+
216
+ Returns:
217
+ - data_df (pl.DataFrame): A DataFrame containing the structured game data.
218
+ """
219
+ swing_list = ['X','F','S','D','E','T','W','L','M','Q','Z','R','O','J']
220
+ whiff_list = ['S','T','W','M','Q','O']
221
+ print('Converting Data to Dataframe.')
222
+ game_id = []
223
+ game_date = []
224
+ batter_id = []
225
+ batter_name = []
226
+ batter_hand = []
227
+ batter_team = []
228
+ batter_team_id = []
229
+ pitcher_id = []
230
+ pitcher_name = []
231
+ pitcher_hand = []
232
+ pitcher_team = []
233
+ pitcher_team_id = []
234
+
235
+ play_description = []
236
+ play_code = []
237
+ in_play = []
238
+ is_strike = []
239
+ is_swing = []
240
+ is_whiff = []
241
+ is_out = []
242
+ is_ball = []
243
+ is_review = []
244
+ pitch_type = []
245
+ pitch_description = []
246
+ strikes = []
247
+ balls = []
248
+ outs = []
249
+ strikes_after = []
250
+ balls_after = []
251
+ outs_after = []
252
+ inning = []
253
+
254
+ start_speed = []
255
+ end_speed = []
256
+ sz_top = []
257
+ sz_bot = []
258
+ x = []
259
+ y = []
260
+ ax = []
261
+ ay = []
262
+ az = []
263
+ pfxx = []
264
+ pfxz = []
265
+ px = []
266
+ pz = []
267
+ vx0 = []
268
+ vy0 = []
269
+ vz0 = []
270
+ x0 = []
271
+ y0 = []
272
+ z0 = []
273
+ zone = []
274
+ type_confidence = []
275
+ plate_time = []
276
+ extension = []
277
+ spin_rate = []
278
+ spin_direction = []
279
+ vb = []
280
+ ivb = []
281
+ hb = []
282
+
283
+ launch_speed = []
284
+ launch_angle = []
285
+ launch_distance = []
286
+ launch_location = []
287
+ trajectory = []
288
+ hardness = []
289
+ hit_x = []
290
+ hit_y = []
291
+
292
+ index_play = []
293
+ play_id = []
294
+ start_time = []
295
+ end_time = []
296
+ is_pitch = []
297
+ type_type = []
298
+
299
+ type_ab = []
300
+ ab_number = []
301
+ event = []
302
+ event_type = []
303
+ rbi = []
304
+ away_score = []
305
+ home_score = []
306
+
307
+ for data in data_list:
308
+ try:
309
+ for ab_id in range(len(data['liveData']['plays']['allPlays'])):
310
+ ab_list = data['liveData']['plays']['allPlays'][ab_id]
311
+
312
+ # Extract result data once per at-bat
313
+ ab_result = ab_list.get('result', {})
314
+
315
+ for n in range(len(ab_list['playEvents'])):
316
+
317
+ # Determine if this event should be recorded
318
+ is_pitch_or_call = ab_list['playEvents'][n].get('isPitch') == True or 'call' in ab_list['playEvents'][n].get('details', {})
319
+ is_walk = 'count' in ab_list['playEvents'][n] and ab_list['playEvents'][n]['count'].get('balls') == 4
320
+
321
+ if is_pitch_or_call or is_walk:
322
+ ab_number.append(ab_list.get('atBatIndex'))
323
+
324
+ game_id.append(data['gamePk'])
325
+ game_date.append(data['gameData']['datetime']['officialDate'])
326
+
327
+ if 'matchup' in ab_list:
328
+ batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else None)
329
+ if 'batter' in ab_list['matchup']:
330
+ batter_name.append(ab_list['matchup']['batter']['fullName'] if 'fullName' in ab_list['matchup']['batter'] else None)
331
+ else:
332
+ batter_name.append(None)
333
+ batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else None)
334
+ pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else None)
335
+ if 'pitcher' in ab_list['matchup']:
336
+ pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'fullName' in ab_list['matchup']['pitcher'] else None)
337
+ else:
338
+ pitcher_name.append(None)
339
+ pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else None)
340
+ else:
341
+ batter_id.append(None)
342
+ batter_name.append(None)
343
+ batter_hand.append(None)
344
+ pitcher_id.append(None)
345
+ pitcher_name.append(None)
346
+ pitcher_hand.append(None)
347
+
348
+ if ab_list['about']['isTopInning']:
349
+ batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
350
+ batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
351
+ pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
352
+ pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
353
+ else:
354
+ batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
355
+ batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
356
+ pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
357
+ pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
358
+
359
+ details = ab_list['playEvents'][n].get('details', {})
360
+ play_description.append(details.get('description'))
361
+ play_code.append(details.get('code'))
362
+ in_play.append(details.get('isInPlay'))
363
+ is_strike.append(details.get('isStrike'))
364
+
365
+ is_swing.append(True if details.get('code') in swing_list else None)
366
+ is_whiff.append(True if details.get('code') in whiff_list else None)
367
+
368
+ inning.append(ab_list['about'].get('inning'))
369
+ is_ball.append(details.get('isOut'))
370
+ is_review.append(details.get('hasReview'))
371
+
372
+ pitch_type.append(details.get('type', {}).get('code') if 'type' in details else None)
373
+ pitch_description.append(details.get('type', {}).get('description') if 'type' in details else None)
374
+
375
+ if ab_list['playEvents'][n].get('pitchNumber') == 1:
376
+ strikes.append(0)
377
+ balls.append(0)
378
+ strikes_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n].get('count', {}) else None)
379
+ balls_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n].get('count', {}) else None)
380
+ outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n].get('count', {}) else None)
381
+ outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n].get('count', {}) else None)
382
+ else:
383
+ strikes.append(ab_list['playEvents'][n-1]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n-1].get('count', {}) else None)
384
+ balls.append(ab_list['playEvents'][n-1]['count']['balls'] if 'balls' in ab_list['playEvents'][n-1].get('count', {}) else None)
385
+ outs.append(ab_list['playEvents'][n-1]['count']['outs'] if 'outs' in ab_list['playEvents'][n-1].get('count', {}) else None)
386
+ strikes_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n].get('count', {}) else None)
387
+ balls_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n].get('count', {}) else None)
388
+ outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n].get('count', {}) else None)
389
+
390
+ if 'pitchData' in ab_list['playEvents'][n]:
391
+ pitch_data = ab_list['playEvents'][n]['pitchData']
392
+ start_speed.append(pitch_data.get('startSpeed'))
393
+ end_speed.append(pitch_data.get('endSpeed'))
394
+ sz_top.append(pitch_data.get('strikeZoneTop'))
395
+ sz_bot.append(pitch_data.get('strikeZoneBottom'))
396
+
397
+ coords = pitch_data.get('coordinates', {})
398
+ x.append(coords.get('x'))
399
+ y.append(coords.get('y'))
400
+ ax.append(coords.get('aX'))
401
+ ay.append(coords.get('aY'))
402
+ az.append(coords.get('aZ'))
403
+ pfxx.append(coords.get('pfxX'))
404
+ pfxz.append(coords.get('pfxZ'))
405
+ px.append(coords.get('pX'))
406
+ pz.append(coords.get('pZ'))
407
+ vx0.append(coords.get('vX0'))
408
+ vy0.append(coords.get('vY0'))
409
+ vz0.append(coords.get('vZ0'))
410
+ x0.append(coords.get('x0'))
411
+ y0.append(coords.get('y0'))
412
+ z0.append(coords.get('z0'))
413
+
414
+ zone.append(pitch_data.get('zone'))
415
+ type_confidence.append(pitch_data.get('typeConfidence'))
416
+ plate_time.append(pitch_data.get('plateTime'))
417
+ extension.append(pitch_data.get('extension'))
418
+
419
+ if 'breaks' in pitch_data:
420
+ breaks = pitch_data['breaks']
421
+ spin_rate.append(breaks.get('spinRate'))
422
+ spin_direction.append(breaks.get('spinDirection'))
423
+ vb.append(breaks.get('breakVertical'))
424
+ ivb.append(breaks.get('breakVerticalInduced'))
425
+ hb.append(breaks.get('breakHorizontal'))
426
+ else:
427
+ spin_rate.append(None)
428
+ spin_direction.append(None)
429
+ vb.append(None)
430
+ ivb.append(None)
431
+ hb.append(None)
432
+
433
+ else:
434
+ start_speed.append(None)
435
+ end_speed.append(None)
436
+ sz_top.append(None)
437
+ sz_bot.append(None)
438
+ x.append(None)
439
+ y.append(None)
440
+ ax.append(None)
441
+ ay.append(None)
442
+ az.append(None)
443
+ pfxx.append(None)
444
+ pfxz.append(None)
445
+ px.append(None)
446
+ pz.append(None)
447
+ vx0.append(None)
448
+ vy0.append(None)
449
+ vz0.append(None)
450
+ x0.append(None)
451
+ y0.append(None)
452
+ z0.append(None)
453
+ zone.append(None)
454
+ type_confidence.append(None)
455
+ plate_time.append(None)
456
+ extension.append(None)
457
+ spin_rate.append(None)
458
+ spin_direction.append(None)
459
+ vb.append(None)
460
+ ivb.append(None)
461
+ hb.append(None)
462
+
463
+ if 'hitData' in ab_list['playEvents'][n]:
464
+ hit_data = ab_list['playEvents'][n]['hitData']
465
+ launch_speed.append(hit_data.get('launchSpeed'))
466
+ launch_angle.append(hit_data.get('launchAngle'))
467
+ launch_distance.append(hit_data.get('totalDistance'))
468
+ launch_location.append(hit_data.get('location'))
469
+ trajectory.append(hit_data.get('trajectory'))
470
+ hardness.append(hit_data.get('hardness'))
471
+ hit_coords = hit_data.get('coordinates', {})
472
+ hit_x.append(hit_coords.get('coordX'))
473
+ hit_y.append(hit_coords.get('coordY'))
474
+ else:
475
+ launch_speed.append(None)
476
+ launch_angle.append(None)
477
+ launch_distance.append(None)
478
+ launch_location.append(None)
479
+ trajectory.append(None)
480
+ hardness.append(None)
481
+ hit_x.append(None)
482
+ hit_y.append(None)
483
+
484
+ index_play.append(ab_list['playEvents'][n].get('index'))
485
+ play_id.append(ab_list['playEvents'][n].get('playId'))
486
+ start_time.append(ab_list['playEvents'][n].get('startTime'))
487
+ end_time.append(ab_list['playEvents'][n].get('endTime'))
488
+ is_pitch.append(ab_list['playEvents'][n].get('isPitch'))
489
+ type_type.append(ab_list['playEvents'][n].get('type'))
490
+
491
+ # Handle result fields - only populate on last event
492
+ if n == len(ab_list['playEvents']) - 1:
493
+ type_ab.append(ab_result.get('type'))
494
+ event.append(ab_result.get('event'))
495
+ event_type.append(ab_result.get('eventType'))
496
+ rbi.append(ab_result.get('rbi'))
497
+ away_score.append(ab_result.get('awayScore'))
498
+ home_score.append(ab_result.get('homeScore'))
499
+ is_out.append(ab_result.get('isOut'))
500
+ else:
501
+ type_ab.append(None)
502
+ event.append(None)
503
+ event_type.append(None)
504
+ rbi.append(None)
505
+ away_score.append(None)
506
+ home_score.append(None)
507
+ is_out.append(None)
508
+
509
+ except KeyError as e:
510
+ print(f"No Data for Game: {e}")
511
+
512
+ df = pl.DataFrame(data={
513
+ 'game_id':game_id,
514
+ 'game_date':game_date,
515
+ 'batter_id':batter_id,
516
+ 'batter_name':batter_name,
517
+ 'batter_hand':batter_hand,
518
+ 'batter_team':batter_team,
519
+ 'batter_team_id':batter_team_id,
520
+ 'pitcher_id':pitcher_id,
521
+ 'pitcher_name':pitcher_name,
522
+ 'pitcher_hand':pitcher_hand,
523
+ 'pitcher_team':pitcher_team,
524
+ 'pitcher_team_id':pitcher_team_id,
525
+ 'ab_number':ab_number,
526
+ 'inning':inning,
527
+ 'play_description':play_description,
528
+ 'play_code':play_code,
529
+ 'in_play':in_play,
530
+ 'is_strike':is_strike,
531
+ 'is_swing':is_swing,
532
+ 'is_whiff':is_whiff,
533
+ 'is_out':is_out,
534
+ 'is_ball':is_ball,
535
+ 'is_review':is_review,
536
+ 'pitch_type':pitch_type,
537
+ 'pitch_description':pitch_description,
538
+ 'strikes':strikes,
539
+ 'balls':balls,
540
+ 'outs':outs,
541
+ 'strikes_after':strikes_after,
542
+ 'balls_after':balls_after,
543
+ 'outs_after':outs_after,
544
+ 'start_speed':start_speed,
545
+ 'end_speed':end_speed,
546
+ 'sz_top':sz_top,
547
+ 'sz_bot':sz_bot,
548
+ 'x':x,
549
+ 'y':y,
550
+ 'ax':ax,
551
+ 'ay':ay,
552
+ 'az':az,
553
+ 'pfxx':pfxx,
554
+ 'pfxz':pfxz,
555
+ 'px':px,
556
+ 'pz':pz,
557
+ 'vx0':vx0,
558
+ 'vy0':vy0,
559
+ 'vz0':vz0,
560
+ 'x0':x0,
561
+ 'y0':y0,
562
+ 'z0':z0,
563
+ 'zone':zone,
564
+ 'type_confidence':type_confidence,
565
+ 'plate_time':plate_time,
566
+ 'extension':extension,
567
+ 'spin_rate':spin_rate,
568
+ 'spin_direction':spin_direction,
569
+ 'vb':vb,
570
+ 'ivb':ivb,
571
+ 'hb':hb,
572
+ 'launch_speed':launch_speed,
573
+ 'launch_angle':launch_angle,
574
+ 'launch_distance':launch_distance,
575
+ 'launch_location':launch_location,
576
+ 'trajectory':trajectory,
577
+ 'hardness':hardness,
578
+ 'hit_x':hit_x,
579
+ 'hit_y':hit_y,
580
+ 'index_play':index_play,
581
+ 'play_id':play_id,
582
+ 'start_time':start_time,
583
+ 'end_time':end_time,
584
+ 'is_pitch':is_pitch,
585
+ 'type_type':type_type,
586
+ 'type_ab':type_ab,
587
+ 'event':event,
588
+ 'event_type':event_type,
589
+ 'rbi':rbi,
590
+ 'away_score':away_score,
591
+ 'home_score':home_score,
592
+ },strict=False
593
+ )
594
+
595
+ return df
596
+
597
+
598
+ def get_teams(self):
599
+ """
600
+ Retrieves information about MLB teams from the MLB API and processes it into a Polars DataFrame.
601
+
602
+ Returns:
603
+ - mlb_teams_df (pl.DataFrame): A DataFrame containing team information, including team ID, city, name, franchise, abbreviation, parent organization ID, parent organization name, league ID, and league name.
604
+ """
605
+ # Make API call to retrieve team information
606
+ teams = requests.get(url='https://statsapi.mlb.com/api/v1/teams/').json()
607
+
608
+ # Extract relevant data from the API response
609
+ mlb_teams_city = [x['franchiseName'] if 'franchiseName' in x else None for x in teams['teams']]
610
+ mlb_teams_name = [x['teamName'] if 'franchiseName' in x else None for x in teams['teams']]
611
+ mlb_teams_franchise = [x['name'] if 'franchiseName' in x else None for x in teams['teams']]
612
+ mlb_teams_id = [x['id'] if 'franchiseName' in x else None for x in teams['teams']]
613
+ mlb_teams_abb = [x['abbreviation'] if 'franchiseName' in x else None for x in teams['teams']]
614
+ mlb_teams_parent_id = [x['parentOrgId'] if 'parentOrgId' in x else None for x in teams['teams']]
615
+ mlb_teams_parent = [x['parentOrgName'] if 'parentOrgName' in x else None for x in teams['teams']]
616
+ mlb_teams_league_id = [x['league']['id'] if 'id' in x['league'] else None for x in teams['teams']]
617
+ mlb_teams_league_name = [x['league']['name'] if 'name' in x['league'] else None for x in teams['teams']]
618
+
619
+ # Create a Polars DataFrame with the extracted data
620
+ mlb_teams_df = pl.DataFrame(data={'team_id': mlb_teams_id,
621
+ 'city': mlb_teams_franchise,
622
+ 'name': mlb_teams_name,
623
+ 'franchise': mlb_teams_franchise,
624
+ 'abbreviation': mlb_teams_abb,
625
+ 'parent_org_id': mlb_teams_parent_id,
626
+ 'parent_org': mlb_teams_parent,
627
+ 'league_id': mlb_teams_league_id,
628
+ 'league_name': mlb_teams_league_name
629
+ }).unique().drop_nulls(subset=['team_id']).sort('team_id')
630
+
631
+ # Fill missing parent organization IDs with team IDs
632
+ mlb_teams_df = mlb_teams_df.with_columns(
633
+ pl.when(pl.col('parent_org_id').is_null())
634
+ .then(pl.col('team_id'))
635
+ .otherwise(pl.col('parent_org_id'))
636
+ .alias('parent_org_id')
637
+ )
638
+
639
+ # Fill missing parent organization names with franchise names
640
+ mlb_teams_df = mlb_teams_df.with_columns(
641
+ pl.when(pl.col('parent_org').is_null())
642
+ .then(pl.col('franchise'))
643
+ .otherwise(pl.col('parent_org'))
644
+ .alias('parent_org')
645
+ )
646
+
647
+ # Create a dictionary for mapping team IDs to abbreviations
648
+ abbreviation_dict = mlb_teams_df.select(['team_id', 'abbreviation']).to_dict(as_series=False)
649
+ abbreviation_map = {k: v for k, v in zip(abbreviation_dict['team_id'], abbreviation_dict['abbreviation'])}
650
+
651
+ # Create a DataFrame for parent organization abbreviations
652
+ abbreviation_df = mlb_teams_df.select(['team_id', 'abbreviation']).rename({'team_id': 'parent_org_id', 'abbreviation': 'parent_org_abbreviation'})
653
+
654
+ # Join the parent organization abbreviations with the main DataFrame
655
+ mlb_teams_df = mlb_teams_df.join(abbreviation_df, on='parent_org_id', how='left')
656
+
657
+ return mlb_teams_df
658
+
659
+ def get_leagues(self):
660
+ """
661
+ Retrieves information about MLB leagues from the MLB API and processes it into a Polars DataFrame.
662
+
663
+ Returns:
664
+ - leagues_df (pl.DataFrame): A DataFrame containing league information, including league ID, league name, league abbreviation, and sport ID.
665
+ """
666
+ # Make API call to retrieve league information
667
+ leagues = requests.get(url='https://statsapi.mlb.com/api/v1/leagues/').json()
668
+
669
+ # Extract relevant data from the API response
670
+ sport_id = [x['sport']['id'] if 'sport' in x else None for x in leagues['leagues']]
671
+ league_id = [x['id'] if 'id' in x else None for x in leagues['leagues']]
672
+ league_name = [x['name'] if 'name' in x else None for x in leagues['leagues']]
673
+ league_abbreviation = [x['abbreviation'] if 'abbreviation' in x else None for x in leagues['leagues']]
674
+
675
+ # Create a Polars DataFrame with the extracted data
676
+ leagues_df = pl.DataFrame(data={
677
+ 'league_id': league_id,
678
+ 'league_name': league_name,
679
+ 'league_abbreviation': league_abbreviation,
680
+ 'sport_id': sport_id,
681
+ })
682
+
683
+ return leagues_df
684
+
685
+ def get_player_games_list(self, player_id: int,
686
+ season: int,
687
+ start_date: str = None,
688
+ end_date: str = None,
689
+ sport_id: int = 1,
690
+ game_type: list = ['R'],
691
+ pitching: bool = True):
692
+ """
693
+ Retrieves a list of game IDs for a specific player in a given season.
694
+
695
+ Parameters:
696
+ - player_id (int): The ID of the player.
697
+ - season (int): The season year for which to retrieve the game list.
698
+ - start_date (str): The start date (YYYY-MM-DD) of the range (default is January 1st of the specified season).
699
+ - end_date (str): The end date (YYYY-MM-DD) of the range (default is December 31st of the specified season).
700
+ - sport_id (int): The ID of the sport for which to retrieve player data.
701
+ - game_type (list): A list of game types to filter the schedule. Default is ['R'].
702
+ - pitching (bool): Return pitching games.
703
+
704
+ Returns:
705
+ - player_game_list (list): A list of game IDs in which the player participated during the specified season.
706
+ """
707
+ # Set default start and end dates if not provided
708
+ if not start_date:
709
+ start_date = f'{season}-01-01'
710
+ if not end_date:
711
+ end_date = f'{season}-12-31'
712
+
713
+ # Determine the group based on the pitching flag
714
+ group = 'pitching' if pitching else 'hitting'
715
+
716
+ # Validate date format
717
+ date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}$')
718
+ if not date_pattern.match(start_date):
719
+ raise ValueError(f"start_date {start_date} is not in YYYY-MM-DD format")
720
+ if not date_pattern.match(end_date):
721
+ raise ValueError(f"end_date {end_date} is not in YYYY-MM-DD format")
722
+
723
+ # Convert game type list to a comma-separated string
724
+ game_type_str = ','.join([str(x) for x in game_type])
725
+
726
+ # Make API call to retrieve player game logs
727
+ response = requests.get(url=f'http://statsapi.mlb.com/api/v1/people/{player_id}?hydrate=stats(group={group},type=gameLog,season={season},startDate={start_date},endDate={end_date},sportId={sport_id},gameType=[{game_type_str}]),hydrations').json()
728
+
729
+ # Check if stats are available in the response
730
+ if 'stats' not in response['people'][0]:
731
+ print(f'No {group} games found for player {player_id} in season {season}')
732
+ return []
733
+
734
+ # Extract game IDs from the API response
735
+ player_game_list = [x['game']['gamePk'] for x in response['people'][0]['stats'][0]['splits']]
736
+
737
+ return player_game_list
738
+
739
+ def get_players(self, sport_id: int, season: int, game_type: list = ['R']):
740
+ """
741
+ Retrieves data frame of players in a given league
742
+
743
+ Parameters:
744
+ - sport_id (int): The ID of the sport for which to retrieve player data.
745
+ - season (int): The season year for which to retrieve player data.
746
+ - game_type (list): A list of game types to filter the players. Default is ['R'].
747
+
748
+ Returns:
749
+ - player_df (pl.DataFrame): A DataFrame containing player information, including player ID, name, position, team, and age.
750
+ """
751
+ game_type_str = ','.join([str(x) for x in game_type])
752
+
753
+ # If game type is 'S', fetch data from a different endpoint
754
+ if game_type_str == 'S':
755
+ # Fetch pitcher data
756
+ pitcher_data = requests.get(f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?&env=prod&season={season}&sportId=1&stats=season&group=pitching&gameType=S&limit=1000000&offset=0&sortStat=inningsPitched&order=asc').json()
757
+ fullName_list = [x['playerFullName'] for x in pitcher_data['stats']]
758
+ firstName_list = [x['playerFirstName'] for x in pitcher_data['stats']]
759
+ useName_list = [x['playerUseName'] for x in pitcher_data['stats']]
760
+ lastName_list = [x['playerLastName'] for x in pitcher_data['stats']]
761
+ id_list = [x['playerId'] for x in pitcher_data['stats']]
762
+ position_list = [x['primaryPositionAbbrev'] for x in pitcher_data['stats']]
763
+ team_list = [x['teamId'] for x in pitcher_data['stats']]
764
+
765
+ df_pitcher = pl.DataFrame(data={
766
+ 'player_id': id_list,
767
+ 'first_name': firstName_list,
768
+ 'use_name':useName_list,
769
+ 'last_name': lastName_list,
770
+ 'name': fullName_list,
771
+ 'position': position_list,
772
+ 'team': team_list
773
+ })
774
+
775
+ # Fetch batter data
776
+ batter_data = requests.get(f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?&env=prod&season={season}&sportId=1&stats=season&group=hitting&gameType=S&limit=1000000&offset=0').json()
777
+ fullName_list = [x['playerFullName'] for x in batter_data['stats']]
778
+ firstName_list = [x['playerFirstName'] for x in batter_data['stats']]
779
+ useName_list = [x['playerUseName'] for x in batter_data['stats']]
780
+ lastName_list = [x['playerLastName'] for x in batter_data['stats']]
781
+ id_list = [x['playerId'] for x in batter_data['stats']]
782
+ position_list = [x['primaryPositionAbbrev'] for x in batter_data['stats']]
783
+ team_list = [x['teamId'] for x in batter_data['stats']]
784
+
785
+ df_batter = pl.DataFrame(data={
786
+ 'player_id': id_list,
787
+ 'first_name': firstName_list,
788
+ 'use_name': useName_list,
789
+ 'last_name': lastName_list,
790
+ 'name': fullName_list,
791
+ 'position': position_list,
792
+ 'team': team_list
793
+ })
794
+
795
+ # Combine pitcher and batter data
796
+ df = pl.concat([df_pitcher, df_batter]).unique().drop_nulls(subset=['player_id']).sort('player_id')
797
+
798
+ else:
799
+ # Fetch player data for other game types
800
+ player_data = requests.get(url=f'https://statsapi.mlb.com/api/v1/sports/{sport_id}/players?season={season}&gameType=[{game_type_str}]').json()['people']
801
+
802
+ # Extract relevant data
803
+ fullName_list = [x['fullName'] for x in player_data]
804
+ firstName_list = [x['firstName'] for x in player_data]
805
+ useName_list = [x['useName'] for x in player_data]
806
+ lastName_list = [x['lastName'] for x in player_data]
807
+ id_list = [x['id'] for x in player_data]
808
+ position_list = [x['primaryPosition']['abbreviation'] if 'primaryPosition' in x else None for x in player_data]
809
+ team_list = [x['currentTeam']['id'] if 'currentTeam' in x else None for x in player_data]
810
+ weight_list = [x['weight'] if 'weight' in x else None for x in player_data]
811
+ height_list = [x['height'] if 'height' in x else None for x in player_data]
812
+ age_list = [x['currentAge'] if 'currentAge' in x else None for x in player_data]
813
+ birthDate_list = [x['birthDate'] if 'birthDate' in x else None for x in player_data]
814
+
815
+ df = pl.DataFrame(data={
816
+ 'player_id': id_list,
817
+ 'first_name': firstName_list,
818
+ 'use_name': useName_list,
819
+ 'last_name': lastName_list,
820
+ 'name': fullName_list,
821
+ 'position': position_list,
822
+ 'team': team_list,
823
+ 'weight': weight_list,
824
+ 'height': height_list,
825
+ 'age': age_list,
826
+ 'birthDate': birthDate_list
827
+ })
828
+
829
+ return df
830
+
831
+