Spaces:
Sleeping
Sleeping
Upload api_scraper.py with huggingface_hub
Browse files- api_scraper.py +61 -40
api_scraper.py
CHANGED
|
@@ -100,20 +100,25 @@ class MLB_Scrape:
|
|
| 100 |
# Make API call to retrieve game schedule
|
| 101 |
game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
|
| 102 |
try:
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
# Create a Polars DataFrame with the extracted data
|
| 119 |
game_df = pl.DataFrame(data={'game_id': game_list,
|
|
@@ -154,40 +159,51 @@ class MLB_Scrape:
|
|
| 154 |
return game_df
|
| 155 |
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
# Parameters:
|
| 162 |
-
# - game_list_input (list): A list of game IDs for which to retrieve live data.
|
| 163 |
-
|
| 164 |
-
# Returns:
|
| 165 |
-
# - data_total (list): A list of JSON responses containing live game data for each game ID.
|
| 166 |
-
# """
|
| 167 |
-
# data_total = []
|
| 168 |
-
# print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
|
| 169 |
-
|
| 170 |
-
# def fetch_data(game_id):
|
| 171 |
-
# r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
|
| 172 |
-
# return r.json()
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
|
| 177 |
-
# data_total.append(future.result())
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
def get_data(self,game_list_input = [748540]):
|
| 183 |
data_total = []
|
| 184 |
-
#n_count = 0
|
| 185 |
print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
|
|
|
|
|
|
|
| 186 |
for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
|
|
|
|
| 187 |
r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
|
|
|
|
| 188 |
data_total.append(r.json())
|
|
|
|
| 189 |
return data_total
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
def get_data_df(self, data_list):
|
| 193 |
"""
|
|
@@ -232,6 +248,7 @@ class MLB_Scrape:
|
|
| 232 |
strikes_after = []
|
| 233 |
balls_after = []
|
| 234 |
outs_after = []
|
|
|
|
| 235 |
|
| 236 |
start_speed = []
|
| 237 |
end_speed = []
|
|
@@ -339,7 +356,8 @@ class MLB_Scrape:
|
|
| 339 |
else:
|
| 340 |
is_swing.append(None)
|
| 341 |
is_whiff.append(None)
|
| 342 |
-
|
|
|
|
| 343 |
is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else None)
|
| 344 |
is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else None)
|
| 345 |
pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else None)
|
|
@@ -431,6 +449,7 @@ class MLB_Scrape:
|
|
| 431 |
vb.append(None)
|
| 432 |
ivb.append(None)
|
| 433 |
hb.append(None)
|
|
|
|
| 434 |
|
| 435 |
if 'hitData' in ab_list['playEvents'][n]:
|
| 436 |
launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else None)
|
|
@@ -514,6 +533,7 @@ class MLB_Scrape:
|
|
| 514 |
is_review.append(None)
|
| 515 |
pitch_type.append(None)
|
| 516 |
pitch_description.append(None)
|
|
|
|
| 517 |
strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
|
| 518 |
balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
|
| 519 |
outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
|
|
@@ -592,6 +612,7 @@ class MLB_Scrape:
|
|
| 592 |
'pitcher_team':pitcher_team,
|
| 593 |
'pitcher_team_id':pitcher_team_id,
|
| 594 |
'ab_number':ab_number,
|
|
|
|
| 595 |
'play_description':play_description,
|
| 596 |
'play_code':play_code,
|
| 597 |
'in_play':in_play,
|
|
|
|
| 100 |
# Make API call to retrieve game schedule
|
| 101 |
game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
|
| 102 |
try:
|
| 103 |
+
def safe_get(d, keys, default=np.nan):
|
| 104 |
+
"""Safely retrieve nested dictionary values."""
|
| 105 |
+
for key in keys:
|
| 106 |
+
d = d.get(key, {})
|
| 107 |
+
if not isinstance(d, dict):
|
| 108 |
+
return d # Return value if it's not a dict
|
| 109 |
+
return default # Return default if keys don't exist
|
| 110 |
+
|
| 111 |
+
game_list = [item for sublist in [[y.get('gamePk', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 112 |
+
time_list = [item for sublist in [[y.get('gameDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 113 |
+
date_list = [item for sublist in [[y.get('officialDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 114 |
+
away_team_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 115 |
+
away_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 116 |
+
home_team_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 117 |
+
home_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 118 |
+
state_list = [item for sublist in [[safe_get(y, ['status', 'codedGameState'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 119 |
+
venue_id = [item for sublist in [[safe_get(y, ['venue', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 120 |
+
venue_name = [item for sublist in [[safe_get(y, ['venue', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 121 |
+
gameday_type = [item for sublist in [[safe_get(y, ['gamedayType'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
| 122 |
|
| 123 |
# Create a Polars DataFrame with the extracted data
|
| 124 |
game_df = pl.DataFrame(data={'game_id': game_list,
|
|
|
|
| 159 |
return game_df
|
| 160 |
|
| 161 |
|
| 162 |
+
def get_data(self, game_list_input: list):
|
| 163 |
+
"""
|
| 164 |
+
Retrieves live game data for a list of game IDs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
Parameters:
|
| 167 |
+
- game_list_input (list): A list of game IDs for which to retrieve live data.
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
Returns:
|
| 170 |
+
- data_total (list): A list of JSON responses containing live game data for each game ID.
|
| 171 |
+
"""
|
|
|
|
| 172 |
data_total = []
|
|
|
|
| 173 |
print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
|
| 174 |
+
|
| 175 |
+
# Iterate over the list of game IDs with a progress bar
|
| 176 |
for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
|
| 177 |
+
# Make a GET request to the MLB API for each game ID
|
| 178 |
r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
|
| 179 |
+
# Append the JSON response to the data_total list
|
| 180 |
data_total.append(r.json())
|
| 181 |
+
|
| 182 |
return data_total
|
| 183 |
|
| 184 |
+
def get_data_new(self, game_list_input: list):
|
| 185 |
+
"""
|
| 186 |
+
Retrieves live game data for a list of game IDs in parallel.
|
| 187 |
+
|
| 188 |
+
Parameters:
|
| 189 |
+
- game_list_input (list): A list of game IDs for which to retrieve live data.
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
- data_total (list): A list of JSON responses containing live game data for each game ID.
|
| 193 |
+
"""
|
| 194 |
+
data_total = []
|
| 195 |
+
print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
|
| 196 |
+
|
| 197 |
+
def fetch_data(game_id):
|
| 198 |
+
r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
|
| 199 |
+
return r.json()
|
| 200 |
+
|
| 201 |
+
with ThreadPoolExecutor() as executor:
|
| 202 |
+
futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
|
| 203 |
+
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
|
| 204 |
+
data_total.append(future.result())
|
| 205 |
+
|
| 206 |
+
return data_total
|
| 207 |
|
| 208 |
def get_data_df(self, data_list):
|
| 209 |
"""
|
|
|
|
| 248 |
strikes_after = []
|
| 249 |
balls_after = []
|
| 250 |
outs_after = []
|
| 251 |
+
inning = []
|
| 252 |
|
| 253 |
start_speed = []
|
| 254 |
end_speed = []
|
|
|
|
| 356 |
else:
|
| 357 |
is_swing.append(None)
|
| 358 |
is_whiff.append(None)
|
| 359 |
+
|
| 360 |
+
inning.append(ab_list['about']['inning'] if 'inning' in ab_list['about'] else None)
|
| 361 |
is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else None)
|
| 362 |
is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else None)
|
| 363 |
pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else None)
|
|
|
|
| 449 |
vb.append(None)
|
| 450 |
ivb.append(None)
|
| 451 |
hb.append(None)
|
| 452 |
+
|
| 453 |
|
| 454 |
if 'hitData' in ab_list['playEvents'][n]:
|
| 455 |
launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else None)
|
|
|
|
| 533 |
is_review.append(None)
|
| 534 |
pitch_type.append(None)
|
| 535 |
pitch_description.append(None)
|
| 536 |
+
inning.append(None)
|
| 537 |
strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
|
| 538 |
balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
|
| 539 |
outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
|
|
|
|
| 612 |
'pitcher_team':pitcher_team,
|
| 613 |
'pitcher_team_id':pitcher_team_id,
|
| 614 |
'ab_number':ab_number,
|
| 615 |
+
'inning':inning,
|
| 616 |
'play_description':play_description,
|
| 617 |
'play_code':play_code,
|
| 618 |
'in_play':in_play,
|