rairo commited on
Commit
97cf964
Β·
verified Β·
1 Parent(s): 3016fda

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +303 -23
src/streamlit_app.py CHANGED
@@ -145,6 +145,7 @@ def parse_table(html, table_id=None):
145
  return pd.DataFrame()
146
 
147
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
148
  @st.cache_data(ttl=3600)
149
  def get_player_index():
150
  """
@@ -188,58 +189,99 @@ def get_player_index():
188
  def player_season_stats(bbr_url):
189
  """
190
  Scrapes a player's per‑season table (id="per_game") from their BBR page.
191
- Returns cleaned DataFrame.
192
  """
193
  html = fetch_html(bbr_url)
194
  if not html:
195
  return pd.DataFrame()
196
 
197
- df = parse_table(html, table_id="per_game")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  if df.empty:
 
199
  return pd.DataFrame()
200
 
201
  # Handle potential MultiIndex columns
202
  if isinstance(df.columns, pd.MultiIndex):
203
- # Flatten MultiIndex columns
204
- df.columns = ['_'.join(str(col).strip() for col in cols if str(col).strip() and str(col).strip() != 'Unnamed: 0_level_0')
205
- for cols in df.columns.values]
 
 
 
 
 
 
 
206
 
207
  # Clean column names
208
  df.columns = [str(col).strip() for col in df.columns]
209
 
210
- # Find season column (could be 'Season' or similar)
 
 
 
211
  season_col = None
212
  for col in df.columns:
213
- if 'season' in col.lower() or col == 'Season':
214
  season_col = col
215
  break
216
 
217
- if season_col is None:
218
- # Try to find it by looking for columns with year patterns
219
  for col in df.columns:
220
- if df[col].dtype == 'object' and not df[col].isna().all():
221
- sample_val = str(df[col].iloc[0]) if len(df) > 0 else ""
222
- if re.match(r'\d{4}-\d{2}', sample_val):
223
- season_col = col
224
- break
 
 
 
225
 
226
  if season_col is None:
227
- st.warning(f"Could not find season column in player stats. Available columns: {df.columns.tolist()}")
228
  return pd.DataFrame()
229
 
230
  # Rename season column to standard name
231
  if season_col != 'Season':
232
  df = df.rename(columns={season_col: 'Season'})
233
 
234
- # Remove header rows that might have been included
235
- df = df[df["Season"].astype(str) != "Season"].copy()
236
  df = df[df["Season"].notna()].copy()
237
 
 
 
 
 
 
 
 
238
  # Clean season format
239
- df["Season"] = df["Season"].astype(str)
240
  df['Season'] = df['Season'].str.replace('-', '–') # Ensure en-dash for consistency
241
 
242
- # Standardize column names to match previous nba_api output expectations
243
  column_mapping = {
244
  'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
245
  'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
@@ -256,6 +298,10 @@ def player_season_stats(bbr_url):
256
  if old_col in df.columns:
257
  df = df.rename(columns={old_col: new_col})
258
 
 
 
 
 
259
  # Convert numeric columns
260
  non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
261
  for col in df.columns:
@@ -264,6 +310,123 @@ def player_season_stats(bbr_url):
264
 
265
  return df
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
268
  @st.cache_data(ttl=300)
269
  def team_per_game(year):
@@ -347,6 +510,9 @@ def team_per_game(year):
347
  if old_col in df.columns:
348
  df = df.rename(columns={old_col: new_col})
349
 
 
 
 
350
  # Convert numeric columns
351
  non_numeric_cols = {"Tm", "RANK"}
352
  for col in df.columns:
@@ -356,7 +522,119 @@ def team_per_game(year):
356
  return df
357
 
358
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
359
- # Additional utility functions for better error handling and data validation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
  def validate_dataframe(df, required_columns=None):
362
  """
@@ -382,12 +660,13 @@ def clean_team_name(team_name):
382
  # Remove any asterisks or other symbols
383
  team_name = str(team_name).strip().replace('*', '')
384
 
385
- # Handle special cases
386
  team_mapping = {
387
- 'TOT': 'TOT', # Total for players who played for multiple teams
388
  'NOP': 'NO', # New Orleans Pelicans sometimes shown as NOP
389
  'PHX': 'PHO', # Phoenix Suns sometimes shown as PHX
390
  'BRK': 'BKN', # Brooklyn Nets sometimes shown as BRK
 
 
391
  }
392
 
393
  return team_mapping.get(team_name, team_name)
@@ -408,7 +687,8 @@ def retry_fetch(func, *args, max_retries=3, **kwargs):
408
  time.sleep(2 ** attempt) # Exponential backoff
409
 
410
  return pd.DataFrame()
411
-
 
412
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
413
  # Perplexity integration
414
  PERP_KEY = os.getenv("PERPLEXITY_API_KEY")
 
145
  return pd.DataFrame()
146
 
147
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
148
+ β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
149
  @st.cache_data(ttl=3600)
150
  def get_player_index():
151
  """
 
189
  def player_season_stats(bbr_url):
190
  """
191
  Scrapes a player's per‑season table (id="per_game") from their BBR page.
192
+ Returns cleaned DataFrame with full player statistics.
193
  """
194
  html = fetch_html(bbr_url)
195
  if not html:
196
  return pd.DataFrame()
197
 
198
+ # Try multiple table IDs that Basketball Reference uses for player stats
199
+ possible_table_ids = ["per_game", "per-game", "stats", "player-stats"]
200
+ df = pd.DataFrame()
201
+
202
+ for table_id in possible_table_ids:
203
+ df = parse_table(html, table_id=table_id)
204
+ if not df.empty:
205
+ break
206
+
207
+ # If no table found with IDs, try to find any table with season data
208
+ if df.empty:
209
+ soup = BeautifulSoup(html, "html.parser")
210
+ tables = soup.find_all("table")
211
+ for table in tables:
212
+ # Look for tables that have season-like headers
213
+ headers = [th.text.strip() for th in table.find_all("th")]
214
+ if any("season" in h.lower() or re.match(r'\d{4}-\d{2}', h) for h in headers):
215
+ df = parse_table(str(table))
216
+ if not df.empty:
217
+ break
218
+
219
  if df.empty:
220
+ st.warning(f"Could not find player stats table at {bbr_url}")
221
  return pd.DataFrame()
222
 
223
  # Handle potential MultiIndex columns
224
  if isinstance(df.columns, pd.MultiIndex):
225
+ # Flatten MultiIndex columns more carefully
226
+ new_columns = []
227
+ for col in df.columns:
228
+ if isinstance(col, tuple):
229
+ # Join non-empty parts of the tuple
230
+ col_parts = [str(part).strip() for part in col if str(part).strip() and 'Unnamed' not in str(part)]
231
+ new_columns.append('_'.join(col_parts) if col_parts else 'Unknown')
232
+ else:
233
+ new_columns.append(str(col).strip())
234
+ df.columns = new_columns
235
 
236
  # Clean column names
237
  df.columns = [str(col).strip() for col in df.columns]
238
 
239
+ # Debug: print available columns
240
+ st.info(f"Available columns: {df.columns.tolist()}")
241
+
242
+ # Find season column with more flexible matching
243
  season_col = None
244
  for col in df.columns:
245
+ if any(keyword in col.lower() for keyword in ['season', 'year']):
246
  season_col = col
247
  break
248
 
249
+ # If still no season column, look for columns with year-like data
250
+ if season_col is None and len(df) > 0:
251
  for col in df.columns:
252
+ if df[col].dtype == 'object':
253
+ try:
254
+ sample_values = df[col].dropna().head(3).astype(str)
255
+ if any(re.match(r'\d{4}[-–]\d{2}', val) for val in sample_values):
256
+ season_col = col
257
+ break
258
+ except:
259
+ continue
260
 
261
  if season_col is None:
262
+ st.warning(f"Could not find season column. Available columns: {df.columns.tolist()}")
263
  return pd.DataFrame()
264
 
265
  # Rename season column to standard name
266
  if season_col != 'Season':
267
  df = df.rename(columns={season_col: 'Season'})
268
 
269
+ # Clean the data
270
+ df = df[df["Season"].astype(str).str.strip() != "Season"].copy()
271
  df = df[df["Season"].notna()].copy()
272
 
273
+ # Remove any completely empty rows
274
+ df = df.dropna(how='all').copy()
275
+
276
+ if df.empty:
277
+ st.warning("No valid season data found after cleaning")
278
+ return pd.DataFrame()
279
+
280
  # Clean season format
281
+ df["Season"] = df["Season"].astype(str).str.strip()
282
  df['Season'] = df['Season'].str.replace('-', '–') # Ensure en-dash for consistency
283
 
284
+ # Standardize column names to match expected format
285
  column_mapping = {
286
  'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
287
  'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
 
298
  if old_col in df.columns:
299
  df = df.rename(columns={old_col: new_col})
300
 
301
+ # Clean team names if TEAM_ABBREVIATION column exists
302
+ if 'TEAM_ABBREVIATION' in df.columns:
303
+ df['TEAM_ABBREVIATION'] = df['TEAM_ABBREVIATION'].apply(clean_team_name)
304
+
305
  # Convert numeric columns
306
  non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
307
  for col in df.columns:
 
310
 
311
  return df
312
 
313
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
314
+ @st.cache_data(ttl=300)
315
+ def get_player_stats_by_name(player_name):
316
+ """
317
+ Get player stats by searching for the player name in the index.
318
+ Returns DataFrame with player's career statistics.
319
+ """
320
+ # Get player index
321
+ player_index = get_player_index()
322
+ if player_index.empty:
323
+ return pd.DataFrame()
324
+
325
+ # Search for player (case insensitive)
326
+ matches = player_index[player_index['name'].str.contains(player_name, case=False, na=False)]
327
+
328
+ if matches.empty:
329
+ st.warning(f"No player found matching '{player_name}'")
330
+ return pd.DataFrame()
331
+
332
+ if len(matches) > 1:
333
+ st.info(f"Multiple players found matching '{player_name}': {matches['name'].tolist()}")
334
+ st.info("Using first match")
335
+
336
+ # Get stats for the first match
337
+ player_url = matches.iloc[0]['url']
338
+ return player_season_stats(player_url)
339
+
340
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
341
+
342
+ import requests
343
+ import pandas as pd
344
+ import streamlit as st
345
+ from bs4 import BeautifulSoup
346
+ import re
347
+ import time
348
+ import random
349
+ from urllib.parse import urljoin
350
+
351
+ @st.cache_data(ttl=3600)
352
+ def fetch_html(url):
353
+ """Fetch raw HTML for a URL (with error handling and rate limiting)."""
354
+ try:
355
+ # Add random delay to be respectful to basketball-reference.com
356
+ time.sleep(random.uniform(0.5, 1.5))
357
+
358
+ headers = {
359
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
360
+ }
361
+
362
+ resp = requests.get(url, timeout=30, headers=headers)
363
+ resp.raise_for_status()
364
+ return resp.text
365
+ except requests.exceptions.RequestException as e:
366
+ st.error(f"Failed to fetch {url}: {e}")
367
+ return ""
368
+ except Exception as e:
369
+ st.error(f"An unexpected error occurred while fetching {url}: {e}")
370
+ return ""
371
+
372
+ def parse_table(html, table_id=None):
373
+ """
374
+ Given raw HTML and optional table_id, locate that <table>,
375
+ handling cases where it's commented out, then parse it with pandas.read_html.
376
+ """
377
+ if not html:
378
+ return pd.DataFrame()
379
+
380
+ soup = BeautifulSoup(html, "html.parser") # Changed from lxml to html.parser for better compatibility
381
+ tbl_html = ""
382
+
383
+ if table_id:
384
+ # First, try to find the table directly
385
+ tbl = soup.find("table", {"id": table_id})
386
+ if tbl:
387
+ tbl_html = str(tbl)
388
+ else:
389
+ # If not found directly, search for it within HTML comments
390
+ # Basketball-Reference often comments out tables
391
+ comment_pattern = re.compile(
392
+ r'<!--.*?<table[^>]*?id=["\']' + re.escape(table_id) + r'["\'][^>]*?>.*?</table>.*?-->',
393
+ re.DOTALL | re.IGNORECASE
394
+ )
395
+ comment_match = comment_pattern.search(html)
396
+ if comment_match:
397
+ # Extract the content of the comment
398
+ comment_content = comment_match.group(0)
399
+ # Remove the comment tags
400
+ comment_content = comment_content.replace('<!--', '').replace('-->', '')
401
+ # Parse the comment content as new HTML
402
+ comment_soup = BeautifulSoup(comment_content, 'html.parser')
403
+ tbl = comment_soup.find('table', {'id': table_id})
404
+ if tbl:
405
+ tbl_html = str(tbl)
406
+ else:
407
+ # fallback: first table on page (only if no table_id specified)
408
+ first = soup.find("table")
409
+ if first:
410
+ tbl_html = str(first)
411
+
412
+ if not tbl_html:
413
+ return pd.DataFrame()
414
+
415
+ try:
416
+ # pd.read_html returns a list of DataFrames, we want the first one
417
+ dfs = pd.read_html(tbl_html, header=0)
418
+ if dfs:
419
+ return dfs[0]
420
+ else:
421
+ return pd.DataFrame()
422
+ except ValueError as e:
423
+ # No tables found in the provided HTML string
424
+ st.warning(f"No tables found in HTML: {e}")
425
+ return pd.DataFrame()
426
+ except Exception as e:
427
+ st.error(f"Error parsing table with pandas: {e}")
428
+ return pd.DataFrame()
429
+
430
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
431
  @st.cache_data(ttl=300)
432
  def team_per_game(year):
 
510
  if old_col in df.columns:
511
  df = df.rename(columns={old_col: new_col})
512
 
513
+ # Clean team names
514
+ df['Tm'] = df['Tm'].apply(clean_team_name)
515
+
516
  # Convert numeric columns
517
  non_numeric_cols = {"Tm", "RANK"}
518
  for col in df.columns:
 
522
  return df
523
 
524
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
525
+ # Additional utility functions for team data processing
526
+
527
+ @st.cache_data(ttl=300)
528
+ def team_opponent_stats(year):
529
+ """
530
+ Scrapes the league's opponent per‑game team stats table from:
531
+ https://www.basketball-reference.com/leagues/NBA_{year}_opp_per_game.html
532
+ Returns cleaned DataFrame with opponent stats.
533
+ """
534
+ url = f"https://www.basketball-reference.com/leagues/NBA_{year}_opp_per_game.html"
535
+ html = fetch_html(url)
536
+ if not html:
537
+ return pd.DataFrame()
538
+
539
+ # Try multiple possible table IDs for opponent stats
540
+ possible_table_ids = ["opp-stats-per_game", "opp_per_game", "opponent-stats-per_game"]
541
+ df = pd.DataFrame()
542
+
543
+ for table_id in possible_table_ids:
544
+ df = parse_table(html, table_id=table_id)
545
+ if not df.empty:
546
+ break
547
+
548
+ if df.empty:
549
+ st.warning(f"Could not find opponent stats table for {year}")
550
+ return pd.DataFrame()
551
+
552
+ # Clean and process the same way as regular team stats
553
+ if isinstance(df.columns, pd.MultiIndex):
554
+ df.columns = ['_'.join(str(col).strip() for col in cols if str(col).strip() and str(col).strip() != 'Unnamed: 0_level_0')
555
+ for cols in df.columns.values]
556
+
557
+ df.columns = [str(col).strip() for col in df.columns]
558
+
559
+ # Find team column
560
+ team_col = None
561
+ for col in df.columns:
562
+ if 'team' in col.lower() or col in ['Team', 'Tm']:
563
+ team_col = col
564
+ break
565
+
566
+ if team_col is None:
567
+ return pd.DataFrame()
568
+
569
+ if team_col != 'Team':
570
+ df = df.rename(columns={team_col: 'Team'})
571
+
572
+ df = df[df["Team"].astype(str) != "Team"].copy()
573
+ df = df[df["Team"].notna()].copy()
574
+ df = df.rename(columns={"Team": "Tm"})
575
+
576
+ # Apply team name cleaning
577
+ df['Tm'] = df['Tm'].apply(clean_team_name)
578
+
579
+ # Same column standardization as regular team stats
580
+ column_mapping = {
581
+ 'G': 'OPP_GP', 'MP': 'OPP_MIN',
582
+ 'FG%': 'OPP_FG_PCT', '3P%': 'OPP_FG3_PCT', 'FT%': 'OPP_FT_PCT',
583
+ 'TRB': 'OPP_REB', 'AST': 'OPP_AST', 'STL': 'OPP_STL', 'BLK': 'OPP_BLK', 'TOV': 'OPP_TO',
584
+ 'PF': 'OPP_PF', 'PTS': 'OPP_PTS',
585
+ 'FG': 'OPP_FGM', 'FGA': 'OPP_FGA', '3P': 'OPP_FG3M', '3PA': 'OPP_FG3A',
586
+ '2P': 'OPP_FGM2', '2PA': 'OPP_FGA2', '2P%': 'OPP_FG2_PCT', 'eFG%': 'OPP_EFG_PCT',
587
+ 'FT': 'OPP_FTM', 'FTA': 'OPP_FTA', 'ORB': 'OPP_OREB', 'DRB': 'OPP_DREB'
588
+ }
589
+
590
+ for old_col, new_col in column_mapping.items():
591
+ if old_col in df.columns:
592
+ df = df.rename(columns={old_col: new_col})
593
+
594
+ # Convert numeric columns
595
+ non_numeric_cols = {"Tm"}
596
+ for col in df.columns:
597
+ if col not in non_numeric_cols:
598
+ df[col] = pd.to_numeric(df[col], errors="coerce")
599
+
600
+ return df
601
+
602
+ @st.cache_data(ttl=300)
603
+ def team_standings(year):
604
+ """
605
+ Scrapes team standings from Basketball Reference.
606
+ Returns DataFrame with team records and standings info.
607
+ """
608
+ url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
609
+ html = fetch_html(url)
610
+ if not html:
611
+ return pd.DataFrame()
612
+
613
+ # Try to find standings tables (usually split by conference)
614
+ soup = BeautifulSoup(html, "html.parser")
615
+ standings_data = []
616
+
617
+ # Look for conference tables
618
+ for conference in ['E', 'W']: # Eastern and Western conference IDs
619
+ table_id = f"standings_{conference}"
620
+ table = soup.find("table", {"id": table_id})
621
+ if table:
622
+ df = parse_table(str(table))
623
+ if not df.empty:
624
+ df['Conference'] = 'Eastern' if conference == 'E' else 'Western'
625
+ standings_data.append(df)
626
+
627
+ if not standings_data:
628
+ return pd.DataFrame()
629
+
630
+ # Combine conference standings
631
+ df = pd.concat(standings_data, ignore_index=True)
632
+
633
+ # Clean team names if 'Team' column exists
634
+ if 'Team' in df.columns:
635
+ df['Team'] = df['Team'].apply(clean_team_name)
636
+
637
+ return df
638
 
639
  def validate_dataframe(df, required_columns=None):
640
  """
 
660
  # Remove any asterisks or other symbols
661
  team_name = str(team_name).strip().replace('*', '')
662
 
663
+ # Handle special cases for team name variations
664
  team_mapping = {
 
665
  'NOP': 'NO', # New Orleans Pelicans sometimes shown as NOP
666
  'PHX': 'PHO', # Phoenix Suns sometimes shown as PHX
667
  'BRK': 'BKN', # Brooklyn Nets sometimes shown as BRK
668
+ 'CHA': 'CHO', # Charlotte sometimes inconsistent
669
+ 'UTA': 'UTH' # Utah Jazz sometimes shown as UTA
670
  }
671
 
672
  return team_mapping.get(team_name, team_name)
 
687
  time.sleep(2 ** attempt) # Exponential backoff
688
 
689
  return pd.DataFrame()
690
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
691
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
692
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
693
  # Perplexity integration
694
  PERP_KEY = os.getenv("PERPLEXITY_API_KEY")