Update src/streamlit_app.py
Browse files- src/streamlit_app.py +303 -23
src/streamlit_app.py
CHANGED
|
@@ -145,6 +145,7 @@ def parse_table(html, table_id=None):
|
|
| 145 |
return pd.DataFrame()
|
| 146 |
|
| 147 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 148 |
@st.cache_data(ttl=3600)
|
| 149 |
def get_player_index():
|
| 150 |
"""
|
|
@@ -188,58 +189,99 @@ def get_player_index():
|
|
| 188 |
def player_season_stats(bbr_url):
|
| 189 |
"""
|
| 190 |
Scrapes a player's perβseason table (id="per_game") from their BBR page.
|
| 191 |
-
Returns cleaned DataFrame.
|
| 192 |
"""
|
| 193 |
html = fetch_html(bbr_url)
|
| 194 |
if not html:
|
| 195 |
return pd.DataFrame()
|
| 196 |
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
if df.empty:
|
|
|
|
| 199 |
return pd.DataFrame()
|
| 200 |
|
| 201 |
# Handle potential MultiIndex columns
|
| 202 |
if isinstance(df.columns, pd.MultiIndex):
|
| 203 |
-
# Flatten MultiIndex columns
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
# Clean column names
|
| 208 |
df.columns = [str(col).strip() for col in df.columns]
|
| 209 |
|
| 210 |
-
#
|
|
|
|
|
|
|
|
|
|
| 211 |
season_col = None
|
| 212 |
for col in df.columns:
|
| 213 |
-
if
|
| 214 |
season_col = col
|
| 215 |
break
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
for col in df.columns:
|
| 220 |
-
if df[col].dtype == 'object'
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
if season_col is None:
|
| 227 |
-
st.warning(f"Could not find season column
|
| 228 |
return pd.DataFrame()
|
| 229 |
|
| 230 |
# Rename season column to standard name
|
| 231 |
if season_col != 'Season':
|
| 232 |
df = df.rename(columns={season_col: 'Season'})
|
| 233 |
|
| 234 |
-
#
|
| 235 |
-
df = df[df["Season"].astype(str) != "Season"].copy()
|
| 236 |
df = df[df["Season"].notna()].copy()
|
| 237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
# Clean season format
|
| 239 |
-
df["Season"] = df["Season"].astype(str)
|
| 240 |
df['Season'] = df['Season'].str.replace('-', 'β') # Ensure en-dash for consistency
|
| 241 |
|
| 242 |
-
# Standardize column names to match
|
| 243 |
column_mapping = {
|
| 244 |
'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
|
| 245 |
'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
|
|
@@ -256,6 +298,10 @@ def player_season_stats(bbr_url):
|
|
| 256 |
if old_col in df.columns:
|
| 257 |
df = df.rename(columns={old_col: new_col})
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
# Convert numeric columns
|
| 260 |
non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
|
| 261 |
for col in df.columns:
|
|
@@ -264,6 +310,123 @@ def player_season_stats(bbr_url):
|
|
| 264 |
|
| 265 |
return df
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 268 |
@st.cache_data(ttl=300)
|
| 269 |
def team_per_game(year):
|
|
@@ -347,6 +510,9 @@ def team_per_game(year):
|
|
| 347 |
if old_col in df.columns:
|
| 348 |
df = df.rename(columns={old_col: new_col})
|
| 349 |
|
|
|
|
|
|
|
|
|
|
| 350 |
# Convert numeric columns
|
| 351 |
non_numeric_cols = {"Tm", "RANK"}
|
| 352 |
for col in df.columns:
|
|
@@ -356,7 +522,119 @@ def team_per_game(year):
|
|
| 356 |
return df
|
| 357 |
|
| 358 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 359 |
-
# Additional utility functions for
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
|
| 361 |
def validate_dataframe(df, required_columns=None):
|
| 362 |
"""
|
|
@@ -382,12 +660,13 @@ def clean_team_name(team_name):
|
|
| 382 |
# Remove any asterisks or other symbols
|
| 383 |
team_name = str(team_name).strip().replace('*', '')
|
| 384 |
|
| 385 |
-
# Handle special cases
|
| 386 |
team_mapping = {
|
| 387 |
-
'TOT': 'TOT', # Total for players who played for multiple teams
|
| 388 |
'NOP': 'NO', # New Orleans Pelicans sometimes shown as NOP
|
| 389 |
'PHX': 'PHO', # Phoenix Suns sometimes shown as PHX
|
| 390 |
'BRK': 'BKN', # Brooklyn Nets sometimes shown as BRK
|
|
|
|
|
|
|
| 391 |
}
|
| 392 |
|
| 393 |
return team_mapping.get(team_name, team_name)
|
|
@@ -408,7 +687,8 @@ def retry_fetch(func, *args, max_retries=3, **kwargs):
|
|
| 408 |
time.sleep(2 ** attempt) # Exponential backoff
|
| 409 |
|
| 410 |
return pd.DataFrame()
|
| 411 |
-
|
|
|
|
| 412 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 413 |
# Perplexity integration
|
| 414 |
PERP_KEY = os.getenv("PERPLEXITY_API_KEY")
|
|
|
|
| 145 |
return pd.DataFrame()
|
| 146 |
|
| 147 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 148 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
@st.cache_data(ttl=3600)
|
| 150 |
def get_player_index():
|
| 151 |
"""
|
|
|
|
| 189 |
def player_season_stats(bbr_url):
|
| 190 |
"""
|
| 191 |
Scrapes a player's perβseason table (id="per_game") from their BBR page.
|
| 192 |
+
Returns cleaned DataFrame with full player statistics.
|
| 193 |
"""
|
| 194 |
html = fetch_html(bbr_url)
|
| 195 |
if not html:
|
| 196 |
return pd.DataFrame()
|
| 197 |
|
| 198 |
+
# Try multiple table IDs that Basketball Reference uses for player stats
|
| 199 |
+
possible_table_ids = ["per_game", "per-game", "stats", "player-stats"]
|
| 200 |
+
df = pd.DataFrame()
|
| 201 |
+
|
| 202 |
+
for table_id in possible_table_ids:
|
| 203 |
+
df = parse_table(html, table_id=table_id)
|
| 204 |
+
if not df.empty:
|
| 205 |
+
break
|
| 206 |
+
|
| 207 |
+
# If no table found with IDs, try to find any table with season data
|
| 208 |
+
if df.empty:
|
| 209 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 210 |
+
tables = soup.find_all("table")
|
| 211 |
+
for table in tables:
|
| 212 |
+
# Look for tables that have season-like headers
|
| 213 |
+
headers = [th.text.strip() for th in table.find_all("th")]
|
| 214 |
+
if any("season" in h.lower() or re.match(r'\d{4}-\d{2}', h) for h in headers):
|
| 215 |
+
df = parse_table(str(table))
|
| 216 |
+
if not df.empty:
|
| 217 |
+
break
|
| 218 |
+
|
| 219 |
if df.empty:
|
| 220 |
+
st.warning(f"Could not find player stats table at {bbr_url}")
|
| 221 |
return pd.DataFrame()
|
| 222 |
|
| 223 |
# Handle potential MultiIndex columns
|
| 224 |
if isinstance(df.columns, pd.MultiIndex):
|
| 225 |
+
# Flatten MultiIndex columns more carefully
|
| 226 |
+
new_columns = []
|
| 227 |
+
for col in df.columns:
|
| 228 |
+
if isinstance(col, tuple):
|
| 229 |
+
# Join non-empty parts of the tuple
|
| 230 |
+
col_parts = [str(part).strip() for part in col if str(part).strip() and 'Unnamed' not in str(part)]
|
| 231 |
+
new_columns.append('_'.join(col_parts) if col_parts else 'Unknown')
|
| 232 |
+
else:
|
| 233 |
+
new_columns.append(str(col).strip())
|
| 234 |
+
df.columns = new_columns
|
| 235 |
|
| 236 |
# Clean column names
|
| 237 |
df.columns = [str(col).strip() for col in df.columns]
|
| 238 |
|
| 239 |
+
# Debug: print available columns
|
| 240 |
+
st.info(f"Available columns: {df.columns.tolist()}")
|
| 241 |
+
|
| 242 |
+
# Find season column with more flexible matching
|
| 243 |
season_col = None
|
| 244 |
for col in df.columns:
|
| 245 |
+
if any(keyword in col.lower() for keyword in ['season', 'year']):
|
| 246 |
season_col = col
|
| 247 |
break
|
| 248 |
|
| 249 |
+
# If still no season column, look for columns with year-like data
|
| 250 |
+
if season_col is None and len(df) > 0:
|
| 251 |
for col in df.columns:
|
| 252 |
+
if df[col].dtype == 'object':
|
| 253 |
+
try:
|
| 254 |
+
sample_values = df[col].dropna().head(3).astype(str)
|
| 255 |
+
if any(re.match(r'\d{4}[-β]\d{2}', val) for val in sample_values):
|
| 256 |
+
season_col = col
|
| 257 |
+
break
|
| 258 |
+
except:
|
| 259 |
+
continue
|
| 260 |
|
| 261 |
if season_col is None:
|
| 262 |
+
st.warning(f"Could not find season column. Available columns: {df.columns.tolist()}")
|
| 263 |
return pd.DataFrame()
|
| 264 |
|
| 265 |
# Rename season column to standard name
|
| 266 |
if season_col != 'Season':
|
| 267 |
df = df.rename(columns={season_col: 'Season'})
|
| 268 |
|
| 269 |
+
# Clean the data
|
| 270 |
+
df = df[df["Season"].astype(str).str.strip() != "Season"].copy()
|
| 271 |
df = df[df["Season"].notna()].copy()
|
| 272 |
|
| 273 |
+
# Remove any completely empty rows
|
| 274 |
+
df = df.dropna(how='all').copy()
|
| 275 |
+
|
| 276 |
+
if df.empty:
|
| 277 |
+
st.warning("No valid season data found after cleaning")
|
| 278 |
+
return pd.DataFrame()
|
| 279 |
+
|
| 280 |
# Clean season format
|
| 281 |
+
df["Season"] = df["Season"].astype(str).str.strip()
|
| 282 |
df['Season'] = df['Season'].str.replace('-', 'β') # Ensure en-dash for consistency
|
| 283 |
|
| 284 |
+
# Standardize column names to match expected format
|
| 285 |
column_mapping = {
|
| 286 |
'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
|
| 287 |
'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
|
|
|
|
| 298 |
if old_col in df.columns:
|
| 299 |
df = df.rename(columns={old_col: new_col})
|
| 300 |
|
| 301 |
+
# Clean team names if TEAM_ABBREVIATION column exists
|
| 302 |
+
if 'TEAM_ABBREVIATION' in df.columns:
|
| 303 |
+
df['TEAM_ABBREVIATION'] = df['TEAM_ABBREVIATION'].apply(clean_team_name)
|
| 304 |
+
|
| 305 |
# Convert numeric columns
|
| 306 |
non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
|
| 307 |
for col in df.columns:
|
|
|
|
| 310 |
|
| 311 |
return df
|
| 312 |
|
| 313 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 314 |
+
@st.cache_data(ttl=300)
|
| 315 |
+
def get_player_stats_by_name(player_name):
|
| 316 |
+
"""
|
| 317 |
+
Get player stats by searching for the player name in the index.
|
| 318 |
+
Returns DataFrame with player's career statistics.
|
| 319 |
+
"""
|
| 320 |
+
# Get player index
|
| 321 |
+
player_index = get_player_index()
|
| 322 |
+
if player_index.empty:
|
| 323 |
+
return pd.DataFrame()
|
| 324 |
+
|
| 325 |
+
# Search for player (case insensitive)
|
| 326 |
+
matches = player_index[player_index['name'].str.contains(player_name, case=False, na=False)]
|
| 327 |
+
|
| 328 |
+
if matches.empty:
|
| 329 |
+
st.warning(f"No player found matching '{player_name}'")
|
| 330 |
+
return pd.DataFrame()
|
| 331 |
+
|
| 332 |
+
if len(matches) > 1:
|
| 333 |
+
st.info(f"Multiple players found matching '{player_name}': {matches['name'].tolist()}")
|
| 334 |
+
st.info("Using first match")
|
| 335 |
+
|
| 336 |
+
# Get stats for the first match
|
| 337 |
+
player_url = matches.iloc[0]['url']
|
| 338 |
+
return player_season_stats(player_url)
|
| 339 |
+
|
| 340 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 341 |
+
|
| 342 |
+
import requests
|
| 343 |
+
import pandas as pd
|
| 344 |
+
import streamlit as st
|
| 345 |
+
from bs4 import BeautifulSoup
|
| 346 |
+
import re
|
| 347 |
+
import time
|
| 348 |
+
import random
|
| 349 |
+
from urllib.parse import urljoin
|
| 350 |
+
|
| 351 |
+
@st.cache_data(ttl=3600)
|
| 352 |
+
def fetch_html(url):
|
| 353 |
+
"""Fetch raw HTML for a URL (with error handling and rate limiting)."""
|
| 354 |
+
try:
|
| 355 |
+
# Add random delay to be respectful to basketball-reference.com
|
| 356 |
+
time.sleep(random.uniform(0.5, 1.5))
|
| 357 |
+
|
| 358 |
+
headers = {
|
| 359 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
resp = requests.get(url, timeout=30, headers=headers)
|
| 363 |
+
resp.raise_for_status()
|
| 364 |
+
return resp.text
|
| 365 |
+
except requests.exceptions.RequestException as e:
|
| 366 |
+
st.error(f"Failed to fetch {url}: {e}")
|
| 367 |
+
return ""
|
| 368 |
+
except Exception as e:
|
| 369 |
+
st.error(f"An unexpected error occurred while fetching {url}: {e}")
|
| 370 |
+
return ""
|
| 371 |
+
|
| 372 |
+
def parse_table(html, table_id=None):
|
| 373 |
+
"""
|
| 374 |
+
Given raw HTML and optional table_id, locate that <table>,
|
| 375 |
+
handling cases where it's commented out, then parse it with pandas.read_html.
|
| 376 |
+
"""
|
| 377 |
+
if not html:
|
| 378 |
+
return pd.DataFrame()
|
| 379 |
+
|
| 380 |
+
soup = BeautifulSoup(html, "html.parser") # Changed from lxml to html.parser for better compatibility
|
| 381 |
+
tbl_html = ""
|
| 382 |
+
|
| 383 |
+
if table_id:
|
| 384 |
+
# First, try to find the table directly
|
| 385 |
+
tbl = soup.find("table", {"id": table_id})
|
| 386 |
+
if tbl:
|
| 387 |
+
tbl_html = str(tbl)
|
| 388 |
+
else:
|
| 389 |
+
# If not found directly, search for it within HTML comments
|
| 390 |
+
# Basketball-Reference often comments out tables
|
| 391 |
+
comment_pattern = re.compile(
|
| 392 |
+
r'<!--.*?<table[^>]*?id=["\']' + re.escape(table_id) + r'["\'][^>]*?>.*?</table>.*?-->',
|
| 393 |
+
re.DOTALL | re.IGNORECASE
|
| 394 |
+
)
|
| 395 |
+
comment_match = comment_pattern.search(html)
|
| 396 |
+
if comment_match:
|
| 397 |
+
# Extract the content of the comment
|
| 398 |
+
comment_content = comment_match.group(0)
|
| 399 |
+
# Remove the comment tags
|
| 400 |
+
comment_content = comment_content.replace('<!--', '').replace('-->', '')
|
| 401 |
+
# Parse the comment content as new HTML
|
| 402 |
+
comment_soup = BeautifulSoup(comment_content, 'html.parser')
|
| 403 |
+
tbl = comment_soup.find('table', {'id': table_id})
|
| 404 |
+
if tbl:
|
| 405 |
+
tbl_html = str(tbl)
|
| 406 |
+
else:
|
| 407 |
+
# fallback: first table on page (only if no table_id specified)
|
| 408 |
+
first = soup.find("table")
|
| 409 |
+
if first:
|
| 410 |
+
tbl_html = str(first)
|
| 411 |
+
|
| 412 |
+
if not tbl_html:
|
| 413 |
+
return pd.DataFrame()
|
| 414 |
+
|
| 415 |
+
try:
|
| 416 |
+
# pd.read_html returns a list of DataFrames, we want the first one
|
| 417 |
+
dfs = pd.read_html(tbl_html, header=0)
|
| 418 |
+
if dfs:
|
| 419 |
+
return dfs[0]
|
| 420 |
+
else:
|
| 421 |
+
return pd.DataFrame()
|
| 422 |
+
except ValueError as e:
|
| 423 |
+
# No tables found in the provided HTML string
|
| 424 |
+
st.warning(f"No tables found in HTML: {e}")
|
| 425 |
+
return pd.DataFrame()
|
| 426 |
+
except Exception as e:
|
| 427 |
+
st.error(f"Error parsing table with pandas: {e}")
|
| 428 |
+
return pd.DataFrame()
|
| 429 |
+
|
| 430 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 431 |
@st.cache_data(ttl=300)
|
| 432 |
def team_per_game(year):
|
|
|
|
| 510 |
if old_col in df.columns:
|
| 511 |
df = df.rename(columns={old_col: new_col})
|
| 512 |
|
| 513 |
+
# Clean team names
|
| 514 |
+
df['Tm'] = df['Tm'].apply(clean_team_name)
|
| 515 |
+
|
| 516 |
# Convert numeric columns
|
| 517 |
non_numeric_cols = {"Tm", "RANK"}
|
| 518 |
for col in df.columns:
|
|
|
|
| 522 |
return df
|
| 523 |
|
| 524 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 525 |
+
# Additional utility functions for team data processing
|
| 526 |
+
|
| 527 |
+
@st.cache_data(ttl=300)
|
| 528 |
+
def team_opponent_stats(year):
|
| 529 |
+
"""
|
| 530 |
+
Scrapes the league's opponent perβgame team stats table from:
|
| 531 |
+
https://www.basketball-reference.com/leagues/NBA_{year}_opp_per_game.html
|
| 532 |
+
Returns cleaned DataFrame with opponent stats.
|
| 533 |
+
"""
|
| 534 |
+
url = f"https://www.basketball-reference.com/leagues/NBA_{year}_opp_per_game.html"
|
| 535 |
+
html = fetch_html(url)
|
| 536 |
+
if not html:
|
| 537 |
+
return pd.DataFrame()
|
| 538 |
+
|
| 539 |
+
# Try multiple possible table IDs for opponent stats
|
| 540 |
+
possible_table_ids = ["opp-stats-per_game", "opp_per_game", "opponent-stats-per_game"]
|
| 541 |
+
df = pd.DataFrame()
|
| 542 |
+
|
| 543 |
+
for table_id in possible_table_ids:
|
| 544 |
+
df = parse_table(html, table_id=table_id)
|
| 545 |
+
if not df.empty:
|
| 546 |
+
break
|
| 547 |
+
|
| 548 |
+
if df.empty:
|
| 549 |
+
st.warning(f"Could not find opponent stats table for {year}")
|
| 550 |
+
return pd.DataFrame()
|
| 551 |
+
|
| 552 |
+
# Clean and process the same way as regular team stats
|
| 553 |
+
if isinstance(df.columns, pd.MultiIndex):
|
| 554 |
+
df.columns = ['_'.join(str(col).strip() for col in cols if str(col).strip() and str(col).strip() != 'Unnamed: 0_level_0')
|
| 555 |
+
for cols in df.columns.values]
|
| 556 |
+
|
| 557 |
+
df.columns = [str(col).strip() for col in df.columns]
|
| 558 |
+
|
| 559 |
+
# Find team column
|
| 560 |
+
team_col = None
|
| 561 |
+
for col in df.columns:
|
| 562 |
+
if 'team' in col.lower() or col in ['Team', 'Tm']:
|
| 563 |
+
team_col = col
|
| 564 |
+
break
|
| 565 |
+
|
| 566 |
+
if team_col is None:
|
| 567 |
+
return pd.DataFrame()
|
| 568 |
+
|
| 569 |
+
if team_col != 'Team':
|
| 570 |
+
df = df.rename(columns={team_col: 'Team'})
|
| 571 |
+
|
| 572 |
+
df = df[df["Team"].astype(str) != "Team"].copy()
|
| 573 |
+
df = df[df["Team"].notna()].copy()
|
| 574 |
+
df = df.rename(columns={"Team": "Tm"})
|
| 575 |
+
|
| 576 |
+
# Apply team name cleaning
|
| 577 |
+
df['Tm'] = df['Tm'].apply(clean_team_name)
|
| 578 |
+
|
| 579 |
+
# Same column standardization as regular team stats
|
| 580 |
+
column_mapping = {
|
| 581 |
+
'G': 'OPP_GP', 'MP': 'OPP_MIN',
|
| 582 |
+
'FG%': 'OPP_FG_PCT', '3P%': 'OPP_FG3_PCT', 'FT%': 'OPP_FT_PCT',
|
| 583 |
+
'TRB': 'OPP_REB', 'AST': 'OPP_AST', 'STL': 'OPP_STL', 'BLK': 'OPP_BLK', 'TOV': 'OPP_TO',
|
| 584 |
+
'PF': 'OPP_PF', 'PTS': 'OPP_PTS',
|
| 585 |
+
'FG': 'OPP_FGM', 'FGA': 'OPP_FGA', '3P': 'OPP_FG3M', '3PA': 'OPP_FG3A',
|
| 586 |
+
'2P': 'OPP_FGM2', '2PA': 'OPP_FGA2', '2P%': 'OPP_FG2_PCT', 'eFG%': 'OPP_EFG_PCT',
|
| 587 |
+
'FT': 'OPP_FTM', 'FTA': 'OPP_FTA', 'ORB': 'OPP_OREB', 'DRB': 'OPP_DREB'
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
for old_col, new_col in column_mapping.items():
|
| 591 |
+
if old_col in df.columns:
|
| 592 |
+
df = df.rename(columns={old_col: new_col})
|
| 593 |
+
|
| 594 |
+
# Convert numeric columns
|
| 595 |
+
non_numeric_cols = {"Tm"}
|
| 596 |
+
for col in df.columns:
|
| 597 |
+
if col not in non_numeric_cols:
|
| 598 |
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
| 599 |
+
|
| 600 |
+
return df
|
| 601 |
+
|
| 602 |
+
@st.cache_data(ttl=300)
|
| 603 |
+
def team_standings(year):
|
| 604 |
+
"""
|
| 605 |
+
Scrapes team standings from Basketball Reference.
|
| 606 |
+
Returns DataFrame with team records and standings info.
|
| 607 |
+
"""
|
| 608 |
+
url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
|
| 609 |
+
html = fetch_html(url)
|
| 610 |
+
if not html:
|
| 611 |
+
return pd.DataFrame()
|
| 612 |
+
|
| 613 |
+
# Try to find standings tables (usually split by conference)
|
| 614 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 615 |
+
standings_data = []
|
| 616 |
+
|
| 617 |
+
# Look for conference tables
|
| 618 |
+
for conference in ['E', 'W']: # Eastern and Western conference IDs
|
| 619 |
+
table_id = f"standings_{conference}"
|
| 620 |
+
table = soup.find("table", {"id": table_id})
|
| 621 |
+
if table:
|
| 622 |
+
df = parse_table(str(table))
|
| 623 |
+
if not df.empty:
|
| 624 |
+
df['Conference'] = 'Eastern' if conference == 'E' else 'Western'
|
| 625 |
+
standings_data.append(df)
|
| 626 |
+
|
| 627 |
+
if not standings_data:
|
| 628 |
+
return pd.DataFrame()
|
| 629 |
+
|
| 630 |
+
# Combine conference standings
|
| 631 |
+
df = pd.concat(standings_data, ignore_index=True)
|
| 632 |
+
|
| 633 |
+
# Clean team names if 'Team' column exists
|
| 634 |
+
if 'Team' in df.columns:
|
| 635 |
+
df['Team'] = df['Team'].apply(clean_team_name)
|
| 636 |
+
|
| 637 |
+
return df
|
| 638 |
|
| 639 |
def validate_dataframe(df, required_columns=None):
|
| 640 |
"""
|
|
|
|
| 660 |
# Remove any asterisks or other symbols
|
| 661 |
team_name = str(team_name).strip().replace('*', '')
|
| 662 |
|
| 663 |
+
# Handle special cases for team name variations
|
| 664 |
team_mapping = {
|
|
|
|
| 665 |
'NOP': 'NO', # New Orleans Pelicans sometimes shown as NOP
|
| 666 |
'PHX': 'PHO', # Phoenix Suns sometimes shown as PHX
|
| 667 |
'BRK': 'BKN', # Brooklyn Nets sometimes shown as BRK
|
| 668 |
+
'CHA': 'CHO', # Charlotte sometimes inconsistent
|
| 669 |
+
'UTA': 'UTH' # Utah Jazz sometimes shown as UTA
|
| 670 |
}
|
| 671 |
|
| 672 |
return team_mapping.get(team_name, team_name)
|
|
|
|
| 687 |
time.sleep(2 ** attempt) # Exponential backoff
|
| 688 |
|
| 689 |
return pd.DataFrame()
|
| 690 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 691 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 692 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 693 |
# Perplexity integration
|
| 694 |
PERP_KEY = os.getenv("PERPLEXITY_API_KEY")
|