Spaces:

rairo
/

NBA-Test

Sleeping

App Files Files Community

rairo commited on May 26, 2025

Commit

97cf964

verified ·

1 Parent(s): 3016fda

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +303 -23

src/streamlit_app.py CHANGED Viewed

@@ -145,6 +145,7 @@ def parse_table(html, table_id=None):
         return pd.DataFrame()
 # —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=3600)
 def get_player_index():
     """
@@ -188,58 +189,99 @@ def get_player_index():
 def player_season_stats(bbr_url):
     """
     Scrapes a player's per‑season table (id="per_game") from their BBR page.
-    Returns cleaned DataFrame.
     """
     html = fetch_html(bbr_url)
     if not html:
         return pd.DataFrame()
-    df = parse_table(html, table_id="per_game")
     if df.empty:
         return pd.DataFrame()
     # Handle potential MultiIndex columns
     if isinstance(df.columns, pd.MultiIndex):
-        # Flatten MultiIndex columns
-        df.columns = ['_'.join(str(col).strip() for col in cols if str(col).strip() and str(col).strip() != 'Unnamed: 0_level_0')
-                     for cols in df.columns.values]
     # Clean column names
     df.columns = [str(col).strip() for col in df.columns]
-    # Find season column (could be 'Season' or similar)
     season_col = None
     for col in df.columns:
-        if 'season' in col.lower() or col == 'Season':
             season_col = col
             break
-    if season_col is None:
-        # Try to find it by looking for columns with year patterns
         for col in df.columns:
-            if df[col].dtype == 'object' and not df[col].isna().all():
-                sample_val = str(df[col].iloc[0]) if len(df) > 0 else ""
-                if re.match(r'\d{4}-\d{2}', sample_val):
-                    season_col = col
-                    break
     if season_col is None:
-        st.warning(f"Could not find season column in player stats. Available columns: {df.columns.tolist()}")
         return pd.DataFrame()
     # Rename season column to standard name
     if season_col != 'Season':
         df = df.rename(columns={season_col: 'Season'})
-    # Remove header rows that might have been included
-    df = df[df["Season"].astype(str) != "Season"].copy()
     df = df[df["Season"].notna()].copy()
     # Clean season format
-    df["Season"] = df["Season"].astype(str)
     df['Season'] = df['Season'].str.replace('-', '–')  # Ensure en-dash for consistency
-    # Standardize column names to match previous nba_api output expectations
     column_mapping = {
         'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
         'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
@@ -256,6 +298,10 @@ def player_season_stats(bbr_url):
         if old_col in df.columns:
             df = df.rename(columns={old_col: new_col})
     # Convert numeric columns
     non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
     for col in df.columns:
@@ -264,6 +310,123 @@ def player_season_stats(bbr_url):
     return df
 # —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=300)
 def team_per_game(year):
@@ -347,6 +510,9 @@ def team_per_game(year):
         if old_col in df.columns:
             df = df.rename(columns={old_col: new_col})
     # Convert numeric columns
     non_numeric_cols = {"Tm", "RANK"}
     for col in df.columns:
@@ -356,7 +522,119 @@ def team_per_game(year):
     return df
 # —————————————————————————————————————————————————————————————————————————————
-# Additional utility functions for better error handling and data validation
 def validate_dataframe(df, required_columns=None):
     """
@@ -382,12 +660,13 @@ def clean_team_name(team_name):
     # Remove any asterisks or other symbols
     team_name = str(team_name).strip().replace('*', '')
-    # Handle special cases
     team_mapping = {
-        'TOT': 'TOT',  # Total for players who played for multiple teams
         'NOP': 'NO',   # New Orleans Pelicans sometimes shown as NOP
         'PHX': 'PHO',  # Phoenix Suns sometimes shown as PHX
         'BRK': 'BKN',  # Brooklyn Nets sometimes shown as BRK
     }
     return team_mapping.get(team_name, team_name)
@@ -408,7 +687,8 @@ def retry_fetch(func, *args, max_retries=3, **kwargs):
             time.sleep(2 ** attempt)  # Exponential backoff
     return pd.DataFrame()
 # —————————————————————————————————————————————————————————————————————————————
 # Perplexity integration
 PERP_KEY = os.getenv("PERPLEXITY_API_KEY")

         return pd.DataFrame()
 # —————————————————————————————————————————————————————————————————————————————
+—————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=3600)
 def get_player_index():
     """
 def player_season_stats(bbr_url):
     """
     Scrapes a player's per‑season table (id="per_game") from their BBR page.
+    Returns cleaned DataFrame with full player statistics.
     """
     html = fetch_html(bbr_url)
     if not html:
         return pd.DataFrame()
+    # Try multiple table IDs that Basketball Reference uses for player stats
+    possible_table_ids = ["per_game", "per-game", "stats", "player-stats"]
+    df = pd.DataFrame()
+    for table_id in possible_table_ids:
+        df = parse_table(html, table_id=table_id)
+        if not df.empty:
+            break
+    # If no table found with IDs, try to find any table with season data
+    if df.empty:
+        soup = BeautifulSoup(html, "html.parser")
+        tables = soup.find_all("table")
+        for table in tables:
+            # Look for tables that have season-like headers
+            headers = [th.text.strip() for th in table.find_all("th")]
+            if any("season" in h.lower() or re.match(r'\d{4}-\d{2}', h) for h in headers):
+                df = parse_table(str(table))
+                if not df.empty:
+                    break
     if df.empty:
+        st.warning(f"Could not find player stats table at {bbr_url}")
         return pd.DataFrame()
     # Handle potential MultiIndex columns
     if isinstance(df.columns, pd.MultiIndex):
+        # Flatten MultiIndex columns more carefully
+        new_columns = []
+        for col in df.columns:
+            if isinstance(col, tuple):
+                # Join non-empty parts of the tuple
+                col_parts = [str(part).strip() for part in col if str(part).strip() and 'Unnamed' not in str(part)]
+                new_columns.append('_'.join(col_parts) if col_parts else 'Unknown')
+            else:
+                new_columns.append(str(col).strip())
+        df.columns = new_columns
     # Clean column names
     df.columns = [str(col).strip() for col in df.columns]
+    # Debug: print available columns
+    st.info(f"Available columns: {df.columns.tolist()}")
+    # Find season column with more flexible matching
     season_col = None
     for col in df.columns:
+        if any(keyword in col.lower() for keyword in ['season', 'year']):
             season_col = col
             break
+    # If still no season column, look for columns with year-like data
+    if season_col is None and len(df) > 0:
         for col in df.columns:
+            if df[col].dtype == 'object':
+                try:
+                    sample_values = df[col].dropna().head(3).astype(str)
+                    if any(re.match(r'\d{4}[-–]\d{2}', val) for val in sample_values):
+                        season_col = col
+                        break
+                except:
+                    continue
     if season_col is None:
+        st.warning(f"Could not find season column. Available columns: {df.columns.tolist()}")
         return pd.DataFrame()
     # Rename season column to standard name
     if season_col != 'Season':
         df = df.rename(columns={season_col: 'Season'})
+    # Clean the data
+    df = df[df["Season"].astype(str).str.strip() != "Season"].copy()
     df = df[df["Season"].notna()].copy()
+    # Remove any completely empty rows
+    df = df.dropna(how='all').copy()
+    if df.empty:
+        st.warning("No valid season data found after cleaning")
+        return pd.DataFrame()
     # Clean season format
+    df["Season"] = df["Season"].astype(str).str.strip()
     df['Season'] = df['Season'].str.replace('-', '–')  # Ensure en-dash for consistency
+    # Standardize column names to match expected format
     column_mapping = {
         'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
         'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
         if old_col in df.columns:
             df = df.rename(columns={old_col: new_col})
+    # Clean team names if TEAM_ABBREVIATION column exists
+    if 'TEAM_ABBREVIATION' in df.columns:
+        df['TEAM_ABBREVIATION'] = df['TEAM_ABBREVIATION'].apply(clean_team_name)
     # Convert numeric columns
     non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
     for col in df.columns:
     return df
+# —————————————————————————————————————————————————————————————————————————————
+@st.cache_data(ttl=300)
+def get_player_stats_by_name(player_name):
+    """
+    Get player stats by searching for the player name in the index.
+    Returns DataFrame with player's career statistics.
+    """
+    # Get player index
+    player_index = get_player_index()
+    if player_index.empty:
+        return pd.DataFrame()
+    # Search for player (case insensitive)
+    matches = player_index[player_index['name'].str.contains(player_name, case=False, na=False)]
+    if matches.empty:
+        st.warning(f"No player found matching '{player_name}'")
+        return pd.DataFrame()
+    if len(matches) > 1:
+        st.info(f"Multiple players found matching '{player_name}': {matches['name'].tolist()}")
+        st.info("Using first match")
+    # Get stats for the first match
+    player_url = matches.iloc[0]['url']
+    return player_season_stats(player_url)
+# —————————————————————————————————————————————————————————————————————————————
+import requests
+import pandas as pd
+import streamlit as st
+from bs4 import BeautifulSoup
+import re
+import time
+import random
+from urllib.parse import urljoin
+@st.cache_data(ttl=3600)
+def fetch_html(url):
+    """Fetch raw HTML for a URL (with error handling and rate limiting)."""
+    try:
+        # Add random delay to be respectful to basketball-reference.com
+        time.sleep(random.uniform(0.5, 1.5))
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        resp = requests.get(url, timeout=30, headers=headers)
+        resp.raise_for_status()
+        return resp.text
+    except requests.exceptions.RequestException as e:
+        st.error(f"Failed to fetch {url}: {e}")
+        return ""
+    except Exception as e:
+        st.error(f"An unexpected error occurred while fetching {url}: {e}")
+        return ""
+def parse_table(html, table_id=None):
+    """
+    Given raw HTML and optional table_id, locate that <table>,
+    handling cases where it's commented out, then parse it with pandas.read_html.
+    """
+    if not html:
+        return pd.DataFrame()
+    soup = BeautifulSoup(html, "html.parser")  # Changed from lxml to html.parser for better compatibility
+    tbl_html = ""
+    if table_id:
+        # First, try to find the table directly
+        tbl = soup.find("table", {"id": table_id})
+        if tbl:
+            tbl_html = str(tbl)
+        else:
+            # If not found directly, search for it within HTML comments
+            # Basketball-Reference often comments out tables
+            comment_pattern = re.compile(
+                r'<!--.*?<table[^>]*?id=["\']' + re.escape(table_id) + r'["\'][^>]*?>.*?</table>.*?-->',
+                re.DOTALL | re.IGNORECASE
+            )
+            comment_match = comment_pattern.search(html)
+            if comment_match:
+                # Extract the content of the comment
+                comment_content = comment_match.group(0)
+                # Remove the comment tags
+                comment_content = comment_content.replace('<!--', '').replace('-->', '')
+                # Parse the comment content as new HTML
+                comment_soup = BeautifulSoup(comment_content, 'html.parser')
+                tbl = comment_soup.find('table', {'id': table_id})
+                if tbl:
+                    tbl_html = str(tbl)
+    else:
+        # fallback: first table on page (only if no table_id specified)
+        first = soup.find("table")
+        if first:
+            tbl_html = str(first)
+    if not tbl_html:
+        return pd.DataFrame()
+    try:
+        # pd.read_html returns a list of DataFrames, we want the first one
+        dfs = pd.read_html(tbl_html, header=0)
+        if dfs:
+            return dfs[0]
+        else:
+            return pd.DataFrame()
+    except ValueError as e:
+        # No tables found in the provided HTML string
+        st.warning(f"No tables found in HTML: {e}")
+        return pd.DataFrame()
+    except Exception as e:
+        st.error(f"Error parsing table with pandas: {e}")
+        return pd.DataFrame()
 # —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=300)
 def team_per_game(year):
         if old_col in df.columns:
             df = df.rename(columns={old_col: new_col})
+    # Clean team names
+    df['Tm'] = df['Tm'].apply(clean_team_name)
     # Convert numeric columns
     non_numeric_cols = {"Tm", "RANK"}
     for col in df.columns:
     return df
 # —————————————————————————————————————————————————————————————————————————————
+# Additional utility functions for team data processing
+@st.cache_data(ttl=300)
+def team_opponent_stats(year):
+    """
+    Scrapes the league's opponent per‑game team stats table from:
+      https://www.basketball-reference.com/leagues/NBA_{year}_opp_per_game.html
+    Returns cleaned DataFrame with opponent stats.
+    """
+    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_opp_per_game.html"
+    html = fetch_html(url)
+    if not html:
+        return pd.DataFrame()
+    # Try multiple possible table IDs for opponent stats
+    possible_table_ids = ["opp-stats-per_game", "opp_per_game", "opponent-stats-per_game"]
+    df = pd.DataFrame()
+    for table_id in possible_table_ids:
+        df = parse_table(html, table_id=table_id)
+        if not df.empty:
+            break
+    if df.empty:
+        st.warning(f"Could not find opponent stats table for {year}")
+        return pd.DataFrame()
+    # Clean and process the same way as regular team stats
+    if isinstance(df.columns, pd.MultiIndex):
+        df.columns = ['_'.join(str(col).strip() for col in cols if str(col).strip() and str(col).strip() != 'Unnamed: 0_level_0')
+                     for cols in df.columns.values]
+    df.columns = [str(col).strip() for col in df.columns]
+    # Find team column
+    team_col = None
+    for col in df.columns:
+        if 'team' in col.lower() or col in ['Team', 'Tm']:
+            team_col = col
+            break
+    if team_col is None:
+        return pd.DataFrame()
+    if team_col != 'Team':
+        df = df.rename(columns={team_col: 'Team'})
+    df = df[df["Team"].astype(str) != "Team"].copy()
+    df = df[df["Team"].notna()].copy()
+    df = df.rename(columns={"Team": "Tm"})
+    # Apply team name cleaning
+    df['Tm'] = df['Tm'].apply(clean_team_name)
+    # Same column standardization as regular team stats
+    column_mapping = {
+        'G': 'OPP_GP', 'MP': 'OPP_MIN',
+        'FG%': 'OPP_FG_PCT', '3P%': 'OPP_FG3_PCT', 'FT%': 'OPP_FT_PCT',
+        'TRB': 'OPP_REB', 'AST': 'OPP_AST', 'STL': 'OPP_STL', 'BLK': 'OPP_BLK', 'TOV': 'OPP_TO',
+        'PF': 'OPP_PF', 'PTS': 'OPP_PTS',
+        'FG': 'OPP_FGM', 'FGA': 'OPP_FGA', '3P': 'OPP_FG3M', '3PA': 'OPP_FG3A',
+        '2P': 'OPP_FGM2', '2PA': 'OPP_FGA2', '2P%': 'OPP_FG2_PCT', 'eFG%': 'OPP_EFG_PCT',
+        'FT': 'OPP_FTM', 'FTA': 'OPP_FTA', 'ORB': 'OPP_OREB', 'DRB': 'OPP_DREB'
+    }
+    for old_col, new_col in column_mapping.items():
+        if old_col in df.columns:
+            df = df.rename(columns={old_col: new_col})
+    # Convert numeric columns
+    non_numeric_cols = {"Tm"}
+    for col in df.columns:
+        if col not in non_numeric_cols:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+    return df
+@st.cache_data(ttl=300)
+def team_standings(year):
+    """
+    Scrapes team standings from Basketball Reference.
+    Returns DataFrame with team records and standings info.
+    """
+    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
+    html = fetch_html(url)
+    if not html:
+        return pd.DataFrame()
+    # Try to find standings tables (usually split by conference)
+    soup = BeautifulSoup(html, "html.parser")
+    standings_data = []
+    # Look for conference tables
+    for conference in ['E', 'W']:  # Eastern and Western conference IDs
+        table_id = f"standings_{conference}"
+        table = soup.find("table", {"id": table_id})
+        if table:
+            df = parse_table(str(table))
+            if not df.empty:
+                df['Conference'] = 'Eastern' if conference == 'E' else 'Western'
+                standings_data.append(df)
+    if not standings_data:
+        return pd.DataFrame()
+    # Combine conference standings
+    df = pd.concat(standings_data, ignore_index=True)
+    # Clean team names if 'Team' column exists
+    if 'Team' in df.columns:
+        df['Team'] = df['Team'].apply(clean_team_name)
+    return df
 def validate_dataframe(df, required_columns=None):
     """
     # Remove any asterisks or other symbols
     team_name = str(team_name).strip().replace('*', '')
+    # Handle special cases for team name variations
     team_mapping = {
         'NOP': 'NO',   # New Orleans Pelicans sometimes shown as NOP
         'PHX': 'PHO',  # Phoenix Suns sometimes shown as PHX
         'BRK': 'BKN',  # Brooklyn Nets sometimes shown as BRK
+        'CHA': 'CHO',  # Charlotte sometimes inconsistent
+        'UTA': 'UTH'   # Utah Jazz sometimes shown as UTA
     }
     return team_mapping.get(team_name, team_name)
             time.sleep(2 ** attempt)  # Exponential backoff
     return pd.DataFrame()
+# —————————————————————————————————————————————————————————————————————————————
+# —————————————————————————————————————————————————————————————————————————————
 # —————————————————————————————————————————————————————————————————————————————
 # Perplexity integration
 PERP_KEY = os.getenv("PERPLEXITY_API_KEY")