Spaces:

rairo
/

NBA-Test

Sleeping

App Files Files Community

rairo commited on May 26, 2025

Commit

3016fda

verified ·

1 Parent(s): 0cae82f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +202 -47

src/streamlit_app.py CHANGED Viewed

@@ -53,11 +53,30 @@ if 'chat_history' not in st.session_state:
 # Basketball-Reference Data Fetching Utilities
 # —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=3600)
 def fetch_html(url):
-    """Fetch raw HTML for a URL (with error handling)."""
     try:
-        resp = requests.get(url, timeout=20)
         resp.raise_for_status()
         return resp.text
     except requests.exceptions.RequestException as e:
@@ -72,7 +91,10 @@ def parse_table(html, table_id=None):
     Given raw HTML and optional table_id, locate that <table>,
     handling cases where it's commented out, then parse it with pandas.read_html.
     """
-    soup = BeautifulSoup(html, "lxml") # Using lxml for potentially faster parsing
     tbl_html = ""
     if table_id:
@@ -83,7 +105,10 @@ def parse_table(html, table_id=None):
         else:
             # If not found directly, search for it within HTML comments
             # Basketball-Reference often comments out tables
-            comment_pattern = re.compile(r'<!--.*?<table[^>]*id="%s"[^>]*>.*?</table>.*?-->' % table_id, re.DOTALL)
             comment_match = comment_pattern.search(html)
             if comment_match:
                 # Extract the content of the comment
@@ -91,7 +116,7 @@ def parse_table(html, table_id=None):
                 # Remove the comment tags
                 comment_content = comment_content.replace('<!--', '').replace('-->', '')
                 # Parse the comment content as new HTML
-                comment_soup = BeautifulSoup(comment_content, 'lxml')
                 tbl = comment_soup.find('table', {'id': table_id})
                 if tbl:
                     tbl_html = str(tbl)
@@ -106,8 +131,14 @@ def parse_table(html, table_id=None):
     try:
         # pd.read_html returns a list of DataFrames, we want the first one
-        return pd.read_html(tbl_html)[0]
-    except ValueError: # No tables found in the provided HTML string
         return pd.DataFrame()
     except Exception as e:
         st.error(f"Error parsing table with pandas: {e}")
@@ -129,13 +160,16 @@ def get_player_index():
         if not html:
             continue
-        soup = BeautifulSoup(html, "lxml")
         # The players table is usually directly available, not commented out.
         table = soup.find("table", {"id": "players"})
         if not table:
             continue
-        for row in table.select("tbody tr"):
             th = row.find("th", {"data-stat": "player"})
             if not th:
                 continue
@@ -144,7 +178,7 @@ def get_player_index():
                 continue
             name = a.text.strip()
             href = a["href"].strip()
-            full_url = f"https://www.basketball-reference.com{href}"
             records.append({"name": name, "url": full_url})
     return pd.DataFrame(records)
@@ -153,7 +187,7 @@ def get_player_index():
 @st.cache_data(ttl=300)
 def player_season_stats(bbr_url):
     """
-    Scrapes a player’s per‑season table (id="per_game") from their BBR page.
     Returns cleaned DataFrame.
     """
     html = fetch_html(bbr_url)
@@ -161,29 +195,52 @@ def player_season_stats(bbr_url):
         return pd.DataFrame()
     df = parse_table(html, table_id="per_game")
-    if df.empty: # Check if df is empty first
         return pd.DataFrame()
-    # Flatten multi-index columns if they exist (common with pd.read_html)
     if isinstance(df.columns, pd.MultiIndex):
-        df.columns = ['_'.join(col).strip() for col in df.columns.values]
-    else:
-        df.columns = [col.strip() for col in df.columns.values]
-    # Now check for 'Season' column after flattening
-    if "Season" not in df.columns:
-        # This is the critical point. If 'Season' is still not found,
-        # it means the table either doesn't exist or has a completely different structure.
-        # st.warning(f"Could not find 'Season' column in the parsed table from {bbr_url}. Columns found: {df.columns.tolist()}")
         return pd.DataFrame()
-    # drop repeated header rows (e.g., rows where 'Season' is literally 'Season')
-    df = df[df["Season"] != "Season"].copy()
-    df["Season"] = df["Season"].astype(str) # Ensure season is string
-    df['Season'] = df['Season'].str.replace('-', '–') # Ensure en-dash for consistency
     # Standardize column names to match previous nba_api output expectations
-    df = df.rename(columns={
         'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
         'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
         'TRB': 'REB', 'AST': 'AST', 'STL': 'STL', 'BLK': 'BLK', 'TOV': 'TO',
@@ -192,10 +249,14 @@ def player_season_stats(bbr_url):
         'FG': 'FGM', 'FGA': 'FGA', '3P': 'FG3M', '3PA': 'FG3A',
         '2P': 'FGM2', '2PA': 'FGA2', '2P%': 'FG2_PCT', 'eFG%': 'EFG_PCT',
         'FT': 'FTM', 'FTA': 'FTA', 'ORB': 'OREB', 'DRB': 'DREB'
-    })
-    # Coerce all numeric columns
-    # Exclude columns that are definitely not numeric or are identifiers
     non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
     for col in df.columns:
         if col not in non_numeric_cols:
@@ -207,7 +268,7 @@ def player_season_stats(bbr_url):
 @st.cache_data(ttl=300)
 def team_per_game(year):
     """
-    Scrapes the league’s per‑game team stats table from:
       https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html
     Returns cleaned DataFrame.
     """
@@ -216,26 +277,61 @@ def team_per_game(year):
     if not html:
         return pd.DataFrame()
-    df = parse_table(html, table_id="per_game-team") # Correct table ID for team stats
-    if df.empty: # Check if df is empty first
         return pd.DataFrame()
-    # Flatten multi-index columns if they exist
     if isinstance(df.columns, pd.MultiIndex):
-        df.columns = ['_'.join(col).strip() for col in df.columns.values]
-    else:
-        df.columns = [col.strip() for col in df.columns.values]
-    # Now check for 'Team' column after flattening
-    if "Team" not in df.columns:
         return pd.DataFrame()
-    # drop repeated headers & rename
-    df = df[df["Team"] != "Team"].copy()
-    df.rename(columns={"Team": "Tm"}, inplace=True)
     # Standardize column names
-    df = df.rename(columns={
         'G': 'GP', 'MP': 'MIN',
         'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
         'TRB': 'REB', 'AST': 'AST', 'STL': 'STL', 'BLK': 'BLK', 'TOV': 'TO',
@@ -244,9 +340,14 @@ def team_per_game(year):
         'FG': 'FGM', 'FGA': 'FGA', '3P': 'FG3M', '3PA': 'FG3A',
         '2P': 'FGM2', '2PA': 'FGA2', '2P%': 'FG2_PCT', 'eFG%': 'EFG_PCT',
         'FT': 'FTM', 'FTA': 'FTA', 'ORB': 'OREB', 'DRB': 'DREB'
-    })
-    # coerce numeric columns
     non_numeric_cols = {"Tm", "RANK"}
     for col in df.columns:
         if col not in non_numeric_cols:
@@ -254,6 +355,60 @@ def team_per_game(year):
     return df
 # ——————————————————————————————————————————————————————————��——————————————————
 # Perplexity integration
 PERP_KEY = os.getenv("PERPLEXITY_API_KEY")
@@ -265,7 +420,7 @@ def ask_perp(prompt, system="You are a helpful NBA analyst AI.", max_tokens=500,
         return ""
     hdr = {'Authorization':f'Bearer {PERP_KEY}','Content-Type':'application/json'}
     payload = {
-      "model":"sonar-medium-online", # Changed to a commonly available online model
       "messages":[{"role":"system","content":system},{"role":"user","content":prompt}],
       "max_tokens":max_tokens, "temperature":temp
     }

 # Basketball-Reference Data Fetching Utilities
 # —————————————————————————————————————————————————————————————————————————————
+# Basketball-Reference Data Fetching Utilities
+# —————————————————————————————————————————————————————————————————————————————
+import requests
+import pandas as pd
+import streamlit as st
+from bs4 import BeautifulSoup
+import re
+import time
+import random
+from urllib.parse import urljoin
 @st.cache_data(ttl=3600)
 def fetch_html(url):
+    """Fetch raw HTML for a URL (with error handling and rate limiting)."""
     try:
+        # Add random delay to be respectful to basketball-reference.com
+        time.sleep(random.uniform(0.5, 1.5))
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        resp = requests.get(url, timeout=30, headers=headers)
         resp.raise_for_status()
         return resp.text
     except requests.exceptions.RequestException as e:
     Given raw HTML and optional table_id, locate that <table>,
     handling cases where it's commented out, then parse it with pandas.read_html.
     """
+    if not html:
+        return pd.DataFrame()
+    soup = BeautifulSoup(html, "html.parser")  # Changed from lxml to html.parser for better compatibility
     tbl_html = ""
     if table_id:
         else:
             # If not found directly, search for it within HTML comments
             # Basketball-Reference often comments out tables
+            comment_pattern = re.compile(
+                r'<!--.*?<table[^>]*?id=["\']' + re.escape(table_id) + r'["\'][^>]*?>.*?</table>.*?-->',
+                re.DOTALL | re.IGNORECASE
+            )
             comment_match = comment_pattern.search(html)
             if comment_match:
                 # Extract the content of the comment
                 # Remove the comment tags
                 comment_content = comment_content.replace('<!--', '').replace('-->', '')
                 # Parse the comment content as new HTML
+                comment_soup = BeautifulSoup(comment_content, 'html.parser')
                 tbl = comment_soup.find('table', {'id': table_id})
                 if tbl:
                     tbl_html = str(tbl)
     try:
         # pd.read_html returns a list of DataFrames, we want the first one
+        dfs = pd.read_html(tbl_html, header=0)
+        if dfs:
+            return dfs[0]
+        else:
+            return pd.DataFrame()
+    except ValueError as e:
+        # No tables found in the provided HTML string
+        st.warning(f"No tables found in HTML: {e}")
         return pd.DataFrame()
     except Exception as e:
         st.error(f"Error parsing table with pandas: {e}")
         if not html:
             continue
+        soup = BeautifulSoup(html, "html.parser")
         # The players table is usually directly available, not commented out.
         table = soup.find("table", {"id": "players"})
         if not table:
             continue
+        # Look for both tbody and direct tr children
+        rows = table.select("tbody tr") if table.select("tbody tr") else table.select("tr")
+        for row in rows:
             th = row.find("th", {"data-stat": "player"})
             if not th:
                 continue
                 continue
             name = a.text.strip()
             href = a["href"].strip()
+            full_url = urljoin("https://www.basketball-reference.com", href)
             records.append({"name": name, "url": full_url})
     return pd.DataFrame(records)
 @st.cache_data(ttl=300)
 def player_season_stats(bbr_url):
     """
+    Scrapes a player's per‑season table (id="per_game") from their BBR page.
     Returns cleaned DataFrame.
     """
     html = fetch_html(bbr_url)
         return pd.DataFrame()
     df = parse_table(html, table_id="per_game")
+    if df.empty:
         return pd.DataFrame()
+    # Handle potential MultiIndex columns
     if isinstance(df.columns, pd.MultiIndex):
+        # Flatten MultiIndex columns
+        df.columns = ['_'.join(str(col).strip() for col in cols if str(col).strip() and str(col).strip() != 'Unnamed: 0_level_0')
+                     for cols in df.columns.values]
+    # Clean column names
+    df.columns = [str(col).strip() for col in df.columns]
+    # Find season column (could be 'Season' or similar)
+    season_col = None
+    for col in df.columns:
+        if 'season' in col.lower() or col == 'Season':
+            season_col = col
+            break
+    if season_col is None:
+        # Try to find it by looking for columns with year patterns
+        for col in df.columns:
+            if df[col].dtype == 'object' and not df[col].isna().all():
+                sample_val = str(df[col].iloc[0]) if len(df) > 0 else ""
+                if re.match(r'\d{4}-\d{2}', sample_val):
+                    season_col = col
+                    break
+    if season_col is None:
+        st.warning(f"Could not find season column in player stats. Available columns: {df.columns.tolist()}")
         return pd.DataFrame()
+    # Rename season column to standard name
+    if season_col != 'Season':
+        df = df.rename(columns={season_col: 'Season'})
+    # Remove header rows that might have been included
+    df = df[df["Season"].astype(str) != "Season"].copy()
+    df = df[df["Season"].notna()].copy()
+    # Clean season format
+    df["Season"] = df["Season"].astype(str)
+    df['Season'] = df['Season'].str.replace('-', '–')  # Ensure en-dash for consistency
     # Standardize column names to match previous nba_api output expectations
+    column_mapping = {
         'G': 'GP', 'GS': 'GS', 'MP': 'MIN',
         'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
         'TRB': 'REB', 'AST': 'AST', 'STL': 'STL', 'BLK': 'BLK', 'TOV': 'TO',
         'FG': 'FGM', 'FGA': 'FGA', '3P': 'FG3M', '3PA': 'FG3A',
         '2P': 'FGM2', '2PA': 'FGA2', '2P%': 'FG2_PCT', 'eFG%': 'EFG_PCT',
         'FT': 'FTM', 'FTA': 'FTA', 'ORB': 'OREB', 'DRB': 'DREB'
+    }
+    # Apply column mapping only for columns that exist
+    for old_col, new_col in column_mapping.items():
+        if old_col in df.columns:
+            df = df.rename(columns={old_col: new_col})
+    # Convert numeric columns
     non_numeric_cols = {'Season', 'TEAM_ABBREVIATION', 'LEAGUE_ID', 'POSITION', 'Player'}
     for col in df.columns:
         if col not in non_numeric_cols:
 @st.cache_data(ttl=300)
 def team_per_game(year):
     """
+    Scrapes the league's per‑game team stats table from:
       https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html
     Returns cleaned DataFrame.
     """
     if not html:
         return pd.DataFrame()
+    # Try multiple possible table IDs for team stats
+    possible_table_ids = ["per_game-team", "per_game_team", "team-stats-per_game", "teams_per_game"]
+    df = pd.DataFrame()
+    for table_id in possible_table_ids:
+        df = parse_table(html, table_id=table_id)
+        if not df.empty:
+            break
+    # If no specific table found, try to find any table with team data
+    if df.empty:
+        soup = BeautifulSoup(html, "html.parser")
+        tables = soup.find_all("table")
+        for table in tables:
+            if table.find("th", string=lambda text: text and "team" in text.lower()):
+                df = parse_table(str(table))
+                if not df.empty:
+                    break
+    if df.empty:
+        st.warning(f"Could not find team stats table for {year}")
         return pd.DataFrame()
+    # Handle potential MultiIndex columns
     if isinstance(df.columns, pd.MultiIndex):
+        df.columns = ['_'.join(str(col).strip() for col in cols if str(col).strip() and str(col).strip() != 'Unnamed: 0_level_0')
+                     for cols in df.columns.values]
+    # Clean column names
+    df.columns = [str(col).strip() for col in df.columns]
+    # Find team column
+    team_col = None
+    for col in df.columns:
+        if 'team' in col.lower() or col in ['Team', 'Tm']:
+            team_col = col
+            break
+    if team_col is None:
+        st.warning(f"Could not find team column in team stats. Available columns: {df.columns.tolist()}")
         return pd.DataFrame()
+    # Rename team column to standard name
+    if team_col != 'Team':
+        df = df.rename(columns={team_col: 'Team'})
+    # Remove header rows
+    df = df[df["Team"].astype(str) != "Team"].copy()
+    df = df[df["Team"].notna()].copy()
+    # Rename Team to Tm for consistency
+    df = df.rename(columns={"Team": "Tm"})
     # Standardize column names
+    column_mapping = {
         'G': 'GP', 'MP': 'MIN',
         'FG%': 'FG_PCT', '3P%': 'FG3_PCT', 'FT%': 'FT_PCT',
         'TRB': 'REB', 'AST': 'AST', 'STL': 'STL', 'BLK': 'BLK', 'TOV': 'TO',
         'FG': 'FGM', 'FGA': 'FGA', '3P': 'FG3M', '3PA': 'FG3A',
         '2P': 'FGM2', '2PA': 'FGA2', '2P%': 'FG2_PCT', 'eFG%': 'EFG_PCT',
         'FT': 'FTM', 'FTA': 'FTA', 'ORB': 'OREB', 'DRB': 'DREB'
+    }
+    # Apply column mapping only for columns that exist
+    for old_col, new_col in column_mapping.items():
+        if old_col in df.columns:
+            df = df.rename(columns={old_col: new_col})
+    # Convert numeric columns
     non_numeric_cols = {"Tm", "RANK"}
     for col in df.columns:
         if col not in non_numeric_cols:
     return df
+# —————————————————————————————————————————————————————————————————————————————
+# Additional utility functions for better error handling and data validation
+def validate_dataframe(df, required_columns=None):
+    """
+    Validate that a DataFrame has the expected structure and data.
+    """
+    if df.empty:
+        return False, "DataFrame is empty"
+    if required_columns:
+        missing_cols = [col for col in required_columns if col not in df.columns]
+        if missing_cols:
+            return False, f"Missing required columns: {missing_cols}"
+    return True, "DataFrame is valid"
+def clean_team_name(team_name):
+    """
+    Clean and standardize team names from Basketball Reference.
+    """
+    if pd.isna(team_name):
+        return team_name
+    # Remove any asterisks or other symbols
+    team_name = str(team_name).strip().replace('*', '')
+    # Handle special cases
+    team_mapping = {
+        'TOT': 'TOT',  # Total for players who played for multiple teams
+        'NOP': 'NO',   # New Orleans Pelicans sometimes shown as NOP
+        'PHX': 'PHO',  # Phoenix Suns sometimes shown as PHX
+        'BRK': 'BKN',  # Brooklyn Nets sometimes shown as BRK
+    }
+    return team_mapping.get(team_name, team_name)
+def retry_fetch(func, *args, max_retries=3, **kwargs):
+    """
+    Retry a function call with exponential backoff.
+    """
+    for attempt in range(max_retries):
+        try:
+            result = func(*args, **kwargs)
+            if not (isinstance(result, pd.DataFrame) and result.empty):
+                return result
+        except Exception as e:
+            if attempt == max_retries - 1:
+                st.error(f"Failed after {max_retries} attempts: {e}")
+                return pd.DataFrame()
+            time.sleep(2 ** attempt)  # Exponential backoff
+    return pd.DataFrame()
 # ——————————————————————————————————————————————————————————��——————————————————
 # Perplexity integration
 PERP_KEY = os.getenv("PERPLEXITY_API_KEY")
         return ""
     hdr = {'Authorization':f'Bearer {PERP_KEY}','Content-Type':'application/json'}
     payload = {
+      "model":"sonar-pro", # Changed to a commonly available online model
       "messages":[{"role":"system","content":system},{"role":"user","content":prompt}],
       "max_tokens":max_tokens, "temperature":temp
     }