Spaces:

rairo
/

NBA-Test

Sleeping

App Files Files Community

rairo commited on May 26, 2025

Commit

957d8fd

verified ·

1 Parent(s): 80e3608

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +94 -28

src/streamlit_app.py CHANGED Viewed

@@ -16,56 +16,122 @@ st.markdown("""
 """, unsafe_allow_html=True)
 # —————————————————————————————————————————————————————————————————————————————
-# Caching helpers
 @st.cache_data(ttl=3600)
-def fetch_table(url, idx=0):
     try:
         resp = requests.get(url, timeout=20)
         resp.raise_for_status()
-        dfs = pd.read_html(resp.text)
-        return dfs[idx]
     except Exception as e:
         st.error(f"Failed to fetch {url}: {e}")
         return pd.DataFrame()
 # —————————————————————————————————————————————————————————————————————————————
-# Basketball-Reference scrapers
 @st.cache_data(ttl=3600)
 def get_player_index():
     base = "https://www.basketball-reference.com/players/"
-    rows = []
     for letter in map(chr, range(ord('a'), ord('z')+1)):
-        df = fetch_table(f"{base}{letter}/")
-        if df.empty: continue
-        for _, r in df.iterrows():
-            raw = r['Player']
-            href = raw.split('href="')[1].split('"')[0]
-            name = raw.split('>')[1].split('<')[0]
-            rows.append({'name': name, 'url': f"https://www.basketball-reference.com{href}"})
-    return pd.DataFrame(rows)
 @st.cache_data(ttl=300)
 def player_season_stats(bbr_url):
-    df = fetch_table(bbr_url, 0)
-    if 'Season' not in df.columns:
         return pd.DataFrame()
-    df = df[df['Season']!='Season']
-    df['Season'] = df['Season'].astype(str)
-    nonnum = ['Season','Tm','Lg','Pos']
-    for c in df.columns.difference(nonnum):
-        df[c] = pd.to_numeric(df[c], errors='coerce')
     return df
 @st.cache_data(ttl=300)
 def team_per_game(year):
     url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
-    df = fetch_table(url)
-    if df.empty: return df
-    df = df[df['Player']!='Player']
-    df.rename(columns={'Team':'Tm'}, inplace=True)
-    for c in df.columns.difference(['Player','Pos','Tm']):
-        df[c] = pd.to_numeric(df[c], errors='coerce')
     return df
 # —————————————————————————————————————————————————————————————————————————————

 """, unsafe_allow_html=True)
 # —————————————————————————————————————————————————————————————————————————————
+import requests
+from bs4 import BeautifulSoup
+# —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=3600)
+def fetch_html(url):
+    """Fetch raw HTML for a URL (with error handling)."""
     try:
         resp = requests.get(url, timeout=20)
         resp.raise_for_status()
+        return resp.text
     except Exception as e:
         st.error(f"Failed to fetch {url}: {e}")
+        return ""
+def parse_table(html, table_id=None):
+    """
+    Given raw HTML and optional table_id, locate that <table> via BeautifulSoup,
+    then parse it with pandas.read_html.
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    if table_id:
+        tbl = soup.find("table", {"id": table_id})
+        if not tbl:
+            return pd.DataFrame()
+        tbl_html = str(tbl)
+    else:
+        # fallback: first table on page
+        first = soup.find("table")
+        if not first:
+            return pd.DataFrame()
+        tbl_html = str(first)
+    try:
+        return pd.read_html(tbl_html)[0]
+    except ValueError:
         return pd.DataFrame()
 # —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=3600)
 def get_player_index():
+    """
+    Scrape the master list of players from BBR (players/a → players/z).
+    Returns DataFrame with columns ['name','url'].
+    """
     base = "https://www.basketball-reference.com/players/"
+    records = []
     for letter in map(chr, range(ord('a'), ord('z')+1)):
+        url = f"{base}{letter}/"
+        html = fetch_html(url)
+        df = parse_table(html, table_id="players")
+        if df.empty:
+            continue
+        # each row: <th data-stat="player"><a href="/players/x/xxxxx.html">Name</a></th>
+        soup = BeautifulSoup(html, "html.parser")
+        for row in soup.select("table#players tbody tr"):
+            th = row.find("th", {"data-stat": "player"})
+            if not th:
+                continue
+            a = th.find("a", href=True)
+            if not a:
+                continue
+            name = a.text.strip()
+            href = a["href"].strip()
+            full_url = f"https://www.basketball-reference.com{href}"
+            records.append({"name": name, "url": full_url})
+    return pd.DataFrame(records)
+# —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=300)
 def player_season_stats(bbr_url):
+    """
+    Scrapes a player’s per‑season table (id="per_game") from their BBR page.
+    Returns cleaned DataFrame.
+    """
+    html = fetch_html(bbr_url)
+    df = parse_table(html, table_id="per_game")
+    if df.empty or "Season" not in df.columns:
         return pd.DataFrame()
+    # drop repeated header rows
+    df = df[df["Season"] != "Season"].copy()
+    df["Season"] = df["Season"].astype(str)
+    # coerce all numeric columns
+    nonnum = {"Season", "Tm", "Lg", "Pos"}
+    for col in df.columns:
+        if col not in nonnum:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
     return df
+# —————————————————————————————————————————————————————————————————————————————
 @st.cache_data(ttl=300)
 def team_per_game(year):
+    """
+    Scrapes the league’s per‑game team stats table from:
+      https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html
+    Returns cleaned DataFrame.
+    """
     url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
+    html = fetch_html(url)
+    df = parse_table(html, table_id="per_game-team")
+    if df.empty or "Team" not in df.columns:
+        return pd.DataFrame()
+    # drop repeated headers & rename
+    df = df[df["Team"] != "Team"].copy()
+    df.rename(columns={"Team": "Tm"}, inplace=True)
+    # coerce numeric columns
+    nonnum = {"Player", "Pos", "Tm"}
+    for col in df.columns:
+        if col not in nonnum:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
     return df
 # —————————————————————————————————————————————————————————————————————————————