rairo commited on
Commit
957d8fd
Β·
verified Β·
1 Parent(s): 80e3608

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +94 -28
src/streamlit_app.py CHANGED
@@ -16,56 +16,122 @@ st.markdown("""
16
  """, unsafe_allow_html=True)
17
 
18
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
19
- # Caching helpers
 
 
 
20
  @st.cache_data(ttl=3600)
21
- def fetch_table(url, idx=0):
 
22
  try:
23
  resp = requests.get(url, timeout=20)
24
  resp.raise_for_status()
25
- dfs = pd.read_html(resp.text)
26
- return dfs[idx]
27
  except Exception as e:
28
  st.error(f"Failed to fetch {url}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  return pd.DataFrame()
30
 
31
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
32
- # Basketball-Reference scrapers
33
-
34
  @st.cache_data(ttl=3600)
35
  def get_player_index():
 
 
 
 
36
  base = "https://www.basketball-reference.com/players/"
37
- rows = []
 
38
  for letter in map(chr, range(ord('a'), ord('z')+1)):
39
- df = fetch_table(f"{base}{letter}/")
40
- if df.empty: continue
41
- for _, r in df.iterrows():
42
- raw = r['Player']
43
- href = raw.split('href="')[1].split('"')[0]
44
- name = raw.split('>')[1].split('<')[0]
45
- rows.append({'name': name, 'url': f"https://www.basketball-reference.com{href}"})
46
- return pd.DataFrame(rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
 
48
  @st.cache_data(ttl=300)
49
  def player_season_stats(bbr_url):
50
- df = fetch_table(bbr_url, 0)
51
- if 'Season' not in df.columns:
 
 
 
 
 
52
  return pd.DataFrame()
53
- df = df[df['Season']!='Season']
54
- df['Season'] = df['Season'].astype(str)
55
- nonnum = ['Season','Tm','Lg','Pos']
56
- for c in df.columns.difference(nonnum):
57
- df[c] = pd.to_numeric(df[c], errors='coerce')
 
 
 
 
58
  return df
59
 
 
60
  @st.cache_data(ttl=300)
61
  def team_per_game(year):
 
 
 
 
 
62
  url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
63
- df = fetch_table(url)
64
- if df.empty: return df
65
- df = df[df['Player']!='Player']
66
- df.rename(columns={'Team':'Tm'}, inplace=True)
67
- for c in df.columns.difference(['Player','Pos','Tm']):
68
- df[c] = pd.to_numeric(df[c], errors='coerce')
 
 
 
 
 
 
 
 
69
  return df
70
 
71
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
16
  """, unsafe_allow_html=True)
17
 
18
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
19
+ import requests
20
+ from bs4 import BeautifulSoup
21
+
22
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
23
  @st.cache_data(ttl=3600)
24
+ def fetch_html(url):
25
+ """Fetch raw HTML for a URL (with error handling)."""
26
  try:
27
  resp = requests.get(url, timeout=20)
28
  resp.raise_for_status()
29
+ return resp.text
 
30
  except Exception as e:
31
  st.error(f"Failed to fetch {url}: {e}")
32
+ return ""
33
+
34
+ def parse_table(html, table_id=None):
35
+ """
36
+ Given raw HTML and optional table_id, locate that <table> via BeautifulSoup,
37
+ then parse it with pandas.read_html.
38
+ """
39
+ soup = BeautifulSoup(html, "html.parser")
40
+ if table_id:
41
+ tbl = soup.find("table", {"id": table_id})
42
+ if not tbl:
43
+ return pd.DataFrame()
44
+ tbl_html = str(tbl)
45
+ else:
46
+ # fallback: first table on page
47
+ first = soup.find("table")
48
+ if not first:
49
+ return pd.DataFrame()
50
+ tbl_html = str(first)
51
+
52
+ try:
53
+ return pd.read_html(tbl_html)[0]
54
+ except ValueError:
55
  return pd.DataFrame()
56
 
57
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
 
58
  @st.cache_data(ttl=3600)
59
  def get_player_index():
60
+ """
61
+ Scrape the master list of players from BBR (players/a β†’ players/z).
62
+ Returns DataFrame with columns ['name','url'].
63
+ """
64
  base = "https://www.basketball-reference.com/players/"
65
+ records = []
66
+
67
  for letter in map(chr, range(ord('a'), ord('z')+1)):
68
+ url = f"{base}{letter}/"
69
+ html = fetch_html(url)
70
+ df = parse_table(html, table_id="players")
71
+ if df.empty:
72
+ continue
73
+
74
+ # each row: <th data-stat="player"><a href="/players/x/xxxxx.html">Name</a></th>
75
+ soup = BeautifulSoup(html, "html.parser")
76
+ for row in soup.select("table#players tbody tr"):
77
+ th = row.find("th", {"data-stat": "player"})
78
+ if not th:
79
+ continue
80
+ a = th.find("a", href=True)
81
+ if not a:
82
+ continue
83
+ name = a.text.strip()
84
+ href = a["href"].strip()
85
+ full_url = f"https://www.basketball-reference.com{href}"
86
+ records.append({"name": name, "url": full_url})
87
+
88
+ return pd.DataFrame(records)
89
 
90
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
91
  @st.cache_data(ttl=300)
92
  def player_season_stats(bbr_url):
93
+ """
94
+ Scrapes a player’s per‑season table (id="per_game") from their BBR page.
95
+ Returns cleaned DataFrame.
96
+ """
97
+ html = fetch_html(bbr_url)
98
+ df = parse_table(html, table_id="per_game")
99
+ if df.empty or "Season" not in df.columns:
100
  return pd.DataFrame()
101
+
102
+ # drop repeated header rows
103
+ df = df[df["Season"] != "Season"].copy()
104
+ df["Season"] = df["Season"].astype(str)
105
+ # coerce all numeric columns
106
+ nonnum = {"Season", "Tm", "Lg", "Pos"}
107
+ for col in df.columns:
108
+ if col not in nonnum:
109
+ df[col] = pd.to_numeric(df[col], errors="coerce")
110
  return df
111
 
112
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
113
  @st.cache_data(ttl=300)
114
  def team_per_game(year):
115
+ """
116
+ Scrapes the league’s per‑game team stats table from:
117
+ https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html
118
+ Returns cleaned DataFrame.
119
+ """
120
  url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
121
+ html = fetch_html(url)
122
+ df = parse_table(html, table_id="per_game-team")
123
+ if df.empty or "Team" not in df.columns:
124
+ return pd.DataFrame()
125
+
126
+ # drop repeated headers & rename
127
+ df = df[df["Team"] != "Team"].copy()
128
+ df.rename(columns={"Team": "Tm"}, inplace=True)
129
+ # coerce numeric columns
130
+ nonnum = {"Player", "Pos", "Tm"}
131
+ for col in df.columns:
132
+ if col not in nonnum:
133
+ df[col] = pd.to_numeric(df[col], errors="coerce")
134
+
135
  return df
136
 
137
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”