Fbref-test / app.py
rairo's picture
Create app.py
61b713b verified
import streamlit as st
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re
from datetime import datetime, timezone
# ---------- Configuration & Constants ----------
LEAGUES = {
'premier_league': {
'player_stats_url': 'https://fbref.com/en/comps/9/stats/Premier-League-Stats',
'squad_stats_url': 'https://fbref.com/en/comps/9/Premier-League-Stats',
'fixtures_url': 'https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures',
'name': 'Premier League'
},
'la_liga': {
'player_stats_url': 'https://fbref.com/en/comps/12/stats/La-Liga-Stats',
'squad_stats_url': 'https://fbref.com/en/comps/12/La-Liga-Stats',
'fixtures_url': 'https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures',
'name': 'La Liga'
},
'serie_a': {
'player_stats_url': 'https://fbref.com/en/comps/11/stats/Serie-A-Stats',
'squad_stats_url': 'https://fbref.com/en/comps/11/Serie-A-Stats',
'fixtures_url': 'https://fbref.com/en/comps/11/schedule/Serie-A-Scores-and-Fixtures',
'name': 'Serie A'
},
'bundesliga': {
'player_stats_url': 'https://fbref.com/en/comps/20/stats/Bundesliga-Stats',
'squad_stats_url': 'https://fbref.com/en/comps/20/Bundesliga-Stats',
'fixtures_url': 'https://fbref.com/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures',
'name': 'Bundesliga'
},
'ligue_1': {
'player_stats_url': 'https://fbref.com/en/comps/13/stats/Ligue-1-Stats',
'squad_stats_url': 'https://fbref.com/en/comps/13/Ligue-1-Stats',
'fixtures_url': 'https://fbref.com/en/comps/13/schedule/Ligue-1-Scores-and-Fixtures',
'name': 'Ligue 1'
}
}
SCRAPE_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
PERPLEXITY_API_URL = 'https://api.perplexity.ai/chat/completions'
# Initialize session state for storing data
if 'player_stats_data' not in st.session_state:
st.session_state.player_stats_data = {}
if 'squad_stats_data' not in st.session_state:
st.session_state.squad_stats_data = {}
if 'fixtures_data' not in st.session_state:
st.session_state.fixtures_data = {}
if 'perplexity_api_key' not in st.session_state:
st.session_state.perplexity_api_key = ""
# ---------- Helper Functions (from Flask app) ----------
def clean_fbref_df_columns(df):
if isinstance(df.columns, pd.MultiIndex):
df.columns = df.columns.droplevel(0)
df.columns = ["".join(c if c.isalnum() or c == '%' else "_" for c in str(col)) for col in df.columns]
df.columns = [col.replace('%', 'Pct') for col in df.columns]
df = df.rename(columns=lambda x: re.sub(r'_+', '_', x))
df = df.rename(columns=lambda x: x.strip('_'))
return df
# ---------- Scraping Functions (modified for Streamlit) ----------
def scrape_player_stats_st(league_keys_to_scrape):
st.write("### Scraping Player Stats...")
progress_bar = st.progress(0)
total_leagues = len(league_keys_to_scrape)
for i, key in enumerate(league_keys_to_scrape):
url = LEAGUES[key]['player_stats_url']
st.write(f"Fetching player stats for: {LEAGUES[key]['name']}...")
try:
r = requests.get(url, headers=SCRAPE_HEADERS, timeout=30)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'html.parser')
table_player_standard = soup.find('table', {'id': 'stats_standard'})
if table_player_standard:
df = pd.read_html(str(table_player_standard))[0]
df = clean_fbref_df_columns(df)
df = df[df['Player'].notna() & (df['Player'] != 'Player')]
df = df[df['Rk'].notna() & (df['Rk'] != 'Rk')]
for col in df.columns:
if col not in ['Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Matches']:
try:
df[col] = pd.to_numeric(df[col], errors='coerce')
except Exception:
pass
df = df.fillna(0)
st.session_state.player_stats_data[key] = df
st.success(f"Successfully scraped player stats for {LEAGUES[key]['name']}.")
else:
st.error(f"Could not find player stats table for {LEAGUES[key]['name']}.")
time.sleep(3)
except Exception as e:
st.error(f"Error scraping player stats for {LEAGUES[key]['name']}: {e}")
progress_bar.progress((i + 1) / total_leagues)
st.write("Player stats scraping complete.")
def scrape_squad_stats_st(league_keys_to_scrape):
st.write("### Scraping Squad Stats (League Tables)...")
progress_bar = st.progress(0)
total_leagues = len(league_keys_to_scrape)
for i, key in enumerate(league_keys_to_scrape):
url = LEAGUES[key]['squad_stats_url']
st.write(f"Fetching squad stats for: {LEAGUES[key]['name']}...")
try:
r = requests.get(url, headers=SCRAPE_HEADERS, timeout=30)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'html.parser')
league_table = None
all_captions = soup.find_all('caption')
for caption_tag in all_captions:
if "table" in caption_tag.get_text().lower() and "squad" not in caption_tag.get_text().lower() and "standard stats" not in caption_tag.get_text().lower():
parent_table = caption_tag.find_parent('table')
temp_df_check = pd.read_html(str(parent_table))[0]
temp_cols = temp_df_check.columns
if isinstance(temp_cols, pd.MultiIndex): temp_cols = temp_cols.droplevel(0)
if all(col in temp_cols for col in ['Squad', 'MP', 'W', 'D', 'L', 'Pts']):
league_table = parent_table
break
if not league_table:
potential_table = soup.find('table', id=lambda x: x and 'overall' in x)
if potential_table: league_table = potential_table
if not league_table:
table_squad_standard = soup.find('table', {'id': 'stats_standard'})
if table_squad_standard:
temp_df_check = pd.read_html(str(table_squad_standard))[0]
temp_cols = temp_df_check.columns
if isinstance(temp_cols, pd.MultiIndex): temp_cols = temp_cols.droplevel(0)
if all(col in temp_cols for col in ['Squad', 'MP', 'W', 'D', 'L', 'Pts']):
league_table = table_squad_standard
if league_table:
df = pd.read_html(str(league_table))[0]
df = clean_fbref_df_columns(df)
df = df[df['Squad'].notna() & (df['Squad'] != 'Squad')]
df = df[df['Rk'].notna() & (df['Rk'] != 'Rk')]
numeric_cols = ['MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'xG', 'xGA', 'xGD']
for col in df.columns:
if col in numeric_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
st.session_state.squad_stats_data[key] = df
st.success(f"Successfully scraped squad stats for {LEAGUES[key]['name']}.")
else:
st.error(f"Could not find squad stats table for {LEAGUES[key]['name']}.")
time.sleep(3)
except Exception as e:
st.error(f"Error scraping squad stats for {LEAGUES[key]['name']}: {e}")
progress_bar.progress((i + 1) / total_leagues)
st.write("Squad stats scraping complete.")
def scrape_fixtures_st(league_keys_to_scrape):
st.write("### Scraping Fixtures...")
progress_bar = st.progress(0)
total_leagues = len(league_keys_to_scrape)
for i, key in enumerate(league_keys_to_scrape):
url = LEAGUES[key]['fixtures_url']
st.write(f"Fetching fixtures for: {LEAGUES[key]['name']}...")
try:
r = requests.get(url, headers=SCRAPE_HEADERS, timeout=30)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'html.parser')
fixture_table = None
all_captions = soup.find_all('caption')
for caption_tag in all_captions:
if "scores and fixtures" in caption_tag.get_text().lower():
fixture_table = caption_tag.find_parent('table')
if fixture_table: break
if not fixture_table:
potential_tables = soup.find_all('table', class_="stats_table")
if potential_tables: fixture_table = potential_tables[0]
if fixture_table:
df = pd.read_html(str(fixture_table))[0]
df = clean_fbref_df_columns(df)
df = df[df['Wk'].notna()]
df = df[df['Home'].notna() & (df['Home'] != 'Home')]
if 'Score' in df.columns:
score_split = df['Score'].astype(str).str.split('–', expand=True)
if score_split.shape[1] == 2:
df['HomeGoals'] = pd.to_numeric(score_split[0], errors='coerce')
df['AwayGoals'] = pd.to_numeric(score_split[1], errors='coerce')
else:
df['HomeGoals'] = None
df['AwayGoals'] = None
if 'Date' in df.columns:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')
st.session_state.fixtures_data[key] = df
st.success(f"Successfully scraped fixtures for {LEAGUES[key]['name']}.")
else:
st.error(f"Could not find fixtures table for {LEAGUES[key]['name']}.")
time.sleep(3)
except Exception as e:
st.error(f"Error scraping fixtures for {LEAGUES[key]['name']}: {e}")
progress_bar.progress((i + 1) / total_leagues)
st.write("Fixtures scraping complete.")
# ---------- Perplexity API Functions ----------
def get_perplexity_response(api_key, prompt, system_message="You are a helpful football analyst AI."):
if not api_key:
st.error("Perplexity API Key is not set. Please enter it in the sidebar.")
return None
headers = {
'Authorization': f'Bearer {api_key}',
'Content-Type': 'application/json'
}
payload = {
'model': 'sonar-medium-online', # Or 'sonar-pro-online'
'messages': [
{'role': 'system', 'content': system_message},
{'role': 'user', 'content': prompt}
]
}
try:
with st.spinner("Querying Perplexity AI..."):
response = requests.post(PERPLEXITY_API_URL, headers=headers, json=payload, timeout=45)
response.raise_for_status()
data = response.json()
return data.get('choices', [{}])[0].get('message', {}).get('content', '')
except requests.exceptions.RequestException as e:
error_message = f"Error communicating with Perplexity API: {e}"
if e.response is not None:
try:
error_detail = e.response.json().get("error", {}).get("message", e.response.text)
error_message = f"Perplexity API error: {error_detail}"
except ValueError:
error_message = f"Perplexity API error: {e.response.status_code} - {e.response.reason}"
st.error(error_message)
return None
except Exception as e:
st.error(f"An unexpected error occurred with Perplexity API: {e}")
return None
# ---------- Streamlit UI ----------
st.set_page_config(layout="wide")
st.title("⚽ Football Data Scraper & Perplexity Tester")
st.markdown("Test data retrieval from FBRef and Perplexity API integration. No Firebase calls.")
# --- Sidebar ---
st.sidebar.header("API Keys")
st.session_state.perplexity_api_key = st.sidebar.text_input(
"Perplexity API Key:",
type="password",
value=st.session_state.perplexity_api_key,
help="Your Perplexity AI API key. Will not be stored permanently."
)
st.sidebar.markdown("---")
st.sidebar.header("Scraping Controls")
selected_league_keys = st.sidebar.multiselect(
"Select leagues to scrape:",
options=list(LEAGUES.keys()),
format_func=lambda key: LEAGUES[key]['name'],
default=[]
)
if st.sidebar.button("Scrape Player Stats"):
if selected_league_keys: scrape_player_stats_st(selected_league_keys)
else: st.sidebar.warning("Select leagues.")
if st.sidebar.button("Scrape Squad Stats"):
if selected_league_keys: scrape_squad_stats_st(selected_league_keys)
else: st.sidebar.warning("Select leagues.")
if st.sidebar.button("Scrape Fixtures"):
if selected_league_keys: scrape_fixtures_st(selected_league_keys)
else: st.sidebar.warning("Select leagues.")
st.sidebar.markdown("---")
st.sidebar.header("View Scraped Data")
display_league_key = st.sidebar.selectbox(
"Select league to display data for:",
options=[""] + list(LEAGUES.keys()),
format_func=lambda key: LEAGUES[key]['name'] if key else "Select a league"
)
# --- Main Content Area ---
if display_league_key:
tab1, tab2, tab3 = st.tabs(["Player Stats", "Squad Stats (League Table)", "Fixtures"])
with tab1:
st.subheader(f"Player Stats for {LEAGUES[display_league_key]['name']}")
if display_league_key in st.session_state.player_stats_data:
st.dataframe(st.session_state.player_stats_data[display_league_key])
else:
st.info("No player stats data loaded. Scrape first.")
with tab2:
st.subheader(f"Squad Stats for {LEAGUES[display_league_key]['name']}")
if display_league_key in st.session_state.squad_stats_data:
st.dataframe(st.session_state.squad_stats_data[display_league_key])
else:
st.info("No squad stats data loaded. Scrape first.")
with tab3:
st.subheader(f"Fixtures for {LEAGUES[display_league_key]['name']}")
if display_league_key in st.session_state.fixtures_data:
st.dataframe(st.session_state.fixtures_data[display_league_key])
else:
st.info("No fixtures data loaded. Scrape first.")
else:
st.info("Select a league from the sidebar to view its scraped data, or use the feature testers below.")
st.markdown("---")
st.header("FBRef Data Feature Testing (Local)")
# --- 1. Player Comparison Tool ---
st.subheader("1. Player Comparison (Local Data)")
col1_pc, col2_pc, col3_pc = st.columns(3)
pc_league = col1_pc.selectbox("League (Player Comparison):", options=[""] + list(st.session_state.player_stats_data.keys()), format_func=lambda k: LEAGUES[k]['name'] if k else "Select")
pc_player1_name = col2_pc.text_input("Player 1 Name:", key="pc_p1")
pc_player2_name = col3_pc.text_input("Player 2 Name:", key="pc_p2")
if st.button("Compare Players (Local)", key="compare_local_btn"):
# ... (Player comparison logic remains the same as before) ...
if pc_league and pc_player1_name and pc_player2_name:
if pc_league in st.session_state.player_stats_data:
all_players_df = st.session_state.player_stats_data[pc_league]
player1_data = all_players_df[all_players_df['Player'].str.contains(pc_player1_name, case=False, na=False)]
player2_data = all_players_df[all_players_df['Player'].str.contains(pc_player2_name, case=False, na=False)]
if not player1_data.empty:
st.write(f"**Stats for {pc_player1_name}:**")
st.dataframe(player1_data)
else:
st.warning(f"Could not find data for player: {pc_player1_name} in {LEAGUES[pc_league]['name']}")
if not player2_data.empty:
st.write(f"**Stats for {pc_player2_name}:**")
st.dataframe(player2_data)
else:
st.warning(f"Could not find data for player: {pc_player2_name} in {LEAGUES[pc_league]['name']}")
else:
st.error(f"Player stats data for {LEAGUES[pc_league]['name']} not loaded. Please scrape first.")
else:
st.warning("Please select a league and enter two player names for comparison.")
# --- 2. Fixture Analysis (Local Data) ---
st.subheader("2. Fixture Analysis (Local Data)")
# ... (Fixture analysis logic remains the same as before) ...
col1_fa, col2_fa, col3_fa = st.columns(3)
fa_league = col1_fa.selectbox("League (Fixture Analysis):", options=[""] + list(st.session_state.fixtures_data.keys()), format_func=lambda k: LEAGUES[k]['name'] if k else "Select")
fa_home_team = col2_fa.text_input("Home Team Name:", key="fa_home")
fa_away_team = col3_fa.text_input("Away Team Name:", key="fa_away")
if st.button("Analyze Fixture (Local)", key="analyze_local_btn"):
if fa_league and fa_home_team and fa_away_team:
if fa_league in st.session_state.fixtures_data:
all_fixtures_df = st.session_state.fixtures_data[fa_league]
home_team_norm = fa_home_team.strip().lower()
away_team_norm = fa_away_team.strip().lower()
h2h_matches = all_fixtures_df[
(all_fixtures_df['Home'].str.lower() == home_team_norm) & (all_fixtures_df['Away'].str.lower() == away_team_norm) |
(all_fixtures_df['Home'].str.lower() == away_team_norm) & (all_fixtures_df['Away'].str.lower() == home_team_norm)
]
st.write(f"**Head-to-Head between {fa_home_team} and {fa_away_team}:**")
if not h2h_matches.empty:
st.dataframe(h2h_matches.sort_values(by='Date', ascending=False))
else:
st.info("No H2H matches found in the scraped data.")
def get_form_df(team_name, all_fixtures, num_matches=5):
team_matches = all_fixtures[
(all_fixtures['Home'].str.lower() == team_name.lower()) | (all_fixtures['Away'].str.lower() == team_name.lower())
]
played_matches = team_matches[team_matches['HomeGoals'].notna()].sort_values(by='Date', ascending=False)
return played_matches.head(num_matches)
st.write(f"**Recent Form for {fa_home_team} (last 5 played):**")
home_form_df = get_form_df(fa_home_team, all_fixtures_df)
if not home_form_df.empty: st.dataframe(home_form_df)
else: st.info(f"No recent played matches found for {fa_home_team}.")
st.write(f"**Recent Form for {fa_away_team} (last 5 played):**")
away_form_df = get_form_df(fa_away_team, all_fixtures_df)
if not away_form_df.empty: st.dataframe(away_form_df)
else: st.info(f"No recent played matches found for {fa_away_team}.")
else:
st.error(f"Fixtures data for {LEAGUES[fa_league]['name']} not loaded. Please scrape first.")
else:
st.warning("Please select a league and enter home/away team names for analysis.")
# --- 3. Visualization Data (Local Data) ---
st.subheader("3. Visualization Data (Example: Top Scorers - Local Data)")
# ... (Visualization logic remains the same as before) ...
col1_vd, col2_vd = st.columns(2)
vd_league = col1_vd.selectbox("League (Visualization):", options=[""] + list(st.session_state.player_stats_data.keys()), format_func=lambda k: LEAGUES[k]['name'] if k else "Select")
if st.button("Show Top Scorers (Local)", key="top_scorers_local_btn"):
if vd_league:
if vd_league in st.session_state.player_stats_data:
player_df = st.session_state.player_stats_data[vd_league].copy()
player_df['Gls'] = pd.to_numeric(player_df.get('Gls'), errors='coerce').fillna(0)
player_df['Ast'] = pd.to_numeric(player_df.get('Ast'), errors='coerce').fillna(0)
top_scorers = player_df.sort_values(by=['Gls', 'Ast'], ascending=[False, False]).head(10)
st.write(f"**Top 10 Scorers Data for {LEAGUES[vd_league]['name']}:**")
st.dataframe(top_scorers[['Player', 'Squad', 'Gls', 'Ast']])
if not top_scorers.empty:
st.write("**Chart: Goals by Top Scorers**")
chart_data = top_scorers.set_index('Player')[['Gls', 'Ast']]
st.bar_chart(chart_data)
else:
st.error(f"Player stats data for {LEAGUES[vd_league]['name']} not loaded. Please scrape first.")
else:
st.warning("Please select a league for visualization data.")
st.markdown("---")
st.header("Perplexity API Testing")
# --- 4. Fixture Report via Perplexity ---
st.subheader("4. Fixture Report (via Perplexity AI)")
fr_home_team = st.text_input("Home Team (for Perplexity Report):", key="fr_home")
fr_away_team = st.text_input("Away Team (for Perplexity Report):", key="fr_away")
fr_match_date = st.text_input("Match Date (e.g., YYYY-MM-DD) (for Perplexity Report):", key="fr_date")
if st.button("Get Fixture Report from Perplexity", key="fr_perplexity_btn"):
if fr_home_team and fr_away_team and fr_match_date:
if not st.session_state.perplexity_api_key:
st.error("Perplexity API Key is not set in the sidebar.")
else:
prompt = (
f"Generate a concise pre-match report for the football match: {fr_home_team} vs {fr_away_team} scheduled for {fr_match_date}.\n"
"Include the following sections if possible, keeping each brief:\n"
"1. Recent Form (last 3-5 matches for each team, e.g., WWLDW).\n"
"2. Head-to-Head (H2H) summary of their last few encounters.\n"
"3. Key Players to Watch (one or two from each team with brief reason).\n"
"4. Brief Tactical Outlook or Prediction (optional, if confident).\n"
"Prioritize information from reputable football sources. Be objective."
)
report = get_perplexity_response(st.session_state.perplexity_api_key, prompt, "You are a football analyst providing pre-match reports.")
if report:
st.markdown("**Perplexity AI Fixture Report:**")
st.markdown(report)
else:
st.warning("Please enter Home Team, Away Team, and Match Date for the report.")
# --- 5. Custom Query via Perplexity ---
st.subheader("5. Custom Query (via Perplexity AI)")
custom_query_text = st.text_area("Enter your football-related question:", height=100, key="custom_q")
if st.button("Ask Perplexity AI", key="custom_q_btn"):
if custom_query_text:
if not st.session_state.perplexity_api_key:
st.error("Perplexity API Key is not set in the sidebar.")
else:
answer = get_perplexity_response(st.session_state.perplexity_api_key, custom_query_text)
if answer:
st.markdown("**Perplexity AI Answer:**")
st.markdown(answer)
else:
st.warning("Please enter a question to ask Perplexity AI.")
st.markdown("---")
st.caption("Streamlit test app by your AI assistant. API keys are not stored after session.")