ositamiles's picture
predict
c3e91e9 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from itertools import combinations
import plotly.express as px
# List of European countries
european_countries = [
"albania", "andorra", "armenia", "austria", "azerbaijan", "belarus", "belgium",
"bosnia and herzegovina", "bulgaria", "croatia", "cyprus", "czech republic",
"denmark", "england", "estonia", "faroe islands", "finland", "france", "georgia",
"germany", "gibraltar", "greece", "hungary", "iceland", "ireland", "israel",
"italy", "kazakhstan", "kosovo", "latvia", "liechtenstein", "lithuania", "luxembourg",
"malta", "moldova", "monaco", "montenegro", "netherlands", "north macedonia", "northern ireland",
"norway", "poland", "portugal", "romania", "russia", "san marino", "scotland", "serbia",
"slovakia", "slovenia", "spain", "sweden", "switzerland", "turkey", "ukraine", "wales"
]
# Load and preprocess data
@st.cache_data
def load_data():
data = pd.read_csv("results.csv")
# Ensure numeric columns are correctly formatted
numeric_columns = ['home_score', 'away_score']
for col in numeric_columns:
data[col] = pd.to_numeric(data[col], errors='coerce')
# Ensure date column is in datetime format
data['date'] = pd.to_datetime(data['date'], errors='coerce')
# Filter data from the year 2010 onwards
data = data[data['date'].dt.year >= 2010]
# Drop rows with missing values in critical columns
data = data.dropna(subset=numeric_columns + ['home_team', 'away_team', 'date'])
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
# Filter to include only matches where both teams are European countries
data = data[data['home_team'].isin(european_countries) & data['away_team'].isin(european_countries)]
data['match_outcome'] = data.apply(
lambda row: 'home_win' if row['home_score'] > row['away_score'] else ('away_win' if row['home_score'] < row['away_score'] else 'draw'),
axis=1
)
return data
# Train model
@st.cache_resource
def train_model(data):
X = pd.get_dummies(data, columns=['home_team', 'away_team'], drop_first=True)
X = X.drop(columns=['date', 'home_score', 'away_score', 'match_outcome', 'tournament', 'city', 'country'])
y = data['match_outcome'].map({'home_win': 1, 'draw': 0, 'away_win': -1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
return rf_model, X.columns, accuracy_score(y_test, rf_model.predict(X_test))
# Euro 2024 groups
groups = {
"Group A": ["germany", "hungary", "scotland", "switzerland"],
"Group B": ["albania", "croatia", "italy", "spain"],
"Group C": ["denmark", "england", "serbia", "slovenia"],
"Group D": ["austria", "france", "netherlands", "poland"],
"Group E": ["belgium", "romania", "slovakia", "ukraine"],
"Group F": ["czech republic", "portugal", "turkey", "georgia"]
}
# Simulate group matches
def simulate_group_matches(group_teams, model, data_columns):
points = {team: 0 for team in group_teams}
for home_team, away_team in combinations(group_teams, 2):
match_data = pd.DataFrame(columns=data_columns)
for column in data_columns:
if f'home_team_{home_team}' in column:
match_data.at[0, column] = 1
elif f'away_team_{away_team}' in column:
match_data.at[0, column] = 1
else:
match_data.at[0, column] = 0
prediction = model.predict(match_data)
if prediction == 1:
points[home_team] += 3
elif prediction == -1:
points[away_team] += 3
else:
points[home_team] += 1
points[away_team] += 1
ranked_teams = sorted(points.items(), key=lambda x: x[1], reverse=True)
return ranked_teams
# Simulate knockout match with probability
def simulate_knockout_match(home_team, away_team, model, data_columns):
match_data = pd.DataFrame(columns=data_columns)
for column in data_columns:
if f'home_team_{home_team}' in column:
match_data.at[0, column] = 1
elif f'away_team_{away_team}' in column:
match_data.at[0, column] = 1
else:
match_data.at[0, column] = 0
prediction_proba = model.predict_proba(match_data)[0]
prediction = model.predict(match_data)
outcome_probabilities = {
'home_win': prediction_proba[2],
'draw': prediction_proba[1],
'away_win': prediction_proba[0]
}
if prediction == 1:
return home_team, outcome_probabilities
elif prediction == -1:
return away_team, outcome_probabilities
else:
return home_team if outcome_probabilities['home_win'] > outcome_probabilities['away_win'] else away_team, outcome_probabilities
# Streamlit app
st.title("Euro 2024 Prediction App")
# Load data and train model
data = load_data()
model, data_columns, model_accuracy = train_model(data)
# Display model accuracy
st.sidebar.metric("Model Accuracy", f"{model_accuracy:.2%}")
# Simulation options
st.sidebar.header("Simulation Options")
num_simulations = st.sidebar.slider("Number of Simulations", min_value=1, max_value=1000, value=100, step=10)
# Predict a specific match
st.sidebar.header("Predict a Match")
teams = sorted(european_countries)
home_team = st.sidebar.selectbox("Home Team", teams)
away_team = st.sidebar.selectbox("Away Team", teams)
if st.sidebar.button("Predict Match"):
if home_team != away_team:
winner, outcome_probabilities = simulate_knockout_match(home_team, away_team, model, data_columns)
st.sidebar.write(f"Predicted Winner: {winner}")
st.sidebar.write(f"Home Win Probability: {outcome_probabilities['home_win']:.2%}")
st.sidebar.write(f"Draw Probability: {outcome_probabilities['draw']:.2%}")
st.sidebar.write(f"Away Win Probability: {outcome_probabilities['away_win']:.2%}")
else:
st.sidebar.write("Home team and away team cannot be the same.")
# Run simulations
if st.sidebar.button("Run Simulations"):
progress_bar = st.progress(0)
simulation_results = []
for i in range(num_simulations):
group_rankings = {}
for group_name, group_teams in groups.items():
ranked_teams = simulate_group_matches(group_teams, model, data_columns)
group_rankings[group_name] = ranked_teams
# Round of 16
round_of_16_matches = [
(group_rankings['Group A'][0][0], group_rankings['Group B'][1][0]),
(group_rankings['Group B'][0][0], group_rankings['Group A'][1][0]),
(group_rankings['Group C'][0][0], group_rankings['Group D'][1][0]),
(group_rankings['Group D'][0][0], group_rankings['Group C'][1][0]),
(group_rankings['Group E'][0][0], group_rankings['Group F'][1][0]),
(group_rankings['Group F'][0][0], group_rankings['Group E'][1][0]),
(group_rankings['Group A'][2][0], group_rankings['Group B'][2][0]),
(group_rankings['Group C'][2][0], group_rankings['Group D'][2][0])
]
quarter_finalists = []
round_of_16_results = []
for home, away in round_of_16_matches:
winner, probabilities = simulate_knockout_match(home, away, model, data_columns)
quarter_finalists.append(winner)
round_of_16_results.append((home, away, winner, probabilities))
# Quarter-finals
quarter_final_matches = [
(quarter_finalists[0], quarter_finalists[1]),
(quarter_finalists[2], quarter_finalists[3]),
(quarter_finalists[4], quarter_finalists[5]),
(quarter_finalists[6], quarter_finalists[7])
]
semi_finalists = []
quarter_final_results = []
for home, away in quarter_final_matches:
winner, probabilities = simulate_knockout_match(home, away, model, data_columns)
semi_finalists.append(winner)
quarter_final_results.append((home, away, winner, probabilities))
# Semi-finals
semi_final_matches = [
(semi_finalists[0], semi_finalists[1]),
(semi_finalists[2], semi_finalists[3])
]
finalists = []
semi_final_results = []
for home, away in semi_final_matches:
winner, probabilities = simulate_knockout_match(home, away, model, data_columns)
finalists.append(winner)
semi_final_results.append((home, away, winner, probabilities))
# Final
final_winner, probabilities = simulate_knockout_match(finalists[0], finalists[1], model, data_columns)
simulation_results.append(final_winner)
final_result = (finalists[0], finalists[1], final_winner, probabilities)
# Update progress bar
progress_bar.progress((i + 1) / num_simulations)
# Calculate and display results
winner_counts = pd.Series(simulation_results).value_counts()
fig_winner = px.bar(winner_counts, x=winner_counts.index, y=winner_counts.values, labels={'x': 'Team', 'y': 'Number of Wins'},
title=f"Euro 2024 Winner Distribution (Based on {num_simulations} Simulations)")
st.plotly_chart(fig_winner)
# Top 5 winners
st.subheader("Top 5 Most Likely Winners")
top_5_winners = winner_counts.head()
for team, wins in top_5_winners.items():
st.metric(team, f"{wins} wins", f"{wins/num_simulations:.1%}")
# Display the process from group stage to final
st.subheader("Simulation Process")
for i in range(num_simulations):
st.markdown(f"### Simulation {i + 1}")
# Group stage
st.markdown("**Group Stage Results:**")
for group_name, ranked_teams in group_rankings.items():
st.write(f"{group_name}: {', '.join([team for team, points in ranked_teams])}")
# Round of 16
st.markdown("**Round of 16 Matches:**")
for home, away, winner, probabilities in round_of_16_results:
st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})")
# Quarter-finals
st.markdown("**Quarter-Final Matches:**")
for home, away, winner, probabilities in quarter_final_results:
st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})")
# Semi-finals
st.markdown("**Semi-Final Matches:**")
for home, away, winner, probabilities in semi_final_results:
st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})")
# Final
st.markdown("**Final Match:**")
home, away, winner, probabilities = final_result
st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})")
# Display data and model information
st.sidebar.header("Data and Model Information")
st.sidebar.write(f"Total matches in dataset: {len(data)}")
st.sidebar.write(f"Date range: {data['date'].min().date()} to {data['date'].max().date()}")
# Allow users to view raw data
if st.sidebar.checkbox("Show raw data"):
st.subheader("Raw data")
st.write(data)
# Footer
st.sidebar.markdown("---")
st.sidebar.markdown("Created with ❤️ using Streamlit")