Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score | |
| from itertools import combinations | |
| import plotly.express as px | |
| # List of European countries | |
| european_countries = [ | |
| "albania", "andorra", "armenia", "austria", "azerbaijan", "belarus", "belgium", | |
| "bosnia and herzegovina", "bulgaria", "croatia", "cyprus", "czech republic", | |
| "denmark", "england", "estonia", "faroe islands", "finland", "france", "georgia", | |
| "germany", "gibraltar", "greece", "hungary", "iceland", "ireland", "israel", | |
| "italy", "kazakhstan", "kosovo", "latvia", "liechtenstein", "lithuania", "luxembourg", | |
| "malta", "moldova", "monaco", "montenegro", "netherlands", "north macedonia", "northern ireland", | |
| "norway", "poland", "portugal", "romania", "russia", "san marino", "scotland", "serbia", | |
| "slovakia", "slovenia", "spain", "sweden", "switzerland", "turkey", "ukraine", "wales" | |
| ] | |
| # Load and preprocess data | |
| def load_data(): | |
| data = pd.read_csv("results.csv") | |
| # Ensure numeric columns are correctly formatted | |
| numeric_columns = ['home_score', 'away_score'] | |
| for col in numeric_columns: | |
| data[col] = pd.to_numeric(data[col], errors='coerce') | |
| # Ensure date column is in datetime format | |
| data['date'] = pd.to_datetime(data['date'], errors='coerce') | |
| # Filter data from the year 2010 onwards | |
| data = data[data['date'].dt.year >= 2010] | |
| # Drop rows with missing values in critical columns | |
| data = data.dropna(subset=numeric_columns + ['home_team', 'away_team', 'date']) | |
| data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x) | |
| # Filter to include only matches where both teams are European countries | |
| data = data[data['home_team'].isin(european_countries) & data['away_team'].isin(european_countries)] | |
| data['match_outcome'] = data.apply( | |
| lambda row: 'home_win' if row['home_score'] > row['away_score'] else ('away_win' if row['home_score'] < row['away_score'] else 'draw'), | |
| axis=1 | |
| ) | |
| return data | |
| # Train model | |
| def train_model(data): | |
| X = pd.get_dummies(data, columns=['home_team', 'away_team'], drop_first=True) | |
| X = X.drop(columns=['date', 'home_score', 'away_score', 'match_outcome', 'tournament', 'city', 'country']) | |
| y = data['match_outcome'].map({'home_win': 1, 'draw': 0, 'away_win': -1}) | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
| rf_model = RandomForestClassifier(n_estimators=100, random_state=42) | |
| rf_model.fit(X_train, y_train) | |
| return rf_model, X.columns, accuracy_score(y_test, rf_model.predict(X_test)) | |
| # Euro 2024 groups | |
| groups = { | |
| "Group A": ["germany", "hungary", "scotland", "switzerland"], | |
| "Group B": ["albania", "croatia", "italy", "spain"], | |
| "Group C": ["denmark", "england", "serbia", "slovenia"], | |
| "Group D": ["austria", "france", "netherlands", "poland"], | |
| "Group E": ["belgium", "romania", "slovakia", "ukraine"], | |
| "Group F": ["czech republic", "portugal", "turkey", "georgia"] | |
| } | |
| # Simulate group matches | |
| def simulate_group_matches(group_teams, model, data_columns): | |
| points = {team: 0 for team in group_teams} | |
| for home_team, away_team in combinations(group_teams, 2): | |
| match_data = pd.DataFrame(columns=data_columns) | |
| for column in data_columns: | |
| if f'home_team_{home_team}' in column: | |
| match_data.at[0, column] = 1 | |
| elif f'away_team_{away_team}' in column: | |
| match_data.at[0, column] = 1 | |
| else: | |
| match_data.at[0, column] = 0 | |
| prediction = model.predict(match_data) | |
| if prediction == 1: | |
| points[home_team] += 3 | |
| elif prediction == -1: | |
| points[away_team] += 3 | |
| else: | |
| points[home_team] += 1 | |
| points[away_team] += 1 | |
| ranked_teams = sorted(points.items(), key=lambda x: x[1], reverse=True) | |
| return ranked_teams | |
| # Simulate knockout match with probability | |
| def simulate_knockout_match(home_team, away_team, model, data_columns): | |
| match_data = pd.DataFrame(columns=data_columns) | |
| for column in data_columns: | |
| if f'home_team_{home_team}' in column: | |
| match_data.at[0, column] = 1 | |
| elif f'away_team_{away_team}' in column: | |
| match_data.at[0, column] = 1 | |
| else: | |
| match_data.at[0, column] = 0 | |
| prediction_proba = model.predict_proba(match_data)[0] | |
| prediction = model.predict(match_data) | |
| outcome_probabilities = { | |
| 'home_win': prediction_proba[2], | |
| 'draw': prediction_proba[1], | |
| 'away_win': prediction_proba[0] | |
| } | |
| if prediction == 1: | |
| return home_team, outcome_probabilities | |
| elif prediction == -1: | |
| return away_team, outcome_probabilities | |
| else: | |
| return home_team if outcome_probabilities['home_win'] > outcome_probabilities['away_win'] else away_team, outcome_probabilities | |
| # Streamlit app | |
| st.title("Euro 2024 Prediction App") | |
| # Load data and train model | |
| data = load_data() | |
| model, data_columns, model_accuracy = train_model(data) | |
| # Display model accuracy | |
| st.sidebar.metric("Model Accuracy", f"{model_accuracy:.2%}") | |
| # Simulation options | |
| st.sidebar.header("Simulation Options") | |
| num_simulations = st.sidebar.slider("Number of Simulations", min_value=1, max_value=1000, value=100, step=10) | |
| # Predict a specific match | |
| st.sidebar.header("Predict a Match") | |
| teams = sorted(european_countries) | |
| home_team = st.sidebar.selectbox("Home Team", teams) | |
| away_team = st.sidebar.selectbox("Away Team", teams) | |
| if st.sidebar.button("Predict Match"): | |
| if home_team != away_team: | |
| winner, outcome_probabilities = simulate_knockout_match(home_team, away_team, model, data_columns) | |
| st.sidebar.write(f"Predicted Winner: {winner}") | |
| st.sidebar.write(f"Home Win Probability: {outcome_probabilities['home_win']:.2%}") | |
| st.sidebar.write(f"Draw Probability: {outcome_probabilities['draw']:.2%}") | |
| st.sidebar.write(f"Away Win Probability: {outcome_probabilities['away_win']:.2%}") | |
| else: | |
| st.sidebar.write("Home team and away team cannot be the same.") | |
| # Run simulations | |
| if st.sidebar.button("Run Simulations"): | |
| progress_bar = st.progress(0) | |
| simulation_results = [] | |
| for i in range(num_simulations): | |
| group_rankings = {} | |
| for group_name, group_teams in groups.items(): | |
| ranked_teams = simulate_group_matches(group_teams, model, data_columns) | |
| group_rankings[group_name] = ranked_teams | |
| # Round of 16 | |
| round_of_16_matches = [ | |
| (group_rankings['Group A'][0][0], group_rankings['Group B'][1][0]), | |
| (group_rankings['Group B'][0][0], group_rankings['Group A'][1][0]), | |
| (group_rankings['Group C'][0][0], group_rankings['Group D'][1][0]), | |
| (group_rankings['Group D'][0][0], group_rankings['Group C'][1][0]), | |
| (group_rankings['Group E'][0][0], group_rankings['Group F'][1][0]), | |
| (group_rankings['Group F'][0][0], group_rankings['Group E'][1][0]), | |
| (group_rankings['Group A'][2][0], group_rankings['Group B'][2][0]), | |
| (group_rankings['Group C'][2][0], group_rankings['Group D'][2][0]) | |
| ] | |
| quarter_finalists = [] | |
| round_of_16_results = [] | |
| for home, away in round_of_16_matches: | |
| winner, probabilities = simulate_knockout_match(home, away, model, data_columns) | |
| quarter_finalists.append(winner) | |
| round_of_16_results.append((home, away, winner, probabilities)) | |
| # Quarter-finals | |
| quarter_final_matches = [ | |
| (quarter_finalists[0], quarter_finalists[1]), | |
| (quarter_finalists[2], quarter_finalists[3]), | |
| (quarter_finalists[4], quarter_finalists[5]), | |
| (quarter_finalists[6], quarter_finalists[7]) | |
| ] | |
| semi_finalists = [] | |
| quarter_final_results = [] | |
| for home, away in quarter_final_matches: | |
| winner, probabilities = simulate_knockout_match(home, away, model, data_columns) | |
| semi_finalists.append(winner) | |
| quarter_final_results.append((home, away, winner, probabilities)) | |
| # Semi-finals | |
| semi_final_matches = [ | |
| (semi_finalists[0], semi_finalists[1]), | |
| (semi_finalists[2], semi_finalists[3]) | |
| ] | |
| finalists = [] | |
| semi_final_results = [] | |
| for home, away in semi_final_matches: | |
| winner, probabilities = simulate_knockout_match(home, away, model, data_columns) | |
| finalists.append(winner) | |
| semi_final_results.append((home, away, winner, probabilities)) | |
| # Final | |
| final_winner, probabilities = simulate_knockout_match(finalists[0], finalists[1], model, data_columns) | |
| simulation_results.append(final_winner) | |
| final_result = (finalists[0], finalists[1], final_winner, probabilities) | |
| # Update progress bar | |
| progress_bar.progress((i + 1) / num_simulations) | |
| # Calculate and display results | |
| winner_counts = pd.Series(simulation_results).value_counts() | |
| fig_winner = px.bar(winner_counts, x=winner_counts.index, y=winner_counts.values, labels={'x': 'Team', 'y': 'Number of Wins'}, | |
| title=f"Euro 2024 Winner Distribution (Based on {num_simulations} Simulations)") | |
| st.plotly_chart(fig_winner) | |
| # Top 5 winners | |
| st.subheader("Top 5 Most Likely Winners") | |
| top_5_winners = winner_counts.head() | |
| for team, wins in top_5_winners.items(): | |
| st.metric(team, f"{wins} wins", f"{wins/num_simulations:.1%}") | |
| # Display the process from group stage to final | |
| st.subheader("Simulation Process") | |
| for i in range(num_simulations): | |
| st.markdown(f"### Simulation {i + 1}") | |
| # Group stage | |
| st.markdown("**Group Stage Results:**") | |
| for group_name, ranked_teams in group_rankings.items(): | |
| st.write(f"{group_name}: {', '.join([team for team, points in ranked_teams])}") | |
| # Round of 16 | |
| st.markdown("**Round of 16 Matches:**") | |
| for home, away, winner, probabilities in round_of_16_results: | |
| st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})") | |
| # Quarter-finals | |
| st.markdown("**Quarter-Final Matches:**") | |
| for home, away, winner, probabilities in quarter_final_results: | |
| st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})") | |
| # Semi-finals | |
| st.markdown("**Semi-Final Matches:**") | |
| for home, away, winner, probabilities in semi_final_results: | |
| st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})") | |
| # Final | |
| st.markdown("**Final Match:**") | |
| home, away, winner, probabilities = final_result | |
| st.write(f"{home} vs {away} - Winner: {winner} (Home Win: {probabilities['home_win']:.2%}, Draw: {probabilities['draw']:.2%}, Away Win: {probabilities['away_win']:.2%})") | |
| # Display data and model information | |
| st.sidebar.header("Data and Model Information") | |
| st.sidebar.write(f"Total matches in dataset: {len(data)}") | |
| st.sidebar.write(f"Date range: {data['date'].min().date()} to {data['date'].max().date()}") | |
| # Allow users to view raw data | |
| if st.sidebar.checkbox("Show raw data"): | |
| st.subheader("Raw data") | |
| st.write(data) | |
| # Footer | |
| st.sidebar.markdown("---") | |
| st.sidebar.markdown("Created with ❤️ using Streamlit") | |