Spaces:
Sleeping
Sleeping
| # model/train_model.py | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression, LinearRegression | |
| from sklearn.preprocessing import LabelEncoder | |
| import joblib | |
| import os | |
| import logging | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Load datasets | |
| try: | |
| matches_df = pd.read_csv('data/results.csv') | |
| except FileNotFoundError as e: | |
| logger.error(f"Dataset not found: {e}") | |
| raise | |
| matches_df['home_score'] = pd.to_numeric(matches_df['home_score'], errors='coerce').fillna(0) | |
| matches_df['away_score'] = pd.to_numeric(matches_df['away_score'], errors='coerce').fillna(0) | |
| # Define the training function | |
| def train_and_save_models(): | |
| # --- Prepare Data --- | |
| # Create a symmetric outcome: 0 for team1 win, 1 for draw, 2 for team2 win | |
| def get_match_outcome(row): | |
| if row['home_score'] > row['away_score']: | |
| return 0 if row['home_team'] < row['away_team'] else 2 | |
| elif row['home_score'] < row['away_score']: | |
| return 2 if row['home_team'] < row['away_team'] else 0 | |
| else: | |
| return 1 | |
| matches_df['outcome'] = matches_df.apply(get_match_outcome, axis=1) | |
| # Sort teams alphabetically to ensure consistency | |
| matches_df['team1'] = matches_df.apply(lambda x: min(x['home_team'], x['away_team']), axis=1) | |
| matches_df['team2'] = matches_df.apply(lambda x: max(x['home_team'], x['away_team']), axis=1) | |
| # Get all unique team names from both home_team and away_team | |
| all_teams = pd.concat([matches_df['home_team'], matches_df['away_team']]).unique() | |
| # Encode team names with a single LabelEncoder fitted on all teams | |
| le_outcome = LabelEncoder() | |
| le_outcome.fit(all_teams) # Fit on all unique teams | |
| # --- Logistic Regression for Match Outcome --- | |
| X_outcome = pd.DataFrame({ | |
| 'team1': le_outcome.transform(matches_df['team1']), | |
| 'team2': le_outcome.transform(matches_df['team2']) | |
| }) | |
| y_outcome = matches_df['outcome'] | |
| # Split data and train Logistic Regression model | |
| X_train_outcome, _, y_train_outcome, _ = train_test_split(X_outcome, y_outcome, test_size=0.2, random_state=42) | |
| logistic_model = LogisticRegression(multi_class='multinomial', max_iter=1000) | |
| logistic_model.fit(X_train_outcome, y_train_outcome) | |
| # --- Linear Regression for Goal Prediction --- | |
| X_goals = pd.DataFrame({ | |
| 'team1': le_outcome.transform(matches_df['team1']), | |
| 'team2': le_outcome.transform(matches_df['team2']) | |
| }) | |
| # Targets: home_score and away_score as separate predictions | |
| y_team1_goals = matches_df['home_score'] # Goals scored by team1 (home team in original data) | |
| y_team2_goals = matches_df['away_score'] # Goals scored by team2 (away team in original data) | |
| # Split data for goal prediction | |
| X_train_goals, _, y_train_team1_goals, _, y_train_team2_goals, _ = train_test_split( | |
| X_goals, y_team1_goals, y_team2_goals, test_size=0.2, random_state=42 | |
| ) | |
| # Train Linear Regression models for each team's goals | |
| linear_model_team1 = LinearRegression() | |
| linear_model_team2 = LinearRegression() | |
| linear_model_team1.fit(X_train_goals, y_train_team1_goals) | |
| linear_model_team2.fit(X_train_goals, y_train_team2_goals) | |
| # Ensure the model directory exists | |
| os.makedirs('model', exist_ok=True) | |
| # Save all models and the label encoder | |
| joblib.dump(logistic_model, 'model/logistic_regression_model.pkl') | |
| joblib.dump(linear_model_team1, 'model/linear_regression_team1_goals.pkl') | |
| joblib.dump(linear_model_team2, 'model/linear_regression_team2_goals.pkl') | |
| joblib.dump(le_outcome, 'model/label_encoder.pkl') | |
| logger.info("Logistic Regression and Linear Regression models, along with LabelEncoder, saved successfully.") | |
| if __name__ == "__main__": | |
| train_and_save_models() |