# model/train_model.py import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.preprocessing import LabelEncoder import joblib import os import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Load datasets try: matches_df = pd.read_csv('data/results.csv') except FileNotFoundError as e: logger.error(f"Dataset not found: {e}") raise matches_df['home_score'] = pd.to_numeric(matches_df['home_score'], errors='coerce').fillna(0) matches_df['away_score'] = pd.to_numeric(matches_df['away_score'], errors='coerce').fillna(0) # Define the training function def train_and_save_models(): # --- Prepare Data --- # Create a symmetric outcome: 0 for team1 win, 1 for draw, 2 for team2 win def get_match_outcome(row): if row['home_score'] > row['away_score']: return 0 if row['home_team'] < row['away_team'] else 2 elif row['home_score'] < row['away_score']: return 2 if row['home_team'] < row['away_team'] else 0 else: return 1 matches_df['outcome'] = matches_df.apply(get_match_outcome, axis=1) # Sort teams alphabetically to ensure consistency matches_df['team1'] = matches_df.apply(lambda x: min(x['home_team'], x['away_team']), axis=1) matches_df['team2'] = matches_df.apply(lambda x: max(x['home_team'], x['away_team']), axis=1) # Get all unique team names from both home_team and away_team all_teams = pd.concat([matches_df['home_team'], matches_df['away_team']]).unique() # Encode team names with a single LabelEncoder fitted on all teams le_outcome = LabelEncoder() le_outcome.fit(all_teams) # Fit on all unique teams # --- Logistic Regression for Match Outcome --- X_outcome = pd.DataFrame({ 'team1': le_outcome.transform(matches_df['team1']), 'team2': le_outcome.transform(matches_df['team2']) }) y_outcome = matches_df['outcome'] # Split data and train Logistic Regression model X_train_outcome, _, y_train_outcome, _ = train_test_split(X_outcome, y_outcome, test_size=0.2, random_state=42) logistic_model = LogisticRegression(multi_class='multinomial', max_iter=1000) logistic_model.fit(X_train_outcome, y_train_outcome) # --- Linear Regression for Goal Prediction --- X_goals = pd.DataFrame({ 'team1': le_outcome.transform(matches_df['team1']), 'team2': le_outcome.transform(matches_df['team2']) }) # Targets: home_score and away_score as separate predictions y_team1_goals = matches_df['home_score'] # Goals scored by team1 (home team in original data) y_team2_goals = matches_df['away_score'] # Goals scored by team2 (away team in original data) # Split data for goal prediction X_train_goals, _, y_train_team1_goals, _, y_train_team2_goals, _ = train_test_split( X_goals, y_team1_goals, y_team2_goals, test_size=0.2, random_state=42 ) # Train Linear Regression models for each team's goals linear_model_team1 = LinearRegression() linear_model_team2 = LinearRegression() linear_model_team1.fit(X_train_goals, y_train_team1_goals) linear_model_team2.fit(X_train_goals, y_train_team2_goals) # Ensure the model directory exists os.makedirs('model', exist_ok=True) # Save all models and the label encoder joblib.dump(logistic_model, 'model/logistic_regression_model.pkl') joblib.dump(linear_model_team1, 'model/linear_regression_team1_goals.pkl') joblib.dump(linear_model_team2, 'model/linear_regression_team2_goals.pkl') joblib.dump(le_outcome, 'model/label_encoder.pkl') logger.info("Logistic Regression and Linear Regression models, along with LabelEncoder, saved successfully.") if __name__ == "__main__": train_and_save_models()