Spaces:

Kabila22
/

Backend_Caps

Sleeping

File size: 3,948 Bytes

abdf1bb

# model/train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import LabelEncoder
import joblib
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load datasets
try:
    matches_df = pd.read_csv('data/results.csv')
except FileNotFoundError as e:
    logger.error(f"Dataset not found: {e}")
    raise

matches_df['home_score'] = pd.to_numeric(matches_df['home_score'], errors='coerce').fillna(0)
matches_df['away_score'] = pd.to_numeric(matches_df['away_score'], errors='coerce').fillna(0)

# Define the training function
def train_and_save_models():
    # --- Prepare Data ---
    # Create a symmetric outcome: 0 for team1 win, 1 for draw, 2 for team2 win
    def get_match_outcome(row):
        if row['home_score'] > row['away_score']:
            return 0 if row['home_team'] < row['away_team'] else 2
        elif row['home_score'] < row['away_score']:
            return 2 if row['home_team'] < row['away_team'] else 0
        else:
            return 1

    matches_df['outcome'] = matches_df.apply(get_match_outcome, axis=1)
    
    # Sort teams alphabetically to ensure consistency
    matches_df['team1'] = matches_df.apply(lambda x: min(x['home_team'], x['away_team']), axis=1)
    matches_df['team2'] = matches_df.apply(lambda x: max(x['home_team'], x['away_team']), axis=1)
    
    # Get all unique team names from both home_team and away_team
    all_teams = pd.concat([matches_df['home_team'], matches_df['away_team']]).unique()
    
    # Encode team names with a single LabelEncoder fitted on all teams
    le_outcome = LabelEncoder()
    le_outcome.fit(all_teams)  # Fit on all unique teams
    
    # --- Logistic Regression for Match Outcome ---
    X_outcome = pd.DataFrame({
        'team1': le_outcome.transform(matches_df['team1']),
        'team2': le_outcome.transform(matches_df['team2'])
    })
    y_outcome = matches_df['outcome']
    
    # Split data and train Logistic Regression model
    X_train_outcome, _, y_train_outcome, _ = train_test_split(X_outcome, y_outcome, test_size=0.2, random_state=42)
    logistic_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
    logistic_model.fit(X_train_outcome, y_train_outcome)
    
    # --- Linear Regression for Goal Prediction ---
    X_goals = pd.DataFrame({
        'team1': le_outcome.transform(matches_df['team1']),
        'team2': le_outcome.transform(matches_df['team2'])
    })
    
    # Targets: home_score and away_score as separate predictions
    y_team1_goals = matches_df['home_score']  # Goals scored by team1 (home team in original data)
    y_team2_goals = matches_df['away_score']  # Goals scored by team2 (away team in original data)
    
    # Split data for goal prediction
    X_train_goals, _, y_train_team1_goals, _, y_train_team2_goals, _ = train_test_split(
        X_goals, y_team1_goals, y_team2_goals, test_size=0.2, random_state=42
    )
    
    # Train Linear Regression models for each team's goals
    linear_model_team1 = LinearRegression()
    linear_model_team2 = LinearRegression()
    
    linear_model_team1.fit(X_train_goals, y_train_team1_goals)
    linear_model_team2.fit(X_train_goals, y_train_team2_goals)
    
    # Ensure the model directory exists
    os.makedirs('model', exist_ok=True)
    
    # Save all models and the label encoder
    joblib.dump(logistic_model, 'model/logistic_regression_model.pkl')
    joblib.dump(linear_model_team1, 'model/linear_regression_team1_goals.pkl')
    joblib.dump(linear_model_team2, 'model/linear_regression_team2_goals.pkl')
    joblib.dump(le_outcome, 'model/label_encoder.pkl')
    
    logger.info("Logistic Regression and Linear Regression models, along with LabelEncoder, saved successfully.")

if __name__ == "__main__":
    train_and_save_models()