Backend_Caps / app /model /train_model.py
Kabila22's picture
backend commit
c40dc5c
# model/train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import LabelEncoder
import joblib
import os
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load datasets
try:
matches_df = pd.read_csv('data/results.csv')
except FileNotFoundError as e:
logger.error(f"Dataset not found: {e}")
raise
matches_df['home_score'] = pd.to_numeric(matches_df['home_score'], errors='coerce').fillna(0)
matches_df['away_score'] = pd.to_numeric(matches_df['away_score'], errors='coerce').fillna(0)
# Define the training function
def train_and_save_models():
# --- Prepare Data ---
# Create a symmetric outcome: 0 for team1 win, 1 for draw, 2 for team2 win
def get_match_outcome(row):
if row['home_score'] > row['away_score']:
return 0 if row['home_team'] < row['away_team'] else 2
elif row['home_score'] < row['away_score']:
return 2 if row['home_team'] < row['away_team'] else 0
else:
return 1
matches_df['outcome'] = matches_df.apply(get_match_outcome, axis=1)
# Sort teams alphabetically to ensure consistency
matches_df['team1'] = matches_df.apply(lambda x: min(x['home_team'], x['away_team']), axis=1)
matches_df['team2'] = matches_df.apply(lambda x: max(x['home_team'], x['away_team']), axis=1)
# Get all unique team names from both home_team and away_team
all_teams = pd.concat([matches_df['home_team'], matches_df['away_team']]).unique()
# Encode team names with a single LabelEncoder fitted on all teams
le_outcome = LabelEncoder()
le_outcome.fit(all_teams) # Fit on all unique teams
# --- Logistic Regression for Match Outcome ---
X_outcome = pd.DataFrame({
'team1': le_outcome.transform(matches_df['team1']),
'team2': le_outcome.transform(matches_df['team2'])
})
y_outcome = matches_df['outcome']
# Split data and train Logistic Regression model
X_train_outcome, _, y_train_outcome, _ = train_test_split(X_outcome, y_outcome, test_size=0.2, random_state=42)
logistic_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
logistic_model.fit(X_train_outcome, y_train_outcome)
# --- Linear Regression for Goal Prediction ---
X_goals = pd.DataFrame({
'team1': le_outcome.transform(matches_df['team1']),
'team2': le_outcome.transform(matches_df['team2'])
})
# Targets: home_score and away_score as separate predictions
y_team1_goals = matches_df['home_score'] # Goals scored by team1 (home team in original data)
y_team2_goals = matches_df['away_score'] # Goals scored by team2 (away team in original data)
# Split data for goal prediction
X_train_goals, _, y_train_team1_goals, _, y_train_team2_goals, _ = train_test_split(
X_goals, y_team1_goals, y_team2_goals, test_size=0.2, random_state=42
)
# Train Linear Regression models for each team's goals
linear_model_team1 = LinearRegression()
linear_model_team2 = LinearRegression()
linear_model_team1.fit(X_train_goals, y_train_team1_goals)
linear_model_team2.fit(X_train_goals, y_train_team2_goals)
# Ensure the model directory exists
os.makedirs('model', exist_ok=True)
# Save all models and the label encoder
joblib.dump(logistic_model, 'model/logistic_regression_model.pkl')
joblib.dump(linear_model_team1, 'model/linear_regression_team1_goals.pkl')
joblib.dump(linear_model_team2, 'model/linear_regression_team2_goals.pkl')
joblib.dump(le_outcome, 'model/label_encoder.pkl')
logger.info("Logistic Regression and Linear Regression models, along with LabelEncoder, saved successfully.")
if __name__ == "__main__":
train_and_save_models()