Spaces:

Kabila22
/

Backend_Caps

Sleeping

App Files Files Community

Backend_Caps / app /model /train_model.py

Kabila22

backend commit

c40dc5c 10 months ago

raw

history blame contribute delete

3.95 kB

	# model/train_model.py
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression, LinearRegression
	from sklearn.preprocessing import LabelEncoder
	import joblib
	import os
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Load datasets
	try:
	matches_df = pd.read_csv('data/results.csv')
	except FileNotFoundError as e:
	logger.error(f"Dataset not found: {e}")
	raise

	matches_df['home_score'] = pd.to_numeric(matches_df['home_score'], errors='coerce').fillna(0)
	matches_df['away_score'] = pd.to_numeric(matches_df['away_score'], errors='coerce').fillna(0)

	# Define the training function
	def train_and_save_models():
	# --- Prepare Data ---
	# Create a symmetric outcome: 0 for team1 win, 1 for draw, 2 for team2 win
	def get_match_outcome(row):
	if row['home_score'] > row['away_score']:
	return 0 if row['home_team'] < row['away_team'] else 2
	elif row['home_score'] < row['away_score']:
	return 2 if row['home_team'] < row['away_team'] else 0
	else:
	return 1

	matches_df['outcome'] = matches_df.apply(get_match_outcome, axis=1)

	# Sort teams alphabetically to ensure consistency
	matches_df['team1'] = matches_df.apply(lambda x: min(x['home_team'], x['away_team']), axis=1)
	matches_df['team2'] = matches_df.apply(lambda x: max(x['home_team'], x['away_team']), axis=1)

	# Get all unique team names from both home_team and away_team
	all_teams = pd.concat([matches_df['home_team'], matches_df['away_team']]).unique()

	# Encode team names with a single LabelEncoder fitted on all teams
	le_outcome = LabelEncoder()
	le_outcome.fit(all_teams) # Fit on all unique teams

	# --- Logistic Regression for Match Outcome ---
	X_outcome = pd.DataFrame({
	'team1': le_outcome.transform(matches_df['team1']),
	'team2': le_outcome.transform(matches_df['team2'])
	})
	y_outcome = matches_df['outcome']

	# Split data and train Logistic Regression model
	X_train_outcome, _, y_train_outcome, _ = train_test_split(X_outcome, y_outcome, test_size=0.2, random_state=42)
	logistic_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
	logistic_model.fit(X_train_outcome, y_train_outcome)

	# --- Linear Regression for Goal Prediction ---
	X_goals = pd.DataFrame({
	'team1': le_outcome.transform(matches_df['team1']),
	'team2': le_outcome.transform(matches_df['team2'])
	})

	# Targets: home_score and away_score as separate predictions
	y_team1_goals = matches_df['home_score'] # Goals scored by team1 (home team in original data)
	y_team2_goals = matches_df['away_score'] # Goals scored by team2 (away team in original data)

	# Split data for goal prediction
	X_train_goals, _, y_train_team1_goals, _, y_train_team2_goals, _ = train_test_split(
	X_goals, y_team1_goals, y_team2_goals, test_size=0.2, random_state=42
	)

	# Train Linear Regression models for each team's goals
	linear_model_team1 = LinearRegression()
	linear_model_team2 = LinearRegression()

	linear_model_team1.fit(X_train_goals, y_train_team1_goals)
	linear_model_team2.fit(X_train_goals, y_train_team2_goals)

	# Ensure the model directory exists
	os.makedirs('model', exist_ok=True)

	# Save all models and the label encoder
	joblib.dump(logistic_model, 'model/logistic_regression_model.pkl')
	joblib.dump(linear_model_team1, 'model/linear_regression_team1_goals.pkl')
	joblib.dump(linear_model_team2, 'model/linear_regression_team2_goals.pkl')
	joblib.dump(le_outcome, 'model/label_encoder.pkl')

	logger.info("Logistic Regression and Linear Regression models, along with LabelEncoder, saved successfully.")

	if __name__ == "__main__":
	train_and_save_models()