Spaces:

AlvaroMros
/

ufc-predictor

Sleeping

App Files Files Community

ufc-predictor / src /predict /models.py

AlvaroMros

Fix import

371767b 6 months ago

raw

history blame contribute delete

8.75 kB

	from abc import ABC, abstractmethod
	import sys
	import os
	import pandas as pd
	from typing import Dict, Any, Optional, List
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.naive_bayes import BernoulliNB
	from sklearn.ensemble import RandomForestClassifier
	from xgboost import XGBClassifier
	from lightgbm import LGBMClassifier
	from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
	from ..config import FIGHTERS_CSV_PATH
	from .preprocess import preprocess_for_ml, _get_fighter_history_stats
	from .utils import calculate_age, prepare_fighters_data
	from .config import DEFAULT_ELO

	class BaseModel(ABC):
	"""
	Abstract base class for all prediction models.
	Ensures that every model has a standard interface for training and prediction.
	"""
	@abstractmethod
	def train(self, train_fights):
	"""
	Trains or prepares the model using historical fight data.

	:param train_fights: A list of historical fight data dictionaries.
	"""
	pass

	@abstractmethod
	def predict(self, fight):
	"""
	Predicts the winner of a single fight.

	:param fight: A dictionary representing a single fight.
	:return: The name of the predicted winning fighter.
	"""
	pass

	class EloBaselineModel(BaseModel):
	"""
	A baseline prediction model that predicts the winner based on the higher ELO rating.
	"""
	def __init__(self):
	self.fighters_df = None

	def train(self, train_fights):
	"""
	For the ELO baseline, 'training' simply consists of loading the fighter data
	to access their ELO scores during prediction.
	"""
	print("Training EloBaselineModel: Loading fighter ELO data...")
	self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
	self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
	self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')

	def predict(self, fight: Dict[str, Any]) -> Dict[str, Optional[float]]:
	"""Predicts the winner based on ELO and calculates win probability."""
	f1_name, f2_name = fight['fighter_1'], fight['fighter_2']

	try:
	f1_elo = self.fighters_df.loc[f1_name, 'elo']
	f2_elo = self.fighters_df.loc[f2_name, 'elo']

	# Calculate win probability for fighter 1 using the ELO formula
	prob_f1_wins = 1 / (1 + 10**((f2_elo - f1_elo) / 400))

	if prob_f1_wins >= 0.5:
	return {'winner': f1_name, 'probability': prob_f1_wins}
	else:
	return {'winner': f2_name, 'probability': 1 - prob_f1_wins}

	except KeyError as e:
	print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
	return {'winner': None, 'probability': None}

	class BaseMLModel(BaseModel):
	"""
	An abstract base class for machine learning models that handles all common
	data preparation, training, and prediction logic.
	"""
	def __init__(self, model):
	if model is None:
	raise ValueError("A model must be provided.")
	self.model = model
	self.fighters_df = None
	self.fighter_histories = {}

	def train(self, train_fights: List[Dict[str, Any]]) -> None:
	"""
	Trains the machine learning model. This involves loading fighter data,
	pre-calculating histories, and fitting the model on the preprocessed data.
	"""
	print(f"--- Training {self.model.__class__.__name__} ---")

	# 1. Prepare data for prediction-time feature generation
	self.fighters_df = prepare_fighters_data(pd.read_csv(FIGHTERS_CSV_PATH))

	# 2. Pre-calculate fighter histories
	train_fights_with_dates = []
	for fight in train_fights:
	fight['date_obj'] = pd.to_datetime(fight['event_date'])
	train_fights_with_dates.append(fight)
	for fighter_name in self.fighters_df.index:
	history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
	self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])

	# 3. Preprocess and fit
	X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
	print(f"Fitting model on {X_train.shape[0]} samples...")
	self.model.fit(X_train, y_train)
	print("Model training complete.")

	def predict(self, fight):
	"""
	Predicts the outcome of a single fight, returning the winner and probability.
	"""
	f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
	fight_date = pd.to_datetime(fight['event_date'])

	if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
	print(f"Warning: Fighter not found. Skipping prediction for {f1_name} vs {f2_name}")
	return {'winner': None, 'probability': None}

	f1_stats = self.fighters_df.loc[f1_name]
	f2_stats = self.fighters_df.loc[f2_name]
	if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
	if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]

	f1_hist = self.fighter_histories.get(f1_name, [])
	f2_hist = self.fighter_histories.get(f2_name, [])
	f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, f1_hist, self.fighters_df)
	f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, f2_hist, self.fighters_df)

	f1_age = calculate_age(f1_stats.get('dob'), fight['event_date'])
	f2_age = calculate_age(f2_stats.get('dob'), fight['event_date'])

	features = {
	'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
	'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
	'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
	'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
	'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
	'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
	'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
	'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
	'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
	'takedown_accuracy_last_5_diff': f1_hist_stats['takedown_accuracy_last_n'] - f2_hist_stats['takedown_accuracy_last_n'],
	'sub_attempts_per_min_last_5_diff': f1_hist_stats['sub_attempts_per_min_last_n'] - f2_hist_stats['sub_attempts_per_min_last_n'],
	}

	feature_vector = pd.DataFrame([features]).fillna(0)

	# Use predict_proba to get probabilities for each class
	probabilities = self.model.predict_proba(feature_vector)[0]
	prob_f1_wins = probabilities[1] # Probability of class '1' (fighter 1 wins)

	if prob_f1_wins >= 0.5:
	return {'winner': f1_name, 'probability': prob_f1_wins}
	else:
	return {'winner': f2_name, 'probability': 1 - prob_f1_wins}

	class LogisticRegressionModel(BaseMLModel):
	"""A thin wrapper for scikit-learn's LogisticRegression."""
	def __init__(self):
	super().__init__(model=LogisticRegression())

	class XGBoostModel(BaseMLModel):
	"""A thin wrapper for XGBoost's XGBClassifier."""
	def __init__(self):
	model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
	super().__init__(model=model)

	class SVCModel(BaseMLModel):
	"""A thin wrapper for scikit-learn's Support Vector Classifier."""
	def __init__(self):
	# Probability=True is needed for some reports, though it slows down training
	super().__init__(model=SVC(probability=True, random_state=42))

	class RandomForestModel(BaseMLModel):
	"""A thin wrapper for scikit-learn's RandomForestClassifier."""
	def __init__(self):
	super().__init__(model=RandomForestClassifier(random_state=42))

	class BernoulliNBModel(BaseMLModel):
	"""A thin wrapper for scikit-learn's Bernoulli Naive Bayes classifier."""
	def __init__(self):
	super().__init__(model=BernoulliNB())

	class LGBMModel(BaseMLModel):
	"""A thin wrapper for LightGBM's LGBMClassifier."""
	def __init__(self):
	super().__init__(model=LGBMClassifier(random_state=42))