footypredict-pro / src /backtesting.py
NetBoss
V3.0 Ultimate Enhancement - Complete production system
6f7e932
"""
Backtesting System
Test model accuracy on historical data to validate predictions.
Features:
- Walk-forward validation
- Multiple time periods
- Profit/loss simulation
- Accuracy by league, team, outcome
"""
import json
import logging
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
logger = logging.getLogger(__name__)
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / "data"
RESULTS_DIR = DATA_DIR / "backtest_results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
class Backtester:
"""Test predictions against historical data"""
def __init__(self):
self.data = None
self.results = []
def load_data(self) -> pd.DataFrame:
"""Load historical match data"""
if self.data is not None:
return self.data
# Try local cache
cache_file = DATA_DIR / "training_data.csv"
if cache_file.exists():
self.data = pd.read_csv(cache_file)
self.data['date'] = pd.to_datetime(self.data['date'])
return self.data
# Download
try:
url = 'https://raw.githubusercontent.com/martj42/international_results/master/results.csv'
self.data = pd.read_csv(url)
self.data['date'] = pd.to_datetime(self.data['date'])
return self.data
except:
return pd.DataFrame()
def calculate_elo(self, df: pd.DataFrame) -> Dict[str, float]:
"""Calculate Elo ratings up to a point in time"""
elo = {}
K = 32
df = df.sort_values('date')
for _, row in df.iterrows():
home, away = row['home_team'], row['away_team']
h_elo = elo.get(home, 1500)
a_elo = elo.get(away, 1500)
exp_h = 1 / (1 + 10**((a_elo - h_elo) / 400))
if row['home_score'] > row['away_score']:
s_h, s_a = 1, 0
elif row['home_score'] < row['away_score']:
s_h, s_a = 0, 1
else:
s_h, s_a = 0.5, 0.5
elo[home] = h_elo + K * (s_h - exp_h)
elo[away] = a_elo + K * (s_a - (1 - exp_h))
return elo
def predict_match(self, home_elo: float, away_elo: float, home_advantage: float = 100) -> Dict:
"""Simple Elo-based prediction"""
h_elo = home_elo + home_advantage
# Expected score
exp_h = 1 / (1 + 10**((away_elo - h_elo) / 400))
# Convert to 3-way probabilities (rough approximation)
draw_prob = 0.25 - 0.1 * abs(exp_h - 0.5) # More likely draw when teams are even
home_prob = exp_h * (1 - draw_prob)
away_prob = (1 - exp_h) * (1 - draw_prob)
# Normalize
total = home_prob + draw_prob + away_prob
home_prob /= total
draw_prob /= total
away_prob /= total
if home_prob > draw_prob and home_prob > away_prob:
pred = 'H'
elif away_prob > draw_prob:
pred = 'A'
else:
pred = 'D'
return {
'home_prob': home_prob,
'draw_prob': draw_prob,
'away_prob': away_prob,
'prediction': pred,
'confidence': max(home_prob, draw_prob, away_prob)
}
def run_backtest(self,
start_year: int = 2020,
end_year: int = 2024,
min_confidence: float = 0.5) -> Dict:
"""Run backtest over a period"""
df = self.load_data()
if df.empty:
return {'error': 'No data available'}
# Filter date range
df = df[(df['date'].dt.year >= start_year) & (df['date'].dt.year <= end_year)].copy()
df = df.sort_values('date')
if len(df) < 100:
return {'error': 'Not enough data for backtest'}
# Split: use first 70% to build Elo, test on last 30%
split_idx = int(len(df) * 0.7)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]
# Build Elo from training data
elo = self.calculate_elo(train_df)
# Test predictions
results = {
'total': 0,
'correct': 0,
'by_outcome': {'H': {'total': 0, 'correct': 0},
'D': {'total': 0, 'correct': 0},
'A': {'total': 0, 'correct': 0}},
'by_confidence': {
'high': {'total': 0, 'correct': 0}, # > 0.6
'medium': {'total': 0, 'correct': 0}, # 0.5-0.6
'low': {'total': 0, 'correct': 0} # < 0.5
},
'profit_loss': 0, # Assuming $10 flat bets at 1.9 odds
'predictions': []
}
for _, row in test_df.iterrows():
home, away = row['home_team'], row['away_team']
h_elo = elo.get(home, 1500)
a_elo = elo.get(away, 1500)
pred = self.predict_match(h_elo, a_elo)
if pred['confidence'] < min_confidence:
continue
# Actual result
if row['home_score'] > row['away_score']:
actual = 'H'
elif row['home_score'] < row['away_score']:
actual = 'A'
else:
actual = 'D'
correct = pred['prediction'] == actual
results['total'] += 1
if correct:
results['correct'] += 1
results['profit_loss'] += 9 # Win $9 on $10 at 1.9 odds
else:
results['profit_loss'] -= 10 # Lose $10
results['by_outcome'][pred['prediction']]['total'] += 1
if correct:
results['by_outcome'][pred['prediction']]['correct'] += 1
# Confidence bucket
if pred['confidence'] > 0.6:
bucket = 'high'
elif pred['confidence'] > 0.5:
bucket = 'medium'
else:
bucket = 'low'
results['by_confidence'][bucket]['total'] += 1
if correct:
results['by_confidence'][bucket]['correct'] += 1
results['predictions'].append({
'date': str(row['date'].date()),
'match': f"{home} vs {away}",
'predicted': pred['prediction'],
'actual': actual,
'correct': correct,
'confidence': round(pred['confidence'], 3)
})
# Update Elo
exp_h = 1 / (1 + 10**((a_elo - h_elo) / 400))
if actual == 'H': s_h, s_a = 1, 0
elif actual == 'A': s_h, s_a = 0, 1
else: s_h, s_a = 0.5, 0.5
elo[home] = h_elo + 32 * (s_h - exp_h)
elo[away] = a_elo + 32 * (s_a - (1 - exp_h))
# Calculate summary stats
results['accuracy'] = results['correct'] / results['total'] if results['total'] > 0 else 0
results['roi'] = results['profit_loss'] / (results['total'] * 10) if results['total'] > 0 else 0
for outcome in results['by_outcome'].values():
outcome['accuracy'] = outcome['correct'] / outcome['total'] if outcome['total'] > 0 else 0
for conf in results['by_confidence'].values():
conf['accuracy'] = conf['correct'] / conf['total'] if conf['total'] > 0 else 0
results['period'] = f"{start_year}-{end_year}"
results['test_matches'] = len(test_df)
results['predictions'] = results['predictions'][-50:] # Last 50 only
# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
with open(RESULTS_DIR / f'backtest_{timestamp}.json', 'w') as f:
json.dump(results, f, indent=2)
return results
def get_summary(self) -> Dict:
"""Get summary of all backtests"""
results = []
for f in RESULTS_DIR.glob('backtest_*.json'):
with open(f, 'r') as file:
data = json.load(file)
results.append({
'file': f.name,
'period': data.get('period'),
'accuracy': data.get('accuracy'),
'roi': data.get('roi'),
'total_predictions': data.get('total')
})
return {'backtests': results}
# Global instance
_backtester: Optional[Backtester] = None
def get_backtester() -> Backtester:
global _backtester
if _backtester is None:
_backtester = Backtester()
return _backtester
def run_backtest(start_year: int = 2020, end_year: int = 2024, min_confidence: float = 0.5):
return get_backtester().run_backtest(start_year, end_year, min_confidence)
def get_backtest_summary():
return get_backtester().get_summary()