porty / prediction_engine.py
babaTEEpe's picture
Upload 6 files
dd75617 verified
"""
PREDICTION ENGINE β€” The Brain
Pure math/statistics module. No Selenium. No browser.
Loads historical data, builds team profiles, computes predictions.
Used by both the 1X2 and Double Chance predictors + backtester.
"""
import os
import numpy as np
from collections import defaultdict
from math import factorial, exp
def poisson_pmf(k, lam):
"""Poisson probability mass function without scipy dependency."""
if lam <= 0:
return 1.0 if k == 0 else 0.0
return (lam ** k) * exp(-lam) / factorial(k)
class MatchData:
"""Parsed match result."""
__slots__ = ('home', 'away', 'hs', 'as_')
def __init__(self, home, away, hs, as_):
self.home = home
self.away = away
self.hs = hs
self.as_ = as_
@property
def result(self):
if self.hs > self.as_: return 'H'
if self.hs < self.as_: return 'A'
return 'D'
@property
def dc_outcome(self):
"""Which Double Chance option wins: HoD, HoA, or DoA."""
r = self.result
if r == 'H': return 'HoA' # Home win covers HoD AND HoA
if r == 'D': return 'HoD' # Draw covers HoD AND DoA
return 'DoA' # Away win covers HoA AND DoA
def dc_covers(self, option):
"""Does this result cover a given DC option?"""
r = self.result
if option == 'HoD': return r in ('H', 'D')
if option == 'HoA': return r in ('H', 'A')
if option == 'DoA': return r in ('D', 'A')
return False
# ─────────────────────────────────────────────
# TEAM ALIASES β€” BRN (2024 data) = BRE (current)
# ─────────────────────────────────────────────
TEAM_ALIASES = {'BRN': 'BRE'}
def normalize_team(name):
return TEAM_ALIASES.get(name, name)
def load_results_file(path):
"""Load a results txt file β†’ list[MatchData]."""
matches = []
if not os.path.exists(path):
return matches
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split()
if len(parts) != 3:
continue
home = normalize_team(parts[0])
away = normalize_team(parts[2])
score = parts[1].split(':')
if len(score) != 2:
continue
try:
matches.append(MatchData(home, away, int(score[0]), int(score[1])))
except ValueError:
continue
return matches
class PredictionEngine:
"""
The statistical brain.
Loads all historical data and precomputes:
- Team win/draw/loss rates (home & away separately)
- Head-to-head records
- Goal scoring/conceding averages (for Poisson)
- Recent form (Markov transition matrices)
- League-wide base rates
"""
def __init__(self, data_dir='.'):
self.data_dir = data_dir
self.matches = []
self._load_all_data()
self._build_indices()
# ── DATA LOADING ───────────────────────────
def _load_all_data(self):
"""Load both result files."""
for fname in ['2024-england_virtual_results.txt', 'england_virtual_results.txt']:
path = os.path.join(self.data_dir, fname)
self.matches.extend(load_results_file(path))
print(f"[Engine] Loaded {len(self.matches)} total matches")
def _build_indices(self):
"""Build lookup structures from match data."""
self.teams = set()
# Per-team stats
self.home_matches = defaultdict(list) # team β†’ [MatchData played at home]
self.away_matches = defaultdict(list) # team β†’ [MatchData played away]
self.h2h = defaultdict(list) # (home, away) β†’ [MatchData]
self.all_by_team = defaultdict(list) # team β†’ [MatchData] in order
for m in self.matches:
self.teams.add(m.home)
self.teams.add(m.away)
self.home_matches[m.home].append(m)
self.away_matches[m.away].append(m)
self.h2h[(m.home, m.away)].append(m)
self.all_by_team[m.home].append(m)
self.all_by_team[m.away].append(m)
# League-wide averages
if self.matches:
total_goals = sum(m.hs + m.as_ for m in self.matches)
self.league_avg_goals = total_goals / len(self.matches)
self.league_avg_per_side = self.league_avg_goals / 2
results = [m.result for m in self.matches]
self.league_home_rate = results.count('H') / len(results)
self.league_draw_rate = results.count('D') / len(results)
self.league_away_rate = results.count('A') / len(results)
else:
self.league_avg_goals = 2.5
self.league_avg_per_side = 1.25
self.league_home_rate = 0.40
self.league_draw_rate = 0.25
self.league_away_rate = 0.35
print(f"[Engine] {len(self.teams)} teams indexed")
print(f"[Engine] League avg goals/match: {self.league_avg_goals:.2f}")
print(f"[Engine] Base rates: H={self.league_home_rate:.1%} D={self.league_draw_rate:.1%} A={self.league_away_rate:.1%}")
# ── TEAM STATISTICS ────────────────────────
def team_home_record(self, team):
"""Returns (win_rate, draw_rate, loss_rate, avg_scored, avg_conceded) at home."""
ms = self.home_matches.get(team, [])
if not ms:
return self.league_home_rate, self.league_draw_rate, self.league_away_rate, self.league_avg_per_side, self.league_avg_per_side
w = sum(1 for m in ms if m.result == 'H')
d = sum(1 for m in ms if m.result == 'D')
l = len(ms) - w - d
gs = sum(m.hs for m in ms)
gc = sum(m.as_ for m in ms)
n = len(ms)
return w/n, d/n, l/n, gs/n, gc/n
def team_away_record(self, team):
"""Returns (win_rate, draw_rate, loss_rate, avg_scored, avg_conceded) away."""
ms = self.away_matches.get(team, [])
if not ms:
return self.league_away_rate, self.league_draw_rate, self.league_home_rate, self.league_avg_per_side, self.league_avg_per_side
w = sum(1 for m in ms if m.result == 'A')
d = sum(1 for m in ms if m.result == 'D')
l = len(ms) - w - d
gs = sum(m.as_ for m in ms)
gc = sum(m.hs for m in ms)
n = len(ms)
return w/n, d/n, l/n, gs/n, gc/n
def head_to_head(self, home, away):
"""H2H record for this exact matchup (home=home, away=away)."""
ms = self.h2h.get((home, away), [])
if not ms:
return None
n = len(ms)
hw = sum(1 for m in ms if m.result == 'H')
dr = sum(1 for m in ms if m.result == 'D')
aw = n - hw - dr
return {'matches': n, 'home_win': hw/n, 'draw': dr/n, 'away_win': aw/n}
def recent_form(self, team, last_n=15):
"""Last N results for a team (W/D/L sequence)."""
ms = self.all_by_team.get(team, [])[-last_n:]
form = []
for m in ms:
if m.home == team:
form.append(m.result.replace('H','W').replace('A','L'))
else:
r = m.result
form.append('W' if r == 'A' else ('L' if r == 'H' else 'D'))
return form
def markov_transition(self, team, last_n=20):
"""Build Markov transition matrix from recent form."""
form = self.recent_form(team, last_n)
if len(form) < 3:
return {'W': {'W': 0.4, 'D': 0.3, 'L': 0.3},
'D': {'W': 0.35, 'D': 0.3, 'L': 0.35},
'L': {'W': 0.3, 'D': 0.3, 'L': 0.4}}
trans = defaultdict(lambda: defaultdict(int))
for i in range(len(form) - 1):
trans[form[i]][form[i+1]] += 1
# Normalize
result = {}
for state in ['W', 'D', 'L']:
total = sum(trans[state].values())
if total == 0:
result[state] = {'W': 1/3, 'D': 1/3, 'L': 1/3}
else:
result[state] = {s: trans[state][s]/total for s in ['W', 'D', 'L']}
return result
# ── POISSON MODEL ──────────────────────────
def poisson_predict(self, home, away):
"""
Predict match outcome probabilities using Poisson distribution.
Returns dict with P(H), P(D), P(A).
"""
h_wr, h_dr, h_lr, h_gs, h_gc = self.team_home_record(home)
a_wr, a_dr, a_lr, a_gs, a_gc = self.team_away_record(away)
# Attack & defense strengths relative to league average
home_attack = h_gs / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0
home_defense = h_gc / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0
away_attack = a_gs / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0
away_defense = a_gc / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0
# Expected goals
exp_home = home_attack * away_defense * self.league_avg_per_side
exp_away = away_attack * home_defense * self.league_avg_per_side
# Clamp to reasonable range
exp_home = max(0.3, min(exp_home, 4.0))
exp_away = max(0.3, min(exp_away, 4.0))
p_h = 0.0; p_d = 0.0; p_a = 0.0
for i in range(8):
pi = poisson_pmf(i, exp_home)
for j in range(8):
pj = poisson_pmf(j, exp_away)
p = pi * pj
if i > j: p_h += p
elif i == j: p_d += p
else: p_a += p
total = p_h + p_d + p_a
if total > 0:
p_h /= total; p_d /= total; p_a /= total
return {'H': p_h, 'D': p_d, 'A': p_a, 'exp_home': exp_home, 'exp_away': exp_away}
# ── ODDS CONVERSION ───────────────────────
@staticmethod
def odds_to_probs(home_odds, draw_odds, away_odds):
"""Convert 1X2 odds to true probabilities (remove bookmaker margin)."""
raw_h = 1.0 / home_odds
raw_d = 1.0 / draw_odds
raw_a = 1.0 / away_odds
overround = raw_h + raw_d + raw_a
return raw_h / overround, raw_d / overround, raw_a / overround
@staticmethod
def dc_odds_to_probs(hod, hoa, doa):
"""Convert Double Chance odds to implied probabilities."""
raw_hod = 1.0 / hod
raw_hoa = 1.0 / hoa
raw_doa = 1.0 / doa
overround = raw_hod + raw_hoa + raw_doa
return raw_hod / overround, raw_hoa / overround, raw_doa / overround
# ── 1X2 PREDICTION ─────────────────────────
def predict_1x2(self, home, away, h_odds=None, d_odds=None, a_odds=None):
"""
Full Bayesian-style 1X2 prediction.
Fuses: odds-implied probs, Poisson model, historical rates,
h2h record, and Markov form.
Returns: dict with final probabilities and confidence.
"""
home = normalize_team(home)
away = normalize_team(away)
# 1) Odds-implied probabilities
if h_odds and d_odds and a_odds:
p_h_odds, p_d_odds, p_a_odds = self.odds_to_probs(h_odds, d_odds, a_odds)
w_odds = 0.25
else:
p_h_odds = self.league_home_rate
p_d_odds = self.league_draw_rate
p_a_odds = self.league_away_rate
w_odds = 0.0
# 2) Poisson model
poisson = self.poisson_predict(home, away)
w_poisson = 0.25
# 3) Historical team rates
h_wr, h_dr, h_lr, _, _ = self.team_home_record(home)
a_wr, a_dr, a_lr, _, _ = self.team_away_record(away)
# Combine: P(H) ~ home's home_win_rate, P(A) ~ away's away_win_rate
p_h_hist = (h_wr + (1 - a_wr)) / 2
p_a_hist = (a_wr + (1 - h_wr)) / 2
p_d_hist = 1.0 - p_h_hist - p_a_hist
p_d_hist = max(0.05, p_d_hist)
total = p_h_hist + p_d_hist + p_a_hist
p_h_hist /= total; p_d_hist /= total; p_a_hist /= total
w_hist = 0.25
# 4) Head-to-head
h2h = self.head_to_head(home, away)
if h2h and h2h['matches'] >= 3:
p_h_h2h = h2h['home_win']
p_d_h2h = h2h['draw']
p_a_h2h = h2h['away_win']
w_h2h = 0.15
else:
p_h_h2h = p_h_hist; p_d_h2h = p_d_hist; p_a_h2h = p_a_hist
w_h2h = 0.05
# 5) Markov form
h_form = self.recent_form(home, 10)
a_form = self.recent_form(away, 10)
h_trans = self.markov_transition(home, 15)
a_trans = self.markov_transition(away, 15)
if h_form:
h_state = h_form[-1]
h_next_w = h_trans[h_state]['W']
else:
h_next_w = 0.33
if a_form:
a_state = a_form[-1]
a_next_w = a_trans[a_state]['W']
else:
a_next_w = 0.33
p_h_form = h_next_w * 0.6 + (1 - a_next_w) * 0.4
p_a_form = a_next_w * 0.6 + (1 - h_next_w) * 0.4
p_d_form = 1.0 - p_h_form - p_a_form
p_d_form = max(0.05, p_d_form)
total = p_h_form + p_d_form + p_a_form
p_h_form /= total; p_d_form /= total; p_a_form /= total
w_form = 0.10
# Normalize weights
w_total = w_odds + w_poisson + w_hist + w_h2h + w_form
w_odds /= w_total; w_poisson /= w_total; w_hist /= w_total
w_h2h /= w_total; w_form /= w_total
# Fuse
p_h = (w_odds * p_h_odds + w_poisson * poisson['H'] +
w_hist * p_h_hist + w_h2h * p_h_h2h + w_form * p_h_form)
p_d = (w_odds * p_d_odds + w_poisson * poisson['D'] +
w_hist * p_d_hist + w_h2h * p_d_h2h + w_form * p_d_form)
p_a = (w_odds * p_a_odds + w_poisson * poisson['A'] +
w_hist * p_a_hist + w_h2h * p_a_h2h + w_form * p_a_form)
total = p_h + p_d + p_a
p_h /= total; p_d /= total; p_a /= total
best = max(('H', p_h), ('D', p_d), ('A', p_a), key=lambda x: x[1])
confidence = best[1]
if confidence >= 0.55:
level = "HIGH"
elif confidence >= 0.42:
level = "MEDIUM"
else:
level = "LOW"
return {
'prediction': best[0],
'confidence': confidence,
'level': level,
'probs': {'H': p_h, 'D': p_d, 'A': p_a},
'poisson': poisson,
'skip': confidence < 0.38
}
# ── DOUBLE CHANCE PREDICTION ───────────────
def predict_dc(self, home, away, hod_odds=None, hoa_odds=None, doa_odds=None,
h_odds=None, d_odds=None, a_odds=None):
"""
Full Double Chance prediction using EDGE-BASED approach.
Key insight: raw DC probabilities always favor HoA (since P(H)+P(A) dominates
when home advantage is strong). Instead, we compute each option's EDGE
relative to the league baseline for that option, identifying matchup-specific
deviations.
"""
home = normalize_team(home)
away = normalize_team(away)
# Get 1X2 probabilities from full model
r1x2 = self.predict_1x2(home, away, h_odds, d_odds, a_odds)
p_h = r1x2['probs']['H']
p_d = r1x2['probs']['D']
p_a = r1x2['probs']['A']
# This match's DC probabilities
p_hod = p_h + p_d
p_hoa = p_h + p_a
p_doa = p_d + p_a
# League-wide DC baselines
base_hod = self.league_home_rate + self.league_draw_rate
base_hoa = self.league_home_rate + self.league_away_rate
base_doa = self.league_draw_rate + self.league_away_rate
# EDGE = how much better than baseline this matchup is for each option
edge_hod = p_hod - base_hod
edge_hoa = p_hoa - base_hoa
edge_doa = p_doa - base_doa
# H2H DC frequency adjustment
h2h_matches = self.h2h.get((home, away), [])
if len(h2h_matches) >= 3:
n = len(h2h_matches)
hod_freq = sum(1 for m in h2h_matches if m.dc_covers('HoD')) / n
hoa_freq = sum(1 for m in h2h_matches if m.dc_covers('HoA')) / n
doa_freq = sum(1 for m in h2h_matches if m.dc_covers('DoA')) / n
# H2H edge (deviation from base)
edge_hod = 0.70 * edge_hod + 0.30 * (hod_freq - base_hod)
edge_hoa = 0.70 * edge_hoa + 0.30 * (hoa_freq - base_hoa)
edge_doa = 0.70 * edge_doa + 0.30 * (doa_freq - base_doa)
# DC odds-implied edge (if available)
asym_hod = asym_hoa = asym_doa = 0.0
if hod_odds and hoa_odds and doa_odds:
p_hod_odds, p_hoa_odds, p_doa_odds = self.dc_odds_to_probs(hod_odds, hoa_odds, doa_odds)
# Odds edge = implied prob minus model prob (positive = bookie thinks more likely)
asym_hod = p_hod_odds - p_hod
asym_hoa = p_hoa_odds - p_hoa
asym_doa = p_doa_odds - p_doa
# Blend odds signal into edge (40% weight to odds)
odds_edge_hod = p_hod_odds - base_hod
odds_edge_hoa = p_hoa_odds - base_hoa
odds_edge_doa = p_doa_odds - base_doa
edge_hod = 0.60 * edge_hod + 0.40 * odds_edge_hod
edge_hoa = 0.60 * edge_hoa + 0.40 * odds_edge_hoa
edge_doa = 0.60 * edge_doa + 0.40 * odds_edge_doa
# Pick by highest edge
edges = {'HoD': edge_hod, 'HoA': edge_hoa, 'DoA': edge_doa}
best = max(edges, key=edges.get)
# Confidence = the actual probability of the chosen option
probs = {'HoD': p_hod, 'HoA': p_hoa, 'DoA': p_doa}
confidence = probs[best]
# Confidence thresholds for DC (base rates are ~66%, so higher bar)
if confidence >= 0.72:
level = "HIGH"
elif confidence >= 0.65:
level = "MEDIUM"
else:
level = "LOW"
return {
'prediction': best,
'confidence': confidence,
'level': level,
'probs': probs,
'edges': edges,
'asymmetry': {'HoD': asym_hod, 'HoA': asym_hoa, 'DoA': asym_doa},
'underlying_1x2': r1x2['probs'],
'skip': confidence < 0.60
}
# ── QUICK TEST ─────────────────────────────────
if __name__ == '__main__':
engine = PredictionEngine()
print("\n" + "="*60)
print("SAMPLE 1X2 PREDICTIONS")
print("="*60)
for home, away in [('MCI', 'CHE'), ('ARS', 'TOT'), ('LIV', 'MUN'), ('NEW', 'BRE'), ('FUL', 'WOL')]:
r = engine.predict_1x2(home, away)
print(f"\n{home} vs {away}:")
print(f" H={r['probs']['H']:.1%} D={r['probs']['D']:.1%} A={r['probs']['A']:.1%}")
print(f" Prediction: {r['prediction']} ({r['level']}, {r['confidence']:.1%})")
print(f" Poisson xG: {r['poisson']['exp_home']:.2f} - {r['poisson']['exp_away']:.2f}")
if r['skip']:
print(f" ⚠️ SKIP β€” low confidence")
print("\n" + "="*60)
print("SAMPLE DOUBLE CHANCE PREDICTIONS")
print("="*60)
for home, away in [('MCI', 'CHE'), ('ARS', 'TOT'), ('LIV', 'MUN')]:
r = engine.predict_dc(home, away)
print(f"\n{home} vs {away}:")
print(f" HoD={r['probs']['HoD']:.1%} HoA={r['probs']['HoA']:.1%} DoA={r['probs']['DoA']:.1%}")
print(f" Prediction: {r['prediction']} ({r['level']}, {r['confidence']:.1%})")