Spaces:

babaTEEpe
/

porty

Sleeping

App Files Files Community

porty / prediction_engine.py

babaTEEpe

Upload 6 files

dd75617 verified about 1 month ago

raw

history blame contribute delete

19.9 kB

	"""
	PREDICTION ENGINE — The Brain
	Pure math/statistics module. No Selenium. No browser.
	Loads historical data, builds team profiles, computes predictions.
	Used by both the 1X2 and Double Chance predictors + backtester.
	"""
	import os
	import numpy as np
	from collections import defaultdict
	from math import factorial, exp


	def poisson_pmf(k, lam):
	"""Poisson probability mass function without scipy dependency."""
	if lam <= 0:
	return 1.0 if k == 0 else 0.0
	return (lam ** k) * exp(-lam) / factorial(k)


	class MatchData:
	"""Parsed match result."""
	__slots__ = ('home', 'away', 'hs', 'as_')
	def __init__(self, home, away, hs, as_):
	self.home = home
	self.away = away
	self.hs = hs
	self.as_ = as_

	@property
	def result(self):
	if self.hs > self.as_: return 'H'
	if self.hs < self.as_: return 'A'
	return 'D'

	@property
	def dc_outcome(self):
	"""Which Double Chance option wins: HoD, HoA, or DoA."""
	r = self.result
	if r == 'H': return 'HoA' # Home win covers HoD AND HoA
	if r == 'D': return 'HoD' # Draw covers HoD AND DoA
	return 'DoA' # Away win covers HoA AND DoA

	def dc_covers(self, option):
	"""Does this result cover a given DC option?"""
	r = self.result
	if option == 'HoD': return r in ('H', 'D')
	if option == 'HoA': return r in ('H', 'A')
	if option == 'DoA': return r in ('D', 'A')
	return False


	# ─────────────────────────────────────────────
	# TEAM ALIASES — BRN (2024 data) = BRE (current)
	# ─────────────────────────────────────────────
	TEAM_ALIASES = {'BRN': 'BRE'}

	def normalize_team(name):
	return TEAM_ALIASES.get(name, name)


	def load_results_file(path):
	"""Load a results txt file → list[MatchData]."""
	matches = []
	if not os.path.exists(path):
	return matches
	with open(path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	parts = line.split()
	if len(parts) != 3:
	continue
	home = normalize_team(parts[0])
	away = normalize_team(parts[2])
	score = parts[1].split(':')
	if len(score) != 2:
	continue
	try:
	matches.append(MatchData(home, away, int(score[0]), int(score[1])))
	except ValueError:
	continue
	return matches


	class PredictionEngine:
	"""
	The statistical brain.

	Loads all historical data and precomputes:
	- Team win/draw/loss rates (home & away separately)
	- Head-to-head records
	- Goal scoring/conceding averages (for Poisson)
	- Recent form (Markov transition matrices)
	- League-wide base rates
	"""

	def __init__(self, data_dir='.'):
	self.data_dir = data_dir
	self.matches = []
	self._load_all_data()
	self._build_indices()

	# ── DATA LOADING ───────────────────────────
	def _load_all_data(self):
	"""Load both result files."""
	for fname in ['2024-england_virtual_results.txt', 'england_virtual_results.txt']:
	path = os.path.join(self.data_dir, fname)
	self.matches.extend(load_results_file(path))
	print(f"[Engine] Loaded {len(self.matches)} total matches")

	def _build_indices(self):
	"""Build lookup structures from match data."""
	self.teams = set()
	# Per-team stats
	self.home_matches = defaultdict(list) # team → [MatchData played at home]
	self.away_matches = defaultdict(list) # team → [MatchData played away]
	self.h2h = defaultdict(list) # (home, away) → [MatchData]
	self.all_by_team = defaultdict(list) # team → [MatchData] in order

	for m in self.matches:
	self.teams.add(m.home)
	self.teams.add(m.away)
	self.home_matches[m.home].append(m)
	self.away_matches[m.away].append(m)
	self.h2h[(m.home, m.away)].append(m)
	self.all_by_team[m.home].append(m)
	self.all_by_team[m.away].append(m)

	# League-wide averages
	if self.matches:
	total_goals = sum(m.hs + m.as_ for m in self.matches)
	self.league_avg_goals = total_goals / len(self.matches)
	self.league_avg_per_side = self.league_avg_goals / 2
	results = [m.result for m in self.matches]
	self.league_home_rate = results.count('H') / len(results)
	self.league_draw_rate = results.count('D') / len(results)
	self.league_away_rate = results.count('A') / len(results)
	else:
	self.league_avg_goals = 2.5
	self.league_avg_per_side = 1.25
	self.league_home_rate = 0.40
	self.league_draw_rate = 0.25
	self.league_away_rate = 0.35

	print(f"[Engine] {len(self.teams)} teams indexed")
	print(f"[Engine] League avg goals/match: {self.league_avg_goals:.2f}")
	print(f"[Engine] Base rates: H={self.league_home_rate:.1%} D={self.league_draw_rate:.1%} A={self.league_away_rate:.1%}")

	# ── TEAM STATISTICS ────────────────────────
	def team_home_record(self, team):
	"""Returns (win_rate, draw_rate, loss_rate, avg_scored, avg_conceded) at home."""
	ms = self.home_matches.get(team, [])
	if not ms:
	return self.league_home_rate, self.league_draw_rate, self.league_away_rate, self.league_avg_per_side, self.league_avg_per_side
	w = sum(1 for m in ms if m.result == 'H')
	d = sum(1 for m in ms if m.result == 'D')
	l = len(ms) - w - d
	gs = sum(m.hs for m in ms)
	gc = sum(m.as_ for m in ms)
	n = len(ms)
	return w/n, d/n, l/n, gs/n, gc/n

	def team_away_record(self, team):
	"""Returns (win_rate, draw_rate, loss_rate, avg_scored, avg_conceded) away."""
	ms = self.away_matches.get(team, [])
	if not ms:
	return self.league_away_rate, self.league_draw_rate, self.league_home_rate, self.league_avg_per_side, self.league_avg_per_side
	w = sum(1 for m in ms if m.result == 'A')
	d = sum(1 for m in ms if m.result == 'D')
	l = len(ms) - w - d
	gs = sum(m.as_ for m in ms)
	gc = sum(m.hs for m in ms)
	n = len(ms)
	return w/n, d/n, l/n, gs/n, gc/n

	def head_to_head(self, home, away):
	"""H2H record for this exact matchup (home=home, away=away)."""
	ms = self.h2h.get((home, away), [])
	if not ms:
	return None
	n = len(ms)
	hw = sum(1 for m in ms if m.result == 'H')
	dr = sum(1 for m in ms if m.result == 'D')
	aw = n - hw - dr
	return {'matches': n, 'home_win': hw/n, 'draw': dr/n, 'away_win': aw/n}

	def recent_form(self, team, last_n=15):
	"""Last N results for a team (W/D/L sequence)."""
	ms = self.all_by_team.get(team, [])[-last_n:]
	form = []
	for m in ms:
	if m.home == team:
	form.append(m.result.replace('H','W').replace('A','L'))
	else:
	r = m.result
	form.append('W' if r == 'A' else ('L' if r == 'H' else 'D'))
	return form

	def markov_transition(self, team, last_n=20):
	"""Build Markov transition matrix from recent form."""
	form = self.recent_form(team, last_n)
	if len(form) < 3:
	return {'W': {'W': 0.4, 'D': 0.3, 'L': 0.3},
	'D': {'W': 0.35, 'D': 0.3, 'L': 0.35},
	'L': {'W': 0.3, 'D': 0.3, 'L': 0.4}}
	trans = defaultdict(lambda: defaultdict(int))
	for i in range(len(form) - 1):
	trans[form[i]][form[i+1]] += 1
	# Normalize
	result = {}
	for state in ['W', 'D', 'L']:
	total = sum(trans[state].values())
	if total == 0:
	result[state] = {'W': 1/3, 'D': 1/3, 'L': 1/3}
	else:
	result[state] = {s: trans[state][s]/total for s in ['W', 'D', 'L']}
	return result

	# ── POISSON MODEL ──────────────────────────
	def poisson_predict(self, home, away):
	"""
	Predict match outcome probabilities using Poisson distribution.
	Returns dict with P(H), P(D), P(A).
	"""
	h_wr, h_dr, h_lr, h_gs, h_gc = self.team_home_record(home)
	a_wr, a_dr, a_lr, a_gs, a_gc = self.team_away_record(away)

	# Attack & defense strengths relative to league average
	home_attack = h_gs / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0
	home_defense = h_gc / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0
	away_attack = a_gs / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0
	away_defense = a_gc / self.league_avg_per_side if self.league_avg_per_side > 0 else 1.0

	# Expected goals
	exp_home = home_attack * away_defense * self.league_avg_per_side
	exp_away = away_attack * home_defense * self.league_avg_per_side

	# Clamp to reasonable range
	exp_home = max(0.3, min(exp_home, 4.0))
	exp_away = max(0.3, min(exp_away, 4.0))

	p_h = 0.0; p_d = 0.0; p_a = 0.0
	for i in range(8):
	pi = poisson_pmf(i, exp_home)
	for j in range(8):
	pj = poisson_pmf(j, exp_away)
	p = pi * pj
	if i > j: p_h += p
	elif i == j: p_d += p
	else: p_a += p

	total = p_h + p_d + p_a
	if total > 0:
	p_h /= total; p_d /= total; p_a /= total
	return {'H': p_h, 'D': p_d, 'A': p_a, 'exp_home': exp_home, 'exp_away': exp_away}

	# ── ODDS CONVERSION ───────────────────────
	@staticmethod
	def odds_to_probs(home_odds, draw_odds, away_odds):
	"""Convert 1X2 odds to true probabilities (remove bookmaker margin)."""
	raw_h = 1.0 / home_odds
	raw_d = 1.0 / draw_odds
	raw_a = 1.0 / away_odds
	overround = raw_h + raw_d + raw_a
	return raw_h / overround, raw_d / overround, raw_a / overround

	@staticmethod
	def dc_odds_to_probs(hod, hoa, doa):
	"""Convert Double Chance odds to implied probabilities."""
	raw_hod = 1.0 / hod
	raw_hoa = 1.0 / hoa
	raw_doa = 1.0 / doa
	overround = raw_hod + raw_hoa + raw_doa
	return raw_hod / overround, raw_hoa / overround, raw_doa / overround

	# ── 1X2 PREDICTION ─────────────────────────
	def predict_1x2(self, home, away, h_odds=None, d_odds=None, a_odds=None):
	"""
	Full Bayesian-style 1X2 prediction.
	Fuses: odds-implied probs, Poisson model, historical rates,
	h2h record, and Markov form.
	Returns: dict with final probabilities and confidence.
	"""
	home = normalize_team(home)
	away = normalize_team(away)

	# 1) Odds-implied probabilities
	if h_odds and d_odds and a_odds:
	p_h_odds, p_d_odds, p_a_odds = self.odds_to_probs(h_odds, d_odds, a_odds)
	w_odds = 0.25
	else:
	p_h_odds = self.league_home_rate
	p_d_odds = self.league_draw_rate
	p_a_odds = self.league_away_rate
	w_odds = 0.0

	# 2) Poisson model
	poisson = self.poisson_predict(home, away)
	w_poisson = 0.25

	# 3) Historical team rates
	h_wr, h_dr, h_lr, _, _ = self.team_home_record(home)
	a_wr, a_dr, a_lr, _, _ = self.team_away_record(away)
	# Combine: P(H) ~ home's home_win_rate, P(A) ~ away's away_win_rate
	p_h_hist = (h_wr + (1 - a_wr)) / 2
	p_a_hist = (a_wr + (1 - h_wr)) / 2
	p_d_hist = 1.0 - p_h_hist - p_a_hist
	p_d_hist = max(0.05, p_d_hist)
	total = p_h_hist + p_d_hist + p_a_hist
	p_h_hist /= total; p_d_hist /= total; p_a_hist /= total
	w_hist = 0.25

	# 4) Head-to-head
	h2h = self.head_to_head(home, away)
	if h2h and h2h['matches'] >= 3:
	p_h_h2h = h2h['home_win']
	p_d_h2h = h2h['draw']
	p_a_h2h = h2h['away_win']
	w_h2h = 0.15
	else:
	p_h_h2h = p_h_hist; p_d_h2h = p_d_hist; p_a_h2h = p_a_hist
	w_h2h = 0.05

	# 5) Markov form
	h_form = self.recent_form(home, 10)
	a_form = self.recent_form(away, 10)
	h_trans = self.markov_transition(home, 15)
	a_trans = self.markov_transition(away, 15)

	if h_form:
	h_state = h_form[-1]
	h_next_w = h_trans[h_state]['W']
	else:
	h_next_w = 0.33

	if a_form:
	a_state = a_form[-1]
	a_next_w = a_trans[a_state]['W']
	else:
	a_next_w = 0.33

	p_h_form = h_next_w * 0.6 + (1 - a_next_w) * 0.4
	p_a_form = a_next_w * 0.6 + (1 - h_next_w) * 0.4
	p_d_form = 1.0 - p_h_form - p_a_form
	p_d_form = max(0.05, p_d_form)
	total = p_h_form + p_d_form + p_a_form
	p_h_form /= total; p_d_form /= total; p_a_form /= total
	w_form = 0.10

	# Normalize weights
	w_total = w_odds + w_poisson + w_hist + w_h2h + w_form
	w_odds /= w_total; w_poisson /= w_total; w_hist /= w_total
	w_h2h /= w_total; w_form /= w_total

	# Fuse
	p_h = (w_odds * p_h_odds + w_poisson * poisson['H'] +
	w_hist * p_h_hist + w_h2h * p_h_h2h + w_form * p_h_form)
	p_d = (w_odds * p_d_odds + w_poisson * poisson['D'] +
	w_hist * p_d_hist + w_h2h * p_d_h2h + w_form * p_d_form)
	p_a = (w_odds * p_a_odds + w_poisson * poisson['A'] +
	w_hist * p_a_hist + w_h2h * p_a_h2h + w_form * p_a_form)

	total = p_h + p_d + p_a
	p_h /= total; p_d /= total; p_a /= total

	best = max(('H', p_h), ('D', p_d), ('A', p_a), key=lambda x: x[1])
	confidence = best[1]

	if confidence >= 0.55:
	level = "HIGH"
	elif confidence >= 0.42:
	level = "MEDIUM"
	else:
	level = "LOW"

	return {
	'prediction': best[0],
	'confidence': confidence,
	'level': level,
	'probs': {'H': p_h, 'D': p_d, 'A': p_a},
	'poisson': poisson,
	'skip': confidence < 0.38
	}

	# ── DOUBLE CHANCE PREDICTION ───────────────
	def predict_dc(self, home, away, hod_odds=None, hoa_odds=None, doa_odds=None,
	h_odds=None, d_odds=None, a_odds=None):
	"""
	Full Double Chance prediction using EDGE-BASED approach.

	Key insight: raw DC probabilities always favor HoA (since P(H)+P(A) dominates
	when home advantage is strong). Instead, we compute each option's EDGE
	relative to the league baseline for that option, identifying matchup-specific
	deviations.
	"""
	home = normalize_team(home)
	away = normalize_team(away)

	# Get 1X2 probabilities from full model
	r1x2 = self.predict_1x2(home, away, h_odds, d_odds, a_odds)
	p_h = r1x2['probs']['H']
	p_d = r1x2['probs']['D']
	p_a = r1x2['probs']['A']

	# This match's DC probabilities
	p_hod = p_h + p_d
	p_hoa = p_h + p_a
	p_doa = p_d + p_a

	# League-wide DC baselines
	base_hod = self.league_home_rate + self.league_draw_rate
	base_hoa = self.league_home_rate + self.league_away_rate
	base_doa = self.league_draw_rate + self.league_away_rate

	# EDGE = how much better than baseline this matchup is for each option
	edge_hod = p_hod - base_hod
	edge_hoa = p_hoa - base_hoa
	edge_doa = p_doa - base_doa

	# H2H DC frequency adjustment
	h2h_matches = self.h2h.get((home, away), [])
	if len(h2h_matches) >= 3:
	n = len(h2h_matches)
	hod_freq = sum(1 for m in h2h_matches if m.dc_covers('HoD')) / n
	hoa_freq = sum(1 for m in h2h_matches if m.dc_covers('HoA')) / n
	doa_freq = sum(1 for m in h2h_matches if m.dc_covers('DoA')) / n
	# H2H edge (deviation from base)
	edge_hod = 0.70 * edge_hod + 0.30 * (hod_freq - base_hod)
	edge_hoa = 0.70 * edge_hoa + 0.30 * (hoa_freq - base_hoa)
	edge_doa = 0.70 * edge_doa + 0.30 * (doa_freq - base_doa)

	# DC odds-implied edge (if available)
	asym_hod = asym_hoa = asym_doa = 0.0
	if hod_odds and hoa_odds and doa_odds:
	p_hod_odds, p_hoa_odds, p_doa_odds = self.dc_odds_to_probs(hod_odds, hoa_odds, doa_odds)
	# Odds edge = implied prob minus model prob (positive = bookie thinks more likely)
	asym_hod = p_hod_odds - p_hod
	asym_hoa = p_hoa_odds - p_hoa
	asym_doa = p_doa_odds - p_doa
	# Blend odds signal into edge (40% weight to odds)
	odds_edge_hod = p_hod_odds - base_hod
	odds_edge_hoa = p_hoa_odds - base_hoa
	odds_edge_doa = p_doa_odds - base_doa
	edge_hod = 0.60 * edge_hod + 0.40 * odds_edge_hod
	edge_hoa = 0.60 * edge_hoa + 0.40 * odds_edge_hoa
	edge_doa = 0.60 * edge_doa + 0.40 * odds_edge_doa

	# Pick by highest edge
	edges = {'HoD': edge_hod, 'HoA': edge_hoa, 'DoA': edge_doa}
	best = max(edges, key=edges.get)

	# Confidence = the actual probability of the chosen option
	probs = {'HoD': p_hod, 'HoA': p_hoa, 'DoA': p_doa}
	confidence = probs[best]

	# Confidence thresholds for DC (base rates are ~66%, so higher bar)
	if confidence >= 0.72:
	level = "HIGH"
	elif confidence >= 0.65:
	level = "MEDIUM"
	else:
	level = "LOW"

	return {
	'prediction': best,
	'confidence': confidence,
	'level': level,
	'probs': probs,
	'edges': edges,
	'asymmetry': {'HoD': asym_hod, 'HoA': asym_hoa, 'DoA': asym_doa},
	'underlying_1x2': r1x2['probs'],
	'skip': confidence < 0.60
	}


	# ── QUICK TEST ─────────────────────────────────
	if __name__ == '__main__':
	engine = PredictionEngine()

	print("\n" + "="*60)
	print("SAMPLE 1X2 PREDICTIONS")
	print("="*60)
	for home, away in [('MCI', 'CHE'), ('ARS', 'TOT'), ('LIV', 'MUN'), ('NEW', 'BRE'), ('FUL', 'WOL')]:
	r = engine.predict_1x2(home, away)
	print(f"\n{home} vs {away}:")
	print(f" H={r['probs']['H']:.1%} D={r['probs']['D']:.1%} A={r['probs']['A']:.1%}")
	print(f" Prediction: {r['prediction']} ({r['level']}, {r['confidence']:.1%})")
	print(f" Poisson xG: {r['poisson']['exp_home']:.2f} - {r['poisson']['exp_away']:.2f}")
	if r['skip']:
	print(f" ⚠️ SKIP — low confidence")

	print("\n" + "="*60)
	print("SAMPLE DOUBLE CHANCE PREDICTIONS")
	print("="*60)
	for home, away in [('MCI', 'CHE'), ('ARS', 'TOT'), ('LIV', 'MUN')]:
	r = engine.predict_dc(home, away)
	print(f"\n{home} vs {away}:")
	print(f" HoD={r['probs']['HoD']:.1%} HoA={r['probs']['HoA']:.1%} DoA={r['probs']['DoA']:.1%}")
	print(f" Prediction: {r['prediction']} ({r['level']}, {r['confidence']:.1%})")