Teera commited on
Commit
46b7fa3
·
verified ·
1 Parent(s): e6f6cf2

Upload 8 files

Browse files
feature_columns.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "X_cols": [
3
+ "home_roll_gf",
4
+ "home_roll_ga",
5
+ "home_roll_shots_f",
6
+ "home_roll_shots_a",
7
+ "home_roll_sot_f",
8
+ "home_roll_sot_a",
9
+ "home_roll_corn_f",
10
+ "home_roll_corn_a",
11
+ "home_roll_y_f",
12
+ "home_roll_r_f",
13
+ "home_roll_points",
14
+ "away_roll_gf",
15
+ "away_roll_ga",
16
+ "away_roll_shots_f",
17
+ "away_roll_shots_a",
18
+ "away_roll_sot_f",
19
+ "away_roll_sot_a",
20
+ "away_roll_corn_f",
21
+ "away_roll_corn_a",
22
+ "away_roll_y_f",
23
+ "away_roll_r_f",
24
+ "away_roll_points",
25
+ "elo_home",
26
+ "elo_away",
27
+ "elo_diff",
28
+ "p_odds_H",
29
+ "p_odds_D",
30
+ "p_odds_A"
31
+ ],
32
+ "WINDOW": 7
33
+ }
gradio_app.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from datetime import date, timedelta, datetime as dt
4
+ from typing import List, Optional, Tuple
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import gradio as gr
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ import json
12
+ import joblib
13
+
14
+ from ingest_data import load_epl_data
15
+ from preprocess_data import prepare_features
16
+ from inference_utils import (
17
+ clean_team,
18
+ implied_from_odds,
19
+ build_features_for_fixture,
20
+ )
21
+
22
+ # --------- Load pipeline artifacts once ---------
23
+
24
+ def _next_saturday_str(today: Optional[date] = None) -> str:
25
+ if today is None:
26
+ today = date.today()
27
+ days_ahead = 5 - today.weekday() # 5=Saturday
28
+ if days_ahead <= 0:
29
+ days_ahead += 7
30
+ return (today + timedelta(days=days_ahead)).strftime("%Y-%m-%d")
31
+
32
+
33
+ def _read_team_list(path: Path) -> List[str]:
34
+ if not path.exists():
35
+ return []
36
+ names: List[str] = []
37
+ with open(path, "r", encoding="utf-8") as f:
38
+ for line in f:
39
+ name = line.strip()
40
+ if not name:
41
+ continue
42
+ names.append(name)
43
+ return names
44
+
45
+
46
+ def _load_feature_meta() -> Tuple[List[str], int]:
47
+ candidates = [Path("feature_columns.json"), Path("data") / "feature_columns.json"]
48
+ for p in candidates:
49
+ if p.exists():
50
+ with open(p, "r", encoding="utf-8") as f:
51
+ meta = json.load(f)
52
+ return meta.get("X_cols", []), int(meta.get("WINDOW", 7))
53
+ return [], 7
54
+
55
+
56
+ def init_pipeline():
57
+ # Data
58
+ data_raw = load_epl_data(start_y1=2010, end_y1=None, verbose=False)
59
+ feat_df, X_cols_generated, WINDOW_generated, base_df = prepare_features(data_raw, window=7, verbose=False)
60
+
61
+ # Features meta (prefer saved training order)
62
+ X_cols_saved, WINDOW_saved = _load_feature_meta()
63
+ X_cols = X_cols_saved if X_cols_saved else X_cols_generated
64
+ window = WINDOW_saved if X_cols_saved else WINDOW_generated
65
+
66
+ # Model
67
+ model = None
68
+ for mp in [Path("model") / "model_xgb_isotonic.joblib", Path("model_xgb_isotonic.joblib")]:
69
+ if mp.exists():
70
+ model = joblib.load(str(mp))
71
+ break
72
+ if model is None:
73
+ raise FileNotFoundError("Model not found at ./model/model_xgb_isotonic.joblib")
74
+
75
+ # Team list (for UI)
76
+ team_list = _read_team_list(Path("data") / "team name.txt")
77
+ if not team_list:
78
+ # fallback to unique teams from data
79
+ team_list = sorted(set(base_df["home"]).union(set(base_df["away"])))
80
+
81
+ return {
82
+ "feat_df": feat_df,
83
+ "df": base_df,
84
+ "X_cols": X_cols,
85
+ "window": window,
86
+ "model": model,
87
+ "team_list": team_list,
88
+ }
89
+
90
+
91
+ PIPE = init_pipeline()
92
+
93
+
94
+ # --------- Inference helpers for UI ---------
95
+
96
+ def manual_predict(home_team: str, away_team: str, match_date: str,
97
+ home_odds: str = "", draw_odds: str = "", away_odds: str = ""):
98
+ if not home_team or not away_team or not match_date:
99
+ return "Please select Home, Away and Date.", None
100
+
101
+ odds_tuple: Optional[Tuple[float, float, float]] = None
102
+ if home_odds and draw_odds and away_odds:
103
+ try:
104
+ odds_tuple = (float(home_odds), float(draw_odds), float(away_odds))
105
+ except Exception:
106
+ return "Invalid odds input. Leave blank or enter numeric decimals.", None
107
+
108
+ try:
109
+ X_new, ctx = build_features_for_fixture(
110
+ home_team, away_team, match_date,
111
+ df_all=PIPE["df"], X_cols=PIPE["X_cols"], window=PIPE["window"],
112
+ odds_tuple=odds_tuple, feat_df_for_medians=PIPE["feat_df"],
113
+ )
114
+ proba = PIPE["model"].predict_proba(X_new)[0]
115
+ labels = ["H (Home Win)", "D (Draw)", "A (Away Win)"]
116
+ res = pd.DataFrame({"Outcome": labels, "Probability": [float(p) for p in proba]})
117
+ return res, ctx
118
+ except Exception as e:
119
+ return f"Error: {e}", None
120
+
121
+
122
+ def fetch_next_week_fixtures_and_predict(api_key: Optional[str] = None):
123
+ # Use football-data.org if API key provided, else return message
124
+ if not api_key:
125
+ return "Set FOOTBALL_DATA_API_KEY env or provide API key in the textbox to auto-fetch fixtures.", None
126
+
127
+ base_url = "https://api.football-data.org/v4/competitions/PL/matches"
128
+ d_from = date.today()
129
+ d_to = d_from + timedelta(days=7)
130
+ params = {
131
+ "status": "SCHEDULED",
132
+ "dateFrom": d_from.strftime("%Y-%m-%d"),
133
+ "dateTo": d_to.strftime("%Y-%m-%d"),
134
+ }
135
+ headers = {"X-Auth-Token": api_key}
136
+ r = requests.get(base_url, headers=headers, params=params, timeout=30)
137
+ if r.status_code != 200:
138
+ return f"API error {r.status_code}: {r.text}", None
139
+ data = r.json()
140
+ matches = data.get("matches", [])
141
+ if not matches:
142
+ return "No scheduled PL matches in the next 7 days.", None
143
+
144
+ rows = []
145
+ for m in matches:
146
+ home = clean_team(m.get("homeTeam", {}).get("name", ""))
147
+ away = clean_team(m.get("awayTeam", {}).get("name", ""))
148
+ when = m.get("utcDate", "")
149
+ try:
150
+ match_date = dt.fromisoformat(when.replace("Z", "+00:00")).date().strftime("%Y-%m-%d")
151
+ except Exception:
152
+ match_date = _next_saturday_str()
153
+
154
+ try:
155
+ X_new, ctx = build_features_for_fixture(
156
+ home, away, match_date,
157
+ df_all=PIPE["df"], X_cols=PIPE["X_cols"], window=PIPE["window"],
158
+ odds_tuple=None, feat_df_for_medians=PIPE["feat_df"],
159
+ )
160
+ proba = PIPE["model"].predict_proba(X_new)[0]
161
+ rows.append({
162
+ "Date": match_date,
163
+ "Home": home,
164
+ "Away": away,
165
+ "P(Home)": float(proba[0]),
166
+ "P(Draw)": float(proba[1]),
167
+ "P(Away)": float(proba[2]),
168
+ })
169
+ except Exception as e:
170
+ rows.append({
171
+ "Date": match_date,
172
+ "Home": home,
173
+ "Away": away,
174
+ "Error": str(e),
175
+ })
176
+
177
+ df_out = pd.DataFrame(rows)
178
+ if not df_out.empty:
179
+ df_out = df_out.sort_values(["Date", "Home"]).reset_index(drop=True)
180
+ return df_out, None
181
+
182
+
183
+ def _alias_team_name(name: str) -> str:
184
+ """Map scraped team names to our canonical names when obvious.
185
+ Add common aliases here. Fallback to cleaned name.
186
+ """
187
+ aliases = {
188
+ "Man City": "Manchester City",
189
+ "Man Utd": "Manchester United",
190
+ "Nott'm Forest": "Nottingham Forest",
191
+ "Newcastle Utd": "Newcastle",
192
+ "Spurs": "Tottenham",
193
+ "Brighton & Hove Albion": "Brighton",
194
+ "Sheff Utd": "Sheffield United",
195
+ "Sheff Wed": "Sheffield Wednesday",
196
+ "West Bromwich Albion": "West Brom",
197
+ "West Brom": "West Brom",
198
+ "Wolverhampton Wanderers": "Wolves",
199
+ "Queens Park Rangers": "QPR",
200
+ }
201
+ n = clean_team(name)
202
+ return aliases.get(n, n)
203
+
204
+
205
+ def fetch_next_week_fixtures_and_predict_free():
206
+ """Scrape BBC Sport fixtures for the next 7 days (Premier League) and predict all.
207
+ No API key required. BBC structure may change over time; this parser is best-effort.
208
+ """
209
+ def _scrape_bbc_for_date(day: date):
210
+ """Return list of (home, away) for given date from BBC."""
211
+ urls = [
212
+ f"https://www.bbc.com/sport/football/premier-league/scores-fixtures/{day:%Y-%m-%d}",
213
+ f"https://www.bbc.com/sport/football/scores-fixtures/{day:%Y-%m-%d}?competition=premier-league",
214
+ f"https://www.bbc.co.uk/sport/football/premier-league/scores-fixtures/{day:%Y-%m-%d}",
215
+ ]
216
+ pairs = []
217
+ headers = {"User-Agent": "Mozilla/5.0"}
218
+ for url in urls:
219
+ try:
220
+ r = requests.get(url, timeout=30, headers=headers)
221
+ if r.status_code != 200 or not r.text:
222
+ continue
223
+ soup = BeautifulSoup(r.text, "html.parser")
224
+ # Several selector strategies
225
+ # 1) sp-c-fixture blocks
226
+ for fx in soup.select(".sp-c-fixture"):
227
+ tnames = fx.select(".sp-c-fixture__team-name, .sp-c-fixture__team-name-trunc, [data-testid='team-name']")
228
+ if len(tnames) >= 2:
229
+ home = _alias_team_name(tnames[0].get_text(strip=True))
230
+ away = _alias_team_name(tnames[1].get_text(strip=True))
231
+ if home and away:
232
+ pairs.append((home, away))
233
+ # 2) generic match-block containers
234
+ for blk in soup.select('[data-component="match-block"], [data-testid="match-block"]'):
235
+ teams = blk.select('[itemprop="name"], .sp-c-fixture__team-name, [data-testid="team-name"]')
236
+ # If page bundles many team names, take pairs sequentially
237
+ buf = [t.get_text(strip=True) for t in teams]
238
+ for i in range(0, len(buf) - 1, 2):
239
+ home = _alias_team_name(buf[i])
240
+ away = _alias_team_name(buf[i+1])
241
+ if home and away:
242
+ pairs.append((home, away))
243
+ if pairs:
244
+ break
245
+ except Exception:
246
+ continue
247
+ # de-duplicate
248
+ uniq = []
249
+ seen = set()
250
+ for h, a in pairs:
251
+ key = (h, a)
252
+ if key not in seen:
253
+ seen.add(key)
254
+ uniq.append((h, a))
255
+ return uniq
256
+
257
+ rows = []
258
+ today = date.today()
259
+ for d in range(0, 7):
260
+ day = today + timedelta(days=d)
261
+ pairs = _scrape_bbc_for_date(day)
262
+ for home, away in pairs:
263
+ match_date = day.strftime("%Y-%m-%d")
264
+ try:
265
+ X_new, ctx = build_features_for_fixture(
266
+ home, away, match_date,
267
+ df_all=PIPE["df"], X_cols=PIPE["X_cols"], window=PIPE["window"],
268
+ odds_tuple=None, feat_df_for_medians=PIPE["feat_df"],
269
+ )
270
+ proba = PIPE["model"].predict_proba(X_new)[0]
271
+ rows.append({
272
+ "Date": match_date,
273
+ "Home": home,
274
+ "Away": away,
275
+ "P(Home)": float(proba[0]),
276
+ "P(Draw)": float(proba[1]),
277
+ "P(Away)": float(proba[2]),
278
+ })
279
+ except Exception as e:
280
+ rows.append({
281
+ "Date": match_date,
282
+ "Home": home,
283
+ "Away": away,
284
+ "Error": str(e),
285
+ })
286
+
287
+ if not rows:
288
+ return "Could not find PL fixtures from BBC for the next 7 days.", None
289
+ df_out = pd.DataFrame(rows)
290
+ df_out = df_out.sort_values(["Date", "Home"]).reset_index(drop=True)
291
+ return df_out, None
292
+
293
+
294
+ # --------- Build Gradio UI ---------
295
+
296
+ def make_app():
297
+ with gr.Blocks(title="EPL Match Prediction") as demo:
298
+ gr.Markdown("""
299
+ # EPL Match Prediction
300
+ - Manual mode: pick teams and a date (optionally odds) and get predicted probabilities.
301
+ - Auto mode: fetch next week's Premier League fixtures (requires football-data.org API key) and predict all.
302
+ """)
303
+
304
+ with gr.Tab("Manual"):
305
+ with gr.Row():
306
+ home_dd = gr.Dropdown(choices=PIPE["team_list"], label="Home Team", value=PIPE["team_list"][0] if PIPE["team_list"] else None)
307
+ away_dd = gr.Dropdown(choices=PIPE["team_list"], label="Away Team", value=PIPE["team_list"][1] if len(PIPE["team_list"])>1 else None)
308
+ date_tb = gr.Textbox(label="Match Date (YYYY-MM-DD)", value=_next_saturday_str())
309
+ with gr.Accordion("Optional: Odds (decimal)", open=False):
310
+ home_od = gr.Textbox(label="Home Odds")
311
+ draw_od = gr.Textbox(label="Draw Odds")
312
+ away_od = gr.Textbox(label="Away Odds")
313
+ btn = gr.Button("Predict")
314
+ out_tbl = gr.Dataframe(label="Probabilities", interactive=False)
315
+ out_json = gr.JSON(label="Context")
316
+
317
+ def _on_predict(h, a, d, ho, do, ao):
318
+ res, ctx = manual_predict(h, a, d, ho, do, ao)
319
+ if isinstance(res, str):
320
+ return pd.DataFrame({"Message":[res]}), ctx
321
+ return res, ctx
322
+
323
+ btn.click(_on_predict, inputs=[home_dd, away_dd, date_tb, home_od, draw_od, away_od], outputs=[out_tbl, out_json])
324
+
325
+ with gr.Tab("Next Week Fixtures"):
326
+ gr.Markdown("Fetch next week's Premier League fixtures via API or scraping (no API key).")
327
+ api_key_tb = gr.Textbox(label="FOOTBALL_DATA_API_KEY", value=os.getenv("FOOTBALL_DATA_API_KEY", ""), type="password")
328
+ with gr.Row():
329
+ btn2 = gr.Button("Fetch via API and Predict")
330
+ btn3 = gr.Button("Fetch via Scraping (No API Key)")
331
+ out_tbl2 = gr.Dataframe(label="Next 7 days fixtures predictions", interactive=False)
332
+ msg = gr.Markdown(visible=True)
333
+
334
+ def _on_fetch(k):
335
+ res, _ = fetch_next_week_fixtures_and_predict(k.strip() or None)
336
+ if isinstance(res, str):
337
+ return pd.DataFrame(), res
338
+ return res, f"Found {len(res)} fixtures."
339
+
340
+ btn2.click(_on_fetch, inputs=[api_key_tb], outputs=[out_tbl2, msg])
341
+
342
+ def _on_scrape():
343
+ res, _ = fetch_next_week_fixtures_and_predict_free()
344
+ if isinstance(res, str):
345
+ return pd.DataFrame(), res
346
+ return res, f"Found {len(res)} fixtures (scraped)."
347
+
348
+ btn3.click(_on_scrape, inputs=[], outputs=[out_tbl2, msg])
349
+
350
+ return demo
351
+
352
+
353
+ def main():
354
+ app = make_app()
355
+ app.launch()
356
+
357
+
358
+ if __name__ == "__main__":
359
+ main()
inference_utils.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ from datetime import datetime
5
+ from typing import Optional, Tuple, Dict, List
6
+ from unidecode import unidecode
7
+
8
+ # --- Team name cleaner (shared) ---
9
+ def clean_team(s: str) -> str:
10
+ if pd.isna(s):
11
+ return s
12
+ s = unidecode(str(s)).strip()
13
+ s = " ".join(s.split())
14
+ return s
15
+
16
+ # --- Odds -> implied probabilities (normalized) ---
17
+ def implied_from_odds(odds_tuple: Tuple[float, float, float]) -> Optional[Tuple[float, float, float]]:
18
+ h, d, a = odds_tuple
19
+ if min(h, d, a) <= 1.0:
20
+ return None
21
+ inv = np.array([1.0/h, 1.0/d, 1.0/a], dtype=float)
22
+ s = inv.sum()
23
+ if s <= 0:
24
+ return None
25
+ p = inv / s
26
+ return float(p[0]), float(p[1]), float(p[2])
27
+
28
+ # --- Elo snapshot from historical df (up to date-1) ---
29
+ def compute_elo_snapshot(df_hist: pd.DataFrame, base_elo: float = 1500.0, K: float = 20.0, home_adv: float = 60.0) -> Dict[str, float]:
30
+ elo = defaultdict(lambda: base_elo)
31
+ dfh = df_hist.sort_values("date").reset_index(drop=True)
32
+ for _, r in dfh.iterrows():
33
+ h, a = r["home"], r["away"]
34
+ eh, ea = elo[h], elo[a]
35
+ ph = 1.0/(1.0 + 10**(-((eh+home_adv)-ea)/400))
36
+ if r["ftr"] == "H":
37
+ oh, oa = 1.0, 0.0
38
+ elif r["ftr"] == "D":
39
+ oh, oa = 0.5, 0.5
40
+ else:
41
+ oh, oa = 0.0, 1.0
42
+ elo[h] = eh + K*(oh - ph)
43
+ elo[a] = ea + K*((1.0-oh) - (1.0-ph))
44
+ return dict(elo)
45
+
46
+ # --- Build rolling features for a single team from history ---
47
+ def team_rolling_features(df_hist: pd.DataFrame, team_name: str, window: int = 6):
48
+ rows: List[dict] = []
49
+ for _, r in df_hist.iterrows():
50
+ rows.append({
51
+ "date": r["date"], "team": r["home"], "is_home": 1,
52
+ "gf": r["fthg"], "ga": r["ftag"],
53
+ "shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan),
54
+ "sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan),
55
+ "corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan),
56
+ "y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan),
57
+ "r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan),
58
+ "points": 3 if r["ftr"] == "H" else (1 if r["ftr"] == "D" else 0),
59
+ })
60
+ rows.append({
61
+ "date": r["date"], "team": r["away"], "is_home": 0,
62
+ "gf": r["ftag"], "ga": r["fthg"],
63
+ "shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan),
64
+ "sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan),
65
+ "corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan),
66
+ "y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan),
67
+ "r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan),
68
+ "points": 3 if r["ftr"] == "A" else (1 if r["ftr"] == "D" else 0),
69
+ })
70
+ tm = pd.DataFrame(rows)
71
+ if tm.empty:
72
+ return None, None
73
+
74
+ tm = tm.sort_values(["team", "date"]).reset_index(drop=True)
75
+ agg_cols = ["gf", "ga", "shots_f", "shots_a", "sot_f", "sot_a", "corn_f", "corn_a", "y_f", "r_f", "points"]
76
+
77
+ feats_home = {}
78
+ feats_away = {}
79
+
80
+ if (tm["team"] == team_name).any():
81
+ tdf = tm[tm["team"] == team_name]
82
+ if (tdf["is_home"] == 1).any():
83
+ t_home = tdf[tdf["is_home"] == 1]
84
+ for col in agg_cols:
85
+ feats_home[f"roll_{col}"] = t_home[col].tail(window).mean()
86
+ else:
87
+ for col in agg_cols:
88
+ feats_home[f"roll_{col}"] = np.nan
89
+
90
+ if (tdf["is_home"] == 0).any():
91
+ t_away = tdf[tdf["is_home"] == 0]
92
+ for col in agg_cols:
93
+ feats_away[f"roll_{col}"] = t_away[col].tail(window).mean()
94
+ else:
95
+ for col in agg_cols:
96
+ feats_away[f"roll_{col}"] = np.nan
97
+ else:
98
+ for col in agg_cols:
99
+ feats_home[f"roll_{col}"] = np.nan
100
+ feats_away[f"roll_{col}"] = np.nan
101
+
102
+ return feats_home, feats_away
103
+
104
+ # --- Build full feature vector for a fixture ---
105
+ def build_features_for_fixture(
106
+ home_team: str,
107
+ away_team: str,
108
+ date_str: str,
109
+ df_all: pd.DataFrame,
110
+ X_cols: List[str],
111
+ window: int = 6,
112
+ odds_tuple: Optional[Tuple[float, float, float]] = None,
113
+ feat_df_for_medians: Optional[pd.DataFrame] = None,
114
+ ):
115
+ home = clean_team(home_team)
116
+ away = clean_team(away_team)
117
+ match_date = datetime.strptime(date_str, "%Y-%m-%d")
118
+
119
+ df_hist = df_all[df_all["date"] < match_date].copy()
120
+ if df_hist.empty:
121
+ raise ValueError("No historical data found before match date. Try a later date or load more seasons.")
122
+
123
+ # Elo snapshot
124
+ elo_map = compute_elo_snapshot(df_hist)
125
+ elo_home = float(elo_map.get(home, 1500.0))
126
+ elo_away = float(elo_map.get(away, 1500.0))
127
+ elo_diff = elo_home - elo_away
128
+
129
+ # Rolling features by role
130
+ feats_home_homeRole, _ = team_rolling_features(df_hist, home, window)
131
+ _, feats_away_awayRole = team_rolling_features(df_hist, away, window)
132
+
133
+ if feats_home_homeRole is None:
134
+ feats_home_homeRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]}
135
+ if feats_away_awayRole is None:
136
+ feats_away_awayRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]}
137
+
138
+ # Odds -> probs
139
+ if odds_tuple is not None:
140
+ probs = implied_from_odds(odds_tuple)
141
+ if probs is None:
142
+ raise ValueError("Invalid odds provided. Use decimal odds > 1.0")
143
+ pH, pD, pA = probs
144
+ else:
145
+ # Proxy from Elo + average draw rate
146
+ draws = (df_hist["ftr"] == "D").mean()
147
+ draws = float(draws) if not np.isnan(draws) and draws > 0 else 0.25
148
+ k = 400.0
149
+ ph_nodraw = 1.0/(1.0 + 10**(-(elo_diff)/k))
150
+ pa_nodraw = 1.0 - ph_nodraw
151
+ pH = ph_nodraw * (1.0 - draws)
152
+ pA = pa_nodraw * (1.0 - draws)
153
+ pD = draws
154
+ s = pH + pD + pA
155
+ pH, pD, pA = pH/s, pD/s, pA/s
156
+
157
+ # Assemble features in X_cols order
158
+ feat_row: Dict[str, float] = {}
159
+
160
+ for k, v in feats_home_homeRole.items():
161
+ feat_row[f"home_{k}"] = v
162
+ for k, v in feats_away_awayRole.items():
163
+ feat_row[f"away_{k}"] = v
164
+
165
+ feat_row["elo_home"] = elo_home
166
+ feat_row["elo_away"] = elo_away
167
+ feat_row["elo_diff"] = elo_diff
168
+
169
+ feat_row["p_odds_H"] = float(pH)
170
+ feat_row["p_odds_D"] = float(pD)
171
+ feat_row["p_odds_A"] = float(pA)
172
+
173
+ x_vals = [feat_row.get(c, np.nan) for c in X_cols]
174
+ x = np.array(x_vals, dtype=float).reshape(1, -1)
175
+
176
+ if feat_df_for_medians is not None:
177
+ medians = {c: float(feat_df_for_medians[c].median()) if c in feat_df_for_medians.columns else 0.0 for c in X_cols}
178
+ else:
179
+ # fallback zeros
180
+ medians = {c: 0.0 for c in X_cols}
181
+
182
+ for j, c in enumerate(X_cols):
183
+ if np.isnan(x[0, j]):
184
+ x[0, j] = medians[c]
185
+
186
+ context = {
187
+ "p_odds_H": pH,
188
+ "p_odds_D": pD,
189
+ "p_odds_A": pA,
190
+ "elo_home": elo_home,
191
+ "elo_away": elo_away,
192
+ }
193
+ return x, context
ingest_data.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from collections import defaultdict
3
+ from datetime import datetime
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import requests
8
+ from unidecode import unidecode
9
+ from tqdm import tqdm
10
+
11
+
12
+ # -----------------------------
13
+ # 1) Download EPL seasons
14
+ # -----------------------------
15
+ BASE_URL = "https://www.football-data.co.uk/mmz4281/{scode}/E0.csv"
16
+
17
+ def season_code(y1):
18
+ # 1993 -> '9394', 2024 -> '2425'
19
+ return f"{str(y1)[-2:]}{(y1+1)%100:02d}"
20
+
21
+ def try_read_csv_bytes(content):
22
+ for enc in ("utf-8", "latin-1", "ISO-8859-1"):
23
+ try:
24
+ return pd.read_csv(io.BytesIO(content), encoding=enc)
25
+ except Exception:
26
+ pass
27
+ return pd.DataFrame()
28
+
29
+ def fetch_season(y1, verbose=True):
30
+ scode = season_code(y1)
31
+ url = BASE_URL.format(scode=scode)
32
+ r = requests.get(url, timeout=30)
33
+ if r.status_code != 200 or len(r.content) < 100:
34
+ if verbose: print(f"[skip] {y1}-{(y1+1)%100:02d} ({scode}) not available")
35
+ return pd.DataFrame()
36
+ df = try_read_csv_bytes(r.content)
37
+ if df.empty:
38
+ if verbose: print(f"[warn] parse error {scode}")
39
+ return pd.DataFrame()
40
+ df["SeasonFirstYear"] = y1
41
+ df["Season"] = f"{y1}-{str(y1+1)[-2:]}"
42
+ df["SeasonCode"] = scode
43
+ return df
44
+
45
+ def load_epl_data(start_y1: int = 1993, end_y1: int | None = None, verbose: bool = False) -> pd.DataFrame:
46
+ """Download and concatenate EPL seasons into a single DataFrame.
47
+
48
+ - start_y1: first season starting year (inclusive), e.g., 1993
49
+ - end_y1: last season starting year (inclusive). Defaults to current year + 1 to include latest.
50
+ - verbose: print per-season logs
51
+ """
52
+ if end_y1 is None:
53
+ end_y1 = datetime.now().year + 1
54
+ frames = []
55
+ for y in tqdm(range(start_y1, end_y1 + 1)):
56
+ df = fetch_season(y, verbose=verbose)
57
+ if not df.empty:
58
+ frames.append(df)
59
+ data_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
60
+ if data_raw.empty:
61
+ raise RuntimeError("No data downloaded. Try expanding start_y1 backward or check internet.")
62
+ return data_raw
63
+
64
+ if __name__ == "__main__":
65
+ # Example usage when running this file directly
66
+ data_raw = load_epl_data(start_y1=1993, end_y1=datetime.now().year + 1, verbose=True)
67
+ print("Seasons loaded:", sorted(data_raw["Season"].unique()))
model_lgb_isotonic.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ade3ab60965b439f8ef7ca79ba9ab29c6ecaa437d32c79eebb6e757ae15df4c
3
+ size 24789872
model_xgb_isotonic.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da45dfb1d875ccca9f825c363169b81ea8a6aff71333b8f330bc6d1d0ff91939
3
+ size 22864683
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ xgboost
5
+ lightgbm
6
+ unidecode
7
+ requests
8
+ tqdm
9
+ gradio
10
+ beautifulsoup4
team name.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Arsenal
2
+
3
+ Aston Villa
4
+
5
+ Birmingham
6
+
7
+ Blackburn
8
+
9
+ Blackpool
10
+
11
+ Bolton
12
+
13
+ Bournemouth
14
+
15
+ Brentford
16
+
17
+ Brighton
18
+
19
+ Burnley
20
+
21
+ Cardiff
22
+
23
+ Charlton
24
+
25
+ Chelsea
26
+
27
+ Coventry
28
+
29
+ Crystal Palace
30
+
31
+ Derby
32
+
33
+ Everton
34
+
35
+ Fulham
36
+
37
+ Huddersfield
38
+
39
+ Hull
40
+
41
+ Ipswich
42
+
43
+ Leeds
44
+
45
+ Leicester
46
+
47
+ Liverpool
48
+
49
+ Luton
50
+
51
+ Manchester City
52
+
53
+ Manchester United
54
+
55
+ Middlesbrough
56
+
57
+ Newcastle
58
+
59
+ Norwich
60
+
61
+ Nottingham Forest
62
+
63
+ Portsmouth
64
+
65
+ QPR
66
+
67
+ Reading
68
+
69
+ Sheffield United
70
+
71
+ Sheffield Wednesday
72
+
73
+ Southampton
74
+
75
+ Stoke
76
+
77
+ Sunderland
78
+
79
+ Swansea
80
+
81
+ Tottenham
82
+
83
+ Watford
84
+
85
+ West Brom
86
+
87
+ West Ham
88
+
89
+ Wigan
90
+
91
+ Wolves