Spaces:

datamatters24
/

f1-race-predictor

Running

App Files Files Community

f1-race-predictor / app.py

datamatters24

Fix: remove eager live data load at startup to speed up Space init

7f4e6fd verified about 1 month ago

raw

history blame contribute delete

14.5 kB

	"""Gradio app for Pipeline 1: Current Season Race Predictor.

	Loads XGBoost + LightGBM ensemble model + lightweight parquet lookups from HF Hub.
	No SQLite dependency — everything comes from the model repo.
	"""

	import json
	import os
	import tempfile
	from pathlib import Path

	import gradio as gr
	import joblib
	import numpy as np
	import pandas as pd
	import plotly.graph_objects as go
	from huggingface_hub import hf_hub_download

	# --- Config ---
	MODEL_REPO = "datamatters24/f1-race-predictor-model"
	HF_TOKEN = os.environ.get("HF_TOKEN")
	CACHE_DIR = Path(tempfile.gettempdir()) / "racetel_cache"
	CACHE_DIR.mkdir(exist_ok=True)

	# F1 points system (top 10)
	POINTS_SYSTEM = {1: 25, 2: 18, 3: 15, 4: 12, 5: 10, 6: 8, 7: 6, 8: 4, 9: 2, 10: 1}


	def dl(filename):
	return hf_hub_download(MODEL_REPO, filename, cache_dir=CACHE_DIR, token=HF_TOKEN)


	# --- Load everything at startup ---
	print("Loading model and data...")

	# Try ensemble first (XGB + LGBM blend), fall back to single XGBoost
	ENSEMBLE_CONFIG = None
	XGB_MODEL = None
	LGBM_MODEL = None

	try:
	ENSEMBLE_CONFIG = joblib.load(dl("ensemble_config.joblib"))
	XGB_MODEL = joblib.load(dl("race_winner_xgb_tuned.joblib"))
	LGBM_MODEL = joblib.load(dl("race_winner_lgbm_tuned.joblib"))
	print(f"Loaded ensemble: XGB weight={ENSEMBLE_CONFIG['xgb_weight']:.2f}, "
	f"LGBM weight={ENSEMBLE_CONFIG['lgbm_weight']:.2f}")
	except Exception as e:
	print(f"Ensemble not available ({e}), falling back to single XGBoost")
	try:
	XGB_MODEL = joblib.load(dl("race_winner_xgb_tuned.joblib"))
	except Exception:
	XGB_MODEL = joblib.load(dl("race_winner_xgb.joblib"))

	with open(dl("feature_metadata.json")) as f:
	META = json.load(f)

	# Use ensemble feature columns if available, else metadata
	if ENSEMBLE_CONFIG and "feature_columns" in ENSEMBLE_CONFIG:
	FEATURE_COLS = ENSEMBLE_CONFIG["feature_columns"]
	else:
	FEATURE_COLS = META["feature_columns"]

	RACES = pd.read_parquet(dl("data/races_lookup.parquet"))
	DRIVERS = pd.read_parquet(dl("data/drivers_lookup.parquet"))
	WINNERS = pd.read_parquet(dl("data/actual_winners.parquet"))
	TRAINING = pd.read_parquet(dl("data/training_dataset.parquet"))

	# Try loading constructors lookup
	try:
	CONSTRUCTORS = pd.read_parquet(dl("data/constructors_lookup.parquet"))
	CONSTRUCTOR_NAMES = dict(zip(CONSTRUCTORS["constructorId"], CONSTRUCTORS["name"]))
	except Exception:
	CONSTRUCTORS = pd.DataFrame()
	CONSTRUCTOR_NAMES = {}

	DRIVER_NAMES = dict(zip(DRIVERS["driverId"], DRIVERS["name"]))
	WINNER_MAP = dict(zip(WINNERS["raceId"], WINNERS["winner_code"]))

	# Build race dropdown choices
	RACE_CHOICES = []
	RACE_IDS = {}
	for _, row in RACES.iterrows():
	label = f"{row['year']} R{row['round']:02d} — {row['race_name']} ({row['country']})"
	RACE_CHOICES.append(label)
	RACE_IDS[label] = row["raceId"]

	# Build year choices for constructor tab
	YEAR_CHOICES = sorted(RACES["year"].unique().tolist(), reverse=True)

	print(f"Loaded: {len(RACE_CHOICES)} races, {len(DRIVER_NAMES)} drivers, "
	f"{len(CONSTRUCTOR_NAMES)} constructors, {len(TRAINING)} feature rows")


	def _get_probs(X):
	"""Get win probabilities from ensemble or single model."""
	if ENSEMBLE_CONFIG and XGB_MODEL and LGBM_MODEL:
	xgb_probs = XGB_MODEL.predict_proba(X)[:, 1]
	lgbm_probs = LGBM_MODEL.predict_proba(X)[:, 1]
	return (ENSEMBLE_CONFIG["xgb_weight"] * xgb_probs +
	ENSEMBLE_CONFIG["lgbm_weight"] * lgbm_probs)
	return XGB_MODEL.predict_proba(X)[:, 1]


	def predict_race(race_choice):
	"""Generate predictions for selected race."""
	if not race_choice or race_choice not in RACE_IDS:
	return None, pd.DataFrame({"Info": ["Select a race to see predictions"]})

	race_id = RACE_IDS[race_choice]
	race_data = TRAINING[TRAINING["raceId"] == race_id].copy()

	if len(race_data) == 0:
	return None, pd.DataFrame({"Info": ["No feature data for this race"]})

	# Predict
	X = race_data[FEATURE_COLS]
	race_data["win_prob"] = _get_probs(X)
	race_data["driver"] = race_data["driverId"].map(DRIVER_NAMES)
	race_data = race_data.sort_values("win_prob", ascending=False).reset_index(drop=True)

	# Normalize to sum to 100%
	total = race_data["win_prob"].sum()
	if total > 0:
	race_data["win_pct"] = race_data["win_prob"] / total * 100
	else:
	race_data["win_pct"] = 0

	# Actual winner
	actual = WINNER_MAP.get(race_id)
	actual_str = f"Winner: {actual}" if actual else "Not yet raced"

	# Chart
	top_n = min(10, len(race_data))
	plot_df = race_data.head(top_n)

	colors = []
	for i, (_, row) in enumerate(plot_df.iterrows()):
	driver_code = DRIVERS[DRIVERS["driverId"] == row["driverId"]]["code"].values
	code = driver_code[0] if len(driver_code) > 0 else ""
	if actual and code == actual:
	colors.append("#00d200") # green for actual winner
	elif i == 0:
	colors.append("#e10600") # red for predicted favorite
	else:
	colors.append("#1e1e1e")

	fig = go.Figure()
	fig.add_trace(go.Bar(
	x=plot_df["win_pct"].values,
	y=plot_df["driver"].values,
	orientation="h",
	marker_color=colors,
	text=[f"{p:.1f}%" for p in plot_df["win_pct"]],
	textposition="outside",
	textfont=dict(size=14),
	))
	fig.update_layout(
	title=f"Win Probability \| {actual_str}",
	xaxis_title="Win Probability (%)",
	yaxis=dict(autorange="reversed"),
	height=max(400, top_n * 50),
	margin=dict(l=10, r=80, t=50, b=40),
	plot_bgcolor="white",
	font=dict(family="Arial"),
	)

	# Table
	tbl = race_data[["driver", "grid_position", "win_pct"]].head(20).copy()
	tbl.columns = ["Driver", "Grid", "Win %"]
	tbl["Grid"] = tbl["Grid"].astype(int)
	tbl["Win %"] = tbl["Win %"].apply(lambda x: f"{x:.1f}%")
	tbl.index = range(1, len(tbl) + 1)

	return fig, tbl


	def constructor_standings(year_choice):
	"""Compute constructor championship projections for a season."""
	if not CONSTRUCTOR_NAMES:
	return None, pd.DataFrame({"Info": ["Constructor data not available"]})

	year = int(year_choice)
	season_data = TRAINING[(TRAINING["year"] == year) & (TRAINING["constructorId"].isin(CONSTRUCTOR_NAMES))].copy()

	if len(season_data) == 0:
	return None, pd.DataFrame({"Info": [f"No data for {year}"]})

	race_ids = sorted(season_data["raceId"].unique())

	# For each race, predict and compute expected points per constructor
	constructor_points = {cid: 0.0 for cid in CONSTRUCTOR_NAMES}
	actual_points = {cid: 0.0 for cid in CONSTRUCTOR_NAMES}
	races_with_results = 0

	for race_id in race_ids:
	race_data = season_data[season_data["raceId"] == race_id].copy()
	if len(race_data) == 0:
	continue

	X = race_data[FEATURE_COLS]
	race_data["win_prob"] = _get_probs(X)

	# Rank by predicted probability
	race_data = race_data.sort_values("win_prob", ascending=False).reset_index(drop=True)

	# Assign expected points based on predicted finishing order
	for rank, (_, row) in enumerate(race_data.iterrows()):
	pos = rank + 1
	cid = row["constructorId"]
	if cid in constructor_points and pos in POINTS_SYSTEM:
	constructor_points[cid] += POINTS_SYSTEM[pos]

	# Actual points from results
	has_winner = race_id in WINNER_MAP
	if has_winner:
	races_with_results += 1
	for _, row in race_data.iterrows():
	cid = row["constructorId"]
	finish = row.get("finish", None)
	if cid in actual_points and finish is not None and not np.isnan(finish):
	fp = int(finish)
	if fp in POINTS_SYSTEM:
	actual_points[cid] += POINTS_SYSTEM[fp]

	# Build results
	rows = []
	for cid, name in CONSTRUCTOR_NAMES.items():
	rows.append({
	"Constructor": name,
	"Predicted Pts": round(constructor_points.get(cid, 0), 1),
	"Actual Pts": round(actual_points.get(cid, 0), 1),
	})

	df = pd.DataFrame(rows).sort_values("Predicted Pts", ascending=False).reset_index(drop=True)

	# Chart
	fig = go.Figure()
	fig.add_trace(go.Bar(
	name="Predicted",
	x=df["Constructor"],
	y=df["Predicted Pts"],
	marker_color="#e10600",
	text=[f"{p:.0f}" for p in df["Predicted Pts"]],
	textposition="outside",
	))
	if races_with_results > 0:
	fig.add_trace(go.Bar(
	name="Actual",
	x=df["Constructor"],
	y=df["Actual Pts"],
	marker_color="#00d200",
	text=[f"{p:.0f}" for p in df["Actual Pts"]],
	textposition="outside",
	))

	fig.update_layout(
	title=f"{year} Constructor Championship — {len(race_ids)} races",
	yaxis_title="Points",
	barmode="group",
	height=450,
	margin=dict(t=60, b=40),
	plot_bgcolor="white",
	font=dict(family="Arial"),
	)

	tbl = df.copy()
	tbl.index = range(1, len(tbl) + 1)

	return fig, tbl


	def load_live_predictions():
	"""Load live race predictions from HF Hub."""
	try:
	# Use a short-lived local cache to avoid hammering HF on every refresh
	# force_download ensures we get the latest during a live race
	live_path = hf_hub_download(
	MODEL_REPO, "data/live_predictions.json",
	cache_dir=CACHE_DIR, token=HF_TOKEN,
	force_download=True,
	)
	with open(live_path) as f:
	data = json.load(f)
	return data
	except Exception:
	# File may not exist yet (no race has been tracked)
	return None


	def live_race_display():
	"""Show live race predictions."""
	data = load_live_predictions()

	if data is None:
	return (None,
	pd.DataFrame({"Info": ["No live data available. Check back during a race weekend."]}),
	"No live race data available.")

	predictions = data.get("predictions", [])
	if not predictions:
	return (None,
	pd.DataFrame({"Info": ["Waiting for race to start..."]}),
	f"Race: {data.get('race', '?')} \| Status: waiting")

	race = data.get("race", "?")
	lap = data.get("current_lap", 0)
	total = data.get("total_laps", "?")
	status = data.get("status", "unknown")
	sc = " \| SAFETY CAR" if data.get("safety_car") else ""
	ts = data.get("timestamp", "")[:19].replace("T", " ")

	status_text = (f"{race} \| Lap {lap}/{total}{sc} \| "
	f"Status: {status.upper()} \| Updated: {ts} UTC")

	# Chart
	drivers = [p["driver"] for p in predictions[:10]]
	probs = [p["win_prob"] for p in predictions[:10]]

	colors = ["#e10600" if i == 0 else "#1e1e1e" for i in range(len(drivers))]
	# Mark DNFs in grey
	for i, p in enumerate(predictions[:10]):
	if p.get("status") == "DNF":
	colors[i] = "#555555"

	fig = go.Figure()
	fig.add_trace(go.Bar(
	x=probs, y=drivers, orientation="h",
	marker_color=colors,
	text=[f"{p:.1f}%" for p in probs],
	textposition="outside",
	textfont=dict(size=14),
	))
	fig.update_layout(
	title=f"Live Win Probability \| Lap {lap}/{total}",
	xaxis_title="Win Probability (%)",
	yaxis=dict(autorange="reversed"),
	height=max(400, len(drivers) * 50),
	margin=dict(l=10, r=80, t=50, b=40),
	plot_bgcolor="white",
	font=dict(family="Arial"),
	)

	# Table
	rows = []
	for p in predictions:
	rows.append({
	"Driver": p["driver"],
	"Pos": p["position"],
	"Win %": f"{p['win_prob']:.1f}%",
	"Gap": f"+{p['gap']:.1f}s" if p.get("gap") else "-",
	"Tyre": f"{p.get('compound', '?')} L{p.get('tyre_age', '?')}",
	"Pits": p.get("pit_stops", 0),
	"Status": p.get("status", ""),
	})
	tbl = pd.DataFrame(rows)
	tbl.index = range(1, len(tbl) + 1)

	return fig, tbl, status_text


	# --- UI ---
	with gr.Blocks(
	title="F1 Race Predictor",
	theme=gr.themes.Base(primary_hue="red", neutral_hue="slate"),
	) as app:
	gr.Markdown("# F1 Race Predictor")
	gr.Markdown(
	"XGBoost + LightGBM ensemble trained on 2014-2023 F1 data \| 470K+ telemetry laps \| "
	"21 features \| Optuna-tuned (200 trials) \| "
	"[telemetrychaos.space](https://telemetrychaos.space)"
	)

	with gr.Tabs():
	with gr.Tab("Live Race"):
	live_status = gr.Markdown("Loading live data...")
	live_chart = gr.Plot(label="Live Win Probabilities")
	live_table = gr.Dataframe(label="Race State", wrap=True)
	live_btn = gr.Button("Refresh", variant="primary")

	live_btn.click(live_race_display, outputs=[live_chart, live_table, live_status])

	with gr.Tab("Race Winner"):
	with gr.Row():
	race_dd = gr.Dropdown(choices=RACE_CHOICES, label="Select Race",
	value=RACE_CHOICES[0] if RACE_CHOICES else None)
	race_btn = gr.Button("Predict", variant="primary", scale=0)

	race_chart = gr.Plot(label="Win Probabilities")
	race_table = gr.Dataframe(label="Full Predictions", wrap=True)

	race_btn.click(predict_race, inputs=[race_dd], outputs=[race_chart, race_table])
	race_dd.change(predict_race, inputs=[race_dd], outputs=[race_chart, race_table])

	with gr.Tab("Constructor Championship"):
	with gr.Row():
	year_dd = gr.Dropdown(choices=[str(y) for y in YEAR_CHOICES],
	label="Season",
	value=str(YEAR_CHOICES[0]) if YEAR_CHOICES else None)
	con_btn = gr.Button("Project", variant="primary", scale=0)

	con_chart = gr.Plot(label="Constructor Points")
	con_table = gr.Dataframe(label="Standings", wrap=True)

	con_btn.click(constructor_standings, inputs=[year_dd], outputs=[con_chart, con_table])
	year_dd.change(constructor_standings, inputs=[year_dd], outputs=[con_chart, con_table])

	app.load(predict_race, inputs=[race_dd], outputs=[race_chart, race_table])

	if __name__ == "__main__":
	app.launch()