datamatters24's picture
Fix: remove eager live data load at startup to speed up Space init
7f4e6fd verified
"""Gradio app for Pipeline 1: Current Season Race Predictor.
Loads XGBoost + LightGBM ensemble model + lightweight parquet lookups from HF Hub.
No SQLite dependency — everything comes from the model repo.
"""
import json
import os
import tempfile
from pathlib import Path
import gradio as gr
import joblib
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from huggingface_hub import hf_hub_download
# --- Config ---
MODEL_REPO = "datamatters24/f1-race-predictor-model"
HF_TOKEN = os.environ.get("HF_TOKEN")
CACHE_DIR = Path(tempfile.gettempdir()) / "racetel_cache"
CACHE_DIR.mkdir(exist_ok=True)
# F1 points system (top 10)
POINTS_SYSTEM = {1: 25, 2: 18, 3: 15, 4: 12, 5: 10, 6: 8, 7: 6, 8: 4, 9: 2, 10: 1}
def dl(filename):
return hf_hub_download(MODEL_REPO, filename, cache_dir=CACHE_DIR, token=HF_TOKEN)
# --- Load everything at startup ---
print("Loading model and data...")
# Try ensemble first (XGB + LGBM blend), fall back to single XGBoost
ENSEMBLE_CONFIG = None
XGB_MODEL = None
LGBM_MODEL = None
try:
ENSEMBLE_CONFIG = joblib.load(dl("ensemble_config.joblib"))
XGB_MODEL = joblib.load(dl("race_winner_xgb_tuned.joblib"))
LGBM_MODEL = joblib.load(dl("race_winner_lgbm_tuned.joblib"))
print(f"Loaded ensemble: XGB weight={ENSEMBLE_CONFIG['xgb_weight']:.2f}, "
f"LGBM weight={ENSEMBLE_CONFIG['lgbm_weight']:.2f}")
except Exception as e:
print(f"Ensemble not available ({e}), falling back to single XGBoost")
try:
XGB_MODEL = joblib.load(dl("race_winner_xgb_tuned.joblib"))
except Exception:
XGB_MODEL = joblib.load(dl("race_winner_xgb.joblib"))
with open(dl("feature_metadata.json")) as f:
META = json.load(f)
# Use ensemble feature columns if available, else metadata
if ENSEMBLE_CONFIG and "feature_columns" in ENSEMBLE_CONFIG:
FEATURE_COLS = ENSEMBLE_CONFIG["feature_columns"]
else:
FEATURE_COLS = META["feature_columns"]
RACES = pd.read_parquet(dl("data/races_lookup.parquet"))
DRIVERS = pd.read_parquet(dl("data/drivers_lookup.parquet"))
WINNERS = pd.read_parquet(dl("data/actual_winners.parquet"))
TRAINING = pd.read_parquet(dl("data/training_dataset.parquet"))
# Try loading constructors lookup
try:
CONSTRUCTORS = pd.read_parquet(dl("data/constructors_lookup.parquet"))
CONSTRUCTOR_NAMES = dict(zip(CONSTRUCTORS["constructorId"], CONSTRUCTORS["name"]))
except Exception:
CONSTRUCTORS = pd.DataFrame()
CONSTRUCTOR_NAMES = {}
DRIVER_NAMES = dict(zip(DRIVERS["driverId"], DRIVERS["name"]))
WINNER_MAP = dict(zip(WINNERS["raceId"], WINNERS["winner_code"]))
# Build race dropdown choices
RACE_CHOICES = []
RACE_IDS = {}
for _, row in RACES.iterrows():
label = f"{row['year']} R{row['round']:02d}{row['race_name']} ({row['country']})"
RACE_CHOICES.append(label)
RACE_IDS[label] = row["raceId"]
# Build year choices for constructor tab
YEAR_CHOICES = sorted(RACES["year"].unique().tolist(), reverse=True)
print(f"Loaded: {len(RACE_CHOICES)} races, {len(DRIVER_NAMES)} drivers, "
f"{len(CONSTRUCTOR_NAMES)} constructors, {len(TRAINING)} feature rows")
def _get_probs(X):
"""Get win probabilities from ensemble or single model."""
if ENSEMBLE_CONFIG and XGB_MODEL and LGBM_MODEL:
xgb_probs = XGB_MODEL.predict_proba(X)[:, 1]
lgbm_probs = LGBM_MODEL.predict_proba(X)[:, 1]
return (ENSEMBLE_CONFIG["xgb_weight"] * xgb_probs +
ENSEMBLE_CONFIG["lgbm_weight"] * lgbm_probs)
return XGB_MODEL.predict_proba(X)[:, 1]
def predict_race(race_choice):
"""Generate predictions for selected race."""
if not race_choice or race_choice not in RACE_IDS:
return None, pd.DataFrame({"Info": ["Select a race to see predictions"]})
race_id = RACE_IDS[race_choice]
race_data = TRAINING[TRAINING["raceId"] == race_id].copy()
if len(race_data) == 0:
return None, pd.DataFrame({"Info": ["No feature data for this race"]})
# Predict
X = race_data[FEATURE_COLS]
race_data["win_prob"] = _get_probs(X)
race_data["driver"] = race_data["driverId"].map(DRIVER_NAMES)
race_data = race_data.sort_values("win_prob", ascending=False).reset_index(drop=True)
# Normalize to sum to 100%
total = race_data["win_prob"].sum()
if total > 0:
race_data["win_pct"] = race_data["win_prob"] / total * 100
else:
race_data["win_pct"] = 0
# Actual winner
actual = WINNER_MAP.get(race_id)
actual_str = f"Winner: {actual}" if actual else "Not yet raced"
# Chart
top_n = min(10, len(race_data))
plot_df = race_data.head(top_n)
colors = []
for i, (_, row) in enumerate(plot_df.iterrows()):
driver_code = DRIVERS[DRIVERS["driverId"] == row["driverId"]]["code"].values
code = driver_code[0] if len(driver_code) > 0 else ""
if actual and code == actual:
colors.append("#00d200") # green for actual winner
elif i == 0:
colors.append("#e10600") # red for predicted favorite
else:
colors.append("#1e1e1e")
fig = go.Figure()
fig.add_trace(go.Bar(
x=plot_df["win_pct"].values,
y=plot_df["driver"].values,
orientation="h",
marker_color=colors,
text=[f"{p:.1f}%" for p in plot_df["win_pct"]],
textposition="outside",
textfont=dict(size=14),
))
fig.update_layout(
title=f"Win Probability | {actual_str}",
xaxis_title="Win Probability (%)",
yaxis=dict(autorange="reversed"),
height=max(400, top_n * 50),
margin=dict(l=10, r=80, t=50, b=40),
plot_bgcolor="white",
font=dict(family="Arial"),
)
# Table
tbl = race_data[["driver", "grid_position", "win_pct"]].head(20).copy()
tbl.columns = ["Driver", "Grid", "Win %"]
tbl["Grid"] = tbl["Grid"].astype(int)
tbl["Win %"] = tbl["Win %"].apply(lambda x: f"{x:.1f}%")
tbl.index = range(1, len(tbl) + 1)
return fig, tbl
def constructor_standings(year_choice):
"""Compute constructor championship projections for a season."""
if not CONSTRUCTOR_NAMES:
return None, pd.DataFrame({"Info": ["Constructor data not available"]})
year = int(year_choice)
season_data = TRAINING[(TRAINING["year"] == year) & (TRAINING["constructorId"].isin(CONSTRUCTOR_NAMES))].copy()
if len(season_data) == 0:
return None, pd.DataFrame({"Info": [f"No data for {year}"]})
race_ids = sorted(season_data["raceId"].unique())
# For each race, predict and compute expected points per constructor
constructor_points = {cid: 0.0 for cid in CONSTRUCTOR_NAMES}
actual_points = {cid: 0.0 for cid in CONSTRUCTOR_NAMES}
races_with_results = 0
for race_id in race_ids:
race_data = season_data[season_data["raceId"] == race_id].copy()
if len(race_data) == 0:
continue
X = race_data[FEATURE_COLS]
race_data["win_prob"] = _get_probs(X)
# Rank by predicted probability
race_data = race_data.sort_values("win_prob", ascending=False).reset_index(drop=True)
# Assign expected points based on predicted finishing order
for rank, (_, row) in enumerate(race_data.iterrows()):
pos = rank + 1
cid = row["constructorId"]
if cid in constructor_points and pos in POINTS_SYSTEM:
constructor_points[cid] += POINTS_SYSTEM[pos]
# Actual points from results
has_winner = race_id in WINNER_MAP
if has_winner:
races_with_results += 1
for _, row in race_data.iterrows():
cid = row["constructorId"]
finish = row.get("finish", None)
if cid in actual_points and finish is not None and not np.isnan(finish):
fp = int(finish)
if fp in POINTS_SYSTEM:
actual_points[cid] += POINTS_SYSTEM[fp]
# Build results
rows = []
for cid, name in CONSTRUCTOR_NAMES.items():
rows.append({
"Constructor": name,
"Predicted Pts": round(constructor_points.get(cid, 0), 1),
"Actual Pts": round(actual_points.get(cid, 0), 1),
})
df = pd.DataFrame(rows).sort_values("Predicted Pts", ascending=False).reset_index(drop=True)
# Chart
fig = go.Figure()
fig.add_trace(go.Bar(
name="Predicted",
x=df["Constructor"],
y=df["Predicted Pts"],
marker_color="#e10600",
text=[f"{p:.0f}" for p in df["Predicted Pts"]],
textposition="outside",
))
if races_with_results > 0:
fig.add_trace(go.Bar(
name="Actual",
x=df["Constructor"],
y=df["Actual Pts"],
marker_color="#00d200",
text=[f"{p:.0f}" for p in df["Actual Pts"]],
textposition="outside",
))
fig.update_layout(
title=f"{year} Constructor Championship — {len(race_ids)} races",
yaxis_title="Points",
barmode="group",
height=450,
margin=dict(t=60, b=40),
plot_bgcolor="white",
font=dict(family="Arial"),
)
tbl = df.copy()
tbl.index = range(1, len(tbl) + 1)
return fig, tbl
def load_live_predictions():
"""Load live race predictions from HF Hub."""
try:
# Use a short-lived local cache to avoid hammering HF on every refresh
# force_download ensures we get the latest during a live race
live_path = hf_hub_download(
MODEL_REPO, "data/live_predictions.json",
cache_dir=CACHE_DIR, token=HF_TOKEN,
force_download=True,
)
with open(live_path) as f:
data = json.load(f)
return data
except Exception:
# File may not exist yet (no race has been tracked)
return None
def live_race_display():
"""Show live race predictions."""
data = load_live_predictions()
if data is None:
return (None,
pd.DataFrame({"Info": ["No live data available. Check back during a race weekend."]}),
"No live race data available.")
predictions = data.get("predictions", [])
if not predictions:
return (None,
pd.DataFrame({"Info": ["Waiting for race to start..."]}),
f"Race: {data.get('race', '?')} | Status: waiting")
race = data.get("race", "?")
lap = data.get("current_lap", 0)
total = data.get("total_laps", "?")
status = data.get("status", "unknown")
sc = " | SAFETY CAR" if data.get("safety_car") else ""
ts = data.get("timestamp", "")[:19].replace("T", " ")
status_text = (f"**{race}** | Lap {lap}/{total}{sc} | "
f"Status: {status.upper()} | Updated: {ts} UTC")
# Chart
drivers = [p["driver"] for p in predictions[:10]]
probs = [p["win_prob"] for p in predictions[:10]]
colors = ["#e10600" if i == 0 else "#1e1e1e" for i in range(len(drivers))]
# Mark DNFs in grey
for i, p in enumerate(predictions[:10]):
if p.get("status") == "DNF":
colors[i] = "#555555"
fig = go.Figure()
fig.add_trace(go.Bar(
x=probs, y=drivers, orientation="h",
marker_color=colors,
text=[f"{p:.1f}%" for p in probs],
textposition="outside",
textfont=dict(size=14),
))
fig.update_layout(
title=f"Live Win Probability | Lap {lap}/{total}",
xaxis_title="Win Probability (%)",
yaxis=dict(autorange="reversed"),
height=max(400, len(drivers) * 50),
margin=dict(l=10, r=80, t=50, b=40),
plot_bgcolor="white",
font=dict(family="Arial"),
)
# Table
rows = []
for p in predictions:
rows.append({
"Driver": p["driver"],
"Pos": p["position"],
"Win %": f"{p['win_prob']:.1f}%",
"Gap": f"+{p['gap']:.1f}s" if p.get("gap") else "-",
"Tyre": f"{p.get('compound', '?')} L{p.get('tyre_age', '?')}",
"Pits": p.get("pit_stops", 0),
"Status": p.get("status", ""),
})
tbl = pd.DataFrame(rows)
tbl.index = range(1, len(tbl) + 1)
return fig, tbl, status_text
# --- UI ---
with gr.Blocks(
title="F1 Race Predictor",
theme=gr.themes.Base(primary_hue="red", neutral_hue="slate"),
) as app:
gr.Markdown("# F1 Race Predictor")
gr.Markdown(
"XGBoost + LightGBM ensemble trained on 2014-2023 F1 data | 470K+ telemetry laps | "
"21 features | Optuna-tuned (200 trials) | "
"[telemetrychaos.space](https://telemetrychaos.space)"
)
with gr.Tabs():
with gr.Tab("Live Race"):
live_status = gr.Markdown("Loading live data...")
live_chart = gr.Plot(label="Live Win Probabilities")
live_table = gr.Dataframe(label="Race State", wrap=True)
live_btn = gr.Button("Refresh", variant="primary")
live_btn.click(live_race_display, outputs=[live_chart, live_table, live_status])
with gr.Tab("Race Winner"):
with gr.Row():
race_dd = gr.Dropdown(choices=RACE_CHOICES, label="Select Race",
value=RACE_CHOICES[0] if RACE_CHOICES else None)
race_btn = gr.Button("Predict", variant="primary", scale=0)
race_chart = gr.Plot(label="Win Probabilities")
race_table = gr.Dataframe(label="Full Predictions", wrap=True)
race_btn.click(predict_race, inputs=[race_dd], outputs=[race_chart, race_table])
race_dd.change(predict_race, inputs=[race_dd], outputs=[race_chart, race_table])
with gr.Tab("Constructor Championship"):
with gr.Row():
year_dd = gr.Dropdown(choices=[str(y) for y in YEAR_CHOICES],
label="Season",
value=str(YEAR_CHOICES[0]) if YEAR_CHOICES else None)
con_btn = gr.Button("Project", variant="primary", scale=0)
con_chart = gr.Plot(label="Constructor Points")
con_table = gr.Dataframe(label="Standings", wrap=True)
con_btn.click(constructor_standings, inputs=[year_dd], outputs=[con_chart, con_table])
year_dd.change(constructor_standings, inputs=[year_dd], outputs=[con_chart, con_table])
app.load(predict_race, inputs=[race_dd], outputs=[race_chart, race_table])
if __name__ == "__main__":
app.launch()