hyrox-predictor / app.py
MichaelGelshtein's picture
Upload 2 files
b1e04c6 verified
"""
Hyrox Race Time Predictor β€” Gradio app for Hugging Face Spaces.
Loads the demo regressor + demo classifier (trained without cluster features,
so a real prospective athlete can fill out the form without having raced before)
and predicts:
1. Expected total finish time in minutes
2. Probability of finishing under 90 minutes
Required files in the Space repo:
- app.py (this file)
- requirements.txt
- hyrox_demo_regressor.pkl (from notebook Part 6.5)
- hyrox_demo_classifier.pkl (from notebook Part 6.5)
- hyrox_demo_scaler.pkl (from notebook Part 6.5)
- hyrox_demo_features.pkl (from notebook Part 6.5)
- hyrox_demo_options.pkl (from notebook Part 6.5)
"""
import pickle
from pathlib import Path
import gradio as gr
import numpy as np
import pandas as pd
# --------------------------------------------------------------------------
# Load model artifacts at startup
# --------------------------------------------------------------------------
HERE = Path(__file__).parent
with open(HERE / "hyrox_demo_regressor.pkl", "rb") as f:
regressor = pickle.load(f)
with open(HERE / "hyrox_demo_classifier.pkl", "rb") as f:
classifier = pickle.load(f)
with open(HERE / "hyrox_demo_scaler.pkl", "rb") as f:
scaler = pickle.load(f)
with open(HERE / "hyrox_demo_features.pkl", "rb") as f:
feature_columns = pickle.load(f)
with open(HERE / "hyrox_demo_options.pkl", "rb") as f:
options = pickle.load(f)
# Recreate the helpers the notebook used so a single-row prediction lines up
def age_midpoint(s):
try:
parts = str(s).replace("+", "").split("-")
if len(parts) == 2:
return (int(parts[0]) + int(parts[1])) / 2
if len(parts) == 1:
return float(parts[0])
except Exception:
return np.nan
return np.nan
def format_minutes(total_minutes: float) -> str:
"""Convert a float number of minutes into 'Xh YYm ZZs'."""
total_seconds = int(round(total_minutes * 60))
h = total_seconds // 3600
m = (total_seconds % 3600) // 60
s = total_seconds % 60
if h:
return f"{h}h {m:02d}m {s:02d}s"
return f"{m}m {s:02d}s"
# --------------------------------------------------------------------------
# Prediction function
# --------------------------------------------------------------------------
def predict(gender, age_group, division, region, year, event_size):
"""Run a single prediction and return a Markdown summary card."""
# Build a one-row DataFrame matching what X_demo looked like before one-hot
row = pd.DataFrame([{
"age_numeric": age_midpoint(age_group),
"is_male": int(gender == "male"),
"year": float(year),
"event_size": float(event_size),
"gender": gender,
"age_group": age_group,
"division": division,
"region": region,
}])
# One-hot encode and reindex to the exact training column set
row_numeric = row[["age_numeric", "is_male", "year", "event_size"]]
row_categorical = pd.get_dummies(row[["gender", "age_group", "division", "region"]],
drop_first=True)
X_one = pd.concat([row_numeric, row_categorical], axis=1)
X_one = X_one.reindex(columns=feature_columns, fill_value=0)
# Regression: predicted total time in seconds β†’ minutes
pred_seconds = float(regressor.predict(X_one)[0])
pred_minutes = pred_seconds / 60
# Classification: probability of finishing under 90 minutes
X_one_scaled = scaler.transform(X_one)
prob_under_90 = float(classifier.predict_proba(X_one_scaled)[0, 1])
# Headline verdict
if pred_minutes < 90:
verdict = "πŸƒ On pace to break 90 minutes"
verdict_color = "#16a34a" # green
elif pred_minutes < 100:
verdict = "⏱️ Borderline β€” within striking distance of 90"
verdict_color = "#ca8a04" # amber
else:
verdict = "🐒 Likely above 90 minutes"
verdict_color = "#dc2626" # red
delta = pred_minutes - 90
delta_str = f"+{delta:.1f} min over 90" if delta >= 0 else f"{delta:.1f} min under 90"
# Build the output card as Markdown
card = f"""
### Prediction
<div style="padding:14px 18px; border-radius:10px; background:#F9FAFB; border:1px solid #E5E7EB;">
**Predicted finish time:** &nbsp;<span style="font-size:1.4em;"><b>{format_minutes(pred_minutes)}</b></span> &nbsp; ({pred_minutes:.1f} min)
**Probability of finishing under 90 minutes:** &nbsp;<span style="font-size:1.4em;"><b>{prob_under_90 * 100:.0f}%</b></span>
**Margin vs. 90 min target:** {delta_str}
<div style="margin-top:10px; padding:10px 14px; border-radius:8px; background:white; border-left:5px solid {verdict_color}; font-size:1.05em;">
<b>{verdict}</b>
</div>
</div>
<sub>Predictions come from a Gradient Boosting regressor and a Logistic Regression classifier trained on ~92,000 Hyrox results. The demo models use demographics + event metadata only (no race-split features), so anyone can use this β€” but the production assignment regressor in the linked notebook is more accurate because it also uses athlete-archetype clusters derived from past race splits.</sub>
"""
return card
# --------------------------------------------------------------------------
# UI
# --------------------------------------------------------------------------
custom_css = """
.gradio-container {max-width: 920px; margin: auto;}
"""
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Hyrox Race Time Predictor") as demo:
gr.Markdown(
"""
# Hyrox Race Time Predictor πŸ‹οΈβ€β™‚οΈπŸƒ
Will you finish a Hyrox race in under 90 minutes? Fill in the form below β€” the model
will predict your expected total time and the probability you'll cross the line under
the 90-minute mark.
"""
)
with gr.Row():
with gr.Column():
gender = gr.Dropdown(
choices=options["gender"],
value="male" if "male" in options["gender"] else options["gender"][0],
label="Gender",
)
age_group = gr.Dropdown(
choices=options["age_group"],
value=options["age_group"][len(options["age_group"]) // 2],
label="Age group",
)
division = gr.Dropdown(
choices=options["division"],
value=options["division"][0],
label="Division",
info="Open / Pro / Doubles / Relay etc.",
)
region = gr.Dropdown(
choices=options["region"],
value="Europe" if "Europe" in options["region"] else options["region"][0],
label="Region",
)
year = gr.Slider(
minimum=options["year_min"],
maximum=options["year_max"],
value=options["year_max"],
step=1,
label="Race year",
)
event_size = gr.Slider(
minimum=100,
maximum=5000,
value=options["event_size_median"],
step=50,
label="Event size (number of athletes)",
info="Bigger flagship events tend to have slightly different field profiles.",
)
predict_btn = gr.Button("Predict my finish time", variant="primary")
with gr.Column():
output = gr.Markdown()
predict_btn.click(
fn=predict,
inputs=[gender, age_group, division, region, year, event_size],
outputs=output,
)
gr.Markdown(
"""
---
### How this works
1. Your inputs are encoded the same way the training data was (one-hot + numeric).
2. A **Gradient Boosting regressor** estimates total race time in seconds.
3. A **Logistic Regression classifier**, trained directly on the binary "did this athlete finish under 90 min?" label, estimates the probability shown.
4. The two models were trained on ~92,000 cleaned Hyrox results from [jgug05/hyrox-results](https://www.kaggle.com/datasets/jgug05/hyrox-results).
### Caveats
- The probability calibration is only as good as the training data β€” Hyrox times have changed event-to-event and year-to-year.
- Cluster-based features (which capture athlete archetype from past race splits) were intentionally removed for this demo so the form stays usable.
"""
)
if __name__ == "__main__":
demo.launch()