""" Hyrox Race Time Predictor — Gradio app for Hugging Face Spaces. Loads the demo regressor + demo classifier (trained without cluster features, so a real prospective athlete can fill out the form without having raced before) and predicts: 1. Expected total finish time in minutes 2. Probability of finishing under 90 minutes Required files in the Space repo: - app.py (this file) - requirements.txt - hyrox_demo_regressor.pkl (from notebook Part 6.5) - hyrox_demo_classifier.pkl (from notebook Part 6.5) - hyrox_demo_scaler.pkl (from notebook Part 6.5) - hyrox_demo_features.pkl (from notebook Part 6.5) - hyrox_demo_options.pkl (from notebook Part 6.5) """ import pickle from pathlib import Path import gradio as gr import numpy as np import pandas as pd # -------------------------------------------------------------------------- # Load model artifacts at startup # -------------------------------------------------------------------------- HERE = Path(__file__).parent with open(HERE / "hyrox_demo_regressor.pkl", "rb") as f: regressor = pickle.load(f) with open(HERE / "hyrox_demo_classifier.pkl", "rb") as f: classifier = pickle.load(f) with open(HERE / "hyrox_demo_scaler.pkl", "rb") as f: scaler = pickle.load(f) with open(HERE / "hyrox_demo_features.pkl", "rb") as f: feature_columns = pickle.load(f) with open(HERE / "hyrox_demo_options.pkl", "rb") as f: options = pickle.load(f) # Recreate the helpers the notebook used so a single-row prediction lines up def age_midpoint(s): try: parts = str(s).replace("+", "").split("-") if len(parts) == 2: return (int(parts[0]) + int(parts[1])) / 2 if len(parts) == 1: return float(parts[0]) except Exception: return np.nan return np.nan def format_minutes(total_minutes: float) -> str: """Convert a float number of minutes into 'Xh YYm ZZs'.""" total_seconds = int(round(total_minutes * 60)) h = total_seconds // 3600 m = (total_seconds % 3600) // 60 s = total_seconds % 60 if h: return f"{h}h {m:02d}m {s:02d}s" return f"{m}m {s:02d}s" # -------------------------------------------------------------------------- # Prediction function # -------------------------------------------------------------------------- def predict(gender, age_group, division, region, year, event_size): """Run a single prediction and return a Markdown summary card.""" # Build a one-row DataFrame matching what X_demo looked like before one-hot row = pd.DataFrame([{ "age_numeric": age_midpoint(age_group), "is_male": int(gender == "male"), "year": float(year), "event_size": float(event_size), "gender": gender, "age_group": age_group, "division": division, "region": region, }]) # One-hot encode and reindex to the exact training column set row_numeric = row[["age_numeric", "is_male", "year", "event_size"]] row_categorical = pd.get_dummies(row[["gender", "age_group", "division", "region"]], drop_first=True) X_one = pd.concat([row_numeric, row_categorical], axis=1) X_one = X_one.reindex(columns=feature_columns, fill_value=0) # Regression: predicted total time in seconds → minutes pred_seconds = float(regressor.predict(X_one)[0]) pred_minutes = pred_seconds / 60 # Classification: probability of finishing under 90 minutes X_one_scaled = scaler.transform(X_one) prob_under_90 = float(classifier.predict_proba(X_one_scaled)[0, 1]) # Headline verdict if pred_minutes < 90: verdict = "🏃 On pace to break 90 minutes" verdict_color = "#16a34a" # green elif pred_minutes < 100: verdict = "⏱️ Borderline — within striking distance of 90" verdict_color = "#ca8a04" # amber else: verdict = "🐢 Likely above 90 minutes" verdict_color = "#dc2626" # red delta = pred_minutes - 90 delta_str = f"+{delta:.1f} min over 90" if delta >= 0 else f"{delta:.1f} min under 90" # Build the output card as Markdown card = f""" ### Prediction
**Predicted finish time:**  {format_minutes(pred_minutes)}   ({pred_minutes:.1f} min) **Probability of finishing under 90 minutes:**  {prob_under_90 * 100:.0f}% **Margin vs. 90 min target:** {delta_str}
{verdict}
Predictions come from a Gradient Boosting regressor and a Logistic Regression classifier trained on ~92,000 Hyrox results. The demo models use demographics + event metadata only (no race-split features), so anyone can use this — but the production assignment regressor in the linked notebook is more accurate because it also uses athlete-archetype clusters derived from past race splits. """ return card # -------------------------------------------------------------------------- # UI # -------------------------------------------------------------------------- custom_css = """ .gradio-container {max-width: 920px; margin: auto;} """ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Hyrox Race Time Predictor") as demo: gr.Markdown( """ # Hyrox Race Time Predictor 🏋️‍♂️🏃 Will you finish a Hyrox race in under 90 minutes? Fill in the form below — the model will predict your expected total time and the probability you'll cross the line under the 90-minute mark. """ ) with gr.Row(): with gr.Column(): gender = gr.Dropdown( choices=options["gender"], value="male" if "male" in options["gender"] else options["gender"][0], label="Gender", ) age_group = gr.Dropdown( choices=options["age_group"], value=options["age_group"][len(options["age_group"]) // 2], label="Age group", ) division = gr.Dropdown( choices=options["division"], value=options["division"][0], label="Division", info="Open / Pro / Doubles / Relay etc.", ) region = gr.Dropdown( choices=options["region"], value="Europe" if "Europe" in options["region"] else options["region"][0], label="Region", ) year = gr.Slider( minimum=options["year_min"], maximum=options["year_max"], value=options["year_max"], step=1, label="Race year", ) event_size = gr.Slider( minimum=100, maximum=5000, value=options["event_size_median"], step=50, label="Event size (number of athletes)", info="Bigger flagship events tend to have slightly different field profiles.", ) predict_btn = gr.Button("Predict my finish time", variant="primary") with gr.Column(): output = gr.Markdown() predict_btn.click( fn=predict, inputs=[gender, age_group, division, region, year, event_size], outputs=output, ) gr.Markdown( """ --- ### How this works 1. Your inputs are encoded the same way the training data was (one-hot + numeric). 2. A **Gradient Boosting regressor** estimates total race time in seconds. 3. A **Logistic Regression classifier**, trained directly on the binary "did this athlete finish under 90 min?" label, estimates the probability shown. 4. The two models were trained on ~92,000 cleaned Hyrox results from [jgug05/hyrox-results](https://www.kaggle.com/datasets/jgug05/hyrox-results). ### Caveats - The probability calibration is only as good as the training data — Hyrox times have changed event-to-event and year-to-year. - Cluster-based features (which capture athlete archetype from past race splits) were intentionally removed for this demo so the form stays usable. """ ) if __name__ == "__main__": demo.launch()