Spaces:

MichaelGelshtein
/

hyrox-predictor

Sleeping

File size: 8,601 Bytes

b1e04c6

"""
Hyrox Race Time Predictor — Gradio app for Hugging Face Spaces.

Loads the demo regressor + demo classifier (trained without cluster features,
so a real prospective athlete can fill out the form without having raced before)
and predicts:
  1. Expected total finish time in minutes
  2. Probability of finishing under 90 minutes

Required files in the Space repo:
  - app.py                       (this file)
  - requirements.txt
  - hyrox_demo_regressor.pkl     (from notebook Part 6.5)
  - hyrox_demo_classifier.pkl    (from notebook Part 6.5)
  - hyrox_demo_scaler.pkl        (from notebook Part 6.5)
  - hyrox_demo_features.pkl      (from notebook Part 6.5)
  - hyrox_demo_options.pkl       (from notebook Part 6.5)
"""

import pickle
from pathlib import Path

import gradio as gr
import numpy as np
import pandas as pd

# --------------------------------------------------------------------------
# Load model artifacts at startup
# --------------------------------------------------------------------------
HERE = Path(__file__).parent

with open(HERE / "hyrox_demo_regressor.pkl", "rb") as f:
    regressor = pickle.load(f)

with open(HERE / "hyrox_demo_classifier.pkl", "rb") as f:
    classifier = pickle.load(f)

with open(HERE / "hyrox_demo_scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open(HERE / "hyrox_demo_features.pkl", "rb") as f:
    feature_columns = pickle.load(f)

with open(HERE / "hyrox_demo_options.pkl", "rb") as f:
    options = pickle.load(f)

# Recreate the helpers the notebook used so a single-row prediction lines up
def age_midpoint(s):
    try:
        parts = str(s).replace("+", "").split("-")
        if len(parts) == 2:
            return (int(parts[0]) + int(parts[1])) / 2
        if len(parts) == 1:
            return float(parts[0])
    except Exception:
        return np.nan
    return np.nan


def format_minutes(total_minutes: float) -> str:
    """Convert a float number of minutes into 'Xh YYm ZZs'."""
    total_seconds = int(round(total_minutes * 60))
    h = total_seconds // 3600
    m = (total_seconds % 3600) // 60
    s = total_seconds % 60
    if h:
        return f"{h}h {m:02d}m {s:02d}s"
    return f"{m}m {s:02d}s"


# --------------------------------------------------------------------------
# Prediction function
# --------------------------------------------------------------------------
def predict(gender, age_group, division, region, year, event_size):
    """Run a single prediction and return a Markdown summary card."""
    # Build a one-row DataFrame matching what X_demo looked like before one-hot
    row = pd.DataFrame([{
        "age_numeric": age_midpoint(age_group),
        "is_male": int(gender == "male"),
        "year": float(year),
        "event_size": float(event_size),
        "gender": gender,
        "age_group": age_group,
        "division": division,
        "region": region,
    }])

    # One-hot encode and reindex to the exact training column set
    row_numeric = row[["age_numeric", "is_male", "year", "event_size"]]
    row_categorical = pd.get_dummies(row[["gender", "age_group", "division", "region"]],
                                     drop_first=True)
    X_one = pd.concat([row_numeric, row_categorical], axis=1)
    X_one = X_one.reindex(columns=feature_columns, fill_value=0)

    # Regression: predicted total time in seconds → minutes
    pred_seconds = float(regressor.predict(X_one)[0])
    pred_minutes = pred_seconds / 60

    # Classification: probability of finishing under 90 minutes
    X_one_scaled = scaler.transform(X_one)
    prob_under_90 = float(classifier.predict_proba(X_one_scaled)[0, 1])

    # Headline verdict
    if pred_minutes < 90:
        verdict = "🏃 On pace to break 90 minutes"
        verdict_color = "#16a34a"  # green
    elif pred_minutes < 100:
        verdict = "⏱️ Borderline — within striking distance of 90"
        verdict_color = "#ca8a04"  # amber
    else:
        verdict = "🐢 Likely above 90 minutes"
        verdict_color = "#dc2626"  # red

    delta = pred_minutes - 90
    delta_str = f"+{delta:.1f} min over 90" if delta >= 0 else f"{delta:.1f} min under 90"

    # Build the output card as Markdown
    card = f"""
### Prediction

<div style="padding:14px 18px; border-radius:10px; background:#F9FAFB; border:1px solid #E5E7EB;">

**Predicted finish time:** &nbsp;<span style="font-size:1.4em;"><b>{format_minutes(pred_minutes)}</b></span> &nbsp; ({pred_minutes:.1f} min)

**Probability of finishing under 90 minutes:** &nbsp;<span style="font-size:1.4em;"><b>{prob_under_90 * 100:.0f}%</b></span>

**Margin vs. 90 min target:** {delta_str}

<div style="margin-top:10px; padding:10px 14px; border-radius:8px; background:white; border-left:5px solid {verdict_color}; font-size:1.05em;">
<b>{verdict}</b>
</div>

</div>

<sub>Predictions come from a Gradient Boosting regressor and a Logistic Regression classifier trained on ~92,000 Hyrox results. The demo models use demographics + event metadata only (no race-split features), so anyone can use this — but the production assignment regressor in the linked notebook is more accurate because it also uses athlete-archetype clusters derived from past race splits.</sub>
"""
    return card


# --------------------------------------------------------------------------
# UI
# --------------------------------------------------------------------------
custom_css = """
.gradio-container {max-width: 920px; margin: auto;}
"""

with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Hyrox Race Time Predictor") as demo:
    gr.Markdown(
        """
        # Hyrox Race Time Predictor 🏋️‍♂️🏃
        Will you finish a Hyrox race in under 90 minutes? Fill in the form below — the model
        will predict your expected total time and the probability you'll cross the line under
        the 90-minute mark.
        """
    )

    with gr.Row():
        with gr.Column():
            gender = gr.Dropdown(
                choices=options["gender"],
                value="male" if "male" in options["gender"] else options["gender"][0],
                label="Gender",
            )
            age_group = gr.Dropdown(
                choices=options["age_group"],
                value=options["age_group"][len(options["age_group"]) // 2],
                label="Age group",
            )
            division = gr.Dropdown(
                choices=options["division"],
                value=options["division"][0],
                label="Division",
                info="Open / Pro / Doubles / Relay etc.",
            )
            region = gr.Dropdown(
                choices=options["region"],
                value="Europe" if "Europe" in options["region"] else options["region"][0],
                label="Region",
            )
            year = gr.Slider(
                minimum=options["year_min"],
                maximum=options["year_max"],
                value=options["year_max"],
                step=1,
                label="Race year",
            )
            event_size = gr.Slider(
                minimum=100,
                maximum=5000,
                value=options["event_size_median"],
                step=50,
                label="Event size (number of athletes)",
                info="Bigger flagship events tend to have slightly different field profiles.",
            )
            predict_btn = gr.Button("Predict my finish time", variant="primary")

        with gr.Column():
            output = gr.Markdown()

    predict_btn.click(
        fn=predict,
        inputs=[gender, age_group, division, region, year, event_size],
        outputs=output,
    )

    gr.Markdown(
        """
        ---
        ### How this works
        1. Your inputs are encoded the same way the training data was (one-hot + numeric).
        2. A **Gradient Boosting regressor** estimates total race time in seconds.
        3. A **Logistic Regression classifier**, trained directly on the binary "did this athlete finish under 90 min?" label, estimates the probability shown.
        4. The two models were trained on ~92,000 cleaned Hyrox results from [jgug05/hyrox-results](https://www.kaggle.com/datasets/jgug05/hyrox-results).

        ### Caveats
        - The probability calibration is only as good as the training data — Hyrox times have changed event-to-event and year-to-year.
        - Cluster-based features (which capture athlete archetype from past race splits) were intentionally removed for this demo so the form stays usable.
        """
    )

if __name__ == "__main__":
    demo.launch()