File size: 8,601 Bytes
b1e04c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"""
Hyrox Race Time Predictor β€” Gradio app for Hugging Face Spaces.

Loads the demo regressor + demo classifier (trained without cluster features,
so a real prospective athlete can fill out the form without having raced before)
and predicts:
  1. Expected total finish time in minutes
  2. Probability of finishing under 90 minutes

Required files in the Space repo:
  - app.py                       (this file)
  - requirements.txt
  - hyrox_demo_regressor.pkl     (from notebook Part 6.5)
  - hyrox_demo_classifier.pkl    (from notebook Part 6.5)
  - hyrox_demo_scaler.pkl        (from notebook Part 6.5)
  - hyrox_demo_features.pkl      (from notebook Part 6.5)
  - hyrox_demo_options.pkl       (from notebook Part 6.5)
"""

import pickle
from pathlib import Path

import gradio as gr
import numpy as np
import pandas as pd

# --------------------------------------------------------------------------
# Load model artifacts at startup
# --------------------------------------------------------------------------
HERE = Path(__file__).parent

with open(HERE / "hyrox_demo_regressor.pkl", "rb") as f:
    regressor = pickle.load(f)

with open(HERE / "hyrox_demo_classifier.pkl", "rb") as f:
    classifier = pickle.load(f)

with open(HERE / "hyrox_demo_scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open(HERE / "hyrox_demo_features.pkl", "rb") as f:
    feature_columns = pickle.load(f)

with open(HERE / "hyrox_demo_options.pkl", "rb") as f:
    options = pickle.load(f)

# Recreate the helpers the notebook used so a single-row prediction lines up
def age_midpoint(s):
    try:
        parts = str(s).replace("+", "").split("-")
        if len(parts) == 2:
            return (int(parts[0]) + int(parts[1])) / 2
        if len(parts) == 1:
            return float(parts[0])
    except Exception:
        return np.nan
    return np.nan


def format_minutes(total_minutes: float) -> str:
    """Convert a float number of minutes into 'Xh YYm ZZs'."""
    total_seconds = int(round(total_minutes * 60))
    h = total_seconds // 3600
    m = (total_seconds % 3600) // 60
    s = total_seconds % 60
    if h:
        return f"{h}h {m:02d}m {s:02d}s"
    return f"{m}m {s:02d}s"


# --------------------------------------------------------------------------
# Prediction function
# --------------------------------------------------------------------------
def predict(gender, age_group, division, region, year, event_size):
    """Run a single prediction and return a Markdown summary card."""
    # Build a one-row DataFrame matching what X_demo looked like before one-hot
    row = pd.DataFrame([{
        "age_numeric": age_midpoint(age_group),
        "is_male": int(gender == "male"),
        "year": float(year),
        "event_size": float(event_size),
        "gender": gender,
        "age_group": age_group,
        "division": division,
        "region": region,
    }])

    # One-hot encode and reindex to the exact training column set
    row_numeric = row[["age_numeric", "is_male", "year", "event_size"]]
    row_categorical = pd.get_dummies(row[["gender", "age_group", "division", "region"]],
                                     drop_first=True)
    X_one = pd.concat([row_numeric, row_categorical], axis=1)
    X_one = X_one.reindex(columns=feature_columns, fill_value=0)

    # Regression: predicted total time in seconds β†’ minutes
    pred_seconds = float(regressor.predict(X_one)[0])
    pred_minutes = pred_seconds / 60

    # Classification: probability of finishing under 90 minutes
    X_one_scaled = scaler.transform(X_one)
    prob_under_90 = float(classifier.predict_proba(X_one_scaled)[0, 1])

    # Headline verdict
    if pred_minutes < 90:
        verdict = "πŸƒ On pace to break 90 minutes"
        verdict_color = "#16a34a"  # green
    elif pred_minutes < 100:
        verdict = "⏱️ Borderline β€” within striking distance of 90"
        verdict_color = "#ca8a04"  # amber
    else:
        verdict = "🐒 Likely above 90 minutes"
        verdict_color = "#dc2626"  # red

    delta = pred_minutes - 90
    delta_str = f"+{delta:.1f} min over 90" if delta >= 0 else f"{delta:.1f} min under 90"

    # Build the output card as Markdown
    card = f"""
### Prediction

<div style="padding:14px 18px; border-radius:10px; background:#F9FAFB; border:1px solid #E5E7EB;">

**Predicted finish time:** &nbsp;<span style="font-size:1.4em;"><b>{format_minutes(pred_minutes)}</b></span> &nbsp; ({pred_minutes:.1f} min)

**Probability of finishing under 90 minutes:** &nbsp;<span style="font-size:1.4em;"><b>{prob_under_90 * 100:.0f}%</b></span>

**Margin vs. 90 min target:** {delta_str}

<div style="margin-top:10px; padding:10px 14px; border-radius:8px; background:white; border-left:5px solid {verdict_color}; font-size:1.05em;">
<b>{verdict}</b>
</div>

</div>

<sub>Predictions come from a Gradient Boosting regressor and a Logistic Regression classifier trained on ~92,000 Hyrox results. The demo models use demographics + event metadata only (no race-split features), so anyone can use this β€” but the production assignment regressor in the linked notebook is more accurate because it also uses athlete-archetype clusters derived from past race splits.</sub>
"""
    return card


# --------------------------------------------------------------------------
# UI
# --------------------------------------------------------------------------
custom_css = """
.gradio-container {max-width: 920px; margin: auto;}
"""

with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Hyrox Race Time Predictor") as demo:
    gr.Markdown(
        """
        # Hyrox Race Time Predictor πŸ‹οΈβ€β™‚οΈπŸƒ
        Will you finish a Hyrox race in under 90 minutes? Fill in the form below β€” the model
        will predict your expected total time and the probability you'll cross the line under
        the 90-minute mark.
        """
    )

    with gr.Row():
        with gr.Column():
            gender = gr.Dropdown(
                choices=options["gender"],
                value="male" if "male" in options["gender"] else options["gender"][0],
                label="Gender",
            )
            age_group = gr.Dropdown(
                choices=options["age_group"],
                value=options["age_group"][len(options["age_group"]) // 2],
                label="Age group",
            )
            division = gr.Dropdown(
                choices=options["division"],
                value=options["division"][0],
                label="Division",
                info="Open / Pro / Doubles / Relay etc.",
            )
            region = gr.Dropdown(
                choices=options["region"],
                value="Europe" if "Europe" in options["region"] else options["region"][0],
                label="Region",
            )
            year = gr.Slider(
                minimum=options["year_min"],
                maximum=options["year_max"],
                value=options["year_max"],
                step=1,
                label="Race year",
            )
            event_size = gr.Slider(
                minimum=100,
                maximum=5000,
                value=options["event_size_median"],
                step=50,
                label="Event size (number of athletes)",
                info="Bigger flagship events tend to have slightly different field profiles.",
            )
            predict_btn = gr.Button("Predict my finish time", variant="primary")

        with gr.Column():
            output = gr.Markdown()

    predict_btn.click(
        fn=predict,
        inputs=[gender, age_group, division, region, year, event_size],
        outputs=output,
    )

    gr.Markdown(
        """
        ---
        ### How this works
        1. Your inputs are encoded the same way the training data was (one-hot + numeric).
        2. A **Gradient Boosting regressor** estimates total race time in seconds.
        3. A **Logistic Regression classifier**, trained directly on the binary "did this athlete finish under 90 min?" label, estimates the probability shown.
        4. The two models were trained on ~92,000 cleaned Hyrox results from [jgug05/hyrox-results](https://www.kaggle.com/datasets/jgug05/hyrox-results).

        ### Caveats
        - The probability calibration is only as good as the training data β€” Hyrox times have changed event-to-event and year-to-year.
        - Cluster-based features (which capture athlete archetype from past race splits) were intentionally removed for this demo so the form stays usable.
        """
    )

if __name__ == "__main__":
    demo.launch()