Spaces:

MichaelGelshtein
/

hyrox-predictor

Running

App Files Files Community

hyrox-predictor / app.py

MichaelGelshtein

Upload 2 files

b1e04c6 verified 10 days ago

raw

history blame contribute delete

8.6 kB

	"""
	Hyrox Race Time Predictor — Gradio app for Hugging Face Spaces.

	Loads the demo regressor + demo classifier (trained without cluster features,
	so a real prospective athlete can fill out the form without having raced before)
	and predicts:
	1. Expected total finish time in minutes
	2. Probability of finishing under 90 minutes

	Required files in the Space repo:
	- app.py (this file)
	- requirements.txt
	- hyrox_demo_regressor.pkl (from notebook Part 6.5)
	- hyrox_demo_classifier.pkl (from notebook Part 6.5)
	- hyrox_demo_scaler.pkl (from notebook Part 6.5)
	- hyrox_demo_features.pkl (from notebook Part 6.5)
	- hyrox_demo_options.pkl (from notebook Part 6.5)
	"""

	import pickle
	from pathlib import Path

	import gradio as gr
	import numpy as np
	import pandas as pd

	# --------------------------------------------------------------------------
	# Load model artifacts at startup
	# --------------------------------------------------------------------------
	HERE = Path(__file__).parent

	with open(HERE / "hyrox_demo_regressor.pkl", "rb") as f:
	regressor = pickle.load(f)

	with open(HERE / "hyrox_demo_classifier.pkl", "rb") as f:
	classifier = pickle.load(f)

	with open(HERE / "hyrox_demo_scaler.pkl", "rb") as f:
	scaler = pickle.load(f)

	with open(HERE / "hyrox_demo_features.pkl", "rb") as f:
	feature_columns = pickle.load(f)

	with open(HERE / "hyrox_demo_options.pkl", "rb") as f:
	options = pickle.load(f)

	# Recreate the helpers the notebook used so a single-row prediction lines up
	def age_midpoint(s):
	try:
	parts = str(s).replace("+", "").split("-")
	if len(parts) == 2:
	return (int(parts[0]) + int(parts[1])) / 2
	if len(parts) == 1:
	return float(parts[0])
	except Exception:
	return np.nan
	return np.nan


	def format_minutes(total_minutes: float) -> str:
	"""Convert a float number of minutes into 'Xh YYm ZZs'."""
	total_seconds = int(round(total_minutes * 60))
	h = total_seconds // 3600
	m = (total_seconds % 3600) // 60
	s = total_seconds % 60
	if h:
	return f"{h}h {m:02d}m {s:02d}s"
	return f"{m}m {s:02d}s"


	# --------------------------------------------------------------------------
	# Prediction function
	# --------------------------------------------------------------------------
	def predict(gender, age_group, division, region, year, event_size):
	"""Run a single prediction and return a Markdown summary card."""
	# Build a one-row DataFrame matching what X_demo looked like before one-hot
	row = pd.DataFrame([{
	"age_numeric": age_midpoint(age_group),
	"is_male": int(gender == "male"),
	"year": float(year),
	"event_size": float(event_size),
	"gender": gender,
	"age_group": age_group,
	"division": division,
	"region": region,
	}])

	# One-hot encode and reindex to the exact training column set
	row_numeric = row[["age_numeric", "is_male", "year", "event_size"]]
	row_categorical = pd.get_dummies(row[["gender", "age_group", "division", "region"]],
	drop_first=True)
	X_one = pd.concat([row_numeric, row_categorical], axis=1)
	X_one = X_one.reindex(columns=feature_columns, fill_value=0)

	# Regression: predicted total time in seconds → minutes
	pred_seconds = float(regressor.predict(X_one)[0])
	pred_minutes = pred_seconds / 60

	# Classification: probability of finishing under 90 minutes
	X_one_scaled = scaler.transform(X_one)
	prob_under_90 = float(classifier.predict_proba(X_one_scaled)[0, 1])

	# Headline verdict
	if pred_minutes < 90:
	verdict = "🏃 On pace to break 90 minutes"
	verdict_color = "#16a34a" # green
	elif pred_minutes < 100:
	verdict = "⏱️ Borderline — within striking distance of 90"
	verdict_color = "#ca8a04" # amber
	else:
	verdict = "🐢 Likely above 90 minutes"
	verdict_color = "#dc2626" # red

	delta = pred_minutes - 90
	delta_str = f"+{delta:.1f} min over 90" if delta >= 0 else f"{delta:.1f} min under 90"

	# Build the output card as Markdown
	card = f"""
	### Prediction

	<div style="padding:14px 18px; border-radius:10px; background:#F9FAFB; border:1px solid #E5E7EB;">

	Predicted finish time:  <span style="font-size:1.4em;"><b>{format_minutes(pred_minutes)}</b></span>   ({pred_minutes:.1f} min)

	Probability of finishing under 90 minutes:  <span style="font-size:1.4em;"><b>{prob_under_90 * 100:.0f}%</b></span>

	Margin vs. 90 min target: {delta_str}

	<div style="margin-top:10px; padding:10px 14px; border-radius:8px; background:white; border-left:5px solid {verdict_color}; font-size:1.05em;">
	<b>{verdict}</b>
	</div>

	</div>

	<sub>Predictions come from a Gradient Boosting regressor and a Logistic Regression classifier trained on ~92,000 Hyrox results. The demo models use demographics + event metadata only (no race-split features), so anyone can use this — but the production assignment regressor in the linked notebook is more accurate because it also uses athlete-archetype clusters derived from past race splits.</sub>
	"""
	return card


	# --------------------------------------------------------------------------
	# UI
	# --------------------------------------------------------------------------
	custom_css = """
	.gradio-container {max-width: 920px; margin: auto;}
	"""

	with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Hyrox Race Time Predictor") as demo:
	gr.Markdown(
	"""
	# Hyrox Race Time Predictor 🏋️‍♂️🏃
	Will you finish a Hyrox race in under 90 minutes? Fill in the form below — the model
	will predict your expected total time and the probability you'll cross the line under
	the 90-minute mark.
	"""
	)

	with gr.Row():
	with gr.Column():
	gender = gr.Dropdown(
	choices=options["gender"],
	value="male" if "male" in options["gender"] else options["gender"][0],
	label="Gender",
	)
	age_group = gr.Dropdown(
	choices=options["age_group"],
	value=options["age_group"][len(options["age_group"]) // 2],
	label="Age group",
	)
	division = gr.Dropdown(
	choices=options["division"],
	value=options["division"][0],
	label="Division",
	info="Open / Pro / Doubles / Relay etc.",
	)
	region = gr.Dropdown(
	choices=options["region"],
	value="Europe" if "Europe" in options["region"] else options["region"][0],
	label="Region",
	)
	year = gr.Slider(
	minimum=options["year_min"],
	maximum=options["year_max"],
	value=options["year_max"],
	step=1,
	label="Race year",
	)
	event_size = gr.Slider(
	minimum=100,
	maximum=5000,
	value=options["event_size_median"],
	step=50,
	label="Event size (number of athletes)",
	info="Bigger flagship events tend to have slightly different field profiles.",
	)
	predict_btn = gr.Button("Predict my finish time", variant="primary")

	with gr.Column():
	output = gr.Markdown()

	predict_btn.click(
	fn=predict,
	inputs=[gender, age_group, division, region, year, event_size],
	outputs=output,
	)

	gr.Markdown(
	"""
	---
	### How this works
	1. Your inputs are encoded the same way the training data was (one-hot + numeric).
	2. A Gradient Boosting regressor estimates total race time in seconds.
	3. A Logistic Regression classifier, trained directly on the binary "did this athlete finish under 90 min?" label, estimates the probability shown.
	4. The two models were trained on ~92,000 cleaned Hyrox results from [jgug05/hyrox-results](https://www.kaggle.com/datasets/jgug05/hyrox-results).

	### Caveats
	- The probability calibration is only as good as the training data — Hyrox times have changed event-to-event and year-to-year.
	- Cluster-based features (which capture athlete archetype from past race splits) were intentionally removed for this demo so the form stays usable.
	"""
	)

	if __name__ == "__main__":
	demo.launch()