"""OpenMHC leaderboard — self-rendering website (Track 2, imputation).
Computes the leaderboard live from the per-user substrate in the
``MyHeartCounts/OpenMHC-leaderboard-data`` HF dataset (see
``leaderboard_compute.py``) and serves it as an HTML page. The same data is
also exposed as JSON at ``/api/data`` (CORS-enabled) for the public site.
The dataset is public, so no token is required. Styling mirrors the public
MyHeartCounts / OpenMHC site (light theme, red accent). The table is grouped by
sub-track (single-day / long-context) and can be filtered by method type.
"""
from __future__ import annotations
import html
import math
from pathlib import Path
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
from leaderboard_compute import (
compute_downstream_rows,
compute_forecasting_rows,
compute_imputation_rows,
)
app = FastAPI(title="OpenMHC Leaderboard", docs_url=None, redoc_url=None)
# The public site (myheartcounts.stanford.edu) reads /api/data cross-origin.
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["GET"],
allow_headers=["*"],
)
# Compute is mildly expensive (download + reduce); cache per track in-process.
_CACHE: dict = {
"downstream": {"rows": None, "error": None},
"imputation": {"rows": None, "error": None},
"forecasting": {"rows": None, "error": None},
}
# (row key, header label). Order + arrows mirror the paper tables.
# "_pos" is the rank-standing badge column (gold/silver/bronze for the top 3).
_IMPUTATION_COLUMNS = [
("_pos", "#"),
("method", "Method"),
("mtype", "Type"),
("rank", "R ↓"),
("skill", "S ↑"),
("fair_skill", "S_fair ↑"),
("activity", "Activity ↑"),
("physiology", "Physio. ↑"),
("sleep", "Sleep ↑"),
("workout", "Workout ↑"),
("semantic", "Semantic ↑"),
("fallback", "Fallback ↓"),
("submitter", "Submitter"),
]
# Forecasting has the same shape minus the imputation-only "semantic" scope.
_FORECASTING_COLUMNS = [c for c in _IMPUTATION_COLUMNS if c[0] != "semantic"]
# Track 1 (predictive tasks) — headline metrics + the five outcome-category skills
# (the task domains; same category-balanced bootstrap mean as the headline S).
_DOWNSTREAM_COLUMNS = [
("_pos", "#"),
("method", "Method"),
("mtype", "Type"),
("rank", "R ↓"),
("skill", "S ↑"),
("fair_skill", "S_fair ↑"),
("demographics", "Demo. ↑"),
("conditions", "Medical ↑"),
("vitals", "Vitals ↑"),
("mental", "Mental ↑"),
("lifestyle", "Lifestyle ↑"),
("fallback", "Fallback ↓"),
("submitter", "Submitter"),
]
_IMPUTATION_NOTE = (
"Metric legend — scores are computed live vs the LOCF "
"(last-observation-carried-forward) baseline.
"
"R (Average Rank) — mean cross-method rank across all masking-scenario × channel "
"tasks; 1 = best (lower is better).
"
"S (Skill Score) — overall % reduction in reconstruction error vs LOCF "
"(paired per-user geometric mean across tasks); higher is better.
"
"S_fair (Fairness skill) — % reduction in the cross-subgroup error disparity "
"(age group + sex, MAPD ratio vs LOCF); higher = more equitable.
"
"Activity / Physio. / Sleep / Workout — per-category skill on that sensor group's "
"channels (activity = steps, distance, flights; physiology = heart rate, active energy; "
"sleep = asleep / in-bed; workout = the 10 workout-type channels).
"
"Semantic — skill on the three structured-gap masking scenarios "
"(sleep gap, workout gap, intensity failure).
"
"Fallback — % of imputed values substituted by the LOCF baseline when the "
"method produced no valid output (lower is better).
"
"Source: MyHeartCounts/OpenMHC-leaderboard-data."
)
_FORECASTING_NOTE = (
"Metric legend — scores are computed live vs the Seasonal Naive baseline "
"(24-hour-ahead forecasting; MAE on continuous channels, AUROC on binary).
"
"R (Average Rank) — mean cross-method rank across channel tasks; "
"1 = best (lower is better).
"
"S (Skill Score) — overall category-balanced % reduction in forecast error vs "
"Seasonal Naive (paired per-user geometric mean); higher is better.
"
"S_fair (Fairness skill) — % reduction in the cross-subgroup error disparity "
"(age group + sex, MAPD ratio vs Seasonal Naive); higher = more equitable.
"
"Activity / Physio. / Sleep / Workout — per-category skill on that sensor group's "
"channels (activity = steps, distance, flights; physiology = heart rate, active energy; "
"sleep = asleep / in-bed; workout = the 10 workout-type channels).
"
"Fallback — % of forecasts substituted by the Seasonal Naive baseline when "
"the model produced no valid output (lower is better).
"
"Source: MyHeartCounts/OpenMHC-leaderboard-data."
)
_DOWNSTREAM_NOTE = (
"Metric legend — Track 1 predicts weekly health outcomes from 168-hour "
"sensor embeddings; scores are computed vs the Linear baseline.
"
"R (Average Rank) — mean cross-method rank across the outcome tasks; "
"1 = best (lower is better).
"
"S (Skill Score) — category-balanced % improvement over Linear across tasks "
"(per-task AUPRC / Spearman / Pearson, paired-bootstrap mean); higher is better.
"
"S_fair (Fairness skill) — % reduction in the cross-subgroup error disparity "
"(age group + sex, MAPD ratio vs Linear); higher = more equitable.
"
"Demo. / Medical / Vitals / Mental / Lifestyle — per-category skill on that outcome "
"group's tasks (Demographics; Medical Conditions & Risk; Vitals & Blood Biomarkers; "
"Mental Well-Being; Sleep & Lifestyle), category-balanced like S.
"
"Fallback — % of test predictions substituted by the Linear baseline when the "
"method produced no valid output (lower is better).
"
"Source: MyHeartCounts/OpenMHC-leaderboard-data."
)
def _column_tips(baseline: str) -> dict[str, str]:
"""Per-column hover-tooltip text; ``baseline`` is the track's reference method."""
return {
"_pos": "Leaderboard standing by average rank — gold / silver / bronze mark the top 3.",
"method": "Model name; links to its published checkpoint on Hugging Face where one exists.",
"mtype": "Method family — click to filter the table.",
"rank": "Average Rank (R): mean cross-method rank across tasks; 1 = best (lower is better).",
"skill": (
f"Skill Score (S): overall % reduction in error vs the {baseline} baseline "
"(paired per-user geometric mean); higher is better."
),
"fair_skill": (
f"Fairness skill (S_fair): % reduction in cross-subgroup (age + sex) error "
f"disparity vs {baseline} (MAPD ratio); higher = more equitable."
),
"activity": f"Skill on activity channels — steps, distance, flights — vs {baseline}.",
"physiology": f"Skill on physiology channels — heart rate, active energy — vs {baseline}.",
"sleep": f"Skill on sleep channels — asleep / in-bed — vs {baseline}.",
"workout": f"Skill on workout channels — the 10 workout-type channels — vs {baseline}.",
"semantic": (
"Skill on the three structured-gap masking scenarios "
"(sleep gap, workout gap, intensity failure)."
),
"demographics": f"Skill on the Demographics tasks (age, sex, BMI) vs {baseline}.",
"conditions": f"Skill on the Medical Conditions & Risk tasks vs {baseline}.",
"vitals": f"Skill on the Vitals & Blood Biomarker tasks vs {baseline}.",
"mental": f"Skill on the Mental Well-Being tasks vs {baseline}.",
"lifestyle": f"Skill on the Sleep & Lifestyle tasks vs {baseline}.",
"fallback": (
f"Fallback rate: % of predictions substituted by the {baseline} fallback "
"when the model produced no valid output; lower is better."
),
"submitter": "Submitting team.",
}
# Ordered leaderboard sections rendered on the page (one table each).
TRACKS = [
{
"key": "downstream",
"compute": compute_downstream_rows,
"columns": _DOWNSTREAM_COLUMNS,
"subtracks": [],
"tab": "Predictive Tasks",
"title": "Track 1 · Predictive Tasks",
"note": _DOWNSTREAM_NOTE,
"tips": _column_tips("Linear"),
},
{
"key": "imputation",
"compute": compute_imputation_rows,
"columns": _IMPUTATION_COLUMNS,
"subtracks": [
("single-day", "Single-day imputation"),
("long-context", "Long-context imputation (≥ 7×1440 time steps)"),
],
"tab": "Imputation",
"title": "Track 2a · Imputation",
"note": _IMPUTATION_NOTE,
"tips": _column_tips("LOCF"),
},
{
"key": "forecasting",
"compute": compute_forecasting_rows,
"columns": _FORECASTING_COLUMNS,
"subtracks": [],
"tab": "Forecasting",
"title": "Track 2b · Forecasting",
"note": _FORECASTING_NOTE,
"tips": _column_tips("Seasonal Naive"),
},
]
CODE_URL = "https://github.com/AshleyLab/myheartcounts-dataset"
MAIN_URL = "https://myheartcounts.stanford.edu"
MODELS_URL = "https://huggingface.co/MyHeartCounts/models"
# Where submissions go (PRs) + the step-by-step guide.
DATA_URL = "https://huggingface.co/datasets/MyHeartCounts/OpenMHC-leaderboard-data"
SUBMIT_URL = "https://github.com/AshleyLab/myheartcounts-dataset#submit-to-the-leaderboard"
PAGE = """
OpenMHC Benchmark
Wearable & mobile health benchmark on MyHeartCounts. Track 1 (predictive tasks) predicts weekly health outcomes — demographics, medical risk, vitals, mental well-being, and lifestyle — from 168-hour sensor embeddings. The generative tasks operate on the raw signals: Track 2a (imputation) reconstructs masked daily, minute-level signals, and Track 2b (forecasting) predicts future hourly signals. Each method is ranked by skill score vs a track baseline, computed live from the per-user evaluation substrate.
Submit your model
Add a method by opening a pull request on the
OpenMHC leaderboard dataset
that adds your per-user evaluation substrate
(<track>/<method>.parquet) plus a small
<method>.meta.json sidecar. Produce the substrate by running the OpenMHC
eval with output_dir=…; the maintainers recompute the skill, fairness,
and rank scores from it. See the
step-by-step submission guide
for the exact file schema.
{html.escape(cfg["title"])}
' f"{body}" f'{cfg["note"]}
' "