"""OpenMHC leaderboard — self-rendering website (Track 2, imputation). Computes the leaderboard live from the per-user substrate in the ``MyHeartCounts/OpenMHC-leaderboard-data`` HF dataset (see ``leaderboard_compute.py``) and serves it as an HTML page. The same data is also exposed as JSON at ``/api/data`` (CORS-enabled) for the public site. The dataset is public, so no token is required. Styling mirrors the public MyHeartCounts / OpenMHC site (light theme, red accent). The table is grouped by sub-track (single-day / long-context) and can be filtered by method type. """ from __future__ import annotations import html import math from pathlib import Path from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import FileResponse, HTMLResponse, JSONResponse from leaderboard_compute import ( compute_downstream_rows, compute_forecasting_rows, compute_imputation_rows, ) app = FastAPI(title="OpenMHC Leaderboard", docs_url=None, redoc_url=None) # The public site (myheartcounts.stanford.edu) reads /api/data cross-origin. app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"], ) # Compute is mildly expensive (download + reduce); cache per track in-process. _CACHE: dict = { "downstream": {"rows": None, "error": None}, "imputation": {"rows": None, "error": None}, "forecasting": {"rows": None, "error": None}, } # (row key, header label). Order + arrows mirror the paper tables. # "_pos" is the rank-standing badge column (gold/silver/bronze for the top 3). _IMPUTATION_COLUMNS = [ ("_pos", "#"), ("method", "Method"), ("mtype", "Type"), ("rank", "R ↓"), ("skill", "S ↑"), ("fair_skill", "S_fair ↑"), ("activity", "Activity ↑"), ("physiology", "Physio. ↑"), ("sleep", "Sleep ↑"), ("workout", "Workout ↑"), ("semantic", "Semantic ↑"), ("fallback", "Fallback ↓"), ("submitter", "Submitter"), ] # Forecasting has the same shape minus the imputation-only "semantic" scope. _FORECASTING_COLUMNS = [c for c in _IMPUTATION_COLUMNS if c[0] != "semantic"] # Track 1 (predictive tasks) — headline metrics + the five outcome-category skills # (the task domains; same category-balanced bootstrap mean as the headline S). _DOWNSTREAM_COLUMNS = [ ("_pos", "#"), ("method", "Method"), ("mtype", "Type"), ("rank", "R ↓"), ("skill", "S ↑"), ("fair_skill", "S_fair ↑"), ("demographics", "Demo. ↑"), ("conditions", "Medical ↑"), ("vitals", "Vitals ↑"), ("mental", "Mental ↑"), ("lifestyle", "Lifestyle ↑"), ("fallback", "Fallback ↓"), ("submitter", "Submitter"), ] _IMPUTATION_NOTE = ( "Metric legend — scores are computed live vs the LOCF " "(last-observation-carried-forward) baseline.
" "R (Average Rank) — mean cross-method rank across all masking-scenario × channel " "tasks; 1 = best (lower is better).
" "S (Skill Score) — overall % reduction in reconstruction error vs LOCF " "(paired per-user geometric mean across tasks); higher is better.
" "S_fair (Fairness skill) — % reduction in the cross-subgroup error disparity " "(age group + sex, MAPD ratio vs LOCF); higher = more equitable.
" "Activity / Physio. / Sleep / Workout — per-category skill on that sensor group's " "channels (activity = steps, distance, flights; physiology = heart rate, active energy; " "sleep = asleep / in-bed; workout = the 10 workout-type channels).
" "Semantic — skill on the three structured-gap masking scenarios " "(sleep gap, workout gap, intensity failure).
" "Fallback — % of imputed values substituted by the LOCF baseline when the " "method produced no valid output (lower is better).
" "Source: MyHeartCounts/OpenMHC-leaderboard-data." ) _FORECASTING_NOTE = ( "Metric legend — scores are computed live vs the Seasonal Naive baseline " "(24-hour-ahead forecasting; MAE on continuous channels, AUROC on binary).
" "R (Average Rank) — mean cross-method rank across channel tasks; " "1 = best (lower is better).
" "S (Skill Score) — overall category-balanced % reduction in forecast error vs " "Seasonal Naive (paired per-user geometric mean); higher is better.
" "S_fair (Fairness skill) — % reduction in the cross-subgroup error disparity " "(age group + sex, MAPD ratio vs Seasonal Naive); higher = more equitable.
" "Activity / Physio. / Sleep / Workout — per-category skill on that sensor group's " "channels (activity = steps, distance, flights; physiology = heart rate, active energy; " "sleep = asleep / in-bed; workout = the 10 workout-type channels).
" "Fallback — % of forecasts substituted by the Seasonal Naive baseline when " "the model produced no valid output (lower is better).
" "Source: MyHeartCounts/OpenMHC-leaderboard-data." ) _DOWNSTREAM_NOTE = ( "Metric legend — Track 1 predicts weekly health outcomes from 168-hour " "sensor embeddings; scores are computed vs the Linear baseline.
" "R (Average Rank) — mean cross-method rank across the outcome tasks; " "1 = best (lower is better).
" "S (Skill Score) — category-balanced % improvement over Linear across tasks " "(per-task AUPRC / Spearman / Pearson, paired-bootstrap mean); higher is better.
" "S_fair (Fairness skill) — % reduction in the cross-subgroup error disparity " "(age group + sex, MAPD ratio vs Linear); higher = more equitable.
" "Demo. / Medical / Vitals / Mental / Lifestyle — per-category skill on that outcome " "group's tasks (Demographics; Medical Conditions & Risk; Vitals & Blood Biomarkers; " "Mental Well-Being; Sleep & Lifestyle), category-balanced like S.
" "Fallback — % of test predictions substituted by the Linear baseline when the " "method produced no valid output (lower is better).
" "Source: MyHeartCounts/OpenMHC-leaderboard-data." ) def _column_tips(baseline: str) -> dict[str, str]: """Per-column hover-tooltip text; ``baseline`` is the track's reference method.""" return { "_pos": "Leaderboard standing by average rank — gold / silver / bronze mark the top 3.", "method": "Model name; links to its published checkpoint on Hugging Face where one exists.", "mtype": "Method family — click to filter the table.", "rank": "Average Rank (R): mean cross-method rank across tasks; 1 = best (lower is better).", "skill": ( f"Skill Score (S): overall % reduction in error vs the {baseline} baseline " "(paired per-user geometric mean); higher is better." ), "fair_skill": ( f"Fairness skill (S_fair): % reduction in cross-subgroup (age + sex) error " f"disparity vs {baseline} (MAPD ratio); higher = more equitable." ), "activity": f"Skill on activity channels — steps, distance, flights — vs {baseline}.", "physiology": f"Skill on physiology channels — heart rate, active energy — vs {baseline}.", "sleep": f"Skill on sleep channels — asleep / in-bed — vs {baseline}.", "workout": f"Skill on workout channels — the 10 workout-type channels — vs {baseline}.", "semantic": ( "Skill on the three structured-gap masking scenarios " "(sleep gap, workout gap, intensity failure)." ), "demographics": f"Skill on the Demographics tasks (age, sex, BMI) vs {baseline}.", "conditions": f"Skill on the Medical Conditions & Risk tasks vs {baseline}.", "vitals": f"Skill on the Vitals & Blood Biomarker tasks vs {baseline}.", "mental": f"Skill on the Mental Well-Being tasks vs {baseline}.", "lifestyle": f"Skill on the Sleep & Lifestyle tasks vs {baseline}.", "fallback": ( f"Fallback rate: % of predictions substituted by the {baseline} fallback " "when the model produced no valid output; lower is better." ), "submitter": "Submitting team.", } # Ordered leaderboard sections rendered on the page (one table each). TRACKS = [ { "key": "downstream", "compute": compute_downstream_rows, "columns": _DOWNSTREAM_COLUMNS, "subtracks": [], "tab": "Predictive Tasks", "title": "Track 1 · Predictive Tasks", "note": _DOWNSTREAM_NOTE, "tips": _column_tips("Linear"), }, { "key": "imputation", "compute": compute_imputation_rows, "columns": _IMPUTATION_COLUMNS, "subtracks": [ ("single-day", "Single-day imputation"), ("long-context", "Long-context imputation (≥ 7×1440 time steps)"), ], "tab": "Imputation", "title": "Track 2a · Imputation", "note": _IMPUTATION_NOTE, "tips": _column_tips("LOCF"), }, { "key": "forecasting", "compute": compute_forecasting_rows, "columns": _FORECASTING_COLUMNS, "subtracks": [], "tab": "Forecasting", "title": "Track 2b · Forecasting", "note": _FORECASTING_NOTE, "tips": _column_tips("Seasonal Naive"), }, ] CODE_URL = "https://github.com/AshleyLab/myheartcounts-dataset" MAIN_URL = "https://myheartcounts.stanford.edu" MODELS_URL = "https://huggingface.co/MyHeartCounts/models" # Where submissions go (PRs) + the step-by-step guide. DATA_URL = "https://huggingface.co/datasets/MyHeartCounts/OpenMHC-leaderboard-data" SUBMIT_URL = "https://github.com/AshleyLab/myheartcounts-dataset#submit-to-the-leaderboard" PAGE = """ OpenMHC Leaderboard

OpenMHC Benchmark

OpenMHC Leaderboard

Wearable & mobile health benchmark on MyHeartCounts. Track 1 (predictive tasks) predicts weekly health outcomes — demographics, medical risk, vitals, mental well-being, and lifestyle — from 168-hour sensor embeddings. The generative tasks operate on the raw signals: Track 2a (imputation) reconstructs masked daily, minute-level signals, and Track 2b (forecasting) predicts future hourly signals. Each method is ranked by skill score vs a track baseline, computed live from the per-user evaluation substrate.

📤 Submit a model ⚙️ Code 📊 Dataset · coming soon 📄 Paper · coming soon 🏠 MyHeartCounts 🤗 Models

%%CONTENT%%

Submit your model

Add a method by opening a pull request on the OpenMHC leaderboard dataset that adds your per-user evaluation substrate (<track>/<method>.parquet) plus a small <method>.meta.json sidecar. Produce the substrate by running the OpenMHC eval with output_dir=…; the maintainers recompute the skill, fairness, and rank scores from it. See the step-by-step submission guide for the exact file schema.

""" def _rows(track_key: str, compute) -> tuple[list[dict] | None, str | None]: cache = _CACHE[track_key] if cache["rows"] is None and cache["error"] is None: try: cache["rows"] = compute() except Exception as e: # surface the failure on the page; don't hide it cache["error"] = f"{type(e).__name__}: {e}" return cache["rows"], cache["error"] # Per-column heatmap (paper-style blue gradient): better -> more saturated blue. # `fallback` is a diagnostic, not a score, so it is left out and renders plain on every # track. (Heatmapping it shades the column only when there's a range of values — a track # with a non-zero fallback would get a coloured column while all-zero tracks would not.) HIGHER_BETTER = {"skill", "fair_skill", "activity", "physiology", "sleep", "workout", "semantic", "demographics", "conditions", "vitals", "mental", "lifestyle"} LOWER_BETTER = {"rank"} _HEAT_RGB = (59, 130, 246) _HEAT_MAX_ALPHA = 0.55 def _col_stats(rows: list[dict]) -> dict: stats = {} for k in HIGHER_BETTER | LOWER_BETTER: vals = [r[k] for r in rows if isinstance(r.get(k), (int, float))] stats[k] = (min(vals), max(vals)) if vals else (0.0, 0.0) return stats def _heat_alpha(key: str, v, stats: dict) -> float: if not isinstance(v, (int, float)): return 0.0 lo, hi = stats.get(key, (0.0, 0.0)) if key in LOWER_BETTER: t = (hi - v) / (hi - lo) if hi > lo else 0.0 # lower is better else: t = v / hi if (v > 0 and hi > 0) else 0.0 # non-positive unshaded (paper) return round(max(0.0, min(1.0, t)) * _HEAT_MAX_ALPHA, 3) def _bg_style(key: str, v, stats: dict) -> str: a = _heat_alpha(key, v, stats) if a <= 0: return "" r, g, b = _HEAT_RGB return f' style="background:rgba({r},{g},{b},{a})"' def _is_best(key: str, v, stats: dict) -> bool: if not isinstance(v, (int, float)): return False lo, hi = stats.get(key, (0.0, 0.0)) target = lo if key in LOWER_BETTER else hi return abs(v - target) < 1e-9 def _num_span(key: str, v, stats: dict) -> str: if v is None: return '—' if key != "rank" and abs(v) < 1e-9: return '0.0' if key == "rank": txt = f"{v:.1f}" elif key == "fallback": txt = f"{v * 100:.1f}" # a rate (% of predictions), unsigned else: txt = f"{v * 100:+.1f}" best = " best" if _is_best(key, v, stats) else "" return f'{txt}' def _rank_key(r: dict): v = r.get("rank") return (v is None, v if v is not None else 0.0) def _rank_badge(pos: int) -> str: medal = {1: "gold", 2: "silver", 3: "bronze"}.get(pos, "") cls = f"rankbadge {medal}".strip() return f'{pos}' def _method_row(r: dict, section_key: str, stats: dict, columns: list, pos: int) -> str: cells = [] for k, _ in columns: if k == "_pos": cells.append(f'{_rank_badge(pos)}') elif k == "method": name = html.escape(str(r.get(k))) url = r.get("model_url") inner = ( f'{name}' if url else name ) cells.append(f'{inner}') elif k in ("mtype", "submitter"): cells.append(f'{html.escape(str(r.get(k)))}') else: # numeric — heatmap background + value + sortable raw value v = r.get(k) dv = repr(float(v)) if isinstance(v, (int, float)) else "" cells.append(f'{_num_span(k, v, stats)}') typ = html.escape(str(r.get("mtype", ""))) return f'{"".join(cells)}' def _th( key: str, label: str, type_values: list[str], sort_default_key: str = "", tip: str = "" ) -> str: tipattr = f' data-tip="{html.escape(tip)}"' if tip else "" # custom JS hover tooltip if key == "_pos": # rank-standing column = not sortable / not filterable return f'#' if key == "mtype": # Type column = checkbox filter (not sortable) boxes = "".join( f' {html.escape(t)}' for t in type_values ) return ( f'{html.escape(label)} ' f'▾' f'' ) # default sort direction the first time a column is clicked: best-first. default = "asc" if (key in LOWER_BETTER or key not in HIGHER_BETTER) else "desc" # Mark the default-sorted column so its arrow shows on load (rows are # already pre-sorted server-side to match). marker = ' class="s-asc"' if key == sort_default_key else "" return f'{html.escape(label)}' def _table(rows: list[dict], columns: list, subtracks: list, tips: dict) -> str: ncols = len(columns) type_values = sorted({str(r["mtype"]) for r in rows if r.get("mtype")}) head = "".join(_th(k, label, type_values, "rank", tips.get(k, "")) for k, label in columns) stats = _col_stats(rows) def group_body(section_key: str, sub_rows: list[dict]) -> str: # Default order: best average rank first; positions (1..n) drive the # gold/silver/bronze badge. JS re-sorts on header click. ordered = sorted(sub_rows, key=_rank_key) return "".join( _method_row(r, section_key, stats, columns, i + 1) for i, r in enumerate(ordered) ) if not subtracks: tbodies = f'{group_body("all", rows)}' return f'

{head}{tbodies}

' known = {k for k, _ in subtracks} sections = list(subtracks) if any(r.get("subtrack") not in known for r in rows): sections = sections + [("other", "Other")] tbodies = "" for key, label in sections: if key in known: sub_rows = [r for r in rows if r.get("subtrack") == key] else: sub_rows = [r for r in rows if r.get("subtrack") not in known] if not sub_rows: continue sec = f'{html.escape(label)}' tbodies += f'{sec}{group_body(key, sub_rows)}' table = f'

{head}{tbodies}

' return table def _render_section(cfg: dict, active: bool) -> str: rows, error = _rows(cfg["key"], cfg["compute"]) if error is not None: body = f'

Failed to compute leaderboard:\n{html.escape(error)}

' elif not rows: body = '

No methods found in the substrate dataset.

' else: body = _table(rows, cfg["columns"], cfg["subtracks"], cfg["tips"]) hidden = "" if active else " hidden" return ( f'

' f'

{html.escape(cfg["title"])}

' f"{body}" f'

{cfg["note"]}

' "

" ) @app.get("/", response_class=HTMLResponse) def index() -> str: tabs = "".join( f'' for i, cfg in enumerate(TRACKS) ) sections = "".join(_render_section(cfg, active=(i == 0)) for i, cfg in enumerate(TRACKS)) content = f'

{tabs}

\n{sections}' return ( PAGE.replace("%%CONTENT%%", content) .replace("%%CODE%%", CODE_URL) .replace("%%MAIN%%", MAIN_URL) .replace("%%MODELS%%", MODELS_URL) .replace("%%SUBMIT%%", SUBMIT_URL) .replace("%%DATA%%", DATA_URL) ) @app.get("/health") def health() -> dict: tracks: dict = {} ok = True for cfg in TRACKS: rows, error = _rows(cfg["key"], cfg["compute"]) tracks[cfg["key"]] = {"methods": len(rows or []), "error": error} if error is not None: ok = False return {"status": "ok" if ok else "error", "tracks": tracks} # --------------------------------------------------------------------------- # JSON API — consumed cross-origin by the public MyHeartCounts site. # --------------------------------------------------------------------------- def _sanitize(row: dict) -> dict: """Replace non-finite floats (NaN/inf) with None so the row is valid JSON. Starlette's JSONResponse serialises with ``allow_nan=False``; an unsanitised NaN would 500 the endpoint and break the client's ``.json()``. """ return { k: (None if isinstance(v, float) and not math.isfinite(v) else v) for k, v in row.items() } def _api_subtracks(cfg: dict, rows: list[dict]) -> list[tuple[str, str]]: """Subtracks for the payload; mirrors the HTML's dynamic "Other" bucket. The frontend silently drops rows whose ``subtrack`` matches no subtrack key, so when any row falls outside the configured set we append ("other", "Other") — exactly as ``_table`` does for the HTML page. """ subs = list(cfg["subtracks"]) if subs: known = {k for k, _ in subs} if any(r.get("subtrack") not in known for r in rows): subs = subs + [("other", "Other")] return subs def _track_payload(cfg: dict, rows: list[dict] | None, error: str | None) -> dict: # `fallback` is a downstream-only column in the public API contract. columns = [ {"key": k, "label": label} for k, label in cfg["columns"] if not (k == "fallback" and cfg["key"] != "downstream") ] rows = rows or [] return { "title": cfg["title"], "tab": cfg["tab"], "columns": columns, "subtracks": [{"key": k, "label": label} for k, label in _api_subtracks(cfg, rows)], "legend_html": cfg["note"], "rows": [_sanitize(r) for r in rows], "error": error, } @app.get("/api/data") def api_data() -> JSONResponse: payload = { cfg["key"]: _track_payload(cfg, *_rows(cfg["key"], cfg["compute"])) for cfg in TRACKS } return JSONResponse(payload) @app.get("/logo.png") def logo() -> FileResponse: return FileResponse(Path(__file__).parent / "logo.png", media_type="image/png")