"""MoveTSA dataset builder — Hugging Face Space (gated via HF login + allowlist). Generates a **parametrised** windows parquet (HRV / BSI / RSP / simulator aggregates / DATEX / subjective labels) from the PRIVATE raw recordings, entirely server-side. Authorised users only ever download the generated parquet — they never get access to the raw ECG/SIMU files. Backing repos (private, read with the ``HF_TOKEN`` Space secret): - ``MoveTSA/movetsa-raw`` : raw recordings (stay inside the Space) - ``MoveTSA/MoveTSA`` : pipeline code (downloaded at startup, importable as the ``MoveTSA`` package) Access control: the Space is public, but generation requires signing in with Hugging Face and being on the allowlist (the ``ALLOWLIST`` Space variable, a comma-separated list of usernames). Access is requested from ``@thbndi``. Programmatic access (gradio_client / the MovetsaDataset loading script): - named API endpoint ``/generate`` with 6 inputs, in this order: window_size, overlap, normalize, include_baselines, include_familiarization, hf_token - identity: in the browser via ``gr.OAuthProfile`` (the LoginButton); over the API via the explicit ``hf_token`` argument, resolved with ``whoami``. ``gr.OAuthToken`` is NOT populated over the gradio_client API — it only works through the browser OAuth login — so headless callers must pass their token as the 6th argument. """ import os import sys import tempfile import gradio as gr from huggingface_hub import snapshot_download, whoami TOKEN = os.environ.get("HF_TOKEN") CODE_REPO = os.environ.get("CODE_REPO", "MoveTSA/MoveTSA") RAW_REPO = os.environ.get("RAW_REPO", "MoveTSA/movetsa-raw") OWNER = os.environ.get("OWNER_HANDLE", "thbndi") # Comma-separated HF usernames allowed to generate. Edit via the ALLOWLIST # Space *variable* (Settings → Variables and secrets) — no code change needed. ALLOWLIST = { u.strip().lower() for u in os.environ.get("ALLOWLIST", OWNER).split(",") if u.strip() } # Optional server-side cache: identical parameters reuse the same parquet # instead of re-running the (heavy) pipeline. Keyed on ALL five parameters. CACHE_DIR = os.environ.get("MOVETSA_CACHE", os.path.join(tempfile.gettempdir(), "movetsa_out")) os.makedirs(CACHE_DIR, exist_ok=True) # ----------------------------------------------------------------- startup # 1) Pipeline code → importable as the `MoveTSA` package (cwd on sys.path). snapshot_download(CODE_REPO, repo_type="dataset", token=TOKEN, local_dir="MoveTSA", allow_patterns=["*.py", "*.yaml"]) sys.path.insert(0, os.getcwd()) # 2) Raw recordings — private, kept inside the Space, never served to users. RAW_DIR = snapshot_download(RAW_REPO, repo_type="dataset", token=TOKEN, allow_patterns=["S*/**", "subjective_scores.csv"]) from MoveTSA.export_hf_dataset import build_windows # noqa: E402 (needs sys.path) def _resolve_username(profile, oauth_token, hf_token): """Identity from the browser OAuth profile, or from an explicit HF token. In the browser, Gradio injects ``profile`` after the OAuth login. Headless callers (gradio_client / the MovetsaDataset loading script) cannot go through the OAuth flow, so they pass their token explicitly and we resolve the username with ``whoami``. ``oauth_token`` is kept as a fallback but is ``None`` over the API. """ if profile is not None: return profile.username token = None if hf_token: token = hf_token elif oauth_token is not None and getattr(oauth_token, "token", None): token = oauth_token.token if token: try: return whoami(token=token).get("name") except Exception: # noqa: BLE001 return None return None def generate(window_size, overlap, normalize, include_baselines, include_familiarization, hf_token="", profile: gr.OAuthProfile | None = None, oauth_token: gr.OAuthToken | None = None, progress=gr.Progress(track_tqdm=True)): """Run the pipeline with the chosen parameters and return a parquet file. ``hf_token`` is the 6th API input: headless callers pass their HF token so the Space can identify them (the browser leaves it empty and authenticates via the LoginButton). ``profile``/``oauth_token`` are injected by Gradio and are NOT part of the API signature. """ username = _resolve_username(profile, oauth_token, hf_token) if username is None: return None, "Sign in with your Hugging Face account to generate." if username.lower() not in ALLOWLIST: return None, ( f"Access not granted for **@{username}**.\n\n" f"Request access from **@{OWNER}** (to be added to the allowlist)." ) # --- validate parameters (API callers are not bound by the sliders) --- try: ws = int(window_size) ov = float(overlap) except (TypeError, ValueError): return None, "Invalid parameters." if not (15 <= ws <= 180): return None, "window_size must be between 15 and 180 s." if not (0.0 <= ov <= 0.9): return None, "overlap must be between 0 and 0.9." if normalize not in ("zscore", "center", "none"): return None, "normalize must be 'zscore', 'center' or 'none'." include_baselines = bool(include_baselines) include_familiarization = bool(include_familiarization) # Stable filename = also the cache key (now includes the B/F flags). fname = (f"movetsa_w{ws}_ov{int(ov * 100)}_{normalize}" f"_b{int(include_baselines)}_f{int(include_familiarization)}.parquet") cached = os.path.join(CACHE_DIR, fname) if os.path.exists(cached): return cached, f"Served from cache — `{fname}`" df = build_windows( RAW_DIR, window_size=ws, window_overlap=ov, normalize=(None if normalize == "none" else normalize), include_baselines=include_baselines, include_familiarization=include_familiarization, verbose=False, ) df.to_parquet(cached, index=False) msg = (f"**{len(df)} windows × {df.shape[1]} columns** " f"({df['subject'].nunique()} subjects) — `{fname}`") return cached, msg with gr.Blocks(title="MoveTSA dataset builder") as demo: gr.Markdown( "# MoveTSA dataset builder\n" "Generate a **parametrised HRV / BSI / RSP / simulator** parquet from the " "(private) raw recordings. You download **only** the generated parquet — " "never the raw data.\n\n" "1. Sign in with Hugging Face. 2. Set the parameters. 3. Generate." ) gr.LoginButton() with gr.Row(): with gr.Column(): window_size = gr.Slider(15, 180, value=60, step=5, label="window_size (s)") overlap = gr.Slider(0.0, 0.9, value=0.5, step=0.05, label="overlap") normalize = gr.Dropdown(["zscore", "center", "none"], value="zscore", label="normalize") include_baselines = gr.Checkbox( value=True, label="include baselines (B1–B4)") include_familiarization = gr.Checkbox( value=True, label="include familiarization (F)") # 6th API input: headless callers pass their HF token here; the # browser leaves it empty and authenticates via the LoginButton. hf_token = gr.Textbox(value="", visible=False, label="hf_token") btn = gr.Button("Generate parquet", variant="primary") with gr.Column(): status = gr.Markdown() out_file = gr.File(label="generated parquet") btn.click( generate, inputs=[window_size, overlap, normalize, include_baselines, include_familiarization, hf_token], outputs=[out_file, status], api_name="generate", # => client.predict(..., api_name="/generate") ) if __name__ == "__main__": demo.launch()