Spaces:
Sleeping
Sleeping
| """MoveTSA dataset builder — Hugging Face Space (gated via HF login + allowlist). | |
| Generates a **parametrised** windows parquet (HRV / BSI / RSP / simulator | |
| aggregates / DATEX / subjective labels) from the PRIVATE raw recordings, | |
| entirely server-side. Authorised users only ever download the generated | |
| parquet — they never get access to the raw ECG/SIMU files. | |
| Backing repos (private, read with the ``HF_TOKEN`` Space secret): | |
| - ``MoveTSA/movetsa-raw`` : raw recordings (stay inside the Space) | |
| - ``MoveTSA/MoveTSA`` : pipeline code (downloaded at startup, importable | |
| as the ``MoveTSA`` package) | |
| Access control: the Space is public, but generation requires signing in with | |
| Hugging Face and being on the allowlist (the ``ALLOWLIST`` Space variable, a | |
| comma-separated list of usernames). Access is requested from ``@thbndi``. | |
| Programmatic access (gradio_client / the MovetsaDataset loading script): | |
| - named API endpoint ``/generate`` with 6 inputs, in this order: | |
| window_size, overlap, normalize, include_baselines, | |
| include_familiarization, hf_token | |
| - identity: in the browser via ``gr.OAuthProfile`` (the LoginButton); over | |
| the API via the explicit ``hf_token`` argument, resolved with ``whoami``. | |
| ``gr.OAuthToken`` is NOT populated over the gradio_client API — it only | |
| works through the browser OAuth login — so headless callers must pass their | |
| token as the 6th argument. | |
| """ | |
| import os | |
| import sys | |
| import tempfile | |
| import gradio as gr | |
| from huggingface_hub import snapshot_download, whoami | |
| TOKEN = os.environ.get("HF_TOKEN") | |
| CODE_REPO = os.environ.get("CODE_REPO", "MoveTSA/MoveTSA") | |
| RAW_REPO = os.environ.get("RAW_REPO", "MoveTSA/movetsa-raw") | |
| OWNER = os.environ.get("OWNER_HANDLE", "thbndi") | |
| # Comma-separated HF usernames allowed to generate. Edit via the ALLOWLIST | |
| # Space *variable* (Settings → Variables and secrets) — no code change needed. | |
| ALLOWLIST = { | |
| u.strip().lower() | |
| for u in os.environ.get("ALLOWLIST", OWNER).split(",") | |
| if u.strip() | |
| } | |
| # Optional server-side cache: identical parameters reuse the same parquet | |
| # instead of re-running the (heavy) pipeline. Keyed on ALL five parameters. | |
| CACHE_DIR = os.environ.get("MOVETSA_CACHE", | |
| os.path.join(tempfile.gettempdir(), "movetsa_out")) | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| # ----------------------------------------------------------------- startup | |
| # 1) Pipeline code → importable as the `MoveTSA` package (cwd on sys.path). | |
| snapshot_download(CODE_REPO, repo_type="dataset", token=TOKEN, | |
| local_dir="MoveTSA", allow_patterns=["*.py", "*.yaml"]) | |
| sys.path.insert(0, os.getcwd()) | |
| # 2) Raw recordings — private, kept inside the Space, never served to users. | |
| RAW_DIR = snapshot_download(RAW_REPO, repo_type="dataset", token=TOKEN, | |
| allow_patterns=["S*/**", "subjective_scores.csv"]) | |
| from MoveTSA.export_hf_dataset import build_windows # noqa: E402 (needs sys.path) | |
| def _resolve_username(profile, oauth_token, hf_token): | |
| """Identity from the browser OAuth profile, or from an explicit HF token. | |
| In the browser, Gradio injects ``profile`` after the OAuth login. Headless | |
| callers (gradio_client / the MovetsaDataset loading script) cannot go | |
| through the OAuth flow, so they pass their token explicitly and we resolve | |
| the username with ``whoami``. ``oauth_token`` is kept as a fallback but is | |
| ``None`` over the API. | |
| """ | |
| if profile is not None: | |
| return profile.username | |
| token = None | |
| if hf_token: | |
| token = hf_token | |
| elif oauth_token is not None and getattr(oauth_token, "token", None): | |
| token = oauth_token.token | |
| if token: | |
| try: | |
| return whoami(token=token).get("name") | |
| except Exception: # noqa: BLE001 | |
| return None | |
| return None | |
| def generate(window_size, overlap, normalize, include_baselines, | |
| include_familiarization, hf_token="", | |
| profile: gr.OAuthProfile | None = None, | |
| oauth_token: gr.OAuthToken | None = None, | |
| progress=gr.Progress(track_tqdm=True)): | |
| """Run the pipeline with the chosen parameters and return a parquet file. | |
| ``hf_token`` is the 6th API input: headless callers pass their HF token so | |
| the Space can identify them (the browser leaves it empty and authenticates | |
| via the LoginButton). ``profile``/``oauth_token`` are injected by Gradio and | |
| are NOT part of the API signature. | |
| """ | |
| username = _resolve_username(profile, oauth_token, hf_token) | |
| if username is None: | |
| return None, "Sign in with your Hugging Face account to generate." | |
| if username.lower() not in ALLOWLIST: | |
| return None, ( | |
| f"Access not granted for **@{username}**.\n\n" | |
| f"Request access from **@{OWNER}** (to be added to the allowlist)." | |
| ) | |
| # --- validate parameters (API callers are not bound by the sliders) --- | |
| try: | |
| ws = int(window_size) | |
| ov = float(overlap) | |
| except (TypeError, ValueError): | |
| return None, "Invalid parameters." | |
| if not (15 <= ws <= 180): | |
| return None, "window_size must be between 15 and 180 s." | |
| if not (0.0 <= ov <= 0.9): | |
| return None, "overlap must be between 0 and 0.9." | |
| if normalize not in ("zscore", "center", "none"): | |
| return None, "normalize must be 'zscore', 'center' or 'none'." | |
| include_baselines = bool(include_baselines) | |
| include_familiarization = bool(include_familiarization) | |
| # Stable filename = also the cache key (now includes the B/F flags). | |
| fname = (f"movetsa_w{ws}_ov{int(ov * 100)}_{normalize}" | |
| f"_b{int(include_baselines)}_f{int(include_familiarization)}.parquet") | |
| cached = os.path.join(CACHE_DIR, fname) | |
| if os.path.exists(cached): | |
| return cached, f"Served from cache — `{fname}`" | |
| df = build_windows( | |
| RAW_DIR, | |
| window_size=ws, | |
| window_overlap=ov, | |
| normalize=(None if normalize == "none" else normalize), | |
| include_baselines=include_baselines, | |
| include_familiarization=include_familiarization, | |
| verbose=False, | |
| ) | |
| df.to_parquet(cached, index=False) | |
| msg = (f"**{len(df)} windows × {df.shape[1]} columns** " | |
| f"({df['subject'].nunique()} subjects) — `{fname}`") | |
| return cached, msg | |
| with gr.Blocks(title="MoveTSA dataset builder") as demo: | |
| gr.Markdown( | |
| "# MoveTSA dataset builder\n" | |
| "Generate a **parametrised HRV / BSI / RSP / simulator** parquet from the " | |
| "(private) raw recordings. You download **only** the generated parquet — " | |
| "never the raw data.\n\n" | |
| "1. Sign in with Hugging Face. 2. Set the parameters. 3. Generate." | |
| ) | |
| gr.LoginButton() | |
| with gr.Row(): | |
| with gr.Column(): | |
| window_size = gr.Slider(15, 180, value=60, step=5, | |
| label="window_size (s)") | |
| overlap = gr.Slider(0.0, 0.9, value=0.5, step=0.05, label="overlap") | |
| normalize = gr.Dropdown(["zscore", "center", "none"], | |
| value="zscore", label="normalize") | |
| include_baselines = gr.Checkbox( | |
| value=True, label="include baselines (B1–B4)") | |
| include_familiarization = gr.Checkbox( | |
| value=True, label="include familiarization (F)") | |
| # 6th API input: headless callers pass their HF token here; the | |
| # browser leaves it empty and authenticates via the LoginButton. | |
| hf_token = gr.Textbox(value="", visible=False, label="hf_token") | |
| btn = gr.Button("Generate parquet", variant="primary") | |
| with gr.Column(): | |
| status = gr.Markdown() | |
| out_file = gr.File(label="generated parquet") | |
| btn.click( | |
| generate, | |
| inputs=[window_size, overlap, normalize, include_baselines, | |
| include_familiarization, hf_token], | |
| outputs=[out_file, status], | |
| api_name="generate", # => client.predict(..., api_name="/generate") | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |