movetsa-builder / app.py
thbndi's picture
Update app.py
451c004 verified
Raw
History Blame Contribute Delete
8.13 kB
"""MoveTSA dataset builder — Hugging Face Space (gated via HF login + allowlist).
Generates a **parametrised** windows parquet (HRV / BSI / RSP / simulator
aggregates / DATEX / subjective labels) from the PRIVATE raw recordings,
entirely server-side. Authorised users only ever download the generated
parquet — they never get access to the raw ECG/SIMU files.
Backing repos (private, read with the ``HF_TOKEN`` Space secret):
- ``MoveTSA/movetsa-raw`` : raw recordings (stay inside the Space)
- ``MoveTSA/MoveTSA`` : pipeline code (downloaded at startup, importable
as the ``MoveTSA`` package)
Access control: the Space is public, but generation requires signing in with
Hugging Face and being on the allowlist (the ``ALLOWLIST`` Space variable, a
comma-separated list of usernames). Access is requested from ``@thbndi``.
Programmatic access (gradio_client / the MovetsaDataset loading script):
- named API endpoint ``/generate`` with 6 inputs, in this order:
window_size, overlap, normalize, include_baselines,
include_familiarization, hf_token
- identity: in the browser via ``gr.OAuthProfile`` (the LoginButton); over
the API via the explicit ``hf_token`` argument, resolved with ``whoami``.
``gr.OAuthToken`` is NOT populated over the gradio_client API — it only
works through the browser OAuth login — so headless callers must pass their
token as the 6th argument.
"""
import os
import sys
import tempfile
import gradio as gr
from huggingface_hub import snapshot_download, whoami
TOKEN = os.environ.get("HF_TOKEN")
CODE_REPO = os.environ.get("CODE_REPO", "MoveTSA/MoveTSA")
RAW_REPO = os.environ.get("RAW_REPO", "MoveTSA/movetsa-raw")
OWNER = os.environ.get("OWNER_HANDLE", "thbndi")
# Comma-separated HF usernames allowed to generate. Edit via the ALLOWLIST
# Space *variable* (Settings → Variables and secrets) — no code change needed.
ALLOWLIST = {
u.strip().lower()
for u in os.environ.get("ALLOWLIST", OWNER).split(",")
if u.strip()
}
# Optional server-side cache: identical parameters reuse the same parquet
# instead of re-running the (heavy) pipeline. Keyed on ALL five parameters.
CACHE_DIR = os.environ.get("MOVETSA_CACHE",
os.path.join(tempfile.gettempdir(), "movetsa_out"))
os.makedirs(CACHE_DIR, exist_ok=True)
# ----------------------------------------------------------------- startup
# 1) Pipeline code → importable as the `MoveTSA` package (cwd on sys.path).
snapshot_download(CODE_REPO, repo_type="dataset", token=TOKEN,
local_dir="MoveTSA", allow_patterns=["*.py", "*.yaml"])
sys.path.insert(0, os.getcwd())
# 2) Raw recordings — private, kept inside the Space, never served to users.
RAW_DIR = snapshot_download(RAW_REPO, repo_type="dataset", token=TOKEN,
allow_patterns=["S*/**", "subjective_scores.csv"])
from MoveTSA.export_hf_dataset import build_windows # noqa: E402 (needs sys.path)
def _resolve_username(profile, oauth_token, hf_token):
"""Identity from the browser OAuth profile, or from an explicit HF token.
In the browser, Gradio injects ``profile`` after the OAuth login. Headless
callers (gradio_client / the MovetsaDataset loading script) cannot go
through the OAuth flow, so they pass their token explicitly and we resolve
the username with ``whoami``. ``oauth_token`` is kept as a fallback but is
``None`` over the API.
"""
if profile is not None:
return profile.username
token = None
if hf_token:
token = hf_token
elif oauth_token is not None and getattr(oauth_token, "token", None):
token = oauth_token.token
if token:
try:
return whoami(token=token).get("name")
except Exception: # noqa: BLE001
return None
return None
def generate(window_size, overlap, normalize, include_baselines,
include_familiarization, hf_token="",
profile: gr.OAuthProfile | None = None,
oauth_token: gr.OAuthToken | None = None,
progress=gr.Progress(track_tqdm=True)):
"""Run the pipeline with the chosen parameters and return a parquet file.
``hf_token`` is the 6th API input: headless callers pass their HF token so
the Space can identify them (the browser leaves it empty and authenticates
via the LoginButton). ``profile``/``oauth_token`` are injected by Gradio and
are NOT part of the API signature.
"""
username = _resolve_username(profile, oauth_token, hf_token)
if username is None:
return None, "Sign in with your Hugging Face account to generate."
if username.lower() not in ALLOWLIST:
return None, (
f"Access not granted for **@{username}**.\n\n"
f"Request access from **@{OWNER}** (to be added to the allowlist)."
)
# --- validate parameters (API callers are not bound by the sliders) ---
try:
ws = int(window_size)
ov = float(overlap)
except (TypeError, ValueError):
return None, "Invalid parameters."
if not (15 <= ws <= 180):
return None, "window_size must be between 15 and 180 s."
if not (0.0 <= ov <= 0.9):
return None, "overlap must be between 0 and 0.9."
if normalize not in ("zscore", "center", "none"):
return None, "normalize must be 'zscore', 'center' or 'none'."
include_baselines = bool(include_baselines)
include_familiarization = bool(include_familiarization)
# Stable filename = also the cache key (now includes the B/F flags).
fname = (f"movetsa_w{ws}_ov{int(ov * 100)}_{normalize}"
f"_b{int(include_baselines)}_f{int(include_familiarization)}.parquet")
cached = os.path.join(CACHE_DIR, fname)
if os.path.exists(cached):
return cached, f"Served from cache — `{fname}`"
df = build_windows(
RAW_DIR,
window_size=ws,
window_overlap=ov,
normalize=(None if normalize == "none" else normalize),
include_baselines=include_baselines,
include_familiarization=include_familiarization,
verbose=False,
)
df.to_parquet(cached, index=False)
msg = (f"**{len(df)} windows × {df.shape[1]} columns** "
f"({df['subject'].nunique()} subjects) — `{fname}`")
return cached, msg
with gr.Blocks(title="MoveTSA dataset builder") as demo:
gr.Markdown(
"# MoveTSA dataset builder\n"
"Generate a **parametrised HRV / BSI / RSP / simulator** parquet from the "
"(private) raw recordings. You download **only** the generated parquet — "
"never the raw data.\n\n"
"1. Sign in with Hugging Face. 2. Set the parameters. 3. Generate."
)
gr.LoginButton()
with gr.Row():
with gr.Column():
window_size = gr.Slider(15, 180, value=60, step=5,
label="window_size (s)")
overlap = gr.Slider(0.0, 0.9, value=0.5, step=0.05, label="overlap")
normalize = gr.Dropdown(["zscore", "center", "none"],
value="zscore", label="normalize")
include_baselines = gr.Checkbox(
value=True, label="include baselines (B1–B4)")
include_familiarization = gr.Checkbox(
value=True, label="include familiarization (F)")
# 6th API input: headless callers pass their HF token here; the
# browser leaves it empty and authenticates via the LoginButton.
hf_token = gr.Textbox(value="", visible=False, label="hf_token")
btn = gr.Button("Generate parquet", variant="primary")
with gr.Column():
status = gr.Markdown()
out_file = gr.File(label="generated parquet")
btn.click(
generate,
inputs=[window_size, overlap, normalize, include_baselines,
include_familiarization, hf_token],
outputs=[out_file, status],
api_name="generate", # => client.predict(..., api_name="/generate")
)
if __name__ == "__main__":
demo.launch()