File size: 8,134 Bytes
aa3ada1
 
 
 
 
 
 
 
dfdc0a5
 
aa3ada1
 
 
 
dfdc0a5
f32bb9c
 
3ec3248
 
 
 
 
 
 
 
aa3ada1
 
 
 
 
 
 
f32bb9c
aa3ada1
 
dfdc0a5
 
aa3ada1
 
 
 
 
 
 
 
 
 
f32bb9c
 
 
 
 
 
aa3ada1
 
 
 
 
 
 
 
 
 
 
 
 
3ec3248
 
 
 
 
 
 
 
 
f32bb9c
 
3ec3248
 
 
 
 
 
f32bb9c
3ec3248
f32bb9c
 
 
 
 
aa3ada1
3ec3248
f32bb9c
451c004
 
f32bb9c
 
3ec3248
 
 
 
f32bb9c
3ec3248
f32bb9c
a3e15f3
f32bb9c
aa3ada1
f32bb9c
b4f8c8c
aa3ada1
 
f32bb9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3ada1
 
f32bb9c
 
aa3ada1
f32bb9c
 
aa3ada1
 
 
f32bb9c
a3e15f3
b4f8c8c
f32bb9c
aa3ada1
 
 
 
b4f8c8c
 
 
 
 
aa3ada1
 
 
 
 
 
 
 
 
 
b4f8c8c
aa3ada1
b4f8c8c
3ec3248
 
 
b4f8c8c
aa3ada1
 
b4f8c8c
aa3ada1
 
 
 
3ec3248
aa3ada1
f32bb9c
aa3ada1
 
 
dfdc0a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""MoveTSA dataset builder β€” Hugging Face Space (gated via HF login + allowlist).

Generates a **parametrised** windows parquet (HRV / BSI / RSP / simulator
aggregates / DATEX / subjective labels) from the PRIVATE raw recordings,
entirely server-side. Authorised users only ever download the generated
parquet β€” they never get access to the raw ECG/SIMU files.

Backing repos (private, read with the ``HF_TOKEN`` Space secret):
  - ``MoveTSA/movetsa-raw`` : raw recordings (stay inside the Space)
  - ``MoveTSA/MoveTSA``     : pipeline code (downloaded at startup, importable
                             as the ``MoveTSA`` package)

Access control: the Space is public, but generation requires signing in with
Hugging Face and being on the allowlist (the ``ALLOWLIST`` Space variable, a
comma-separated list of usernames). Access is requested from ``@thbndi``.

Programmatic access (gradio_client / the MovetsaDataset loading script):
  - named API endpoint ``/generate`` with 6 inputs, in this order:
        window_size, overlap, normalize, include_baselines,
        include_familiarization, hf_token
  - identity: in the browser via ``gr.OAuthProfile`` (the LoginButton); over
    the API via the explicit ``hf_token`` argument, resolved with ``whoami``.
    ``gr.OAuthToken`` is NOT populated over the gradio_client API β€” it only
    works through the browser OAuth login β€” so headless callers must pass their
    token as the 6th argument.
"""

import os
import sys
import tempfile

import gradio as gr
from huggingface_hub import snapshot_download, whoami

TOKEN = os.environ.get("HF_TOKEN")
CODE_REPO = os.environ.get("CODE_REPO", "MoveTSA/MoveTSA")
RAW_REPO = os.environ.get("RAW_REPO", "MoveTSA/movetsa-raw")
OWNER = os.environ.get("OWNER_HANDLE", "thbndi")

# Comma-separated HF usernames allowed to generate. Edit via the ALLOWLIST
# Space *variable* (Settings β†’ Variables and secrets) β€” no code change needed.
ALLOWLIST = {
    u.strip().lower()
    for u in os.environ.get("ALLOWLIST", OWNER).split(",")
    if u.strip()
}

# Optional server-side cache: identical parameters reuse the same parquet
# instead of re-running the (heavy) pipeline. Keyed on ALL five parameters.
CACHE_DIR = os.environ.get("MOVETSA_CACHE",
                           os.path.join(tempfile.gettempdir(), "movetsa_out"))
os.makedirs(CACHE_DIR, exist_ok=True)

# ----------------------------------------------------------------- startup
# 1) Pipeline code β†’ importable as the `MoveTSA` package (cwd on sys.path).
snapshot_download(CODE_REPO, repo_type="dataset", token=TOKEN,
                  local_dir="MoveTSA", allow_patterns=["*.py", "*.yaml"])
sys.path.insert(0, os.getcwd())

# 2) Raw recordings β€” private, kept inside the Space, never served to users.
RAW_DIR = snapshot_download(RAW_REPO, repo_type="dataset", token=TOKEN,
                            allow_patterns=["S*/**", "subjective_scores.csv"])

from MoveTSA.export_hf_dataset import build_windows  # noqa: E402  (needs sys.path)


def _resolve_username(profile, oauth_token, hf_token):
    """Identity from the browser OAuth profile, or from an explicit HF token.

    In the browser, Gradio injects ``profile`` after the OAuth login. Headless
    callers (gradio_client / the MovetsaDataset loading script) cannot go
    through the OAuth flow, so they pass their token explicitly and we resolve
    the username with ``whoami``. ``oauth_token`` is kept as a fallback but is
    ``None`` over the API.
    """
    if profile is not None:
        return profile.username
    token = None
    if hf_token:
        token = hf_token
    elif oauth_token is not None and getattr(oauth_token, "token", None):
        token = oauth_token.token
    if token:
        try:
            return whoami(token=token).get("name")
        except Exception:  # noqa: BLE001
            return None
    return None


def generate(window_size, overlap, normalize, include_baselines,
             include_familiarization, hf_token="",
             profile: gr.OAuthProfile | None = None,
             oauth_token: gr.OAuthToken | None = None,
             progress=gr.Progress(track_tqdm=True)):
    """Run the pipeline with the chosen parameters and return a parquet file.

    ``hf_token`` is the 6th API input: headless callers pass their HF token so
    the Space can identify them (the browser leaves it empty and authenticates
    via the LoginButton). ``profile``/``oauth_token`` are injected by Gradio and
    are NOT part of the API signature.
    """
    username = _resolve_username(profile, oauth_token, hf_token)
    if username is None:
        return None, "Sign in with your Hugging Face account to generate."
    if username.lower() not in ALLOWLIST:
        return None, (
            f"Access not granted for **@{username}**.\n\n"
            f"Request access from **@{OWNER}** (to be added to the allowlist)."
        )

    # --- validate parameters (API callers are not bound by the sliders) ---
    try:
        ws = int(window_size)
        ov = float(overlap)
    except (TypeError, ValueError):
        return None, "Invalid parameters."
    if not (15 <= ws <= 180):
        return None, "window_size must be between 15 and 180 s."
    if not (0.0 <= ov <= 0.9):
        return None, "overlap must be between 0 and 0.9."
    if normalize not in ("zscore", "center", "none"):
        return None, "normalize must be 'zscore', 'center' or 'none'."
    include_baselines = bool(include_baselines)
    include_familiarization = bool(include_familiarization)

    # Stable filename = also the cache key (now includes the B/F flags).
    fname = (f"movetsa_w{ws}_ov{int(ov * 100)}_{normalize}"
             f"_b{int(include_baselines)}_f{int(include_familiarization)}.parquet")
    cached = os.path.join(CACHE_DIR, fname)
    if os.path.exists(cached):
        return cached, f"Served from cache β€” `{fname}`"

    df = build_windows(
        RAW_DIR,
        window_size=ws,
        window_overlap=ov,
        normalize=(None if normalize == "none" else normalize),
        include_baselines=include_baselines,
        include_familiarization=include_familiarization,
        verbose=False,
    )

    df.to_parquet(cached, index=False)
    msg = (f"**{len(df)} windows Γ— {df.shape[1]} columns** "
           f"({df['subject'].nunique()} subjects) β€” `{fname}`")
    return cached, msg


with gr.Blocks(title="MoveTSA dataset builder") as demo:
    gr.Markdown(
        "# MoveTSA dataset builder\n"
        "Generate a **parametrised HRV / BSI / RSP / simulator** parquet from the "
        "(private) raw recordings. You download **only** the generated parquet β€” "
        "never the raw data.\n\n"
        "1. Sign in with Hugging Face. 2. Set the parameters. 3. Generate."
    )
    gr.LoginButton()
    with gr.Row():
        with gr.Column():
            window_size = gr.Slider(15, 180, value=60, step=5,
                                    label="window_size (s)")
            overlap = gr.Slider(0.0, 0.9, value=0.5, step=0.05, label="overlap")
            normalize = gr.Dropdown(["zscore", "center", "none"],
                                    value="zscore", label="normalize")
            include_baselines = gr.Checkbox(
                value=True, label="include baselines (B1–B4)")
            include_familiarization = gr.Checkbox(
                value=True, label="include familiarization (F)")
            # 6th API input: headless callers pass their HF token here; the
            # browser leaves it empty and authenticates via the LoginButton.
            hf_token = gr.Textbox(value="", visible=False, label="hf_token")
            btn = gr.Button("Generate parquet", variant="primary")
        with gr.Column():
            status = gr.Markdown()
            out_file = gr.File(label="generated parquet")

    btn.click(
        generate,
        inputs=[window_size, overlap, normalize, include_baselines,
                include_familiarization, hf_token],
        outputs=[out_file, status],
        api_name="generate",   # => client.predict(..., api_name="/generate")
    )

if __name__ == "__main__":
    demo.launch()