Andrew commited on
Commit
8bdd018
·
1 Parent(s): 72f8b14

Consolidate AF3/Qwen pipelines, endpoint templates, and setup docs

Browse files
Files changed (45) hide show
  1. .env.example +8 -0
  2. .gitignore +7 -0
  3. README.md +262 -2
  4. af3_chatgpt_pipeline.py +584 -0
  5. af3_gui_app.py +17 -0
  6. docs/deploy/AF3_ENDPOINT.md +79 -0
  7. docs/deploy/AF3_NVIDIA_ENDPOINT.md +64 -0
  8. docs/deploy/QWEN_SPACE.md +26 -0
  9. docs/deploy/SPACE.md +1 -1
  10. docs/guides/README.md +1 -0
  11. docs/guides/af3-chatgpt-pipeline.md +155 -0
  12. docs/guides/qwen2-audio-train.md +171 -0
  13. qwen_audio_captioning.py +996 -0
  14. qwen_caption_app.py +506 -0
  15. react-ui/index.html +12 -0
  16. react-ui/package-lock.json +1674 -0
  17. react-ui/package.json +19 -0
  18. react-ui/src/App.jsx +223 -0
  19. react-ui/src/main.jsx +11 -0
  20. react-ui/src/styles.css +189 -0
  21. react-ui/vite.config.js +15 -0
  22. requirements.txt +6 -1
  23. scripts/annotations/qwen_annotate_file.py +122 -0
  24. scripts/annotations/qwen_caption_dataset.py +203 -0
  25. scripts/dev/run_af3_gui.ps1 +21 -0
  26. scripts/dev/run_af3_gui.py +76 -0
  27. scripts/endpoint/test_af3_caption_endpoint.py +155 -0
  28. scripts/endpoint/test_qwen_caption_endpoint.py +132 -0
  29. scripts/hf_clone.py +185 -1
  30. scripts/jobs/submit_hf_qwen_caption_job.ps1 +133 -0
  31. scripts/pipeline/refine_dataset_json_with_openai.py +291 -0
  32. scripts/pipeline/run_af3_chatgpt_pipeline.py +158 -0
  33. services/pipeline_api.py +242 -0
  34. summaries/findings.md +160 -80
  35. templates/hf-af3-caption-endpoint/README.md +58 -0
  36. templates/hf-af3-caption-endpoint/handler.py +305 -0
  37. templates/hf-af3-caption-endpoint/requirements.txt +2 -0
  38. templates/hf-af3-nvidia-endpoint/README.md +54 -0
  39. templates/hf-af3-nvidia-endpoint/handler.py +204 -0
  40. templates/hf-af3-nvidia-endpoint/requirements.txt +23 -0
  41. templates/hf-qwen-caption-endpoint/README.md +62 -0
  42. templates/hf-qwen-caption-endpoint/handler.py +112 -0
  43. templates/hf-qwen-caption-endpoint/requirements.txt +6 -0
  44. utils/__init__.py +1 -0
  45. utils/env_config.py +52 -0
.env.example CHANGED
@@ -1,5 +1,13 @@
 
1
  HF_TOKEN=hf_xxx_your_token_here
2
  HF_ENDPOINT_URL=https://your-endpoint-url.endpoints.huggingface.cloud
 
 
 
 
 
 
 
3
 
4
  # Optional defaults used by scripts/hf_clone.py
5
  HF_USERNAME=your-hf-username
 
1
+ # Copy this file to `.env` locally. Do not commit real secrets.
2
  HF_TOKEN=hf_xxx_your_token_here
3
  HF_ENDPOINT_URL=https://your-endpoint-url.endpoints.huggingface.cloud
4
+ HF_QWEN_ENDPOINT_URL=https://your-qwen-endpoint-url.endpoints.huggingface.cloud
5
+ HF_AF3_ENDPOINT_URL=https://your-af3-endpoint-url.endpoints.huggingface.cloud
6
+ QWEN_MODEL_ID=Qwen/Qwen2-Audio-7B-Instruct
7
+ AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
8
+ AF3_NV_DEFAULT_MODE=think
9
+ OPENAI_API_KEY=sk-proj-xxx
10
+ OPENAI_MODEL=gpt-5-mini
11
 
12
  # Optional defaults used by scripts/hf_clone.py
13
  HF_USERNAME=your-hf-username
.gitignore CHANGED
@@ -17,6 +17,9 @@ htmlcov/
17
  build/
18
  dist/
19
  *.egg-info/
 
 
 
20
 
21
  # Virtual environments
22
  .venv/
@@ -27,6 +30,7 @@ env/
27
  .cache/
28
  .huggingface/
29
  .gradio/
 
30
 
31
  # Logs/temp
32
  *.log
@@ -37,6 +41,7 @@ env/
37
  # Model/data/runtime artifacts
38
  checkpoints/
39
  lora_output/
 
40
  outputs/
41
  artifacts/
42
  models/
@@ -64,3 +69,5 @@ Thumbs.db
64
  # Optional local working copies
65
  Lora-ace-step/
66
  song_summaries_llm*.md
 
 
 
17
  build/
18
  dist/
19
  *.egg-info/
20
+ node_modules/
21
+ react-ui/node_modules/
22
+ react-ui/dist/
23
 
24
  # Virtual environments
25
  .venv/
 
30
  .cache/
31
  .huggingface/
32
  .gradio/
33
+ .tmp_tf*/
34
 
35
  # Logs/temp
36
  *.log
 
41
  # Model/data/runtime artifacts
42
  checkpoints/
43
  lora_output/
44
+ qwen_annotations/
45
  outputs/
46
  artifacts/
47
  models/
 
69
  # Optional local working copies
70
  Lora-ace-step/
71
  song_summaries_llm*.md
72
+
73
+ train-dataset/*
README.md CHANGED
@@ -21,6 +21,9 @@ Train ACE-Step 1.5 LoRA adapters, deploy your own Hugging Face Space, and run pr
21
 
22
  - LoRA training UI and workflow: `app.py`, `lora_ui.py`
23
  - CLI LoRA trainer for local/HF datasets: `lora_train.py`
 
 
 
24
  - Custom endpoint runtime: `handler.py`, `acestep/`
25
  - Bootstrap automation for cloning into your HF account: `scripts/hf_clone.py`
26
  - Endpoint test clients and HF job launcher: `scripts/endpoint/`, `scripts/jobs/`
@@ -35,6 +38,103 @@ python app.py
35
 
36
  Open `http://localhost:7860`.
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  ## Clone to your HF account
39
 
40
  Use the two buttons near the top of this README to create target repos in your HF account, then run:
@@ -61,6 +161,28 @@ Clone your own Endpoint repo:
61
  python scripts/hf_clone.py endpoint --repo-id YOUR_USERNAME/YOUR_ENDPOINT_REPO
62
  ```
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  Clone both in one run:
65
 
66
  ```bash
@@ -76,10 +198,21 @@ python scripts/hf_clone.py all \
76
  |- app.py
77
  |- lora_ui.py
78
  |- lora_train.py
 
 
 
 
79
  |- handler.py
80
  |- acestep/
81
  |- scripts/
82
  | |- hf_clone.py
 
 
 
 
 
 
 
83
  | |- endpoint/
84
  | | |- generate_interactive.py
85
  | | |- test.ps1
@@ -88,6 +221,12 @@ python scripts/hf_clone.py all \
88
  | | `- test_rnb_2min.bat
89
  | `- jobs/
90
  | `- submit_hf_lora_job.ps1
 
 
 
 
 
 
91
  |- docs/
92
  | |- deploy/
93
  | `- guides/
@@ -119,6 +258,102 @@ Optional sidecar metadata per track:
119
  }
120
  ```
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  ## Endpoint testing
123
 
124
  ```bash
@@ -139,12 +374,37 @@ Current baseline analysis and improvement ideas are tracked in:
139
  ## Docs
140
 
141
  - Space deployment: `docs/deploy/SPACE.md`
 
142
  - Endpoint deployment: `docs/deploy/ENDPOINT.md`
143
- - Additional guides: `docs/guides/qwen2-audio-train.md`
 
 
144
 
145
  ## Open-source readiness checklist
146
 
147
- - Secrets are env-driven (`HF_TOKEN`, `HF_ENDPOINT_URL`, `.env`).
148
  - Local artifacts are ignored via `.gitignore`.
149
  - MIT license included.
150
  - Reproducible clone/deploy paths documented.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  - LoRA training UI and workflow: `app.py`, `lora_ui.py`
23
  - CLI LoRA trainer for local/HF datasets: `lora_train.py`
24
+ - Qwen2-Audio captioning/annotation pipeline: `qwen_caption_app.py`, `qwen_audio_captioning.py`, `scripts/annotations/`
25
+ - Audio Flamingo 3 + ChatGPT cleanup pipeline: `af3_chatgpt_pipeline.py`, `scripts/pipeline/`, `services/pipeline_api.py`
26
+ - React orchestration UI for AF3+ChatGPT: `react-ui/`
27
  - Custom endpoint runtime: `handler.py`, `acestep/`
28
  - Bootstrap automation for cloning into your HF account: `scripts/hf_clone.py`
29
  - Endpoint test clients and HF job launcher: `scripts/endpoint/`, `scripts/jobs/`
 
38
 
39
  Open `http://localhost:7860`.
40
 
41
+ ## End-to-end setup (recommended)
42
+
43
+ Use this sequence when setting up from scratch.
44
+
45
+ 1. Install dependencies
46
+
47
+ ```bash
48
+ python -m pip install --upgrade pip
49
+ python -m pip install -r requirements.txt
50
+ ```
51
+
52
+ 2. Create local `.env` from `.env.example` and fill secrets
53
+
54
+ ```env
55
+ HF_TOKEN=hf_xxx
56
+ HF_AF3_ENDPOINT_URL=https://YOUR_AF3_ENDPOINT.endpoints.huggingface.cloud
57
+ OPENAI_API_KEY=sk-...
58
+ OPENAI_MODEL=gpt-5-mini
59
+ AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
60
+ ```
61
+
62
+ 3. Bootstrap your Hugging Face repos (Space + endpoint templates)
63
+
64
+ ```bash
65
+ python scripts/hf_clone.py space --repo-id YOUR_USERNAME/YOUR_SPACE_NAME
66
+ python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
67
+ ```
68
+
69
+ 4. Deploy endpoint from the cloned AF3 NVIDIA endpoint repo
70
+
71
+ - Set endpoint task to `custom`.
72
+ - Confirm top-level `handler.py` exists in the endpoint repo.
73
+ - Set endpoint env vars if needed (`HF_TOKEN`, `AF3_NV_DEFAULT_MODE=think`).
74
+
75
+ 5. Generate analysis sidecars from audio
76
+
77
+ ```bash
78
+ python scripts/pipeline/run_af3_chatgpt_pipeline.py \
79
+ --dataset-dir ./train-dataset \
80
+ --backend hf_endpoint \
81
+ --endpoint-url "$HF_AF3_ENDPOINT_URL" \
82
+ --openai-api-key "$OPENAI_API_KEY"
83
+ ```
84
+
85
+ 6. Normalize existing JSONs into LoRA-ready shape (optional but recommended)
86
+
87
+ ```bash
88
+ python scripts/pipeline/refine_dataset_json_with_openai.py \
89
+ --dataset-dir ./train-dataset \
90
+ --enable-web-search
91
+ ```
92
+
93
+ This script keeps core fields needed by ACE-Step LoRA training and preserves rich analysis context in `source.rich_details`.
94
+
95
+ 7. Train LoRA
96
+
97
+ ```bash
98
+ python app.py
99
+ ```
100
+
101
+ Then in UI:
102
+ - Load model.
103
+ - Scan/upload dataset.
104
+ - Start LoRA training.
105
+
106
+ 8. Test generation with your new adapter
107
+
108
+ - Use the endpoint scripts in `scripts/endpoint/`.
109
+ - Or test through the Gradio UI flow.
110
+
111
+ ## AF3 GUI one-command startup
112
+
113
+ 1. Configure `.env` (never commit this file):
114
+
115
+ ```env
116
+ HF_TOKEN=hf_xxx
117
+ HF_AF3_ENDPOINT_URL=https://bc3r76slij67lskb.us-east-1.aws.endpoints.huggingface.cloud
118
+ OPENAI_API_KEY=sk-...
119
+ OPENAI_MODEL=gpt-5-mini
120
+ AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
121
+ ```
122
+
123
+ 2. Launch API + GUI together:
124
+
125
+ ```bash
126
+ python af3_gui_app.py
127
+ ```
128
+
129
+ PowerShell alternative:
130
+
131
+ ```powershell
132
+ .\scripts\dev\run_af3_gui.ps1
133
+ ```
134
+
135
+ This command builds the React UI and serves it from the FastAPI backend.
136
+ Open `http://127.0.0.1:8008`.
137
+
138
  ## Clone to your HF account
139
 
140
  Use the two buttons near the top of this README to create target repos in your HF account, then run:
 
161
  python scripts/hf_clone.py endpoint --repo-id YOUR_USERNAME/YOUR_ENDPOINT_REPO
162
  ```
163
 
164
+ Clone a Qwen2-Audio caption endpoint repo:
165
+
166
+ ```bash
167
+ python scripts/hf_clone.py qwen-endpoint --repo-id YOUR_USERNAME/YOUR_QWEN_ENDPOINT_REPO
168
+ ```
169
+
170
+ Clone an Audio Flamingo 3 caption endpoint repo:
171
+
172
+ ```bash
173
+ python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
174
+ ```
175
+
176
+ When creating that endpoint, set task to `custom` so it loads the custom `handler.py`.
177
+
178
+ Clone an AF3 NVIDIA-stack endpoint repo (matches NVIDIA Space stack better):
179
+
180
+ ```bash
181
+ python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
182
+ ```
183
+
184
+ Use this path when you want think/long quality behavior closer to NVIDIA's public demo.
185
+
186
  Clone both in one run:
187
 
188
  ```bash
 
198
  |- app.py
199
  |- lora_ui.py
200
  |- lora_train.py
201
+ |- qwen_caption_app.py
202
+ |- qwen_audio_captioning.py
203
+ |- af3_chatgpt_pipeline.py
204
+ |- af3_gui_app.py
205
  |- handler.py
206
  |- acestep/
207
  |- scripts/
208
  | |- hf_clone.py
209
+ | |- dev/
210
+ | | |- run_af3_gui.py
211
+ | | `- run_af3_gui.ps1
212
+ | |- annotations/
213
+ | | `- qwen_caption_dataset.py
214
+ | |- pipeline/
215
+ | | `- run_af3_chatgpt_pipeline.py
216
  | |- endpoint/
217
  | | |- generate_interactive.py
218
  | | |- test.ps1
 
221
  | | `- test_rnb_2min.bat
222
  | `- jobs/
223
  | `- submit_hf_lora_job.ps1
224
+ | `- submit_hf_qwen_caption_job.ps1
225
+ |- services/
226
+ | `- pipeline_api.py
227
+ |- react-ui/
228
+ |- utils/
229
+ | `- env_config.py
230
  |- docs/
231
  | |- deploy/
232
  | `- guides/
 
258
  }
259
  ```
260
 
261
+ ## Qwen2-Audio annotation pipeline (music captioning)
262
+
263
+ Run the dedicated annotation UI:
264
+
265
+ ```bash
266
+ python qwen_caption_app.py
267
+ ```
268
+
269
+ Batch caption from CLI:
270
+
271
+ ```bash
272
+ python scripts/annotations/qwen_caption_dataset.py \
273
+ --dataset-dir ./dataset_inbox \
274
+ --backend local \
275
+ --model-id Qwen/Qwen2-Audio-7B-Instruct \
276
+ --output-dir ./qwen_annotations \
277
+ --copy-audio
278
+ ```
279
+
280
+ This also writes `.json` sidecars next to source audio by default for direct ACE-Step LoRA training.
281
+
282
+ Then train LoRA on the exported dataset:
283
+
284
+ ```bash
285
+ python lora_train.py --dataset-dir ./qwen_annotations/dataset --model-config acestep-v15-base
286
+ ```
287
+
288
+ ## Audio Flamingo 3 + ChatGPT pipeline (analysis -> normalized sidecar JSON)
289
+
290
+ This stack runs:
291
+
292
+ 1. Audio Flamingo 3 for raw music analysis prose.
293
+ 2. ChatGPT for cleanup/normalization into LoRA-ready fields.
294
+ 3. Sidecar JSON export next to each audio file (or in a custom output folder).
295
+
296
+ CLI single track:
297
+
298
+ ```bash
299
+ python scripts/pipeline/run_af3_chatgpt_pipeline.py \
300
+ --audio "./train-dataset/Andrew Spacey - Wonder (Prod Beat It AT).mp3" \
301
+ --backend hf_endpoint \
302
+ --endpoint-url "$HF_AF3_ENDPOINT_URL" \
303
+ --hf-token "$HF_TOKEN" \
304
+ --openai-api-key "$OPENAI_API_KEY" \
305
+ --artist-name "Andrew Spacey" \
306
+ --track-name "Wonder"
307
+ ```
308
+
309
+ CLI dataset batch:
310
+
311
+ ```bash
312
+ python scripts/pipeline/run_af3_chatgpt_pipeline.py \
313
+ --dataset-dir ./train-dataset \
314
+ --backend hf_endpoint \
315
+ --endpoint-url "$HF_AF3_ENDPOINT_URL" \
316
+ --openai-api-key "$OPENAI_API_KEY"
317
+ ```
318
+
319
+ Refine already-generated JSON files in place:
320
+
321
+ ```bash
322
+ python scripts/pipeline/refine_dataset_json_with_openai.py \
323
+ --dataset-dir ./train-dataset \
324
+ --enable-web-search
325
+ ```
326
+
327
+ Write refined files to a separate folder:
328
+
329
+ ```bash
330
+ python scripts/pipeline/refine_dataset_json_with_openai.py \
331
+ --dataset-dir ./train-dataset \
332
+ --recursive \
333
+ --enable-web-search \
334
+ --output-dir ./train-dataset-refined
335
+ ```
336
+
337
+ Single-command GUI (recommended):
338
+
339
+ ```bash
340
+ python af3_gui_app.py
341
+ ```
342
+
343
+ Manual API + React UI:
344
+
345
+ ```bash
346
+ uvicorn services.pipeline_api:app --host 0.0.0.0 --port 8008 --reload
347
+ ```
348
+
349
+ ```bash
350
+ cd react-ui
351
+ npm install
352
+ npm run dev
353
+ ```
354
+
355
+ Open `http://localhost:5173` (manual) or `http://127.0.0.1:8008` (single-command).
356
+
357
  ## Endpoint testing
358
 
359
  ```bash
 
374
  ## Docs
375
 
376
  - Space deployment: `docs/deploy/SPACE.md`
377
+ - Qwen caption Space deployment: `docs/deploy/QWEN_SPACE.md`
378
  - Endpoint deployment: `docs/deploy/ENDPOINT.md`
379
+ - AF3 endpoint deployment: `docs/deploy/AF3_ENDPOINT.md`
380
+ - AF3 NVIDIA-stack endpoint deployment: `docs/deploy/AF3_NVIDIA_ENDPOINT.md`
381
+ - Additional guides: `docs/guides/qwen2-audio-train.md`, `docs/guides/af3-chatgpt-pipeline.md`
382
 
383
  ## Open-source readiness checklist
384
 
385
+ - Secrets are env-driven (`HF_TOKEN`, `HF_AF3_ENDPOINT_URL`, `OPENAI_API_KEY`, `.env`).
386
  - Local artifacts are ignored via `.gitignore`.
387
  - MIT license included.
388
  - Reproducible clone/deploy paths documented.
389
+ - `.env` is git-ignored; keep real credentials only in local `.env`.
390
+
391
+ ## GitHub publish flow
392
+
393
+ 1. Check status
394
+
395
+ ```bash
396
+ git status
397
+ ```
398
+
399
+ 2. Stage and commit
400
+
401
+ ```bash
402
+ git add .
403
+ git commit -m "Consolidate AF3/Qwen pipelines, endpoint templates, and docs"
404
+ ```
405
+
406
+ 3. Push to GitHub remote
407
+
408
+ ```bash
409
+ git push github main
410
+ ```
af3_chatgpt_pipeline.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio Flamingo 3 -> ChatGPT cleanup pipeline for Ace Step 1.5 LoRA metadata.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import base64
8
+ import io
9
+ import json
10
+ import os
11
+ import urllib.request
12
+ from dataclasses import dataclass
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Any, ClassVar, Dict, List, Optional
16
+
17
+ import soundfile as sf
18
+
19
+ from qwen_audio_captioning import AUDIO_EXTENSIONS, load_audio_mono
20
+
21
+
22
+ DEFAULT_AF3_MODEL_ID = "nvidia/audio-flamingo-3-hf"
23
+ DEFAULT_AF3_PROMPT = (
24
+ "Analyze this full song and provide concise, timestamped sections describing vocals, "
25
+ "instrumentation, production effects, mix changes, energy flow, and genre cues. End with "
26
+ "a short overall summary."
27
+ )
28
+ DEFAULT_AF3_PROMPT_THINK_LONG = (
29
+ "Analyze the entire song from start to finish and produce a detailed, timestamped breakdown. "
30
+ "Cover the full duration with many sections, describing vocals, instrumentation, effects, mix, "
31
+ "arrangement, and energy transitions. Include notable moments and end with a concise overall summary."
32
+ )
33
+ DEFAULT_OPENAI_MODEL = "gpt-5-mini"
34
+
35
+
36
+ LUNA_OUTPUT_SCHEMA: Dict[str, Any] = {
37
+ "type": "object",
38
+ "properties": {
39
+ "caption": {"type": "string"},
40
+ "lyrics": {"type": "string"},
41
+ "bpm": {"type": ["integer", "null"]},
42
+ "keyscale": {"type": "string"},
43
+ "timesignature": {"type": "string"},
44
+ "vocal_language": {"type": "string"},
45
+ "duration": {"type": "number"},
46
+ "analysis_short": {"type": "string"},
47
+ "analysis_long": {"type": "string"},
48
+ "sections": {
49
+ "type": "array",
50
+ "items": {
51
+ "type": "object",
52
+ "properties": {
53
+ "start_sec": {"type": "number"},
54
+ "end_sec": {"type": "number"},
55
+ "summary": {"type": "string"},
56
+ "vocal_notes": {"type": "string"},
57
+ "instrument_notes": {"type": "string"},
58
+ "effects": {"type": "array", "items": {"type": "string"}},
59
+ "mix_notes": {"type": "array", "items": {"type": "string"}},
60
+ },
61
+ "required": [
62
+ "start_sec",
63
+ "end_sec",
64
+ "summary",
65
+ "vocal_notes",
66
+ "instrument_notes",
67
+ "effects",
68
+ "mix_notes",
69
+ ],
70
+ "additionalProperties": False,
71
+ },
72
+ },
73
+ "tags": {"type": "array", "items": {"type": "string"}},
74
+ },
75
+ "required": [
76
+ "caption",
77
+ "lyrics",
78
+ "bpm",
79
+ "keyscale",
80
+ "timesignature",
81
+ "vocal_language",
82
+ "duration",
83
+ "analysis_short",
84
+ "analysis_long",
85
+ "sections",
86
+ "tags",
87
+ ],
88
+ "additionalProperties": False,
89
+ }
90
+
91
+
92
+ def _extract_json_object(text: str) -> Dict[str, Any]:
93
+ text = (text or "").strip()
94
+ if not text:
95
+ raise ValueError("Empty model output")
96
+ try:
97
+ data = json.loads(text)
98
+ if isinstance(data, dict):
99
+ return data
100
+ except Exception:
101
+ pass
102
+
103
+ start = text.find("{")
104
+ if start < 0:
105
+ raise ValueError("No JSON object found in model output")
106
+ depth = 0
107
+ for i in range(start, len(text)):
108
+ ch = text[i]
109
+ if ch == "{":
110
+ depth += 1
111
+ elif ch == "}":
112
+ depth -= 1
113
+ if depth == 0:
114
+ candidate = text[start : i + 1]
115
+ data = json.loads(candidate)
116
+ if isinstance(data, dict):
117
+ return data
118
+ break
119
+ raise ValueError("Failed to parse JSON object from model output")
120
+
121
+
122
+ def _ensure_str(value: Any, default: str = "") -> str:
123
+ if value is None:
124
+ return default
125
+ return str(value).strip()
126
+
127
+
128
+ def _ensure_float(value: Any, default: float = 0.0) -> float:
129
+ try:
130
+ return float(value)
131
+ except Exception:
132
+ return default
133
+
134
+
135
+ def _ensure_int_or_none(value: Any) -> Optional[int]:
136
+ if value is None:
137
+ return None
138
+ try:
139
+ iv = int(float(value))
140
+ except Exception:
141
+ return None
142
+ if iv <= 0:
143
+ return None
144
+ return iv
145
+
146
+
147
+ def _ensure_str_list(value: Any) -> List[str]:
148
+ if value is None:
149
+ return []
150
+ if isinstance(value, str):
151
+ s = value.strip()
152
+ return [s] if s else []
153
+ if not isinstance(value, list):
154
+ return []
155
+ out: List[str] = []
156
+ seen = set()
157
+ for item in value:
158
+ s = _ensure_str(item)
159
+ if not s:
160
+ continue
161
+ k = s.lower()
162
+ if k in seen:
163
+ continue
164
+ seen.add(k)
165
+ out.append(s)
166
+ return out
167
+
168
+
169
+ def _normalize_sections(sections: Any, duration: float) -> List[Dict[str, Any]]:
170
+ if not isinstance(sections, list):
171
+ return []
172
+ out: List[Dict[str, Any]] = []
173
+ for idx, sec in enumerate(sections):
174
+ if not isinstance(sec, dict):
175
+ continue
176
+ start = _ensure_float(sec.get("start_sec"), default=0.0)
177
+ end = _ensure_float(sec.get("end_sec"), default=start)
178
+ if end < start:
179
+ end = start
180
+ if duration > 0:
181
+ start = max(0.0, min(start, duration))
182
+ end = max(start, min(end, duration))
183
+ out.append(
184
+ {
185
+ "index": idx,
186
+ "start_sec": round(start, 3),
187
+ "end_sec": round(end, 3),
188
+ "summary": _ensure_str(sec.get("summary")),
189
+ "vocal_notes": _ensure_str(sec.get("vocal_notes")),
190
+ "instrument_notes": _ensure_str(sec.get("instrument_notes")),
191
+ "effects": _ensure_str_list(sec.get("effects")),
192
+ "mix_notes": _ensure_str_list(sec.get("mix_notes")),
193
+ }
194
+ )
195
+ return out
196
+
197
+
198
+ def _audio_to_wav_base64(audio_path: str, sample_rate: int = 16000) -> str:
199
+ audio, sr = load_audio_mono(audio_path, target_sr=sample_rate)
200
+ buf = io.BytesIO()
201
+ sf.write(buf, audio, sr, format="WAV")
202
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
203
+
204
+
205
+ @dataclass
206
+ class AF3EndpointClient:
207
+ backend_name: ClassVar[str] = "hf_endpoint"
208
+ endpoint_url: str
209
+ token: str
210
+ model_id: str = DEFAULT_AF3_MODEL_ID
211
+ timeout_seconds: int = 300
212
+
213
+ def analyze(
214
+ self,
215
+ audio_path: str,
216
+ prompt: str,
217
+ max_new_tokens: int = 1024,
218
+ temperature: float = 0.1,
219
+ ) -> str:
220
+ audio_b64 = _audio_to_wav_base64(audio_path, sample_rate=16000)
221
+ payload = {
222
+ "inputs": {
223
+ "prompt": prompt,
224
+ "audio_base64": audio_b64,
225
+ "sample_rate": 16000,
226
+ "max_new_tokens": int(max_new_tokens),
227
+ "temperature": float(temperature),
228
+ "model_id": self.model_id,
229
+ }
230
+ }
231
+ req = urllib.request.Request(
232
+ self.endpoint_url,
233
+ data=json.dumps(payload).encode("utf-8"),
234
+ method="POST",
235
+ headers={
236
+ "Content-Type": "application/json",
237
+ **({"Authorization": f"Bearer {self.token}"} if self.token else {}),
238
+ },
239
+ )
240
+ with urllib.request.urlopen(req, timeout=self.timeout_seconds) as resp:
241
+ raw = resp.read().decode("utf-8")
242
+ data = json.loads(raw)
243
+ if isinstance(data, dict) and isinstance(data.get("generated_text"), str):
244
+ return data["generated_text"].strip()
245
+ if isinstance(data, dict) and isinstance(data.get("text"), str):
246
+ return data["text"].strip()
247
+ if isinstance(data, list) and data:
248
+ first = data[0]
249
+ if isinstance(first, dict) and isinstance(first.get("generated_text"), str):
250
+ return first["generated_text"].strip()
251
+ if isinstance(first, dict) and isinstance(first.get("text"), str):
252
+ return first["text"].strip()
253
+ return str(data).strip()
254
+
255
+
256
+ @dataclass
257
+ class AF3LocalClient:
258
+ backend_name: ClassVar[str] = "local"
259
+ model_id: str = DEFAULT_AF3_MODEL_ID
260
+ device: str = "auto"
261
+ torch_dtype: str = "auto"
262
+ trust_remote_code: bool = True
263
+
264
+ def __post_init__(self):
265
+ self._processor = None
266
+ self._model = None
267
+
268
+ def _load(self):
269
+ if self._model is not None and self._processor is not None:
270
+ return
271
+ import torch
272
+
273
+ try:
274
+ from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
275
+ model_cls = AudioFlamingo3ForConditionalGeneration
276
+ except Exception as exc:
277
+ try:
278
+ from transformers import AutoModelForImageTextToText, AutoProcessor
279
+ model_cls = AutoModelForImageTextToText
280
+ except Exception:
281
+ raise RuntimeError(
282
+ "Audio Flamingo 3 classes are unavailable. Install transformers>=4.57.0."
283
+ ) from exc
284
+
285
+ if self.torch_dtype == "auto":
286
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
287
+ elif self.torch_dtype == "bfloat16":
288
+ dtype = torch.bfloat16
289
+ elif self.torch_dtype == "float16":
290
+ dtype = torch.float16
291
+ else:
292
+ dtype = torch.float32
293
+
294
+ self._processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
295
+ self._model = model_cls.from_pretrained(
296
+ self.model_id,
297
+ torch_dtype=dtype,
298
+ device_map="auto" if self.device == "auto" else None,
299
+ trust_remote_code=self.trust_remote_code,
300
+ )
301
+ if self.device != "auto":
302
+ self._model.to(self.device)
303
+
304
+ def analyze(
305
+ self,
306
+ audio_path: str,
307
+ prompt: str,
308
+ max_new_tokens: int = 1024,
309
+ temperature: float = 0.1,
310
+ ) -> str:
311
+ self._load()
312
+ import torch
313
+
314
+ conversation = [
315
+ {
316
+ "role": "user",
317
+ "content": [
318
+ {"type": "text", "text": prompt},
319
+ {"type": "audio", "path": audio_path},
320
+ ],
321
+ }
322
+ ]
323
+ inputs = self._processor.apply_chat_template(
324
+ conversation,
325
+ tokenize=True,
326
+ add_generation_prompt=True,
327
+ return_dict=True,
328
+ )
329
+ device = next(self._model.parameters()).device
330
+ for k, v in list(inputs.items()):
331
+ if hasattr(v, "to"):
332
+ inputs[k] = v.to(device)
333
+
334
+ gen_kwargs = {
335
+ "max_new_tokens": int(max_new_tokens),
336
+ "do_sample": bool(temperature > 0),
337
+ }
338
+ if temperature > 0:
339
+ gen_kwargs["temperature"] = max(temperature, 1e-5)
340
+ with torch.no_grad():
341
+ outputs = self._model.generate(**inputs, **gen_kwargs)
342
+ start = int(inputs["input_ids"].shape[1])
343
+ text = self._processor.batch_decode(outputs[:, start:], skip_special_tokens=True)[0].strip()
344
+ if not text:
345
+ text = self._processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
346
+ return text
347
+
348
+
349
+ def cleanup_with_chatgpt(
350
+ af3_text: str,
351
+ *,
352
+ openai_api_key: str,
353
+ model: str = DEFAULT_OPENAI_MODEL,
354
+ duration: float = 0.0,
355
+ user_context: str = "",
356
+ artist_name: str = "",
357
+ track_name: str = "",
358
+ enable_web_search: bool = False,
359
+ ) -> Dict[str, Any]:
360
+ if not openai_api_key:
361
+ raise ValueError("openai_api_key is required for ChatGPT cleanup.")
362
+ try:
363
+ from openai import OpenAI
364
+ except Exception as exc:
365
+ raise RuntimeError("openai package is not installed. Add `openai` to dependencies.") from exc
366
+
367
+ client = OpenAI(api_key=openai_api_key)
368
+
369
+ system = (
370
+ "You transform raw audio-analysis prose into high-quality LoRA training metadata for Ace Step 1.5. "
371
+ "Always return compact, truthful JSON. Never invent precise music facts not supported by input."
372
+ )
373
+ user = (
374
+ f"Raw AF3 analysis:\n{af3_text}\n\n"
375
+ f"Track duration seconds: {duration}\n"
376
+ f"Artist (optional): {artist_name or 'unknown'}\n"
377
+ f"Track name (optional): {track_name or 'unknown'}\n"
378
+ f"User context (optional): {user_context or 'none'}\n\n"
379
+ "Return output matching the JSON schema exactly. "
380
+ "Keep caption concise and useful for LoRA conditioning."
381
+ )
382
+ if hasattr(client, "responses"):
383
+ req_kwargs: Dict[str, Any] = {
384
+ "model": model,
385
+ "input": [
386
+ {"role": "system", "content": system},
387
+ {"role": "user", "content": user},
388
+ ],
389
+ "text": {
390
+ "format": {
391
+ "type": "json_schema",
392
+ "name": "ace_step_luna_metadata",
393
+ "schema": LUNA_OUTPUT_SCHEMA,
394
+ "strict": True,
395
+ }
396
+ },
397
+ }
398
+ if enable_web_search:
399
+ req_kwargs["tools"] = [{"type": "web_search"}]
400
+
401
+ try:
402
+ response = client.responses.create(**req_kwargs)
403
+ except Exception:
404
+ if enable_web_search:
405
+ # Fallback for SDK/runtime variants that still use the preview tool id.
406
+ req_kwargs["tools"] = [{"type": "web_search_preview"}]
407
+ response = client.responses.create(**req_kwargs)
408
+ else:
409
+ raise
410
+
411
+ output_text = getattr(response, "output_text", "") or ""
412
+ if not output_text and hasattr(response, "output"):
413
+ chunks: List[str] = []
414
+ for item in getattr(response, "output", []):
415
+ for content in getattr(item, "content", []):
416
+ text_val = getattr(content, "text", None)
417
+ if text_val:
418
+ chunks.append(str(text_val))
419
+ output_text = "\n".join(chunks).strip()
420
+ else:
421
+ if enable_web_search:
422
+ raise RuntimeError(
423
+ "enable_web_search requires an OpenAI SDK/runtime with Responses API support. "
424
+ "Upgrade openai package to a recent version."
425
+ )
426
+ try:
427
+ response = client.chat.completions.create(
428
+ model=model,
429
+ messages=[
430
+ {"role": "system", "content": system},
431
+ {"role": "user", "content": user},
432
+ ],
433
+ response_format={
434
+ "type": "json_schema",
435
+ "json_schema": {
436
+ "name": "ace_step_luna_metadata",
437
+ "schema": LUNA_OUTPUT_SCHEMA,
438
+ "strict": True,
439
+ },
440
+ },
441
+ )
442
+ except Exception:
443
+ response = client.chat.completions.create(
444
+ model=model,
445
+ messages=[
446
+ {"role": "system", "content": system},
447
+ {
448
+ "role": "user",
449
+ "content": (
450
+ user
451
+ + "\n\nReturn valid JSON with keys exactly matching this set: "
452
+ "caption, lyrics, bpm, keyscale, timesignature, vocal_language, "
453
+ "duration, analysis_short, analysis_long, sections, tags."
454
+ ),
455
+ },
456
+ ],
457
+ response_format={"type": "json_object"},
458
+ )
459
+ output_text = ""
460
+ if getattr(response, "choices", None):
461
+ message = response.choices[0].message
462
+ output_text = getattr(message, "content", "") or ""
463
+
464
+ cleaned = _extract_json_object(output_text)
465
+ return cleaned
466
+
467
+
468
+ def build_lora_sidecar(
469
+ cleaned: Dict[str, Any],
470
+ *,
471
+ af3_text: str,
472
+ af3_prompt: str,
473
+ af3_backend: str,
474
+ af3_model_id: str,
475
+ source_audio: str,
476
+ duration: float,
477
+ chatgpt_model: str,
478
+ web_search_used: bool,
479
+ ) -> Dict[str, Any]:
480
+ caption = _ensure_str(cleaned.get("caption"), "music track with evolving arrangement")
481
+ lyrics = _ensure_str(cleaned.get("lyrics"), "")
482
+ bpm = _ensure_int_or_none(cleaned.get("bpm"))
483
+ keyscale = _ensure_str(cleaned.get("keyscale"), "")
484
+ timesignature = _ensure_str(cleaned.get("timesignature"), "4/4") or "4/4"
485
+ vocal_language = _ensure_str(cleaned.get("vocal_language"), "unknown") or "unknown"
486
+ duration_val = _ensure_float(cleaned.get("duration"), duration)
487
+ analysis_short = _ensure_str(cleaned.get("analysis_short"), caption)
488
+ analysis_long = _ensure_str(cleaned.get("analysis_long"), af3_text)
489
+ sections = _normalize_sections(cleaned.get("sections"), duration=duration_val)
490
+ tags = _ensure_str_list(cleaned.get("tags"))
491
+
492
+ sidecar: Dict[str, Any] = {
493
+ "caption": caption,
494
+ "lyrics": lyrics,
495
+ "bpm": bpm,
496
+ "keyscale": keyscale,
497
+ "timesignature": timesignature,
498
+ "vocal_language": vocal_language,
499
+ "duration": round(duration_val, 3),
500
+ "analysis_short": analysis_short,
501
+ "analysis_long": analysis_long,
502
+ "source_audio": source_audio,
503
+ "annotation_version": "af3_chatgpt_luna_v1",
504
+ "music_analysis": {
505
+ "timeline": sections,
506
+ "tags": tags,
507
+ "summary_long": analysis_long,
508
+ "segment_count": len(sections),
509
+ },
510
+ "pipeline": {
511
+ "af3_prompt": af3_prompt,
512
+ "af3_backend": af3_backend,
513
+ "af3_model_id": af3_model_id,
514
+ "af3_raw_analysis": af3_text,
515
+ "chatgpt_model": chatgpt_model,
516
+ "chatgpt_web_search_used": bool(web_search_used),
517
+ "generated_at": datetime.now(timezone.utc).isoformat(),
518
+ },
519
+ }
520
+ return sidecar
521
+
522
+
523
+ def run_af3_chatgpt_pipeline(
524
+ *,
525
+ audio_path: str,
526
+ af3_client: Any,
527
+ af3_prompt: str = DEFAULT_AF3_PROMPT,
528
+ af3_max_new_tokens: int = 1400,
529
+ af3_temperature: float = 0.1,
530
+ openai_api_key: str = "",
531
+ openai_model: str = DEFAULT_OPENAI_MODEL,
532
+ user_context: str = "",
533
+ artist_name: str = "",
534
+ track_name: str = "",
535
+ enable_web_search: bool = False,
536
+ ) -> Dict[str, Any]:
537
+ audio_path = str(Path(audio_path))
538
+ if not Path(audio_path).is_file():
539
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
540
+ if Path(audio_path).suffix.lower() not in AUDIO_EXTENSIONS:
541
+ raise ValueError(f"Unsupported audio extension: {Path(audio_path).suffix}")
542
+
543
+ audio, sr = load_audio_mono(audio_path, target_sr=16000)
544
+ duration = (float(audio.shape[0]) / float(sr)) if sr > 0 else 0.0
545
+
546
+ af3_text = af3_client.analyze(
547
+ audio_path=audio_path,
548
+ prompt=af3_prompt,
549
+ max_new_tokens=af3_max_new_tokens,
550
+ temperature=af3_temperature,
551
+ )
552
+ cleaned = cleanup_with_chatgpt(
553
+ af3_text,
554
+ openai_api_key=openai_api_key,
555
+ model=openai_model,
556
+ duration=duration,
557
+ user_context=user_context,
558
+ artist_name=artist_name,
559
+ track_name=track_name,
560
+ enable_web_search=enable_web_search,
561
+ )
562
+ sidecar = build_lora_sidecar(
563
+ cleaned,
564
+ af3_text=af3_text,
565
+ af3_prompt=af3_prompt,
566
+ af3_backend=getattr(af3_client, "backend_name", type(af3_client).__name__),
567
+ af3_model_id=getattr(af3_client, "model_id", DEFAULT_AF3_MODEL_ID),
568
+ source_audio=audio_path,
569
+ duration=duration,
570
+ chatgpt_model=openai_model,
571
+ web_search_used=enable_web_search,
572
+ )
573
+ return {
574
+ "af3_analysis": af3_text,
575
+ "cleaned": cleaned,
576
+ "sidecar": sidecar,
577
+ }
578
+
579
+
580
+ def save_sidecar(sidecar: Dict[str, Any], output_json: str) -> str:
581
+ out = Path(output_json)
582
+ out.parent.mkdir(parents=True, exist_ok=True)
583
+ out.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
584
+ return str(out)
af3_gui_app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Convenience entrypoint for AF3 GUI stack."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ ROOT = Path(__file__).resolve().parent
10
+ DEV_SCRIPTS = ROOT / "scripts" / "dev"
11
+ if str(DEV_SCRIPTS) not in sys.path:
12
+ sys.path.insert(0, str(DEV_SCRIPTS))
13
+
14
+ from run_af3_gui import main
15
+
16
+ if __name__ == "__main__":
17
+ raise SystemExit(main())
docs/deploy/AF3_ENDPOINT.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deploy Audio Flamingo 3 Caption Endpoint (Dedicated Endpoint)
2
+
3
+ Note: this guide is for the HF-converted `audio-flamingo-3-hf` runtime path.
4
+ For NVIDIA Space stack parity (`llava` + `stage35` think adapter), use:
5
+ `docs/deploy/AF3_NVIDIA_ENDPOINT.md`.
6
+
7
+ ## 1) Create endpoint runtime repo
8
+
9
+ ```bash
10
+ python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
11
+ ```
12
+
13
+ This pushes:
14
+
15
+ - `handler.py`
16
+ - `requirements.txt`
17
+ - `README.md`
18
+
19
+ from `templates/hf-af3-caption-endpoint/`.
20
+
21
+ ## 2) Create endpoint from that model repo
22
+
23
+ In Hugging Face Endpoints:
24
+
25
+ 1. Create endpoint from `YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO`.
26
+ 2. Choose a GPU instance.
27
+ 3. Set task to `custom`.
28
+ 4. Set env vars:
29
+ - `AF3_MODEL_ID=nvidia/audio-flamingo-3-hf`
30
+ - `AF3_BOOTSTRAP_RUNTIME=1`
31
+ - `AF3_TRANSFORMERS_SPEC=transformers==5.1.0`
32
+
33
+ ## 3) Validate startup
34
+
35
+ If logs contain:
36
+
37
+ - `No custom pipeline found at /repository/handler.py`
38
+
39
+ then `handler.py` is not in repo root. Re-upload the runtime template files.
40
+
41
+ If logs contain:
42
+
43
+ - `Failed to load AF3 processor classes after runtime bootstrap`
44
+
45
+ keep endpoint task as `custom`, then check that startup could install runtime deps (network + disk). The first cold start can take several minutes.
46
+
47
+ ## 4) Connect from local pipeline
48
+
49
+ Set:
50
+
51
+ - `HF_AF3_ENDPOINT_URL`
52
+ - `HF_TOKEN`
53
+ - `OPENAI_API_KEY`
54
+
55
+ Recommended local `.env`:
56
+
57
+ ```env
58
+ HF_AF3_ENDPOINT_URL=https://bc3r76slij67lskb.us-east-1.aws.endpoints.huggingface.cloud
59
+ HF_TOKEN=hf_xxx
60
+ OPENAI_API_KEY=sk-...
61
+ ```
62
+
63
+ `.env` is git-ignored in this repo. Do not commit real credentials.
64
+
65
+ Then run:
66
+
67
+ ```bash
68
+ python scripts/pipeline/run_af3_chatgpt_pipeline.py \
69
+ --audio ./train-dataset/track.mp3 \
70
+ --backend hf_endpoint \
71
+ --endpoint-url "$HF_AF3_ENDPOINT_URL" \
72
+ --openai-api-key "$OPENAI_API_KEY"
73
+ ```
74
+
75
+ Or launch full GUI stack:
76
+
77
+ ```bash
78
+ python af3_gui_app.py
79
+ ```
docs/deploy/AF3_NVIDIA_ENDPOINT.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deploy AF3 NVIDIA-Stack Endpoint (Space-Parity Runtime)
2
+
3
+ This path uses NVIDIA's `llava` stack + `stage35` think adapter, which matches the quality profile of:
4
+ - `https://huggingface.co/spaces/nvidia/audio-flamingo-3`
5
+
6
+ ## 1) Create endpoint runtime repo
7
+
8
+ ```bash
9
+ python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
10
+ ```
11
+
12
+ This pushes:
13
+ - `handler.py`
14
+ - `requirements.txt`
15
+ - `README.md`
16
+
17
+ from `templates/hf-af3-nvidia-endpoint/`.
18
+
19
+ ## 2) Create Dedicated Endpoint
20
+
21
+ 1. Create endpoint from `YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO`.
22
+ 2. Set task to `custom`.
23
+ 3. Use a GPU instance.
24
+ 4. Add secret:
25
+ - `HF_TOKEN=hf_xxx`
26
+
27
+ ## 3) Recommended endpoint env vars
28
+
29
+ - `AF3_NV_DEFAULT_MODE=think`
30
+ - `AF3_NV_LOAD_THINK=1`
31
+ - `AF3_NV_LOAD_SINGLE=0`
32
+ - `AF3_NV_CODE_REPO_ID=nvidia/audio-flamingo-3`
33
+ - `AF3_NV_MODEL_REPO_ID=nvidia/audio-flamingo-3`
34
+
35
+ ## 4) Request shape from local scripts
36
+
37
+ Current scripts send:
38
+
39
+ ```json
40
+ {
41
+ "inputs": {
42
+ "prompt": "...",
43
+ "audio_base64": "...",
44
+ "max_new_tokens": 3200,
45
+ "temperature": 0.2
46
+ }
47
+ }
48
+ ```
49
+
50
+ Optional extra flag for this endpoint:
51
+
52
+ ```json
53
+ {
54
+ "inputs": {
55
+ "think_mode": true
56
+ }
57
+ }
58
+ ```
59
+
60
+ ## 5) Notes
61
+
62
+ - First boot is slow because runtime deps + model artifacts must load.
63
+ - Keep at least one warm replica if you want consistent latency.
64
+ - This runtime is heavier than the HF-converted `audio-flamingo-3-hf` endpoint path.
docs/deploy/QWEN_SPACE.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deploy Qwen Captioning UI To HF Space
2
+
3
+ This deploys the music-captioning app (`qwen_caption_app.py`) as its own Space.
4
+
5
+ ## Prerequisites
6
+
7
+ - Hugging Face account
8
+ - `HF_TOKEN` with write access
9
+
10
+ ## Steps
11
+
12
+ 1. Create a new Hugging Face Space (SDK: `Gradio`).
13
+ 2. Push this repo content to that Space.
14
+ 3. In Space `README.md` front matter, set:
15
+ - `sdk: gradio`
16
+ - `app_file: qwen_caption_app.py`
17
+ 4. Pick GPU hardware (A10G or better recommended for local backend).
18
+ 5. Optional secrets/env:
19
+ - `HF_TOKEN` (if accessing private datasets or endpoint backend)
20
+
21
+ ## Runtime notes
22
+
23
+ - `local` backend loads `Qwen/Qwen2-Audio-7B-Instruct` in the Space runtime.
24
+ - `hf_endpoint` backend can call a dedicated endpoint URL instead.
25
+ - Export defaults to `/data/qwen_annotations` on Spaces when persistent storage is enabled.
26
+
docs/deploy/SPACE.md CHANGED
@@ -1,6 +1,7 @@
1
  # Deploy LoRA Studio To Your Own HF Space
2
 
3
  This guide deploys the full LoRA Studio UI to your own Hugging Face Space.
 
4
 
5
  ## Prerequisites
6
 
@@ -37,4 +38,3 @@ python scripts/hf_clone.py space --repo-id YOUR_USERNAME/YOUR_SPACE_NAME --priva
37
  - Space output defaults to `/data/lora_output` on Hugging Face Spaces.
38
  - Enable persistent storage if you need checkpoint retention across restarts.
39
  - For long-running non-interactive training, HF Jobs may be more cost-efficient than keeping a Space running.
40
-
 
1
  # Deploy LoRA Studio To Your Own HF Space
2
 
3
  This guide deploys the full LoRA Studio UI to your own Hugging Face Space.
4
+ For the dedicated Qwen captioning UI, see `docs/deploy/QWEN_SPACE.md`.
5
 
6
  ## Prerequisites
7
 
 
38
  - Space output defaults to `/data/lora_output` on Hugging Face Spaces.
39
  - Enable persistent storage if you need checkpoint retention across restarts.
40
  - For long-running non-interactive training, HF Jobs may be more cost-efficient than keeping a Space running.
 
docs/guides/README.md CHANGED
@@ -3,3 +3,4 @@
3
  Additional step-by-step guides that are useful but not required for the core LoRA Studio flow.
4
 
5
  - `qwen2-audio-train.md`
 
 
3
  Additional step-by-step guides that are useful but not required for the core LoRA Studio flow.
4
 
5
  - `qwen2-audio-train.md`
6
+ - `af3-chatgpt-pipeline.md`
docs/guides/af3-chatgpt-pipeline.md ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio Flamingo 3 + ChatGPT Pipeline (Local Orchestration)
2
+
3
+ This guide sets up a cloud-first annotation workflow:
4
+
5
+ 1. **Audio Flamingo 3** generates raw audio analysis text.
6
+ 2. **ChatGPT** cleans and structures that output into Ace Step 1.5 LoRA sidecar JSON.
7
+ 3. Optional human edits are applied before LoRA training.
8
+
9
+ ## Endpoint vs Space
10
+
11
+ For 100+ tracks, use an **HF Dedicated Endpoint** for AF3 inference.
12
+
13
+ - Endpoint: production API, autoscaling options, stable URL, easier local integration.
14
+ - Space: better for interactive demos/tools, less ideal for bulk API workloads.
15
+
16
+ Use a Space only if you want a hosted UI. Keep heavy batch inference on Endpoint.
17
+
18
+ ## Files in this repo
19
+
20
+ - Pipeline core: `af3_chatgpt_pipeline.py`
21
+ - Batch CLI: `scripts/pipeline/run_af3_chatgpt_pipeline.py`
22
+ - Local API: `services/pipeline_api.py`
23
+ - React UI: `react-ui/`
24
+ - AF3 endpoint template: `templates/hf-af3-caption-endpoint/`
25
+
26
+ ## 1) Deploy AF3 endpoint
27
+
28
+ Create/push endpoint runtime repo:
29
+
30
+ ```bash
31
+ python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
32
+ ```
33
+
34
+ If you want NVIDIA Space parity (llava + stage35 think adapter), use:
35
+
36
+ ```bash
37
+ python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
38
+ ```
39
+
40
+ Then create a Hugging Face Dedicated Endpoint from that model repo.
41
+
42
+ If startup logs show:
43
+
44
+ - `No custom pipeline found at /repository/handler.py`
45
+
46
+ your repo root is missing `handler.py` (copy from `templates/hf-af3-caption-endpoint/handler.py`).
47
+
48
+ ## 2) Configure env
49
+
50
+ Set values in `.env` (or shell env vars):
51
+
52
+ ```env
53
+ HF_TOKEN=hf_xxx
54
+ HF_AF3_ENDPOINT_URL=https://bc3r76slij67lskb.us-east-1.aws.endpoints.huggingface.cloud
55
+ AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
56
+ OPENAI_API_KEY=sk-...
57
+ OPENAI_MODEL=gpt-5-mini
58
+ ```
59
+
60
+ `.env` is git-ignored by default. Keep all real secrets in local `.env` only.
61
+
62
+ ## 3) Run one track from CLI
63
+
64
+ ```bash
65
+ python scripts/pipeline/run_af3_chatgpt_pipeline.py \
66
+ --audio "E:/Coding/hf-music-gen/train-dataset/Andrew Spacey - Wonder (Prod Beat It AT).mp3" \
67
+ --backend hf_endpoint \
68
+ --endpoint-url "$HF_AF3_ENDPOINT_URL" \
69
+ --hf-token "$HF_TOKEN" \
70
+ --openai-api-key "$OPENAI_API_KEY" \
71
+ --artist-name "Andrew Spacey" \
72
+ --track-name "Wonder"
73
+ ```
74
+
75
+ Default behavior writes JSON next to the audio file (`same_stem.json`).
76
+
77
+ ## 4) Batch all tracks
78
+
79
+ ```bash
80
+ python scripts/pipeline/run_af3_chatgpt_pipeline.py \
81
+ --dataset-dir ./train-dataset \
82
+ --backend hf_endpoint \
83
+ --endpoint-url "$HF_AF3_ENDPOINT_URL" \
84
+ --openai-api-key "$OPENAI_API_KEY" \
85
+ --enable-web-search
86
+ ```
87
+
88
+ Use `--output-dir` if you want sidecars in a separate folder.
89
+
90
+ ## 5) Run GUI stack
91
+
92
+ One command (recommended):
93
+
94
+ ```bash
95
+ python af3_gui_app.py
96
+ ```
97
+
98
+ This builds React and serves it from FastAPI. Open `http://127.0.0.1:8008`.
99
+
100
+ PowerShell:
101
+
102
+ ```powershell
103
+ .\scripts\dev\run_af3_gui.ps1
104
+ ```
105
+
106
+ Manual mode:
107
+
108
+ ```bash
109
+ uvicorn services.pipeline_api:app --host 0.0.0.0 --port 8008 --reload
110
+
111
+ cd react-ui
112
+ npm install
113
+ npm run dev
114
+ ```
115
+
116
+ Open `http://localhost:5173`.
117
+
118
+ UI supports:
119
+
120
+ - Local file path mode or upload mode
121
+ - AF3 backend toggle (`hf_endpoint` or `local`)
122
+ - Optional user context
123
+ - Optional web-search-enhanced ChatGPT cleanup
124
+ - Artist/track hints for better metadata normalization
125
+
126
+ ## 6) Human-in-the-loop refinement
127
+
128
+ Recommended loop:
129
+
130
+ 1. Generate sidecars with AF3+ChatGPT.
131
+ 2. Review/edit core fields (`caption`, `bpm`, `keyscale`, `timesignature`, `duration`).
132
+ 3. Keep rich analysis fields for traceability.
133
+ 4. Train LoRA with `lora_train.py` on the folder containing audio + JSON sidecars.
134
+
135
+ ## Output compatibility
136
+
137
+ The pipeline keeps Ace Step core sidecar fields:
138
+
139
+ - `caption`
140
+ - `lyrics`
141
+ - `bpm`
142
+ - `keyscale`
143
+ - `timesignature`
144
+ - `vocal_language`
145
+ - `duration`
146
+
147
+ And adds richer analysis fields in `music_analysis` + `pipeline` for auditability.
148
+
149
+ ## Note on "guarantee"
150
+
151
+ No model can guarantee perfect music metadata. This pipeline improves reliability by:
152
+
153
+ - Schema-constrained ChatGPT output
154
+ - Normalization/defaulting in `build_lora_sidecar(...)`
155
+ - Optional human review pass before training
docs/guides/qwen2-audio-train.md CHANGED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen2-Audio Captioning -> Human Refinement -> ACE-Step LoRA Dataset
2
+
3
+ This guide adds a full annotation pipeline around `Qwen/Qwen2-Audio-7B-Instruct` so you can:
4
+
5
+ 1. Caption full songs with timestamped segment analysis.
6
+ 2. Refine/expand annotations manually.
7
+ 3. Export LoRA-ready sidecar JSON for ACE-Step 1.5 training.
8
+
9
+ ## What was added
10
+
11
+ - Reusable captioning module: `qwen_audio_captioning.py`
12
+ - Gradio UI for upload/analyze/edit/export: `qwen_caption_app.py`
13
+ - Batch CLI for local/HF jobs: `scripts/annotations/qwen_caption_dataset.py`
14
+ - HF Job launcher for batch captioning: `scripts/jobs/submit_hf_qwen_caption_job.ps1`
15
+ - Optional endpoint handler template: `templates/hf-qwen-caption-endpoint/handler.py`
16
+
17
+ ## Why use `Qwen2-Audio-7B-Instruct`
18
+
19
+ Use `Qwen/Qwen2-Audio-7B-Instruct` for this task because your prompt is instruction-heavy and structured (musical elements, mix/effects, vocals, and timestamped interactions).
20
+
21
+ ## Default analysis prompt
22
+
23
+ The pipeline defaults to:
24
+
25
+ > Analyze and detail the musical elements, tones, instruments, genre and effects. Describe the effects and mix of instruments and vocals. Vocals may use modern production techniques such as pitch correction and tuning effects. Explain how musical elements interact throughout the song with timestamps. Go in depth on vocal performance and musical writing. Be concise but detail-rich.
26
+
27
+ You can override this in the UI or CLI.
28
+
29
+ ## Run locally (recommended first)
30
+
31
+ Install dependencies:
32
+
33
+ ```bash
34
+ python -m pip install --upgrade pip
35
+ python -m pip install -r requirements.txt
36
+ ```
37
+
38
+ Start the captioning UI:
39
+
40
+ ```bash
41
+ python qwen_caption_app.py
42
+ ```
43
+
44
+ Open `http://localhost:7860`.
45
+
46
+ ### UI flow
47
+
48
+ 1. **Load Audio** tab:
49
+ - Scan a folder and/or upload files.
50
+ 2. **Run Qwen Captioning** tab:
51
+ - Backend:
52
+ - `local` (model runs in same app process), or
53
+ - `hf_endpoint` (calls a remote endpoint URL).
54
+ - Tune segmentation (`segment_seconds`, `overlap_seconds`) for timestamp granularity.
55
+ 3. **Human Annotation + Export** tab:
56
+ - Load JSON per track.
57
+ - Manually refine timelines, instrument/mix notes, caption text.
58
+ - Export sidecars + manifest.
59
+
60
+ ## Run batch from CLI
61
+
62
+ Example local batch:
63
+
64
+ ```bash
65
+ python scripts/annotations/qwen_caption_dataset.py \
66
+ --dataset-dir ./dataset_inbox \
67
+ --backend local \
68
+ --model-id Qwen/Qwen2-Audio-7B-Instruct \
69
+ --segment-seconds 30 \
70
+ --overlap-seconds 2 \
71
+ --max-new-tokens 384 \
72
+ --temperature 0.1 \
73
+ --output-dir ./qwen_annotations \
74
+ --copy-audio
75
+ ```
76
+
77
+ Sidecars are written next to each source audio file by default.
78
+ Disable with `--no-write-inplace-sidecars`.
79
+
80
+ Outputs:
81
+
82
+ - `qwen_annotations/dataset/*.audio` (if `--copy-audio`)
83
+ - `qwen_annotations/dataset/*.json` (LoRA sidecars)
84
+ - `qwen_annotations/annotations_manifest.jsonl`
85
+ - `qwen_annotations/annotations_index.json`
86
+
87
+ ## Run batch on Hugging Face Jobs
88
+
89
+ PowerShell:
90
+
91
+ ```powershell
92
+ .\scripts\jobs\submit_hf_qwen_caption_job.ps1 `
93
+ -CodeRepo "YOUR_USERNAME/ace-step-lora-studio" `
94
+ -DatasetRepo "YOUR_USERNAME/YOUR_AUDIO_DATASET" `
95
+ -ModelId "Qwen/Qwen2-Audio-7B-Instruct" `
96
+ -Flavor "a10g-large" `
97
+ -Timeout "8h" `
98
+ -CopyAudio `
99
+ -UploadRepo "YOUR_USERNAME/YOUR_ANNOTATED_DATASET"
100
+ ```
101
+
102
+ ## Use on Hugging Face Space
103
+
104
+ To run this UI as a dedicated Space app, set Space `README.md` front matter:
105
+
106
+ - `sdk: gradio`
107
+ - `app_file: qwen_caption_app.py`
108
+
109
+ Then push this repo content to that Space.
110
+
111
+ ## Optional: remote endpoint backend
112
+
113
+ If you want local UI to call a remote endpoint:
114
+
115
+ 1. Deploy dedicated endpoint runtime from this template:
116
+ - `python scripts/hf_clone.py qwen-endpoint --repo-id YOUR_USERNAME/YOUR_QWEN_ENDPOINT_REPO`
117
+ 2. In UI select `backend=hf_endpoint`.
118
+ 3. Set endpoint URL + token.
119
+
120
+ ## Sidecar schema and ACE-Step compatibility
121
+
122
+ The exported JSON keeps ACE-Step core fields:
123
+
124
+ - `caption`
125
+ - `lyrics`
126
+ - `bpm`
127
+ - `keyscale`
128
+ - `timesignature`
129
+ - `vocal_language`
130
+ - `duration`
131
+
132
+ And adds rich fields:
133
+
134
+ - `music_analysis.timeline` (timestamped segment notes)
135
+ - `music_analysis.instruments`, `effects`, `vocal_characteristics`, `mix_notes`
136
+ - `analysis_prompt`, `analysis_model`, `analysis_generated_at`
137
+
138
+ ACE-Step loader ignores unknown keys, so rich fields stay available for later refinement while training still works with core fields.
139
+
140
+ ## Train ACE-Step LoRA from exported dataset
141
+
142
+ Local:
143
+
144
+ ```bash
145
+ python lora_train.py \
146
+ --dataset-dir ./qwen_annotations/dataset \
147
+ --model-config acestep-v15-base \
148
+ --device auto \
149
+ --num-epochs 20 \
150
+ --batch-size 1 \
151
+ --grad-accum 1 \
152
+ --output-dir ./lora_output
153
+ ```
154
+
155
+ HF Job (existing script):
156
+
157
+ ```powershell
158
+ .\scripts\jobs\submit_hf_lora_job.ps1 `
159
+ -CodeRepo "YOUR_USERNAME/ace-step-lora-studio" `
160
+ -DatasetRepo "YOUR_USERNAME/YOUR_ANNOTATED_DATASET" `
161
+ -ModelConfig "acestep-v15-base"
162
+ ```
163
+
164
+ ## Recommended iterative loop
165
+
166
+ 1. Auto-caption with segment timestamps.
167
+ 2. Human refine 10-20% highest-impact tracks first.
168
+ 3. Export updated sidecars.
169
+ 4. Train LoRA.
170
+ 5. Evaluate structural/timing control.
171
+ 6. Feed findings back into prompt + schema refinements.
qwen_audio_captioning.py ADDED
@@ -0,0 +1,996 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Qwen2-Audio captioning utilities for music annotation workflows.
3
+
4
+ This module supports:
5
+ 1) Local inference with Qwen2-Audio models via transformers.
6
+ 2) Remote inference via a Hugging Face Endpoint with a simple JSON contract.
7
+ 3) Segment-based analysis with timestamped aggregation.
8
+ 4) Export helpers for ACE-Step LoRA sidecars and manifest files.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import base64
14
+ import io
15
+ import json
16
+ import os
17
+ import re
18
+ import shutil
19
+ import subprocess
20
+ import tempfile
21
+ import urllib.request
22
+ from dataclasses import dataclass
23
+ from datetime import datetime, timezone
24
+ from pathlib import Path
25
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
26
+
27
+ import numpy as np
28
+ import soundfile as sf
29
+ import torchaudio
30
+
31
+
32
+ AUDIO_EXTENSIONS = {".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a", ".aac"}
33
+
34
+
35
+ DEFAULT_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
36
+
37
+
38
+ DEFAULT_ANALYSIS_PROMPT = (
39
+ "Analyze and detail the musical elements, tones, instruments, genre and effects. "
40
+ "Describe the effects and mix of instruments and vocals. Vocals may use modern production "
41
+ "techniques such as pitch correction and tuning effects. Explain how musical elements interact "
42
+ "throughout the song with timestamps. Go in depth on vocal performance and musical writing. "
43
+ "Be concise but detail-rich."
44
+ )
45
+
46
+ DEFAULT_LONG_ANALYSIS_PROMPT = (
47
+ "Analyze the full song and return a concise but detailed timestamped prose breakdown. "
48
+ "Use sections every 10 to 20 seconds (or major arrangement changes). For each section, "
49
+ "describe vocals, instrumentation, genre cues, effects, mix/energy changes, and how elements "
50
+ "interact. End with a short overall summary paragraph."
51
+ )
52
+
53
+
54
+ SEGMENT_JSON_SCHEMA_HINT = (
55
+ 'Return JSON only with keys: "segment_summary" (string), "section_label" (string), '
56
+ '"genre" (array of strings), "instruments" (array of strings), "effects" (array of strings), '
57
+ '"vocal_characteristics" (array of strings), "mix_notes" (array of strings), '
58
+ '"interaction_notes" (string), "bpm_guess" (number or null), "key_guess" (string or ""), '
59
+ '"notable_moments" (array of objects with "timestamp_sec" and "note").'
60
+ )
61
+
62
+
63
+ @dataclass
64
+ class SegmentResult:
65
+ index: int
66
+ start_sec: float
67
+ end_sec: float
68
+ prompt: str
69
+ raw_response: str
70
+ parsed: Dict[str, Any]
71
+
72
+
73
+ def list_audio_files(folder: str) -> List[str]:
74
+ root = Path(folder)
75
+ if not root.is_dir():
76
+ return []
77
+ files: List[str] = []
78
+ for path in sorted(root.rglob("*")):
79
+ if path.suffix.lower() in AUDIO_EXTENSIONS:
80
+ files.append(str(path))
81
+ return files
82
+
83
+
84
+ def _load_audio_with_fallback(path: str) -> Tuple[np.ndarray, int]:
85
+ """Load audio to mono float32 numpy array with fallback decode path."""
86
+ try:
87
+ wav, sr = torchaudio.load(path)
88
+ wav = wav.float().numpy()
89
+ if wav.ndim == 1:
90
+ mono = wav
91
+ else:
92
+ mono = wav.mean(axis=0)
93
+ return mono.astype(np.float32), int(sr)
94
+ except Exception as torchaudio_exc:
95
+ try:
96
+ audio_np, sr = sf.read(path, dtype="float32", always_2d=True)
97
+ mono = audio_np.mean(axis=1)
98
+ return mono.astype(np.float32), int(sr)
99
+ except Exception as sf_exc:
100
+ # Last fallback: ffmpeg decode (works when local libsndfile lacks mp3 codec).
101
+ try:
102
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
103
+ tmp_wav = tmp.name
104
+ cmd = [
105
+ "ffmpeg",
106
+ "-y",
107
+ "-i",
108
+ str(path),
109
+ "-vn",
110
+ "-ac",
111
+ "1",
112
+ "-ar",
113
+ "16000",
114
+ tmp_wav,
115
+ ]
116
+ proc = subprocess.run(
117
+ cmd,
118
+ stdout=subprocess.PIPE,
119
+ stderr=subprocess.PIPE,
120
+ text=True,
121
+ )
122
+ if proc.returncode != 0:
123
+ tail = (proc.stderr or "")[-800:]
124
+ raise RuntimeError(f"ffmpeg decode failed: {tail}")
125
+ audio_np, sr = sf.read(tmp_wav, dtype="float32", always_2d=True)
126
+ mono = audio_np.mean(axis=1)
127
+ return mono.astype(np.float32), int(sr)
128
+ except Exception as ffmpeg_exc:
129
+ raise RuntimeError(
130
+ f"Audio decode failed for '{path}'. "
131
+ f"torchaudio_error={torchaudio_exc}; "
132
+ f"soundfile_error={sf_exc}; "
133
+ f"ffmpeg_error={ffmpeg_exc}"
134
+ ) from ffmpeg_exc
135
+ finally:
136
+ try:
137
+ if "tmp_wav" in locals():
138
+ Path(tmp_wav).unlink(missing_ok=True)
139
+ except Exception:
140
+ pass
141
+
142
+
143
+ def load_audio_mono(path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
144
+ audio, sr = _load_audio_with_fallback(path)
145
+ if sr == target_sr:
146
+ return audio, sr
147
+
148
+ wav = torch_audio_from_numpy(audio)
149
+ resampled = torchaudio.functional.resample(wav, sr, target_sr)
150
+ return resampled.squeeze(0).cpu().numpy().astype(np.float32), target_sr
151
+
152
+
153
+ def torch_audio_from_numpy(audio: np.ndarray):
154
+ import torch
155
+
156
+ if audio.ndim != 1:
157
+ raise ValueError(f"Expected mono waveform [T], got shape={audio.shape}")
158
+ return torch.from_numpy(audio).unsqueeze(0)
159
+
160
+
161
+ def split_audio_segments(
162
+ audio: np.ndarray,
163
+ sample_rate: int,
164
+ segment_seconds: float,
165
+ overlap_seconds: float,
166
+ ) -> List[Tuple[float, float, np.ndarray]]:
167
+ if segment_seconds <= 0:
168
+ raise ValueError("segment_seconds must be > 0")
169
+ if overlap_seconds < 0:
170
+ raise ValueError("overlap_seconds must be >= 0")
171
+ if overlap_seconds >= segment_seconds:
172
+ raise ValueError("overlap_seconds must be smaller than segment_seconds")
173
+
174
+ total_samples = int(audio.shape[0])
175
+ segment_samples = max(1, int(round(segment_seconds * sample_rate)))
176
+ step_samples = max(1, int(round((segment_seconds - overlap_seconds) * sample_rate)))
177
+
178
+ segments: List[Tuple[float, float, np.ndarray]] = []
179
+ start = 0
180
+ idx = 0
181
+ while start < total_samples:
182
+ end = min(total_samples, start + segment_samples)
183
+ seg_audio = audio[start:end]
184
+ start_sec = start / sample_rate
185
+ end_sec = end / sample_rate
186
+ segments.append((start_sec, end_sec, seg_audio))
187
+ idx += 1
188
+ if end >= total_samples:
189
+ break
190
+ start = idx * step_samples
191
+ return segments
192
+
193
+
194
+ def _extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
195
+ text = (text or "").strip()
196
+ if not text:
197
+ return None
198
+
199
+ # Direct parse first.
200
+ try:
201
+ obj = json.loads(text)
202
+ if isinstance(obj, dict):
203
+ return obj
204
+ except Exception:
205
+ pass
206
+
207
+ # Parse markdown code fence if present.
208
+ fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.S | re.I)
209
+ if fence_match:
210
+ block = fence_match.group(1)
211
+ try:
212
+ obj = json.loads(block)
213
+ if isinstance(obj, dict):
214
+ return obj
215
+ except Exception:
216
+ pass
217
+
218
+ # Fallback: first brace-balanced object.
219
+ start = text.find("{")
220
+ if start < 0:
221
+ return None
222
+ depth = 0
223
+ for i in range(start, len(text)):
224
+ ch = text[i]
225
+ if ch == "{":
226
+ depth += 1
227
+ elif ch == "}":
228
+ depth -= 1
229
+ if depth == 0:
230
+ candidate = text[start : i + 1]
231
+ try:
232
+ obj = json.loads(candidate)
233
+ if isinstance(obj, dict):
234
+ return obj
235
+ except Exception:
236
+ return None
237
+ return None
238
+
239
+
240
+ def _ensure_string_list(value: Any) -> List[str]:
241
+ if value is None:
242
+ return []
243
+ if isinstance(value, str):
244
+ v = value.strip()
245
+ return [v] if v else []
246
+ out: List[str] = []
247
+ if isinstance(value, Sequence):
248
+ for item in value:
249
+ if item is None:
250
+ continue
251
+ s = str(item).strip()
252
+ if s:
253
+ out.append(s)
254
+ deduped: List[str] = []
255
+ seen = set()
256
+ for item in out:
257
+ key = item.lower()
258
+ if key in seen:
259
+ continue
260
+ seen.add(key)
261
+ deduped.append(item)
262
+ return deduped
263
+
264
+
265
+ def _float_or_none(value: Any) -> Optional[float]:
266
+ if value is None:
267
+ return None
268
+ try:
269
+ return float(value)
270
+ except Exception:
271
+ return None
272
+
273
+
274
+ _GENRE_KEYWORDS = [
275
+ "pop",
276
+ "rock",
277
+ "hip-hop",
278
+ "hip hop",
279
+ "rap",
280
+ "r&b",
281
+ "rnb",
282
+ "electronic",
283
+ "edm",
284
+ "trap",
285
+ "house",
286
+ "techno",
287
+ "ambient",
288
+ "indie",
289
+ "soul",
290
+ "jazz",
291
+ "metal",
292
+ "punk",
293
+ "country",
294
+ "lo-fi",
295
+ "lofi",
296
+ "drill",
297
+ ]
298
+
299
+ _INSTRUMENT_KEYWORDS = [
300
+ "drums",
301
+ "kick",
302
+ "snare",
303
+ "hihat",
304
+ "hi-hat",
305
+ "808",
306
+ "bass",
307
+ "synth",
308
+ "piano",
309
+ "guitar",
310
+ "electric guitar",
311
+ "acoustic guitar",
312
+ "strings",
313
+ "pad",
314
+ "lead",
315
+ "pluck",
316
+ "vocal",
317
+ "choir",
318
+ ]
319
+
320
+ _EFFECT_KEYWORDS = [
321
+ "reverb",
322
+ "delay",
323
+ "distortion",
324
+ "saturation",
325
+ "autotune",
326
+ "auto tune",
327
+ "pitch correction",
328
+ "compression",
329
+ "eq",
330
+ "sidechain",
331
+ "chorus",
332
+ "flanger",
333
+ "phaser",
334
+ "stereo widening",
335
+ ]
336
+
337
+ _VOCAL_KEYWORDS = [
338
+ "autotune",
339
+ "auto tune",
340
+ "pitch correction",
341
+ "harmonies",
342
+ "ad-libs",
343
+ "ad libs",
344
+ "falsetto",
345
+ "breathy",
346
+ "raspy",
347
+ "processed vocals",
348
+ ]
349
+
350
+
351
+ def _clean_model_text(text: str) -> str:
352
+ s = (text or "").strip()
353
+ if not s:
354
+ return ""
355
+ # Remove repetitive leading boilerplate often produced when JSON is requested.
356
+ s = re.sub(r"^\s*The output should be a JSON object with these fields\.?\s*", "", s, flags=re.I)
357
+ s = re.sub(r"^\s*This is the requested information for the given song segment:?\s*", "", s, flags=re.I)
358
+ s = re.sub(r"^\s*From\s+\d+(\.\d+)?s\s+to\s+\d+(\.\d+)?s\s*", "", s, flags=re.I)
359
+ return s.strip()
360
+
361
+
362
+ def _extract_bpm_guess(text: str) -> Optional[float]:
363
+ for pat in [r"\b(\d{2,3}(?:\.\d+)?)\s*bpm\b", r"\btempo\s*(?:of|is|:)?\s*(\d{2,3}(?:\.\d+)?)\b"]:
364
+ m = re.search(pat, text, flags=re.I)
365
+ if m:
366
+ try:
367
+ val = float(m.group(1))
368
+ if 30 <= val <= 300:
369
+ return val
370
+ except Exception:
371
+ continue
372
+ return None
373
+
374
+
375
+ def _extract_key_guess(text: str) -> str:
376
+ patterns = [
377
+ r"\b([A-G](?:#|b)?\s*(?:major|minor))\b",
378
+ r"\b([A-G](?:#|b)?m)\b",
379
+ ]
380
+ for pat in patterns:
381
+ m = re.search(pat, text, flags=re.I)
382
+ if m:
383
+ key = m.group(1).strip()
384
+ return key[0].upper() + key[1:]
385
+ return ""
386
+
387
+
388
+ def _extract_keyword_hits(text: str, keywords: List[str]) -> List[str]:
389
+ lower = text.lower()
390
+ found: List[str] = []
391
+ for kw in keywords:
392
+ if kw.lower() in lower:
393
+ label = kw.replace("rnb", "R&B").replace("hip-hop", "hip-hop")
394
+ if label.lower() not in {x.lower() for x in found}:
395
+ found.append(label)
396
+ return found
397
+
398
+
399
+ class BaseCaptioner:
400
+ backend_name = "base"
401
+ model_id = DEFAULT_MODEL_ID
402
+
403
+ def generate(
404
+ self,
405
+ audio: np.ndarray,
406
+ sample_rate: int,
407
+ prompt: str,
408
+ max_new_tokens: int,
409
+ temperature: float,
410
+ ) -> str:
411
+ raise NotImplementedError
412
+
413
+
414
+ class LocalQwen2AudioCaptioner(BaseCaptioner):
415
+ backend_name = "local"
416
+
417
+ def __init__(
418
+ self,
419
+ model_id: str = DEFAULT_MODEL_ID,
420
+ device: str = "auto",
421
+ torch_dtype: str = "auto",
422
+ trust_remote_code: bool = True,
423
+ ):
424
+ self.model_id = model_id
425
+ self.device = device
426
+ self.torch_dtype = torch_dtype
427
+ self.trust_remote_code = trust_remote_code
428
+ self._processor = None
429
+ self._model = None
430
+
431
+ def _load(self):
432
+ if self._processor is not None and self._model is not None:
433
+ return
434
+
435
+ import torch
436
+ try:
437
+ from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
438
+ except Exception as exc:
439
+ raise RuntimeError(
440
+ "Qwen2-Audio classes are unavailable. Install a recent transformers build "
441
+ "(for example transformers>=4.53.0) and retry."
442
+ ) from exc
443
+
444
+ if self.torch_dtype == "auto":
445
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
446
+ elif self.torch_dtype == "bfloat16":
447
+ dtype = torch.bfloat16
448
+ elif self.torch_dtype == "float16":
449
+ dtype = torch.float16
450
+ else:
451
+ dtype = torch.float32
452
+
453
+ device_map = "auto" if self.device == "auto" else None
454
+ self._processor = AutoProcessor.from_pretrained(
455
+ self.model_id,
456
+ trust_remote_code=self.trust_remote_code,
457
+ )
458
+ self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
459
+ self.model_id,
460
+ torch_dtype=dtype,
461
+ device_map=device_map,
462
+ trust_remote_code=self.trust_remote_code,
463
+ )
464
+ if device_map is None:
465
+ if self.device == "auto":
466
+ target_device = "cuda" if torch.cuda.is_available() else "cpu"
467
+ else:
468
+ target_device = self.device
469
+ self._model.to(target_device)
470
+
471
+ def _model_device(self):
472
+ import torch
473
+
474
+ if self._model is None:
475
+ return torch.device("cpu")
476
+ return next(self._model.parameters()).device
477
+
478
+ def generate(
479
+ self,
480
+ audio: np.ndarray,
481
+ sample_rate: int,
482
+ prompt: str,
483
+ max_new_tokens: int,
484
+ temperature: float,
485
+ ) -> str:
486
+ self._load()
487
+ import torch
488
+
489
+ conversation = [
490
+ {"role": "system", "content": "You are a precise music analysis assistant."},
491
+ {
492
+ "role": "user",
493
+ "content": [
494
+ {"type": "audio", "audio_url": "local://segment.wav"},
495
+ {"type": "text", "text": prompt},
496
+ ],
497
+ },
498
+ ]
499
+ text = self._processor.apply_chat_template(
500
+ conversation,
501
+ add_generation_prompt=True,
502
+ tokenize=False,
503
+ )
504
+ inputs = self._processor(
505
+ text=text,
506
+ audio=[audio],
507
+ sampling_rate=sample_rate,
508
+ return_tensors="pt",
509
+ padding=True,
510
+ )
511
+ device = self._model_device()
512
+ for key, value in list(inputs.items()):
513
+ if hasattr(value, "to"):
514
+ inputs[key] = value.to(device)
515
+
516
+ do_sample = bool(temperature and temperature > 0)
517
+ gen_kwargs = {
518
+ "max_new_tokens": int(max_new_tokens),
519
+ "do_sample": do_sample,
520
+ }
521
+ if do_sample:
522
+ gen_kwargs["temperature"] = max(float(temperature), 1e-5)
523
+
524
+ with torch.no_grad():
525
+ generated = self._model.generate(**inputs, **gen_kwargs)
526
+ prompt_tokens = inputs["input_ids"].size(1)
527
+ generated_new = generated[:, prompt_tokens:]
528
+ text_out = self._processor.batch_decode(
529
+ generated_new,
530
+ skip_special_tokens=True,
531
+ clean_up_tokenization_spaces=False,
532
+ )[0]
533
+ if not text_out.strip():
534
+ text_out = self._processor.batch_decode(
535
+ generated,
536
+ skip_special_tokens=True,
537
+ clean_up_tokenization_spaces=False,
538
+ )[0]
539
+ return text_out.strip()
540
+
541
+
542
+ class HFEndpointCaptioner(BaseCaptioner):
543
+ backend_name = "hf_endpoint"
544
+
545
+ def __init__(
546
+ self,
547
+ endpoint_url: str,
548
+ token: Optional[str] = None,
549
+ model_id: str = DEFAULT_MODEL_ID,
550
+ timeout_seconds: int = 180,
551
+ ):
552
+ if not endpoint_url:
553
+ raise ValueError("endpoint_url is required for HFEndpointCaptioner")
554
+ self.endpoint_url = endpoint_url.strip()
555
+ self.token = token or os.getenv("HF_TOKEN", "")
556
+ self.model_id = model_id
557
+ self.timeout_seconds = timeout_seconds
558
+
559
+ def generate(
560
+ self,
561
+ audio: np.ndarray,
562
+ sample_rate: int,
563
+ prompt: str,
564
+ max_new_tokens: int,
565
+ temperature: float,
566
+ ) -> str:
567
+ # Serialize to wav bytes for endpoint transport.
568
+ buffer = io.BytesIO()
569
+ sf.write(buffer, audio, sample_rate, format="WAV")
570
+ wav_bytes = buffer.getvalue()
571
+ audio_b64 = base64.b64encode(wav_bytes).decode("utf-8")
572
+
573
+ payload = {
574
+ "inputs": {
575
+ "prompt": prompt,
576
+ "audio_base64": audio_b64,
577
+ "sample_rate": sample_rate,
578
+ "max_new_tokens": int(max_new_tokens),
579
+ "temperature": float(temperature),
580
+ "model_id": self.model_id,
581
+ }
582
+ }
583
+
584
+ req = urllib.request.Request(
585
+ self.endpoint_url,
586
+ data=json.dumps(payload).encode("utf-8"),
587
+ headers={
588
+ "Content-Type": "application/json",
589
+ **({"Authorization": f"Bearer {self.token}"} if self.token else {}),
590
+ },
591
+ method="POST",
592
+ )
593
+ with urllib.request.urlopen(req, timeout=self.timeout_seconds) as resp:
594
+ body = resp.read().decode("utf-8")
595
+ data = json.loads(body)
596
+
597
+ # Accept common endpoint output shapes.
598
+ if isinstance(data, dict):
599
+ if isinstance(data.get("generated_text"), str):
600
+ return data["generated_text"].strip()
601
+ if isinstance(data.get("text"), str):
602
+ return data["text"].strip()
603
+ if isinstance(data.get("output_text"), str):
604
+ return data["output_text"].strip()
605
+ if isinstance(data, list) and data:
606
+ first = data[0]
607
+ if isinstance(first, dict) and isinstance(first.get("generated_text"), str):
608
+ return first["generated_text"].strip()
609
+ return str(data).strip()
610
+
611
+
612
+ def build_segment_prompt(
613
+ base_prompt: str,
614
+ start_sec: float,
615
+ end_sec: float,
616
+ ) -> str:
617
+ return (
618
+ f"{base_prompt}\n\n"
619
+ f"Analyze only the song segment from {start_sec:.2f}s to {end_sec:.2f}s.\n"
620
+ "Use timestamp references in absolute song seconds.\n"
621
+ f"{SEGMENT_JSON_SCHEMA_HINT}"
622
+ )
623
+
624
+
625
+ def _make_fallback_segment_dict(raw_text: str) -> Dict[str, Any]:
626
+ summary = _clean_model_text(raw_text)
627
+ if not summary:
628
+ summary = "No analysis generated."
629
+ bpm_guess = _extract_bpm_guess(summary)
630
+ key_guess = _extract_key_guess(summary)
631
+ genres = _extract_keyword_hits(summary, _GENRE_KEYWORDS)
632
+ instruments = _extract_keyword_hits(summary, _INSTRUMENT_KEYWORDS)
633
+ effects = _extract_keyword_hits(summary, _EFFECT_KEYWORDS)
634
+ vocal_chars = _extract_keyword_hits(summary, _VOCAL_KEYWORDS)
635
+ return {
636
+ "segment_summary": summary,
637
+ "section_label": "",
638
+ "genre": genres,
639
+ "instruments": instruments,
640
+ "effects": effects,
641
+ "vocal_characteristics": vocal_chars,
642
+ "mix_notes": [],
643
+ "interaction_notes": summary,
644
+ "bpm_guess": bpm_guess,
645
+ "key_guess": key_guess,
646
+ "notable_moments": [],
647
+ }
648
+
649
+
650
+ def _parse_segment_output(raw_text: str) -> Dict[str, Any]:
651
+ parsed = _extract_json_from_text(raw_text)
652
+ if not parsed:
653
+ return _make_fallback_segment_dict(raw_text)
654
+
655
+ out = dict(parsed)
656
+ out["segment_summary"] = str(out.get("segment_summary", "")).strip()
657
+ out["section_label"] = str(out.get("section_label", "")).strip()
658
+ out["genre"] = _ensure_string_list(out.get("genre"))
659
+ out["instruments"] = _ensure_string_list(out.get("instruments"))
660
+ out["effects"] = _ensure_string_list(out.get("effects"))
661
+ out["vocal_characteristics"] = _ensure_string_list(out.get("vocal_characteristics"))
662
+ out["mix_notes"] = _ensure_string_list(out.get("mix_notes"))
663
+ out["interaction_notes"] = str(out.get("interaction_notes", "")).strip()
664
+ out["bpm_guess"] = _float_or_none(out.get("bpm_guess"))
665
+ out["key_guess"] = str(out.get("key_guess", "")).strip()
666
+
667
+ nm = out.get("notable_moments")
668
+ cleaned_nm: List[Dict[str, Any]] = []
669
+ if isinstance(nm, Sequence):
670
+ for item in nm:
671
+ if not isinstance(item, dict):
672
+ continue
673
+ ts = _float_or_none(item.get("timestamp_sec"))
674
+ note = str(item.get("note", "")).strip()
675
+ if ts is None and not note:
676
+ continue
677
+ cleaned_nm.append({"timestamp_sec": ts, "note": note})
678
+ out["notable_moments"] = cleaned_nm
679
+ return out
680
+
681
+
682
+ def _pick_common_key(values: List[str]) -> str:
683
+ counts: Dict[str, int] = {}
684
+ first_original: Dict[str, str] = {}
685
+ for v in values:
686
+ s = (v or "").strip()
687
+ if not s:
688
+ continue
689
+ k = s.lower()
690
+ counts[k] = counts.get(k, 0) + 1
691
+ if k not in first_original:
692
+ first_original[k] = s
693
+ if not counts:
694
+ return ""
695
+ best = sorted(counts.items(), key=lambda x: (-x[1], x[0]))[0][0]
696
+ return first_original[best]
697
+
698
+
699
+ def _collect_unique(items: List[List[str]], limit: int = 12) -> List[str]:
700
+ out: List[str] = []
701
+ seen = set()
702
+ for group in items:
703
+ for item in group:
704
+ key = item.strip().lower()
705
+ if not key or key in seen:
706
+ continue
707
+ seen.add(key)
708
+ out.append(item.strip())
709
+ if len(out) >= limit:
710
+ return out
711
+ return out
712
+
713
+
714
+ def _derive_caption(genres: List[str], instruments: List[str], vocals: List[str]) -> str:
715
+ parts: List[str] = []
716
+ if genres:
717
+ parts.append(", ".join(genres[:2]))
718
+ if instruments:
719
+ parts.append("with " + ", ".join(instruments[:3]))
720
+ if vocals:
721
+ parts.append("and modern processed vocals")
722
+ if not parts:
723
+ return "music track with detailed arrangement and production dynamics"
724
+ return " ".join(parts)
725
+
726
+
727
+ def generate_track_annotation(
728
+ audio_path: str,
729
+ captioner: BaseCaptioner,
730
+ prompt: str = DEFAULT_ANALYSIS_PROMPT,
731
+ segment_seconds: float = 30.0,
732
+ overlap_seconds: float = 2.0,
733
+ max_new_tokens: int = 384,
734
+ temperature: float = 0.1,
735
+ keep_raw_outputs: bool = True,
736
+ include_long_analysis: bool = False,
737
+ long_analysis_prompt: str = DEFAULT_LONG_ANALYSIS_PROMPT,
738
+ long_analysis_max_new_tokens: int = 1200,
739
+ long_analysis_temperature: float = 0.1,
740
+ ) -> Dict[str, Any]:
741
+ audio, sr = load_audio_mono(audio_path, target_sr=16000)
742
+ duration_sec = float(audio.shape[0]) / float(sr) if sr > 0 else 0.0
743
+ segments = split_audio_segments(
744
+ audio=audio,
745
+ sample_rate=sr,
746
+ segment_seconds=segment_seconds,
747
+ overlap_seconds=overlap_seconds,
748
+ )
749
+
750
+ results: List[SegmentResult] = []
751
+ for idx, (start_sec, end_sec, seg_audio) in enumerate(segments):
752
+ seg_prompt = build_segment_prompt(prompt, start_sec=start_sec, end_sec=end_sec)
753
+ raw = captioner.generate(
754
+ audio=seg_audio,
755
+ sample_rate=sr,
756
+ prompt=seg_prompt,
757
+ max_new_tokens=max_new_tokens,
758
+ temperature=temperature,
759
+ )
760
+ parsed = _parse_segment_output(raw)
761
+ results.append(
762
+ SegmentResult(
763
+ index=idx,
764
+ start_sec=start_sec,
765
+ end_sec=end_sec,
766
+ prompt=seg_prompt,
767
+ raw_response=raw,
768
+ parsed=parsed,
769
+ )
770
+ )
771
+
772
+ timeline: List[Dict[str, Any]] = []
773
+ all_genres: List[List[str]] = []
774
+ all_instruments: List[List[str]] = []
775
+ all_effects: List[List[str]] = []
776
+ all_vocals: List[List[str]] = []
777
+ all_mix_notes: List[List[str]] = []
778
+ bpm_values: List[float] = []
779
+ keys: List[str] = []
780
+ interaction_summary: List[str] = []
781
+
782
+ for seg in results:
783
+ p = seg.parsed
784
+ all_genres.append(_ensure_string_list(p.get("genre")))
785
+ all_instruments.append(_ensure_string_list(p.get("instruments")))
786
+ all_effects.append(_ensure_string_list(p.get("effects")))
787
+ all_vocals.append(_ensure_string_list(p.get("vocal_characteristics")))
788
+ all_mix_notes.append(_ensure_string_list(p.get("mix_notes")))
789
+
790
+ bpm = _float_or_none(p.get("bpm_guess"))
791
+ if bpm is not None and bpm > 0:
792
+ bpm_values.append(bpm)
793
+ key_guess = str(p.get("key_guess", "")).strip()
794
+ if key_guess:
795
+ keys.append(key_guess)
796
+ if p.get("interaction_notes"):
797
+ interaction_summary.append(str(p["interaction_notes"]).strip())
798
+
799
+ timeline_entry = {
800
+ "segment_index": seg.index,
801
+ "start_sec": round(seg.start_sec, 3),
802
+ "end_sec": round(seg.end_sec, 3),
803
+ "section_label": str(p.get("section_label", "")).strip(),
804
+ "segment_summary": str(p.get("segment_summary", "")).strip(),
805
+ "instruments": _ensure_string_list(p.get("instruments")),
806
+ "effects": _ensure_string_list(p.get("effects")),
807
+ "vocal_characteristics": _ensure_string_list(p.get("vocal_characteristics")),
808
+ "interaction_notes": str(p.get("interaction_notes", "")).strip(),
809
+ "mix_notes": _ensure_string_list(p.get("mix_notes")),
810
+ "notable_moments": p.get("notable_moments", []),
811
+ }
812
+ if keep_raw_outputs:
813
+ timeline_entry["raw_response"] = seg.raw_response
814
+ timeline.append(timeline_entry)
815
+
816
+ genres = _collect_unique(all_genres, limit=10)
817
+ instruments = _collect_unique(all_instruments, limit=16)
818
+ effects = _collect_unique(all_effects, limit=16)
819
+ vocal_traits = _collect_unique(all_vocals, limit=12)
820
+ mix_notes = _collect_unique(all_mix_notes, limit=24)
821
+ keyscale = _pick_common_key(keys)
822
+ bpm = int(round(sum(bpm_values) / len(bpm_values))) if bpm_values else None
823
+ caption = _derive_caption(genres=genres, instruments=instruments, vocals=vocal_traits)
824
+
825
+ sidecar: Dict[str, Any] = {
826
+ "caption": caption,
827
+ "lyrics": "",
828
+ "bpm": bpm,
829
+ "keyscale": keyscale,
830
+ "timesignature": "4/4",
831
+ "vocal_language": "unknown",
832
+ "duration": round(duration_sec, 3),
833
+ "annotation_version": "qwen2_audio_music_v1",
834
+ "source_audio": str(audio_path),
835
+ "analysis_prompt": prompt,
836
+ "analysis_backend": captioner.backend_name,
837
+ "analysis_model": captioner.model_id,
838
+ "analysis_generated_at": datetime.now(timezone.utc).isoformat(),
839
+ "music_analysis": {
840
+ "genres": genres,
841
+ "instruments": instruments,
842
+ "effects": effects,
843
+ "vocal_characteristics": vocal_traits,
844
+ "mix_notes": mix_notes,
845
+ "interaction_summary": interaction_summary,
846
+ "timeline": timeline,
847
+ "segment_seconds": segment_seconds,
848
+ "overlap_seconds": overlap_seconds,
849
+ "segment_count": len(timeline),
850
+ },
851
+ }
852
+
853
+ if include_long_analysis:
854
+ long_prompt = (long_analysis_prompt or "").strip() or DEFAULT_LONG_ANALYSIS_PROMPT
855
+ try:
856
+ long_raw = captioner.generate(
857
+ audio=audio,
858
+ sample_rate=sr,
859
+ prompt=long_prompt,
860
+ max_new_tokens=int(long_analysis_max_new_tokens),
861
+ temperature=float(long_analysis_temperature),
862
+ )
863
+ long_text = _clean_model_text(long_raw)
864
+ sidecar["analysis_long_prompt"] = long_prompt
865
+ sidecar["analysis_long"] = long_text
866
+ sidecar["music_analysis"]["summary_long"] = long_text
867
+ except Exception as exc:
868
+ sidecar["analysis_long_prompt"] = long_prompt
869
+ sidecar["analysis_long"] = ""
870
+ sidecar["analysis_long_error"] = str(exc)
871
+
872
+ return sidecar
873
+
874
+
875
+ def build_captioner(
876
+ backend: str,
877
+ model_id: str = DEFAULT_MODEL_ID,
878
+ endpoint_url: str = "",
879
+ token: str = "",
880
+ device: str = "auto",
881
+ torch_dtype: str = "auto",
882
+ ) -> BaseCaptioner:
883
+ backend = (backend or "").strip().lower()
884
+ if backend in {"local", "hf_space_local"}:
885
+ return LocalQwen2AudioCaptioner(
886
+ model_id=model_id or DEFAULT_MODEL_ID,
887
+ device=device,
888
+ torch_dtype=torch_dtype,
889
+ )
890
+ if backend in {"endpoint", "hf_endpoint"}:
891
+ return HFEndpointCaptioner(
892
+ endpoint_url=endpoint_url,
893
+ token=token,
894
+ model_id=model_id or DEFAULT_MODEL_ID,
895
+ )
896
+ raise ValueError(f"Unsupported backend: {backend}")
897
+
898
+
899
+ def export_annotation_records(
900
+ records: List[Dict[str, Any]],
901
+ output_dir: str,
902
+ copy_audio: bool = True,
903
+ write_inplace_sidecars: bool = True,
904
+ ) -> Dict[str, Any]:
905
+ """
906
+ Export analyzed tracks as LoRA-ready sidecars + manifest.
907
+
908
+ records item schema:
909
+ {
910
+ "audio_path": "...",
911
+ "sidecar": {...annotation json...}
912
+ }
913
+ """
914
+ out_root = Path(output_dir)
915
+ out_root.mkdir(parents=True, exist_ok=True)
916
+ dataset_root = out_root / "dataset"
917
+ if copy_audio:
918
+ dataset_root.mkdir(parents=True, exist_ok=True)
919
+
920
+ manifest_path = out_root / "annotations_manifest.jsonl"
921
+ index_path = out_root / "annotations_index.json"
922
+
923
+ manifest_lines: List[str] = []
924
+ index_items: List[Dict[str, Any]] = []
925
+ written_count = 0
926
+
927
+ for rec in records:
928
+ src_audio = Path(rec["audio_path"])
929
+ sidecar = dict(rec["sidecar"])
930
+ if not src_audio.exists():
931
+ continue
932
+
933
+ if copy_audio:
934
+ dst_audio = dataset_root / src_audio.name
935
+ if src_audio.resolve() != dst_audio.resolve():
936
+ shutil.copy2(src_audio, dst_audio)
937
+ dst_sidecar = dst_audio.with_suffix(".json")
938
+ else:
939
+ dst_sidecar = (out_root / src_audio.name).with_suffix(".json")
940
+
941
+ dst_sidecar.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
942
+ written_count += 1
943
+
944
+ if write_inplace_sidecars:
945
+ inplace_sidecar = src_audio.with_suffix(".json")
946
+ inplace_sidecar.write_text(
947
+ json.dumps(sidecar, indent=2, ensure_ascii=False),
948
+ encoding="utf-8",
949
+ )
950
+
951
+ manifest_row = {
952
+ "audio_path": str(dst_sidecar.with_suffix(src_audio.suffix).as_posix()) if copy_audio else str(src_audio),
953
+ "sidecar_path": str(dst_sidecar),
954
+ "caption": sidecar.get("caption", ""),
955
+ "duration": sidecar.get("duration"),
956
+ "bpm": sidecar.get("bpm"),
957
+ "keyscale": sidecar.get("keyscale", ""),
958
+ }
959
+ manifest_lines.append(json.dumps(manifest_row, ensure_ascii=False))
960
+ index_items.append(
961
+ {
962
+ "source_audio": str(src_audio),
963
+ "exported_sidecar": str(dst_sidecar),
964
+ "caption": sidecar.get("caption", ""),
965
+ }
966
+ )
967
+
968
+ manifest_path.write_text("\n".join(manifest_lines), encoding="utf-8")
969
+ index_path.write_text(
970
+ json.dumps(
971
+ {
972
+ "generated_at": datetime.now(timezone.utc).isoformat(),
973
+ "records": index_items,
974
+ },
975
+ indent=2,
976
+ ensure_ascii=False,
977
+ ),
978
+ encoding="utf-8",
979
+ )
980
+
981
+ return {
982
+ "written_count": written_count,
983
+ "manifest_path": str(manifest_path),
984
+ "index_path": str(index_path),
985
+ "dataset_root": str(dataset_root) if copy_audio else "",
986
+ }
987
+
988
+
989
+ def read_prompt_file(prompt_file: str) -> str:
990
+ path = Path(prompt_file)
991
+ if not path.is_file():
992
+ raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
993
+ text = path.read_text(encoding="utf-8").strip()
994
+ if not text:
995
+ raise ValueError(f"Prompt file is empty: {prompt_file}")
996
+ return text
qwen_caption_app.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import gradio as gr
7
+ import torchaudio
8
+
9
+ # On Hugging Face Spaces Zero, `spaces` must be imported before CUDA-related modules.
10
+ if os.getenv("SPACE_ID"):
11
+ try:
12
+ import spaces # noqa: F401
13
+ except Exception:
14
+ pass
15
+
16
+ from qwen_audio_captioning import (
17
+ DEFAULT_ANALYSIS_PROMPT,
18
+ DEFAULT_MODEL_ID,
19
+ build_captioner,
20
+ export_annotation_records,
21
+ generate_track_annotation,
22
+ list_audio_files,
23
+ )
24
+
25
+
26
+ IS_SPACE = bool(os.getenv("SPACE_ID"))
27
+ DEFAULT_EXPORT_DIR = "/data/qwen_annotations" if IS_SPACE else "qwen_annotations"
28
+
29
+ _captioner_cache: Dict[str, Any] = {"key": None, "obj": None}
30
+
31
+
32
+ def _audio_duration_sec(path: str) -> Optional[float]:
33
+ try:
34
+ info = torchaudio.info(path)
35
+ if info.sample_rate <= 0:
36
+ return None
37
+ return float(info.num_frames) / float(info.sample_rate)
38
+ except Exception:
39
+ return None
40
+
41
+
42
+ def _dedupe_paths(paths: List[str]) -> List[str]:
43
+ seen = set()
44
+ out: List[str] = []
45
+ for p in paths:
46
+ if not isinstance(p, str):
47
+ continue
48
+ pp = p.strip()
49
+ if not pp:
50
+ continue
51
+ key = str(Path(pp).resolve()) if Path(pp).exists() else pp
52
+ if key in seen:
53
+ continue
54
+ seen.add(key)
55
+ out.append(pp)
56
+ return out
57
+
58
+
59
+ def _files_table(paths: List[str]) -> List[List[str]]:
60
+ rows: List[List[str]] = []
61
+ for p in paths:
62
+ duration = _audio_duration_sec(p)
63
+ rows.append([
64
+ Path(p).name,
65
+ f"{duration:.2f}" if duration is not None else "?",
66
+ p,
67
+ ])
68
+ return rows
69
+
70
+
71
+ def _records_table(records: List[Dict[str, Any]]) -> List[List[str]]:
72
+ rows: List[List[str]] = []
73
+ for rec in records:
74
+ sidecar = rec.get("sidecar", {})
75
+ analysis = sidecar.get("music_analysis", {})
76
+ rows.append([
77
+ Path(rec.get("audio_path", "")).name,
78
+ f"{sidecar.get('duration', '?')}",
79
+ str(analysis.get("segment_count", "?")),
80
+ str(sidecar.get("bpm", "")),
81
+ str(sidecar.get("keyscale", "")),
82
+ str(sidecar.get("caption", ""))[:160],
83
+ str(rec.get("status", "ok")),
84
+ ])
85
+ return rows
86
+
87
+
88
+ def _get_captioner(
89
+ backend: str,
90
+ model_id: str,
91
+ endpoint_url: str,
92
+ token: str,
93
+ device: str,
94
+ dtype: str,
95
+ ):
96
+ cache_key = (backend, model_id, endpoint_url, device, dtype, token if backend == "hf_endpoint" else "")
97
+ if _captioner_cache["obj"] is not None and _captioner_cache["key"] == cache_key:
98
+ return _captioner_cache["obj"]
99
+
100
+ cap = build_captioner(
101
+ backend=backend,
102
+ model_id=model_id,
103
+ endpoint_url=endpoint_url,
104
+ token=token,
105
+ device=device,
106
+ torch_dtype=dtype,
107
+ )
108
+ _captioner_cache["obj"] = cap
109
+ _captioner_cache["key"] = cache_key
110
+ return cap
111
+
112
+
113
+ def scan_folder(folder_path: str, current_paths: List[str]):
114
+ current_paths = current_paths or []
115
+ if not folder_path or not Path(folder_path).is_dir():
116
+ return "Provide a valid folder path.", current_paths, _files_table(current_paths)
117
+ merged = _dedupe_paths(current_paths + list_audio_files(folder_path))
118
+ return f"Loaded {len(merged)} audio files.", merged, _files_table(merged)
119
+
120
+
121
+ def add_uploaded(uploaded_paths: List[str], current_paths: List[str]):
122
+ current_paths = current_paths or []
123
+ uploaded_paths = uploaded_paths or []
124
+ merged = _dedupe_paths(current_paths + uploaded_paths)
125
+ if not merged:
126
+ return "Upload one or more audio files first.", merged, _files_table(merged)
127
+ return f"Loaded {len(merged)} audio files.", merged, _files_table(merged)
128
+
129
+
130
+ def clear_files():
131
+ return "Cleared file list.", [], []
132
+
133
+
134
+ def load_existing_sidecars(audio_paths: List[str], records: List[Dict[str, Any]]):
135
+ audio_paths = audio_paths or []
136
+ records = records or []
137
+ existing_by_path = {r.get("audio_path"): r for r in records}
138
+ loaded = 0
139
+ for audio_path in audio_paths:
140
+ sidecar_path = Path(audio_path).with_suffix(".json")
141
+ if not sidecar_path.exists():
142
+ continue
143
+ try:
144
+ data = json.loads(sidecar_path.read_text(encoding="utf-8"))
145
+ except Exception:
146
+ continue
147
+ existing_by_path[audio_path] = {
148
+ "audio_path": audio_path,
149
+ "sidecar": data,
150
+ "status": "loaded-existing",
151
+ }
152
+ loaded += 1
153
+
154
+ merged_records = list(existing_by_path.values())
155
+ choices = [r.get("audio_path", "") for r in merged_records]
156
+ return (
157
+ f"Loaded {loaded} existing sidecar(s). Total editable records: {len(merged_records)}.",
158
+ merged_records,
159
+ _records_table(merged_records),
160
+ gr.update(choices=choices, value=choices[0] if choices else None),
161
+ )
162
+
163
+
164
+ def run_analysis(
165
+ audio_paths: List[str],
166
+ backend: str,
167
+ model_id: str,
168
+ endpoint_url: str,
169
+ token: str,
170
+ device: str,
171
+ dtype: str,
172
+ prompt: str,
173
+ segment_seconds: float,
174
+ overlap_seconds: float,
175
+ max_new_tokens: int,
176
+ temperature: float,
177
+ keep_raw_outputs: bool,
178
+ existing_records: List[Dict[str, Any]],
179
+ ):
180
+ audio_paths = audio_paths or []
181
+ existing_records = existing_records or []
182
+ if not audio_paths:
183
+ return (
184
+ "No audio files loaded.",
185
+ existing_records,
186
+ _records_table(existing_records),
187
+ gr.update(choices=[], value=None),
188
+ )
189
+ prompt = (prompt or "").strip() or DEFAULT_ANALYSIS_PROMPT
190
+
191
+ captioner = _get_captioner(
192
+ backend=backend,
193
+ model_id=model_id or DEFAULT_MODEL_ID,
194
+ endpoint_url=endpoint_url,
195
+ token=token,
196
+ device=device,
197
+ dtype=dtype,
198
+ )
199
+
200
+ existing_by_path = {r.get("audio_path"): r for r in existing_records}
201
+ failures: List[str] = []
202
+
203
+ for audio_path in audio_paths:
204
+ try:
205
+ sidecar = generate_track_annotation(
206
+ audio_path=audio_path,
207
+ captioner=captioner,
208
+ prompt=prompt,
209
+ segment_seconds=float(segment_seconds),
210
+ overlap_seconds=float(overlap_seconds),
211
+ max_new_tokens=int(max_new_tokens),
212
+ temperature=float(temperature),
213
+ keep_raw_outputs=bool(keep_raw_outputs),
214
+ )
215
+ # Persist immediately so dataset folder stays LoRA-ready.
216
+ Path(audio_path).with_suffix(".json").write_text(
217
+ json.dumps(sidecar, indent=2, ensure_ascii=False),
218
+ encoding="utf-8",
219
+ )
220
+ existing_by_path[audio_path] = {
221
+ "audio_path": audio_path,
222
+ "sidecar": sidecar,
223
+ "status": "analyzed+saved",
224
+ }
225
+ except Exception as exc:
226
+ failures.append(f"{Path(audio_path).name}: {exc}")
227
+ fallback = existing_by_path.get(audio_path, {"audio_path": audio_path, "sidecar": {}})
228
+ fallback["status"] = f"failed: {exc}"
229
+ existing_by_path[audio_path] = fallback
230
+
231
+ merged_records = list(existing_by_path.values())
232
+ choices = [r.get("audio_path", "") for r in merged_records]
233
+ message = (
234
+ f"Analyzed {len(audio_paths)} file(s). "
235
+ f"Failures: {len(failures)}."
236
+ )
237
+ if failures:
238
+ message += "\n" + "\n".join(failures[:12])
239
+ return (
240
+ message,
241
+ merged_records,
242
+ _records_table(merged_records),
243
+ gr.update(choices=choices, value=choices[0] if choices else None),
244
+ )
245
+
246
+
247
+ def load_record_json(selected_audio_path: str, records: List[Dict[str, Any]]):
248
+ records = records or []
249
+ if not selected_audio_path:
250
+ return "{}", "", "", "", "", "", ""
251
+ for rec in records:
252
+ if rec.get("audio_path") == selected_audio_path:
253
+ sidecar = rec.get("sidecar", {})
254
+ return (
255
+ json.dumps(sidecar, indent=2, ensure_ascii=False),
256
+ str(sidecar.get("caption", "")),
257
+ str(sidecar.get("lyrics", "")),
258
+ str(sidecar.get("bpm", "")),
259
+ str(sidecar.get("keyscale", "")),
260
+ str(sidecar.get("vocal_language", "")),
261
+ str(sidecar.get("duration", "")),
262
+ )
263
+ return "{}", "", "", "", "", "", ""
264
+
265
+
266
+ def save_record_json(
267
+ selected_audio_path: str,
268
+ edited_json: str,
269
+ records: List[Dict[str, Any]],
270
+ ):
271
+ records = records or []
272
+ if not selected_audio_path:
273
+ return "Select a track first.", records, _records_table(records)
274
+ try:
275
+ payload = json.loads(edited_json)
276
+ if not isinstance(payload, dict):
277
+ return "Edited payload must be a JSON object.", records, _records_table(records)
278
+ except Exception as exc:
279
+ return f"Invalid JSON: {exc}", records, _records_table(records)
280
+
281
+ updated = False
282
+ for rec in records:
283
+ if rec.get("audio_path") == selected_audio_path:
284
+ rec["sidecar"] = payload
285
+ rec["status"] = "edited+saved"
286
+ updated = True
287
+ break
288
+ if not updated:
289
+ records.append({"audio_path": selected_audio_path, "sidecar": payload, "status": "edited+saved"})
290
+
291
+ # Persist edits next to source audio for LoRA-ready folder layout.
292
+ Path(selected_audio_path).with_suffix(".json").write_text(
293
+ json.dumps(payload, indent=2, ensure_ascii=False),
294
+ encoding="utf-8",
295
+ )
296
+ return "Saved edits and wrote sidecar next to source audio.", records, _records_table(records)
297
+
298
+
299
+ def export_records(
300
+ records: List[Dict[str, Any]],
301
+ output_dir: str,
302
+ copy_audio: bool,
303
+ write_inplace_sidecars: bool,
304
+ ):
305
+ records = records or []
306
+ valid: List[Dict[str, Any]] = []
307
+ for rec in records:
308
+ if not rec.get("audio_path") or not isinstance(rec.get("sidecar"), dict):
309
+ continue
310
+ valid.append({"audio_path": rec["audio_path"], "sidecar": rec["sidecar"]})
311
+ if not valid:
312
+ return "No valid analyzed/edited records to export."
313
+
314
+ out_dir = (output_dir or "").strip() or DEFAULT_EXPORT_DIR
315
+ result = export_annotation_records(
316
+ records=valid,
317
+ output_dir=out_dir,
318
+ copy_audio=bool(copy_audio),
319
+ write_inplace_sidecars=bool(write_inplace_sidecars),
320
+ )
321
+ return (
322
+ f"Exported {result['written_count']} sidecar(s).\n"
323
+ f"Manifest: {result['manifest_path']}\n"
324
+ f"Index: {result['index_path']}\n"
325
+ f"Dataset root: {result['dataset_root'] or '(audio copy disabled)'}"
326
+ )
327
+
328
+
329
+ def build_ui():
330
+ with gr.Blocks(title="Qwen2-Audio Music Captioning", theme=gr.themes.Soft()) as app:
331
+ gr.Markdown(
332
+ "# Qwen2-Audio Music Captioning + Annotation Export\n"
333
+ "Upload songs, run structured timestamped music analysis, optionally edit annotations, "
334
+ "then export ACE-Step LoRA sidecars."
335
+ )
336
+
337
+ audio_paths_state = gr.State([])
338
+ records_state = gr.State([])
339
+
340
+ with gr.Tab("1) Load Audio"):
341
+ with gr.Row():
342
+ folder_input = gr.Textbox(label="Dataset Folder", placeholder="e.g. ./dataset_inbox")
343
+ scan_btn = gr.Button("Scan Folder")
344
+ with gr.Row():
345
+ upload_files = gr.Files(
346
+ label="Upload Audio Files",
347
+ file_count="multiple",
348
+ file_types=["audio"],
349
+ type="filepath",
350
+ )
351
+ add_upload_btn = gr.Button("Add Uploaded Files")
352
+ clear_btn = gr.Button("Clear")
353
+ files_status = gr.Textbox(label="Load Status", interactive=False)
354
+ files_table = gr.Dataframe(
355
+ headers=["File", "Duration(s)", "Path"],
356
+ datatype=["str", "str", "str"],
357
+ label="Loaded Audio",
358
+ interactive=False,
359
+ )
360
+
361
+ scan_btn.click(
362
+ scan_folder,
363
+ [folder_input, audio_paths_state],
364
+ [files_status, audio_paths_state, files_table],
365
+ )
366
+ add_upload_btn.click(
367
+ add_uploaded,
368
+ [upload_files, audio_paths_state],
369
+ [files_status, audio_paths_state, files_table],
370
+ )
371
+ clear_btn.click(
372
+ clear_files,
373
+ outputs=[files_status, audio_paths_state, files_table],
374
+ )
375
+
376
+ with gr.Tab("2) Run Qwen Captioning"):
377
+ with gr.Row():
378
+ backend_dd = gr.Dropdown(
379
+ choices=["local", "hf_endpoint"],
380
+ value="local",
381
+ label="Backend",
382
+ )
383
+ model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL_ID)
384
+ endpoint_url = gr.Textbox(label="HF Endpoint URL (for hf_endpoint backend)", value="")
385
+ with gr.Row():
386
+ hf_token = gr.Textbox(label="HF Token (optional)", type="password", value="")
387
+ device_dd = gr.Dropdown(
388
+ choices=["auto", "cuda", "cpu", "mps"],
389
+ value="auto",
390
+ label="Local Device",
391
+ )
392
+ dtype_dd = gr.Dropdown(
393
+ choices=["auto", "float16", "bfloat16", "float32"],
394
+ value="auto",
395
+ label="Torch DType",
396
+ )
397
+ prompt_box = gr.Textbox(
398
+ label="Analysis Prompt",
399
+ lines=6,
400
+ value=DEFAULT_ANALYSIS_PROMPT,
401
+ )
402
+ with gr.Row():
403
+ segment_seconds = gr.Slider(10, 120, value=30, step=1, label="Segment Seconds")
404
+ overlap_seconds = gr.Slider(0, 20, value=2, step=1, label="Overlap Seconds")
405
+ max_new_tokens = gr.Slider(64, 2048, value=384, step=32, label="Max New Tokens")
406
+ with gr.Row():
407
+ temperature = gr.Slider(0.0, 1.2, value=0.1, step=0.05, label="Temperature")
408
+ keep_raw = gr.Checkbox(value=True, label="Keep Raw Segment Responses In JSON")
409
+ analyze_btn = gr.Button("Run Captioning", variant="primary")
410
+ with gr.Row():
411
+ load_existing_btn = gr.Button("Load Existing Sidecars")
412
+ analysis_status = gr.Textbox(label="Analysis Status", lines=5, interactive=False)
413
+ gr.Markdown("Sidecars are auto-saved next to each source audio file during analysis.")
414
+ records_table = gr.Dataframe(
415
+ headers=["File", "Duration", "Segments", "BPM", "Key", "Caption", "Status"],
416
+ datatype=["str", "str", "str", "str", "str", "str", "str"],
417
+ interactive=False,
418
+ label="Annotation Records",
419
+ )
420
+ track_selector = gr.Dropdown(choices=[], label="Select Track For Editing")
421
+
422
+ analyze_btn.click(
423
+ run_analysis,
424
+ [
425
+ audio_paths_state,
426
+ backend_dd,
427
+ model_id,
428
+ endpoint_url,
429
+ hf_token,
430
+ device_dd,
431
+ dtype_dd,
432
+ prompt_box,
433
+ segment_seconds,
434
+ overlap_seconds,
435
+ max_new_tokens,
436
+ temperature,
437
+ keep_raw,
438
+ records_state,
439
+ ],
440
+ [analysis_status, records_state, records_table, track_selector],
441
+ )
442
+ load_existing_btn.click(
443
+ load_existing_sidecars,
444
+ [audio_paths_state, records_state],
445
+ [analysis_status, records_state, records_table, track_selector],
446
+ )
447
+
448
+ with gr.Tab("3) Human Annotation + Export"):
449
+ with gr.Row():
450
+ load_record_btn = gr.Button("Load Selected JSON")
451
+ save_record_btn = gr.Button("Save JSON Edits")
452
+ json_editor = gr.Textbox(label="Editable Annotation JSON", lines=24)
453
+ with gr.Row():
454
+ caption_preview = gr.Textbox(label="Caption", interactive=False)
455
+ bpm_preview = gr.Textbox(label="BPM", interactive=False)
456
+ key_preview = gr.Textbox(label="Key/Scale", interactive=False)
457
+ with gr.Row():
458
+ lang_preview = gr.Textbox(label="Vocal Language", interactive=False)
459
+ duration_preview = gr.Textbox(label="Duration", interactive=False)
460
+ lyrics_preview = gr.Textbox(label="Lyrics", interactive=False)
461
+ edit_status = gr.Textbox(label="Edit Status", interactive=False)
462
+ gr.Markdown("Saving JSON edits also writes the sidecar next to the source audio file.")
463
+
464
+ load_record_btn.click(
465
+ load_record_json,
466
+ [track_selector, records_state],
467
+ [
468
+ json_editor,
469
+ caption_preview,
470
+ lyrics_preview,
471
+ bpm_preview,
472
+ key_preview,
473
+ lang_preview,
474
+ duration_preview,
475
+ ],
476
+ )
477
+ save_record_btn.click(
478
+ save_record_json,
479
+ [track_selector, json_editor, records_state],
480
+ [edit_status, records_state, records_table],
481
+ )
482
+
483
+ gr.Markdown("### Export LoRA-Ready Dataset")
484
+ with gr.Row():
485
+ export_dir = gr.Textbox(label="Export Directory", value=DEFAULT_EXPORT_DIR)
486
+ copy_audio_cb = gr.Checkbox(value=True, label="Copy Audio Into Export Dataset")
487
+ inplace_cb = gr.Checkbox(value=True, label="Also Write Sidecars Next To Source Audio")
488
+ export_btn = gr.Button("Export", variant="primary")
489
+ export_status = gr.Textbox(label="Export Status", lines=5, interactive=False)
490
+
491
+ export_btn.click(
492
+ export_records,
493
+ [records_state, export_dir, copy_audio_cb, inplace_cb],
494
+ export_status,
495
+ )
496
+
497
+ app.queue(default_concurrency_limit=1)
498
+ return app
499
+
500
+
501
+ app = build_ui()
502
+
503
+
504
+ if __name__ == "__main__":
505
+ port = int(os.getenv("PORT", "7860"))
506
+ app.launch(server_name="0.0.0.0", server_port=port, share=False)
react-ui/index.html ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>AF3 + ChatGPT Pipeline</title>
7
+ </head>
8
+ <body>
9
+ <div id="root"></div>
10
+ <script type="module" src="/src/main.jsx"></script>
11
+ </body>
12
+ </html>
react-ui/package-lock.json ADDED
@@ -0,0 +1,1674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "af3-chatgpt-pipeline-ui",
3
+ "version": "0.1.0",
4
+ "lockfileVersion": 3,
5
+ "requires": true,
6
+ "packages": {
7
+ "": {
8
+ "name": "af3-chatgpt-pipeline-ui",
9
+ "version": "0.1.0",
10
+ "dependencies": {
11
+ "react": "^18.3.1",
12
+ "react-dom": "^18.3.1"
13
+ },
14
+ "devDependencies": {
15
+ "@vitejs/plugin-react": "^4.3.4",
16
+ "vite": "^5.4.11"
17
+ }
18
+ },
19
+ "node_modules/@babel/code-frame": {
20
+ "version": "7.29.0",
21
+ "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz",
22
+ "integrity": "sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==",
23
+ "dev": true,
24
+ "license": "MIT",
25
+ "dependencies": {
26
+ "@babel/helper-validator-identifier": "^7.28.5",
27
+ "js-tokens": "^4.0.0",
28
+ "picocolors": "^1.1.1"
29
+ },
30
+ "engines": {
31
+ "node": ">=6.9.0"
32
+ }
33
+ },
34
+ "node_modules/@babel/compat-data": {
35
+ "version": "7.29.0",
36
+ "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.29.0.tgz",
37
+ "integrity": "sha512-T1NCJqT/j9+cn8fvkt7jtwbLBfLC/1y1c7NtCeXFRgzGTsafi68MRv8yzkYSapBnFA6L3U2VSc02ciDzoAJhJg==",
38
+ "dev": true,
39
+ "license": "MIT",
40
+ "engines": {
41
+ "node": ">=6.9.0"
42
+ }
43
+ },
44
+ "node_modules/@babel/core": {
45
+ "version": "7.29.0",
46
+ "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.29.0.tgz",
47
+ "integrity": "sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==",
48
+ "dev": true,
49
+ "license": "MIT",
50
+ "dependencies": {
51
+ "@babel/code-frame": "^7.29.0",
52
+ "@babel/generator": "^7.29.0",
53
+ "@babel/helper-compilation-targets": "^7.28.6",
54
+ "@babel/helper-module-transforms": "^7.28.6",
55
+ "@babel/helpers": "^7.28.6",
56
+ "@babel/parser": "^7.29.0",
57
+ "@babel/template": "^7.28.6",
58
+ "@babel/traverse": "^7.29.0",
59
+ "@babel/types": "^7.29.0",
60
+ "@jridgewell/remapping": "^2.3.5",
61
+ "convert-source-map": "^2.0.0",
62
+ "debug": "^4.1.0",
63
+ "gensync": "^1.0.0-beta.2",
64
+ "json5": "^2.2.3",
65
+ "semver": "^6.3.1"
66
+ },
67
+ "engines": {
68
+ "node": ">=6.9.0"
69
+ },
70
+ "funding": {
71
+ "type": "opencollective",
72
+ "url": "https://opencollective.com/babel"
73
+ }
74
+ },
75
+ "node_modules/@babel/generator": {
76
+ "version": "7.29.1",
77
+ "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.29.1.tgz",
78
+ "integrity": "sha512-qsaF+9Qcm2Qv8SRIMMscAvG4O3lJ0F1GuMo5HR/Bp02LopNgnZBC/EkbevHFeGs4ls/oPz9v+Bsmzbkbe+0dUw==",
79
+ "dev": true,
80
+ "license": "MIT",
81
+ "dependencies": {
82
+ "@babel/parser": "^7.29.0",
83
+ "@babel/types": "^7.29.0",
84
+ "@jridgewell/gen-mapping": "^0.3.12",
85
+ "@jridgewell/trace-mapping": "^0.3.28",
86
+ "jsesc": "^3.0.2"
87
+ },
88
+ "engines": {
89
+ "node": ">=6.9.0"
90
+ }
91
+ },
92
+ "node_modules/@babel/helper-compilation-targets": {
93
+ "version": "7.28.6",
94
+ "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.28.6.tgz",
95
+ "integrity": "sha512-JYtls3hqi15fcx5GaSNL7SCTJ2MNmjrkHXg4FSpOA/grxK8KwyZ5bubHsCq8FXCkua6xhuaaBit+3b7+VZRfcA==",
96
+ "dev": true,
97
+ "license": "MIT",
98
+ "dependencies": {
99
+ "@babel/compat-data": "^7.28.6",
100
+ "@babel/helper-validator-option": "^7.27.1",
101
+ "browserslist": "^4.24.0",
102
+ "lru-cache": "^5.1.1",
103
+ "semver": "^6.3.1"
104
+ },
105
+ "engines": {
106
+ "node": ">=6.9.0"
107
+ }
108
+ },
109
+ "node_modules/@babel/helper-globals": {
110
+ "version": "7.28.0",
111
+ "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz",
112
+ "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==",
113
+ "dev": true,
114
+ "license": "MIT",
115
+ "engines": {
116
+ "node": ">=6.9.0"
117
+ }
118
+ },
119
+ "node_modules/@babel/helper-module-imports": {
120
+ "version": "7.28.6",
121
+ "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.28.6.tgz",
122
+ "integrity": "sha512-l5XkZK7r7wa9LucGw9LwZyyCUscb4x37JWTPz7swwFE/0FMQAGpiWUZn8u9DzkSBWEcK25jmvubfpw2dnAMdbw==",
123
+ "dev": true,
124
+ "license": "MIT",
125
+ "dependencies": {
126
+ "@babel/traverse": "^7.28.6",
127
+ "@babel/types": "^7.28.6"
128
+ },
129
+ "engines": {
130
+ "node": ">=6.9.0"
131
+ }
132
+ },
133
+ "node_modules/@babel/helper-module-transforms": {
134
+ "version": "7.28.6",
135
+ "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.6.tgz",
136
+ "integrity": "sha512-67oXFAYr2cDLDVGLXTEABjdBJZ6drElUSI7WKp70NrpyISso3plG9SAGEF6y7zbha/wOzUByWWTJvEDVNIUGcA==",
137
+ "dev": true,
138
+ "license": "MIT",
139
+ "dependencies": {
140
+ "@babel/helper-module-imports": "^7.28.6",
141
+ "@babel/helper-validator-identifier": "^7.28.5",
142
+ "@babel/traverse": "^7.28.6"
143
+ },
144
+ "engines": {
145
+ "node": ">=6.9.0"
146
+ },
147
+ "peerDependencies": {
148
+ "@babel/core": "^7.0.0"
149
+ }
150
+ },
151
+ "node_modules/@babel/helper-plugin-utils": {
152
+ "version": "7.28.6",
153
+ "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.28.6.tgz",
154
+ "integrity": "sha512-S9gzZ/bz83GRysI7gAD4wPT/AI3uCnY+9xn+Mx/KPs2JwHJIz1W8PZkg2cqyt3RNOBM8ejcXhV6y8Og7ly/Dug==",
155
+ "dev": true,
156
+ "license": "MIT",
157
+ "engines": {
158
+ "node": ">=6.9.0"
159
+ }
160
+ },
161
+ "node_modules/@babel/helper-string-parser": {
162
+ "version": "7.27.1",
163
+ "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz",
164
+ "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==",
165
+ "dev": true,
166
+ "license": "MIT",
167
+ "engines": {
168
+ "node": ">=6.9.0"
169
+ }
170
+ },
171
+ "node_modules/@babel/helper-validator-identifier": {
172
+ "version": "7.28.5",
173
+ "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz",
174
+ "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==",
175
+ "dev": true,
176
+ "license": "MIT",
177
+ "engines": {
178
+ "node": ">=6.9.0"
179
+ }
180
+ },
181
+ "node_modules/@babel/helper-validator-option": {
182
+ "version": "7.27.1",
183
+ "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz",
184
+ "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==",
185
+ "dev": true,
186
+ "license": "MIT",
187
+ "engines": {
188
+ "node": ">=6.9.0"
189
+ }
190
+ },
191
+ "node_modules/@babel/helpers": {
192
+ "version": "7.28.6",
193
+ "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.6.tgz",
194
+ "integrity": "sha512-xOBvwq86HHdB7WUDTfKfT/Vuxh7gElQ+Sfti2Cy6yIWNW05P8iUslOVcZ4/sKbE+/jQaukQAdz/gf3724kYdqw==",
195
+ "dev": true,
196
+ "license": "MIT",
197
+ "dependencies": {
198
+ "@babel/template": "^7.28.6",
199
+ "@babel/types": "^7.28.6"
200
+ },
201
+ "engines": {
202
+ "node": ">=6.9.0"
203
+ }
204
+ },
205
+ "node_modules/@babel/parser": {
206
+ "version": "7.29.0",
207
+ "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.0.tgz",
208
+ "integrity": "sha512-IyDgFV5GeDUVX4YdF/3CPULtVGSXXMLh1xVIgdCgxApktqnQV0r7/8Nqthg+8YLGaAtdyIlo2qIdZrbCv4+7ww==",
209
+ "dev": true,
210
+ "license": "MIT",
211
+ "dependencies": {
212
+ "@babel/types": "^7.29.0"
213
+ },
214
+ "bin": {
215
+ "parser": "bin/babel-parser.js"
216
+ },
217
+ "engines": {
218
+ "node": ">=6.0.0"
219
+ }
220
+ },
221
+ "node_modules/@babel/plugin-transform-react-jsx-self": {
222
+ "version": "7.27.1",
223
+ "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.27.1.tgz",
224
+ "integrity": "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw==",
225
+ "dev": true,
226
+ "license": "MIT",
227
+ "dependencies": {
228
+ "@babel/helper-plugin-utils": "^7.27.1"
229
+ },
230
+ "engines": {
231
+ "node": ">=6.9.0"
232
+ },
233
+ "peerDependencies": {
234
+ "@babel/core": "^7.0.0-0"
235
+ }
236
+ },
237
+ "node_modules/@babel/plugin-transform-react-jsx-source": {
238
+ "version": "7.27.1",
239
+ "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.27.1.tgz",
240
+ "integrity": "sha512-zbwoTsBruTeKB9hSq73ha66iFeJHuaFkUbwvqElnygoNbj/jHRsSeokowZFN3CZ64IvEqcmmkVe89OPXc7ldAw==",
241
+ "dev": true,
242
+ "license": "MIT",
243
+ "dependencies": {
244
+ "@babel/helper-plugin-utils": "^7.27.1"
245
+ },
246
+ "engines": {
247
+ "node": ">=6.9.0"
248
+ },
249
+ "peerDependencies": {
250
+ "@babel/core": "^7.0.0-0"
251
+ }
252
+ },
253
+ "node_modules/@babel/template": {
254
+ "version": "7.28.6",
255
+ "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz",
256
+ "integrity": "sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==",
257
+ "dev": true,
258
+ "license": "MIT",
259
+ "dependencies": {
260
+ "@babel/code-frame": "^7.28.6",
261
+ "@babel/parser": "^7.28.6",
262
+ "@babel/types": "^7.28.6"
263
+ },
264
+ "engines": {
265
+ "node": ">=6.9.0"
266
+ }
267
+ },
268
+ "node_modules/@babel/traverse": {
269
+ "version": "7.29.0",
270
+ "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.29.0.tgz",
271
+ "integrity": "sha512-4HPiQr0X7+waHfyXPZpWPfWL/J7dcN1mx9gL6WdQVMbPnF3+ZhSMs8tCxN7oHddJE9fhNE7+lxdnlyemKfJRuA==",
272
+ "dev": true,
273
+ "license": "MIT",
274
+ "dependencies": {
275
+ "@babel/code-frame": "^7.29.0",
276
+ "@babel/generator": "^7.29.0",
277
+ "@babel/helper-globals": "^7.28.0",
278
+ "@babel/parser": "^7.29.0",
279
+ "@babel/template": "^7.28.6",
280
+ "@babel/types": "^7.29.0",
281
+ "debug": "^4.3.1"
282
+ },
283
+ "engines": {
284
+ "node": ">=6.9.0"
285
+ }
286
+ },
287
+ "node_modules/@babel/types": {
288
+ "version": "7.29.0",
289
+ "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.29.0.tgz",
290
+ "integrity": "sha512-LwdZHpScM4Qz8Xw2iKSzS+cfglZzJGvofQICy7W7v4caru4EaAmyUuO6BGrbyQ2mYV11W0U8j5mBhd14dd3B0A==",
291
+ "dev": true,
292
+ "license": "MIT",
293
+ "dependencies": {
294
+ "@babel/helper-string-parser": "^7.27.1",
295
+ "@babel/helper-validator-identifier": "^7.28.5"
296
+ },
297
+ "engines": {
298
+ "node": ">=6.9.0"
299
+ }
300
+ },
301
+ "node_modules/@esbuild/aix-ppc64": {
302
+ "version": "0.21.5",
303
+ "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
304
+ "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==",
305
+ "cpu": [
306
+ "ppc64"
307
+ ],
308
+ "dev": true,
309
+ "license": "MIT",
310
+ "optional": true,
311
+ "os": [
312
+ "aix"
313
+ ],
314
+ "engines": {
315
+ "node": ">=12"
316
+ }
317
+ },
318
+ "node_modules/@esbuild/android-arm": {
319
+ "version": "0.21.5",
320
+ "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz",
321
+ "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==",
322
+ "cpu": [
323
+ "arm"
324
+ ],
325
+ "dev": true,
326
+ "license": "MIT",
327
+ "optional": true,
328
+ "os": [
329
+ "android"
330
+ ],
331
+ "engines": {
332
+ "node": ">=12"
333
+ }
334
+ },
335
+ "node_modules/@esbuild/android-arm64": {
336
+ "version": "0.21.5",
337
+ "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz",
338
+ "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==",
339
+ "cpu": [
340
+ "arm64"
341
+ ],
342
+ "dev": true,
343
+ "license": "MIT",
344
+ "optional": true,
345
+ "os": [
346
+ "android"
347
+ ],
348
+ "engines": {
349
+ "node": ">=12"
350
+ }
351
+ },
352
+ "node_modules/@esbuild/android-x64": {
353
+ "version": "0.21.5",
354
+ "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz",
355
+ "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==",
356
+ "cpu": [
357
+ "x64"
358
+ ],
359
+ "dev": true,
360
+ "license": "MIT",
361
+ "optional": true,
362
+ "os": [
363
+ "android"
364
+ ],
365
+ "engines": {
366
+ "node": ">=12"
367
+ }
368
+ },
369
+ "node_modules/@esbuild/darwin-arm64": {
370
+ "version": "0.21.5",
371
+ "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz",
372
+ "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==",
373
+ "cpu": [
374
+ "arm64"
375
+ ],
376
+ "dev": true,
377
+ "license": "MIT",
378
+ "optional": true,
379
+ "os": [
380
+ "darwin"
381
+ ],
382
+ "engines": {
383
+ "node": ">=12"
384
+ }
385
+ },
386
+ "node_modules/@esbuild/darwin-x64": {
387
+ "version": "0.21.5",
388
+ "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz",
389
+ "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==",
390
+ "cpu": [
391
+ "x64"
392
+ ],
393
+ "dev": true,
394
+ "license": "MIT",
395
+ "optional": true,
396
+ "os": [
397
+ "darwin"
398
+ ],
399
+ "engines": {
400
+ "node": ">=12"
401
+ }
402
+ },
403
+ "node_modules/@esbuild/freebsd-arm64": {
404
+ "version": "0.21.5",
405
+ "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz",
406
+ "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==",
407
+ "cpu": [
408
+ "arm64"
409
+ ],
410
+ "dev": true,
411
+ "license": "MIT",
412
+ "optional": true,
413
+ "os": [
414
+ "freebsd"
415
+ ],
416
+ "engines": {
417
+ "node": ">=12"
418
+ }
419
+ },
420
+ "node_modules/@esbuild/freebsd-x64": {
421
+ "version": "0.21.5",
422
+ "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz",
423
+ "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==",
424
+ "cpu": [
425
+ "x64"
426
+ ],
427
+ "dev": true,
428
+ "license": "MIT",
429
+ "optional": true,
430
+ "os": [
431
+ "freebsd"
432
+ ],
433
+ "engines": {
434
+ "node": ">=12"
435
+ }
436
+ },
437
+ "node_modules/@esbuild/linux-arm": {
438
+ "version": "0.21.5",
439
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz",
440
+ "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==",
441
+ "cpu": [
442
+ "arm"
443
+ ],
444
+ "dev": true,
445
+ "license": "MIT",
446
+ "optional": true,
447
+ "os": [
448
+ "linux"
449
+ ],
450
+ "engines": {
451
+ "node": ">=12"
452
+ }
453
+ },
454
+ "node_modules/@esbuild/linux-arm64": {
455
+ "version": "0.21.5",
456
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz",
457
+ "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==",
458
+ "cpu": [
459
+ "arm64"
460
+ ],
461
+ "dev": true,
462
+ "license": "MIT",
463
+ "optional": true,
464
+ "os": [
465
+ "linux"
466
+ ],
467
+ "engines": {
468
+ "node": ">=12"
469
+ }
470
+ },
471
+ "node_modules/@esbuild/linux-ia32": {
472
+ "version": "0.21.5",
473
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz",
474
+ "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==",
475
+ "cpu": [
476
+ "ia32"
477
+ ],
478
+ "dev": true,
479
+ "license": "MIT",
480
+ "optional": true,
481
+ "os": [
482
+ "linux"
483
+ ],
484
+ "engines": {
485
+ "node": ">=12"
486
+ }
487
+ },
488
+ "node_modules/@esbuild/linux-loong64": {
489
+ "version": "0.21.5",
490
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz",
491
+ "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==",
492
+ "cpu": [
493
+ "loong64"
494
+ ],
495
+ "dev": true,
496
+ "license": "MIT",
497
+ "optional": true,
498
+ "os": [
499
+ "linux"
500
+ ],
501
+ "engines": {
502
+ "node": ">=12"
503
+ }
504
+ },
505
+ "node_modules/@esbuild/linux-mips64el": {
506
+ "version": "0.21.5",
507
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz",
508
+ "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==",
509
+ "cpu": [
510
+ "mips64el"
511
+ ],
512
+ "dev": true,
513
+ "license": "MIT",
514
+ "optional": true,
515
+ "os": [
516
+ "linux"
517
+ ],
518
+ "engines": {
519
+ "node": ">=12"
520
+ }
521
+ },
522
+ "node_modules/@esbuild/linux-ppc64": {
523
+ "version": "0.21.5",
524
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz",
525
+ "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==",
526
+ "cpu": [
527
+ "ppc64"
528
+ ],
529
+ "dev": true,
530
+ "license": "MIT",
531
+ "optional": true,
532
+ "os": [
533
+ "linux"
534
+ ],
535
+ "engines": {
536
+ "node": ">=12"
537
+ }
538
+ },
539
+ "node_modules/@esbuild/linux-riscv64": {
540
+ "version": "0.21.5",
541
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz",
542
+ "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==",
543
+ "cpu": [
544
+ "riscv64"
545
+ ],
546
+ "dev": true,
547
+ "license": "MIT",
548
+ "optional": true,
549
+ "os": [
550
+ "linux"
551
+ ],
552
+ "engines": {
553
+ "node": ">=12"
554
+ }
555
+ },
556
+ "node_modules/@esbuild/linux-s390x": {
557
+ "version": "0.21.5",
558
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz",
559
+ "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==",
560
+ "cpu": [
561
+ "s390x"
562
+ ],
563
+ "dev": true,
564
+ "license": "MIT",
565
+ "optional": true,
566
+ "os": [
567
+ "linux"
568
+ ],
569
+ "engines": {
570
+ "node": ">=12"
571
+ }
572
+ },
573
+ "node_modules/@esbuild/linux-x64": {
574
+ "version": "0.21.5",
575
+ "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz",
576
+ "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==",
577
+ "cpu": [
578
+ "x64"
579
+ ],
580
+ "dev": true,
581
+ "license": "MIT",
582
+ "optional": true,
583
+ "os": [
584
+ "linux"
585
+ ],
586
+ "engines": {
587
+ "node": ">=12"
588
+ }
589
+ },
590
+ "node_modules/@esbuild/netbsd-x64": {
591
+ "version": "0.21.5",
592
+ "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz",
593
+ "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==",
594
+ "cpu": [
595
+ "x64"
596
+ ],
597
+ "dev": true,
598
+ "license": "MIT",
599
+ "optional": true,
600
+ "os": [
601
+ "netbsd"
602
+ ],
603
+ "engines": {
604
+ "node": ">=12"
605
+ }
606
+ },
607
+ "node_modules/@esbuild/openbsd-x64": {
608
+ "version": "0.21.5",
609
+ "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz",
610
+ "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==",
611
+ "cpu": [
612
+ "x64"
613
+ ],
614
+ "dev": true,
615
+ "license": "MIT",
616
+ "optional": true,
617
+ "os": [
618
+ "openbsd"
619
+ ],
620
+ "engines": {
621
+ "node": ">=12"
622
+ }
623
+ },
624
+ "node_modules/@esbuild/sunos-x64": {
625
+ "version": "0.21.5",
626
+ "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz",
627
+ "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==",
628
+ "cpu": [
629
+ "x64"
630
+ ],
631
+ "dev": true,
632
+ "license": "MIT",
633
+ "optional": true,
634
+ "os": [
635
+ "sunos"
636
+ ],
637
+ "engines": {
638
+ "node": ">=12"
639
+ }
640
+ },
641
+ "node_modules/@esbuild/win32-arm64": {
642
+ "version": "0.21.5",
643
+ "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz",
644
+ "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==",
645
+ "cpu": [
646
+ "arm64"
647
+ ],
648
+ "dev": true,
649
+ "license": "MIT",
650
+ "optional": true,
651
+ "os": [
652
+ "win32"
653
+ ],
654
+ "engines": {
655
+ "node": ">=12"
656
+ }
657
+ },
658
+ "node_modules/@esbuild/win32-ia32": {
659
+ "version": "0.21.5",
660
+ "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz",
661
+ "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==",
662
+ "cpu": [
663
+ "ia32"
664
+ ],
665
+ "dev": true,
666
+ "license": "MIT",
667
+ "optional": true,
668
+ "os": [
669
+ "win32"
670
+ ],
671
+ "engines": {
672
+ "node": ">=12"
673
+ }
674
+ },
675
+ "node_modules/@esbuild/win32-x64": {
676
+ "version": "0.21.5",
677
+ "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz",
678
+ "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==",
679
+ "cpu": [
680
+ "x64"
681
+ ],
682
+ "dev": true,
683
+ "license": "MIT",
684
+ "optional": true,
685
+ "os": [
686
+ "win32"
687
+ ],
688
+ "engines": {
689
+ "node": ">=12"
690
+ }
691
+ },
692
+ "node_modules/@jridgewell/gen-mapping": {
693
+ "version": "0.3.13",
694
+ "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
695
+ "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==",
696
+ "dev": true,
697
+ "license": "MIT",
698
+ "dependencies": {
699
+ "@jridgewell/sourcemap-codec": "^1.5.0",
700
+ "@jridgewell/trace-mapping": "^0.3.24"
701
+ }
702
+ },
703
+ "node_modules/@jridgewell/remapping": {
704
+ "version": "2.3.5",
705
+ "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz",
706
+ "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==",
707
+ "dev": true,
708
+ "license": "MIT",
709
+ "dependencies": {
710
+ "@jridgewell/gen-mapping": "^0.3.5",
711
+ "@jridgewell/trace-mapping": "^0.3.24"
712
+ }
713
+ },
714
+ "node_modules/@jridgewell/resolve-uri": {
715
+ "version": "3.1.2",
716
+ "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
717
+ "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
718
+ "dev": true,
719
+ "license": "MIT",
720
+ "engines": {
721
+ "node": ">=6.0.0"
722
+ }
723
+ },
724
+ "node_modules/@jridgewell/sourcemap-codec": {
725
+ "version": "1.5.5",
726
+ "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
727
+ "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==",
728
+ "dev": true,
729
+ "license": "MIT"
730
+ },
731
+ "node_modules/@jridgewell/trace-mapping": {
732
+ "version": "0.3.31",
733
+ "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz",
734
+ "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==",
735
+ "dev": true,
736
+ "license": "MIT",
737
+ "dependencies": {
738
+ "@jridgewell/resolve-uri": "^3.1.0",
739
+ "@jridgewell/sourcemap-codec": "^1.4.14"
740
+ }
741
+ },
742
+ "node_modules/@rolldown/pluginutils": {
743
+ "version": "1.0.0-beta.27",
744
+ "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz",
745
+ "integrity": "sha512-+d0F4MKMCbeVUJwG96uQ4SgAznZNSq93I3V+9NHA4OpvqG8mRCpGdKmK8l/dl02h2CCDHwW2FqilnTyDcAnqjA==",
746
+ "dev": true,
747
+ "license": "MIT"
748
+ },
749
+ "node_modules/@rollup/rollup-android-arm-eabi": {
750
+ "version": "4.57.1",
751
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.57.1.tgz",
752
+ "integrity": "sha512-A6ehUVSiSaaliTxai040ZpZ2zTevHYbvu/lDoeAteHI8QnaosIzm4qwtezfRg1jOYaUmnzLX1AOD6Z+UJjtifg==",
753
+ "cpu": [
754
+ "arm"
755
+ ],
756
+ "dev": true,
757
+ "license": "MIT",
758
+ "optional": true,
759
+ "os": [
760
+ "android"
761
+ ]
762
+ },
763
+ "node_modules/@rollup/rollup-android-arm64": {
764
+ "version": "4.57.1",
765
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.57.1.tgz",
766
+ "integrity": "sha512-dQaAddCY9YgkFHZcFNS/606Exo8vcLHwArFZ7vxXq4rigo2bb494/xKMMwRRQW6ug7Js6yXmBZhSBRuBvCCQ3w==",
767
+ "cpu": [
768
+ "arm64"
769
+ ],
770
+ "dev": true,
771
+ "license": "MIT",
772
+ "optional": true,
773
+ "os": [
774
+ "android"
775
+ ]
776
+ },
777
+ "node_modules/@rollup/rollup-darwin-arm64": {
778
+ "version": "4.57.1",
779
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.57.1.tgz",
780
+ "integrity": "sha512-crNPrwJOrRxagUYeMn/DZwqN88SDmwaJ8Cvi/TN1HnWBU7GwknckyosC2gd0IqYRsHDEnXf328o9/HC6OkPgOg==",
781
+ "cpu": [
782
+ "arm64"
783
+ ],
784
+ "dev": true,
785
+ "license": "MIT",
786
+ "optional": true,
787
+ "os": [
788
+ "darwin"
789
+ ]
790
+ },
791
+ "node_modules/@rollup/rollup-darwin-x64": {
792
+ "version": "4.57.1",
793
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.57.1.tgz",
794
+ "integrity": "sha512-Ji8g8ChVbKrhFtig5QBV7iMaJrGtpHelkB3lsaKzadFBe58gmjfGXAOfI5FV0lYMH8wiqsxKQ1C9B0YTRXVy4w==",
795
+ "cpu": [
796
+ "x64"
797
+ ],
798
+ "dev": true,
799
+ "license": "MIT",
800
+ "optional": true,
801
+ "os": [
802
+ "darwin"
803
+ ]
804
+ },
805
+ "node_modules/@rollup/rollup-freebsd-arm64": {
806
+ "version": "4.57.1",
807
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.57.1.tgz",
808
+ "integrity": "sha512-R+/WwhsjmwodAcz65guCGFRkMb4gKWTcIeLy60JJQbXrJ97BOXHxnkPFrP+YwFlaS0m+uWJTstrUA9o+UchFug==",
809
+ "cpu": [
810
+ "arm64"
811
+ ],
812
+ "dev": true,
813
+ "license": "MIT",
814
+ "optional": true,
815
+ "os": [
816
+ "freebsd"
817
+ ]
818
+ },
819
+ "node_modules/@rollup/rollup-freebsd-x64": {
820
+ "version": "4.57.1",
821
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.57.1.tgz",
822
+ "integrity": "sha512-IEQTCHeiTOnAUC3IDQdzRAGj3jOAYNr9kBguI7MQAAZK3caezRrg0GxAb6Hchg4lxdZEI5Oq3iov/w/hnFWY9Q==",
823
+ "cpu": [
824
+ "x64"
825
+ ],
826
+ "dev": true,
827
+ "license": "MIT",
828
+ "optional": true,
829
+ "os": [
830
+ "freebsd"
831
+ ]
832
+ },
833
+ "node_modules/@rollup/rollup-linux-arm-gnueabihf": {
834
+ "version": "4.57.1",
835
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.57.1.tgz",
836
+ "integrity": "sha512-F8sWbhZ7tyuEfsmOxwc2giKDQzN3+kuBLPwwZGyVkLlKGdV1nvnNwYD0fKQ8+XS6hp9nY7B+ZeK01EBUE7aHaw==",
837
+ "cpu": [
838
+ "arm"
839
+ ],
840
+ "dev": true,
841
+ "license": "MIT",
842
+ "optional": true,
843
+ "os": [
844
+ "linux"
845
+ ]
846
+ },
847
+ "node_modules/@rollup/rollup-linux-arm-musleabihf": {
848
+ "version": "4.57.1",
849
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.57.1.tgz",
850
+ "integrity": "sha512-rGfNUfn0GIeXtBP1wL5MnzSj98+PZe/AXaGBCRmT0ts80lU5CATYGxXukeTX39XBKsxzFpEeK+Mrp9faXOlmrw==",
851
+ "cpu": [
852
+ "arm"
853
+ ],
854
+ "dev": true,
855
+ "license": "MIT",
856
+ "optional": true,
857
+ "os": [
858
+ "linux"
859
+ ]
860
+ },
861
+ "node_modules/@rollup/rollup-linux-arm64-gnu": {
862
+ "version": "4.57.1",
863
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.57.1.tgz",
864
+ "integrity": "sha512-MMtej3YHWeg/0klK2Qodf3yrNzz6CGjo2UntLvk2RSPlhzgLvYEB3frRvbEF2wRKh1Z2fDIg9KRPe1fawv7C+g==",
865
+ "cpu": [
866
+ "arm64"
867
+ ],
868
+ "dev": true,
869
+ "license": "MIT",
870
+ "optional": true,
871
+ "os": [
872
+ "linux"
873
+ ]
874
+ },
875
+ "node_modules/@rollup/rollup-linux-arm64-musl": {
876
+ "version": "4.57.1",
877
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.57.1.tgz",
878
+ "integrity": "sha512-1a/qhaaOXhqXGpMFMET9VqwZakkljWHLmZOX48R0I/YLbhdxr1m4gtG1Hq7++VhVUmf+L3sTAf9op4JlhQ5u1Q==",
879
+ "cpu": [
880
+ "arm64"
881
+ ],
882
+ "dev": true,
883
+ "license": "MIT",
884
+ "optional": true,
885
+ "os": [
886
+ "linux"
887
+ ]
888
+ },
889
+ "node_modules/@rollup/rollup-linux-loong64-gnu": {
890
+ "version": "4.57.1",
891
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.57.1.tgz",
892
+ "integrity": "sha512-QWO6RQTZ/cqYtJMtxhkRkidoNGXc7ERPbZN7dVW5SdURuLeVU7lwKMpo18XdcmpWYd0qsP1bwKPf7DNSUinhvA==",
893
+ "cpu": [
894
+ "loong64"
895
+ ],
896
+ "dev": true,
897
+ "license": "MIT",
898
+ "optional": true,
899
+ "os": [
900
+ "linux"
901
+ ]
902
+ },
903
+ "node_modules/@rollup/rollup-linux-loong64-musl": {
904
+ "version": "4.57.1",
905
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.57.1.tgz",
906
+ "integrity": "sha512-xpObYIf+8gprgWaPP32xiN5RVTi/s5FCR+XMXSKmhfoJjrpRAjCuuqQXyxUa/eJTdAE6eJ+KDKaoEqjZQxh3Gw==",
907
+ "cpu": [
908
+ "loong64"
909
+ ],
910
+ "dev": true,
911
+ "license": "MIT",
912
+ "optional": true,
913
+ "os": [
914
+ "linux"
915
+ ]
916
+ },
917
+ "node_modules/@rollup/rollup-linux-ppc64-gnu": {
918
+ "version": "4.57.1",
919
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.57.1.tgz",
920
+ "integrity": "sha512-4BrCgrpZo4hvzMDKRqEaW1zeecScDCR+2nZ86ATLhAoJ5FQ+lbHVD3ttKe74/c7tNT9c6F2viwB3ufwp01Oh2w==",
921
+ "cpu": [
922
+ "ppc64"
923
+ ],
924
+ "dev": true,
925
+ "license": "MIT",
926
+ "optional": true,
927
+ "os": [
928
+ "linux"
929
+ ]
930
+ },
931
+ "node_modules/@rollup/rollup-linux-ppc64-musl": {
932
+ "version": "4.57.1",
933
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.57.1.tgz",
934
+ "integrity": "sha512-NOlUuzesGauESAyEYFSe3QTUguL+lvrN1HtwEEsU2rOwdUDeTMJdO5dUYl/2hKf9jWydJrO9OL/XSSf65R5+Xw==",
935
+ "cpu": [
936
+ "ppc64"
937
+ ],
938
+ "dev": true,
939
+ "license": "MIT",
940
+ "optional": true,
941
+ "os": [
942
+ "linux"
943
+ ]
944
+ },
945
+ "node_modules/@rollup/rollup-linux-riscv64-gnu": {
946
+ "version": "4.57.1",
947
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.57.1.tgz",
948
+ "integrity": "sha512-ptA88htVp0AwUUqhVghwDIKlvJMD/fmL/wrQj99PRHFRAG6Z5nbWoWG4o81Nt9FT+IuqUQi+L31ZKAFeJ5Is+A==",
949
+ "cpu": [
950
+ "riscv64"
951
+ ],
952
+ "dev": true,
953
+ "license": "MIT",
954
+ "optional": true,
955
+ "os": [
956
+ "linux"
957
+ ]
958
+ },
959
+ "node_modules/@rollup/rollup-linux-riscv64-musl": {
960
+ "version": "4.57.1",
961
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.57.1.tgz",
962
+ "integrity": "sha512-S51t7aMMTNdmAMPpBg7OOsTdn4tySRQvklmL3RpDRyknk87+Sp3xaumlatU+ppQ+5raY7sSTcC2beGgvhENfuw==",
963
+ "cpu": [
964
+ "riscv64"
965
+ ],
966
+ "dev": true,
967
+ "license": "MIT",
968
+ "optional": true,
969
+ "os": [
970
+ "linux"
971
+ ]
972
+ },
973
+ "node_modules/@rollup/rollup-linux-s390x-gnu": {
974
+ "version": "4.57.1",
975
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.57.1.tgz",
976
+ "integrity": "sha512-Bl00OFnVFkL82FHbEqy3k5CUCKH6OEJL54KCyx2oqsmZnFTR8IoNqBF+mjQVcRCT5sB6yOvK8A37LNm/kPJiZg==",
977
+ "cpu": [
978
+ "s390x"
979
+ ],
980
+ "dev": true,
981
+ "license": "MIT",
982
+ "optional": true,
983
+ "os": [
984
+ "linux"
985
+ ]
986
+ },
987
+ "node_modules/@rollup/rollup-linux-x64-gnu": {
988
+ "version": "4.57.1",
989
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.57.1.tgz",
990
+ "integrity": "sha512-ABca4ceT4N+Tv/GtotnWAeXZUZuM/9AQyCyKYyKnpk4yoA7QIAuBt6Hkgpw8kActYlew2mvckXkvx0FfoInnLg==",
991
+ "cpu": [
992
+ "x64"
993
+ ],
994
+ "dev": true,
995
+ "license": "MIT",
996
+ "optional": true,
997
+ "os": [
998
+ "linux"
999
+ ]
1000
+ },
1001
+ "node_modules/@rollup/rollup-linux-x64-musl": {
1002
+ "version": "4.57.1",
1003
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.57.1.tgz",
1004
+ "integrity": "sha512-HFps0JeGtuOR2convgRRkHCekD7j+gdAuXM+/i6kGzQtFhlCtQkpwtNzkNj6QhCDp7DRJ7+qC/1Vg2jt5iSOFw==",
1005
+ "cpu": [
1006
+ "x64"
1007
+ ],
1008
+ "dev": true,
1009
+ "license": "MIT",
1010
+ "optional": true,
1011
+ "os": [
1012
+ "linux"
1013
+ ]
1014
+ },
1015
+ "node_modules/@rollup/rollup-openbsd-x64": {
1016
+ "version": "4.57.1",
1017
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.57.1.tgz",
1018
+ "integrity": "sha512-H+hXEv9gdVQuDTgnqD+SQffoWoc0Of59AStSzTEj/feWTBAnSfSD3+Dql1ZruJQxmykT/JVY0dE8Ka7z0DH1hw==",
1019
+ "cpu": [
1020
+ "x64"
1021
+ ],
1022
+ "dev": true,
1023
+ "license": "MIT",
1024
+ "optional": true,
1025
+ "os": [
1026
+ "openbsd"
1027
+ ]
1028
+ },
1029
+ "node_modules/@rollup/rollup-openharmony-arm64": {
1030
+ "version": "4.57.1",
1031
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.57.1.tgz",
1032
+ "integrity": "sha512-4wYoDpNg6o/oPximyc/NG+mYUejZrCU2q+2w6YZqrAs2UcNUChIZXjtafAiiZSUc7On8v5NyNj34Kzj/Ltk6dQ==",
1033
+ "cpu": [
1034
+ "arm64"
1035
+ ],
1036
+ "dev": true,
1037
+ "license": "MIT",
1038
+ "optional": true,
1039
+ "os": [
1040
+ "openharmony"
1041
+ ]
1042
+ },
1043
+ "node_modules/@rollup/rollup-win32-arm64-msvc": {
1044
+ "version": "4.57.1",
1045
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.57.1.tgz",
1046
+ "integrity": "sha512-O54mtsV/6LW3P8qdTcamQmuC990HDfR71lo44oZMZlXU4tzLrbvTii87Ni9opq60ds0YzuAlEr/GNwuNluZyMQ==",
1047
+ "cpu": [
1048
+ "arm64"
1049
+ ],
1050
+ "dev": true,
1051
+ "license": "MIT",
1052
+ "optional": true,
1053
+ "os": [
1054
+ "win32"
1055
+ ]
1056
+ },
1057
+ "node_modules/@rollup/rollup-win32-ia32-msvc": {
1058
+ "version": "4.57.1",
1059
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.57.1.tgz",
1060
+ "integrity": "sha512-P3dLS+IerxCT/7D2q2FYcRdWRl22dNbrbBEtxdWhXrfIMPP9lQhb5h4Du04mdl5Woq05jVCDPCMF7Ub0NAjIew==",
1061
+ "cpu": [
1062
+ "ia32"
1063
+ ],
1064
+ "dev": true,
1065
+ "license": "MIT",
1066
+ "optional": true,
1067
+ "os": [
1068
+ "win32"
1069
+ ]
1070
+ },
1071
+ "node_modules/@rollup/rollup-win32-x64-gnu": {
1072
+ "version": "4.57.1",
1073
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.57.1.tgz",
1074
+ "integrity": "sha512-VMBH2eOOaKGtIJYleXsi2B8CPVADrh+TyNxJ4mWPnKfLB/DBUmzW+5m1xUrcwWoMfSLagIRpjUFeW5CO5hyciQ==",
1075
+ "cpu": [
1076
+ "x64"
1077
+ ],
1078
+ "dev": true,
1079
+ "license": "MIT",
1080
+ "optional": true,
1081
+ "os": [
1082
+ "win32"
1083
+ ]
1084
+ },
1085
+ "node_modules/@rollup/rollup-win32-x64-msvc": {
1086
+ "version": "4.57.1",
1087
+ "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.57.1.tgz",
1088
+ "integrity": "sha512-mxRFDdHIWRxg3UfIIAwCm6NzvxG0jDX/wBN6KsQFTvKFqqg9vTrWUE68qEjHt19A5wwx5X5aUi2zuZT7YR0jrA==",
1089
+ "cpu": [
1090
+ "x64"
1091
+ ],
1092
+ "dev": true,
1093
+ "license": "MIT",
1094
+ "optional": true,
1095
+ "os": [
1096
+ "win32"
1097
+ ]
1098
+ },
1099
+ "node_modules/@types/babel__core": {
1100
+ "version": "7.20.5",
1101
+ "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
1102
+ "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==",
1103
+ "dev": true,
1104
+ "license": "MIT",
1105
+ "dependencies": {
1106
+ "@babel/parser": "^7.20.7",
1107
+ "@babel/types": "^7.20.7",
1108
+ "@types/babel__generator": "*",
1109
+ "@types/babel__template": "*",
1110
+ "@types/babel__traverse": "*"
1111
+ }
1112
+ },
1113
+ "node_modules/@types/babel__generator": {
1114
+ "version": "7.27.0",
1115
+ "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz",
1116
+ "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==",
1117
+ "dev": true,
1118
+ "license": "MIT",
1119
+ "dependencies": {
1120
+ "@babel/types": "^7.0.0"
1121
+ }
1122
+ },
1123
+ "node_modules/@types/babel__template": {
1124
+ "version": "7.4.4",
1125
+ "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz",
1126
+ "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==",
1127
+ "dev": true,
1128
+ "license": "MIT",
1129
+ "dependencies": {
1130
+ "@babel/parser": "^7.1.0",
1131
+ "@babel/types": "^7.0.0"
1132
+ }
1133
+ },
1134
+ "node_modules/@types/babel__traverse": {
1135
+ "version": "7.28.0",
1136
+ "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz",
1137
+ "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==",
1138
+ "dev": true,
1139
+ "license": "MIT",
1140
+ "dependencies": {
1141
+ "@babel/types": "^7.28.2"
1142
+ }
1143
+ },
1144
+ "node_modules/@types/estree": {
1145
+ "version": "1.0.8",
1146
+ "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
1147
+ "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
1148
+ "dev": true,
1149
+ "license": "MIT"
1150
+ },
1151
+ "node_modules/@vitejs/plugin-react": {
1152
+ "version": "4.7.0",
1153
+ "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.7.0.tgz",
1154
+ "integrity": "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA==",
1155
+ "dev": true,
1156
+ "license": "MIT",
1157
+ "dependencies": {
1158
+ "@babel/core": "^7.28.0",
1159
+ "@babel/plugin-transform-react-jsx-self": "^7.27.1",
1160
+ "@babel/plugin-transform-react-jsx-source": "^7.27.1",
1161
+ "@rolldown/pluginutils": "1.0.0-beta.27",
1162
+ "@types/babel__core": "^7.20.5",
1163
+ "react-refresh": "^0.17.0"
1164
+ },
1165
+ "engines": {
1166
+ "node": "^14.18.0 || >=16.0.0"
1167
+ },
1168
+ "peerDependencies": {
1169
+ "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
1170
+ }
1171
+ },
1172
+ "node_modules/baseline-browser-mapping": {
1173
+ "version": "2.9.19",
1174
+ "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.19.tgz",
1175
+ "integrity": "sha512-ipDqC8FrAl/76p2SSWKSI+H9tFwm7vYqXQrItCuiVPt26Km0jS+NzSsBWAaBusvSbQcfJG+JitdMm+wZAgTYqg==",
1176
+ "dev": true,
1177
+ "license": "Apache-2.0",
1178
+ "bin": {
1179
+ "baseline-browser-mapping": "dist/cli.js"
1180
+ }
1181
+ },
1182
+ "node_modules/browserslist": {
1183
+ "version": "4.28.1",
1184
+ "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz",
1185
+ "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==",
1186
+ "dev": true,
1187
+ "funding": [
1188
+ {
1189
+ "type": "opencollective",
1190
+ "url": "https://opencollective.com/browserslist"
1191
+ },
1192
+ {
1193
+ "type": "tidelift",
1194
+ "url": "https://tidelift.com/funding/github/npm/browserslist"
1195
+ },
1196
+ {
1197
+ "type": "github",
1198
+ "url": "https://github.com/sponsors/ai"
1199
+ }
1200
+ ],
1201
+ "license": "MIT",
1202
+ "dependencies": {
1203
+ "baseline-browser-mapping": "^2.9.0",
1204
+ "caniuse-lite": "^1.0.30001759",
1205
+ "electron-to-chromium": "^1.5.263",
1206
+ "node-releases": "^2.0.27",
1207
+ "update-browserslist-db": "^1.2.0"
1208
+ },
1209
+ "bin": {
1210
+ "browserslist": "cli.js"
1211
+ },
1212
+ "engines": {
1213
+ "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
1214
+ }
1215
+ },
1216
+ "node_modules/caniuse-lite": {
1217
+ "version": "1.0.30001769",
1218
+ "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001769.tgz",
1219
+ "integrity": "sha512-BCfFL1sHijQlBGWBMuJyhZUhzo7wer5sVj9hqekB/7xn0Ypy+pER/edCYQm4exbXj4WiySGp40P8UuTh6w1srg==",
1220
+ "dev": true,
1221
+ "funding": [
1222
+ {
1223
+ "type": "opencollective",
1224
+ "url": "https://opencollective.com/browserslist"
1225
+ },
1226
+ {
1227
+ "type": "tidelift",
1228
+ "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
1229
+ },
1230
+ {
1231
+ "type": "github",
1232
+ "url": "https://github.com/sponsors/ai"
1233
+ }
1234
+ ],
1235
+ "license": "CC-BY-4.0"
1236
+ },
1237
+ "node_modules/convert-source-map": {
1238
+ "version": "2.0.0",
1239
+ "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz",
1240
+ "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==",
1241
+ "dev": true,
1242
+ "license": "MIT"
1243
+ },
1244
+ "node_modules/debug": {
1245
+ "version": "4.4.3",
1246
+ "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
1247
+ "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
1248
+ "dev": true,
1249
+ "license": "MIT",
1250
+ "dependencies": {
1251
+ "ms": "^2.1.3"
1252
+ },
1253
+ "engines": {
1254
+ "node": ">=6.0"
1255
+ },
1256
+ "peerDependenciesMeta": {
1257
+ "supports-color": {
1258
+ "optional": true
1259
+ }
1260
+ }
1261
+ },
1262
+ "node_modules/electron-to-chromium": {
1263
+ "version": "1.5.286",
1264
+ "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.286.tgz",
1265
+ "integrity": "sha512-9tfDXhJ4RKFNerfjdCcZfufu49vg620741MNs26a9+bhLThdB+plgMeou98CAaHu/WATj2iHOOHTp1hWtABj2A==",
1266
+ "dev": true,
1267
+ "license": "ISC"
1268
+ },
1269
+ "node_modules/esbuild": {
1270
+ "version": "0.21.5",
1271
+ "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz",
1272
+ "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==",
1273
+ "dev": true,
1274
+ "hasInstallScript": true,
1275
+ "license": "MIT",
1276
+ "bin": {
1277
+ "esbuild": "bin/esbuild"
1278
+ },
1279
+ "engines": {
1280
+ "node": ">=12"
1281
+ },
1282
+ "optionalDependencies": {
1283
+ "@esbuild/aix-ppc64": "0.21.5",
1284
+ "@esbuild/android-arm": "0.21.5",
1285
+ "@esbuild/android-arm64": "0.21.5",
1286
+ "@esbuild/android-x64": "0.21.5",
1287
+ "@esbuild/darwin-arm64": "0.21.5",
1288
+ "@esbuild/darwin-x64": "0.21.5",
1289
+ "@esbuild/freebsd-arm64": "0.21.5",
1290
+ "@esbuild/freebsd-x64": "0.21.5",
1291
+ "@esbuild/linux-arm": "0.21.5",
1292
+ "@esbuild/linux-arm64": "0.21.5",
1293
+ "@esbuild/linux-ia32": "0.21.5",
1294
+ "@esbuild/linux-loong64": "0.21.5",
1295
+ "@esbuild/linux-mips64el": "0.21.5",
1296
+ "@esbuild/linux-ppc64": "0.21.5",
1297
+ "@esbuild/linux-riscv64": "0.21.5",
1298
+ "@esbuild/linux-s390x": "0.21.5",
1299
+ "@esbuild/linux-x64": "0.21.5",
1300
+ "@esbuild/netbsd-x64": "0.21.5",
1301
+ "@esbuild/openbsd-x64": "0.21.5",
1302
+ "@esbuild/sunos-x64": "0.21.5",
1303
+ "@esbuild/win32-arm64": "0.21.5",
1304
+ "@esbuild/win32-ia32": "0.21.5",
1305
+ "@esbuild/win32-x64": "0.21.5"
1306
+ }
1307
+ },
1308
+ "node_modules/escalade": {
1309
+ "version": "3.2.0",
1310
+ "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
1311
+ "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
1312
+ "dev": true,
1313
+ "license": "MIT",
1314
+ "engines": {
1315
+ "node": ">=6"
1316
+ }
1317
+ },
1318
+ "node_modules/fsevents": {
1319
+ "version": "2.3.3",
1320
+ "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
1321
+ "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
1322
+ "dev": true,
1323
+ "hasInstallScript": true,
1324
+ "license": "MIT",
1325
+ "optional": true,
1326
+ "os": [
1327
+ "darwin"
1328
+ ],
1329
+ "engines": {
1330
+ "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
1331
+ }
1332
+ },
1333
+ "node_modules/gensync": {
1334
+ "version": "1.0.0-beta.2",
1335
+ "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",
1336
+ "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==",
1337
+ "dev": true,
1338
+ "license": "MIT",
1339
+ "engines": {
1340
+ "node": ">=6.9.0"
1341
+ }
1342
+ },
1343
+ "node_modules/js-tokens": {
1344
+ "version": "4.0.0",
1345
+ "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
1346
+ "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
1347
+ "license": "MIT"
1348
+ },
1349
+ "node_modules/jsesc": {
1350
+ "version": "3.1.0",
1351
+ "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz",
1352
+ "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==",
1353
+ "dev": true,
1354
+ "license": "MIT",
1355
+ "bin": {
1356
+ "jsesc": "bin/jsesc"
1357
+ },
1358
+ "engines": {
1359
+ "node": ">=6"
1360
+ }
1361
+ },
1362
+ "node_modules/json5": {
1363
+ "version": "2.2.3",
1364
+ "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz",
1365
+ "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==",
1366
+ "dev": true,
1367
+ "license": "MIT",
1368
+ "bin": {
1369
+ "json5": "lib/cli.js"
1370
+ },
1371
+ "engines": {
1372
+ "node": ">=6"
1373
+ }
1374
+ },
1375
+ "node_modules/loose-envify": {
1376
+ "version": "1.4.0",
1377
+ "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
1378
+ "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
1379
+ "license": "MIT",
1380
+ "dependencies": {
1381
+ "js-tokens": "^3.0.0 || ^4.0.0"
1382
+ },
1383
+ "bin": {
1384
+ "loose-envify": "cli.js"
1385
+ }
1386
+ },
1387
+ "node_modules/lru-cache": {
1388
+ "version": "5.1.1",
1389
+ "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
1390
+ "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
1391
+ "dev": true,
1392
+ "license": "ISC",
1393
+ "dependencies": {
1394
+ "yallist": "^3.0.2"
1395
+ }
1396
+ },
1397
+ "node_modules/ms": {
1398
+ "version": "2.1.3",
1399
+ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
1400
+ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
1401
+ "dev": true,
1402
+ "license": "MIT"
1403
+ },
1404
+ "node_modules/nanoid": {
1405
+ "version": "3.3.11",
1406
+ "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
1407
+ "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==",
1408
+ "dev": true,
1409
+ "funding": [
1410
+ {
1411
+ "type": "github",
1412
+ "url": "https://github.com/sponsors/ai"
1413
+ }
1414
+ ],
1415
+ "license": "MIT",
1416
+ "bin": {
1417
+ "nanoid": "bin/nanoid.cjs"
1418
+ },
1419
+ "engines": {
1420
+ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
1421
+ }
1422
+ },
1423
+ "node_modules/node-releases": {
1424
+ "version": "2.0.27",
1425
+ "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
1426
+ "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==",
1427
+ "dev": true,
1428
+ "license": "MIT"
1429
+ },
1430
+ "node_modules/picocolors": {
1431
+ "version": "1.1.1",
1432
+ "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
1433
+ "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
1434
+ "dev": true,
1435
+ "license": "ISC"
1436
+ },
1437
+ "node_modules/postcss": {
1438
+ "version": "8.5.6",
1439
+ "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
1440
+ "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
1441
+ "dev": true,
1442
+ "funding": [
1443
+ {
1444
+ "type": "opencollective",
1445
+ "url": "https://opencollective.com/postcss/"
1446
+ },
1447
+ {
1448
+ "type": "tidelift",
1449
+ "url": "https://tidelift.com/funding/github/npm/postcss"
1450
+ },
1451
+ {
1452
+ "type": "github",
1453
+ "url": "https://github.com/sponsors/ai"
1454
+ }
1455
+ ],
1456
+ "license": "MIT",
1457
+ "dependencies": {
1458
+ "nanoid": "^3.3.11",
1459
+ "picocolors": "^1.1.1",
1460
+ "source-map-js": "^1.2.1"
1461
+ },
1462
+ "engines": {
1463
+ "node": "^10 || ^12 || >=14"
1464
+ }
1465
+ },
1466
+ "node_modules/react": {
1467
+ "version": "18.3.1",
1468
+ "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz",
1469
+ "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==",
1470
+ "license": "MIT",
1471
+ "dependencies": {
1472
+ "loose-envify": "^1.1.0"
1473
+ },
1474
+ "engines": {
1475
+ "node": ">=0.10.0"
1476
+ }
1477
+ },
1478
+ "node_modules/react-dom": {
1479
+ "version": "18.3.1",
1480
+ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz",
1481
+ "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==",
1482
+ "license": "MIT",
1483
+ "dependencies": {
1484
+ "loose-envify": "^1.1.0",
1485
+ "scheduler": "^0.23.2"
1486
+ },
1487
+ "peerDependencies": {
1488
+ "react": "^18.3.1"
1489
+ }
1490
+ },
1491
+ "node_modules/react-refresh": {
1492
+ "version": "0.17.0",
1493
+ "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.17.0.tgz",
1494
+ "integrity": "sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==",
1495
+ "dev": true,
1496
+ "license": "MIT",
1497
+ "engines": {
1498
+ "node": ">=0.10.0"
1499
+ }
1500
+ },
1501
+ "node_modules/rollup": {
1502
+ "version": "4.57.1",
1503
+ "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.57.1.tgz",
1504
+ "integrity": "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A==",
1505
+ "dev": true,
1506
+ "license": "MIT",
1507
+ "dependencies": {
1508
+ "@types/estree": "1.0.8"
1509
+ },
1510
+ "bin": {
1511
+ "rollup": "dist/bin/rollup"
1512
+ },
1513
+ "engines": {
1514
+ "node": ">=18.0.0",
1515
+ "npm": ">=8.0.0"
1516
+ },
1517
+ "optionalDependencies": {
1518
+ "@rollup/rollup-android-arm-eabi": "4.57.1",
1519
+ "@rollup/rollup-android-arm64": "4.57.1",
1520
+ "@rollup/rollup-darwin-arm64": "4.57.1",
1521
+ "@rollup/rollup-darwin-x64": "4.57.1",
1522
+ "@rollup/rollup-freebsd-arm64": "4.57.1",
1523
+ "@rollup/rollup-freebsd-x64": "4.57.1",
1524
+ "@rollup/rollup-linux-arm-gnueabihf": "4.57.1",
1525
+ "@rollup/rollup-linux-arm-musleabihf": "4.57.1",
1526
+ "@rollup/rollup-linux-arm64-gnu": "4.57.1",
1527
+ "@rollup/rollup-linux-arm64-musl": "4.57.1",
1528
+ "@rollup/rollup-linux-loong64-gnu": "4.57.1",
1529
+ "@rollup/rollup-linux-loong64-musl": "4.57.1",
1530
+ "@rollup/rollup-linux-ppc64-gnu": "4.57.1",
1531
+ "@rollup/rollup-linux-ppc64-musl": "4.57.1",
1532
+ "@rollup/rollup-linux-riscv64-gnu": "4.57.1",
1533
+ "@rollup/rollup-linux-riscv64-musl": "4.57.1",
1534
+ "@rollup/rollup-linux-s390x-gnu": "4.57.1",
1535
+ "@rollup/rollup-linux-x64-gnu": "4.57.1",
1536
+ "@rollup/rollup-linux-x64-musl": "4.57.1",
1537
+ "@rollup/rollup-openbsd-x64": "4.57.1",
1538
+ "@rollup/rollup-openharmony-arm64": "4.57.1",
1539
+ "@rollup/rollup-win32-arm64-msvc": "4.57.1",
1540
+ "@rollup/rollup-win32-ia32-msvc": "4.57.1",
1541
+ "@rollup/rollup-win32-x64-gnu": "4.57.1",
1542
+ "@rollup/rollup-win32-x64-msvc": "4.57.1",
1543
+ "fsevents": "~2.3.2"
1544
+ }
1545
+ },
1546
+ "node_modules/scheduler": {
1547
+ "version": "0.23.2",
1548
+ "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz",
1549
+ "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==",
1550
+ "license": "MIT",
1551
+ "dependencies": {
1552
+ "loose-envify": "^1.1.0"
1553
+ }
1554
+ },
1555
+ "node_modules/semver": {
1556
+ "version": "6.3.1",
1557
+ "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
1558
+ "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
1559
+ "dev": true,
1560
+ "license": "ISC",
1561
+ "bin": {
1562
+ "semver": "bin/semver.js"
1563
+ }
1564
+ },
1565
+ "node_modules/source-map-js": {
1566
+ "version": "1.2.1",
1567
+ "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
1568
+ "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
1569
+ "dev": true,
1570
+ "license": "BSD-3-Clause",
1571
+ "engines": {
1572
+ "node": ">=0.10.0"
1573
+ }
1574
+ },
1575
+ "node_modules/update-browserslist-db": {
1576
+ "version": "1.2.3",
1577
+ "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
1578
+ "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==",
1579
+ "dev": true,
1580
+ "funding": [
1581
+ {
1582
+ "type": "opencollective",
1583
+ "url": "https://opencollective.com/browserslist"
1584
+ },
1585
+ {
1586
+ "type": "tidelift",
1587
+ "url": "https://tidelift.com/funding/github/npm/browserslist"
1588
+ },
1589
+ {
1590
+ "type": "github",
1591
+ "url": "https://github.com/sponsors/ai"
1592
+ }
1593
+ ],
1594
+ "license": "MIT",
1595
+ "dependencies": {
1596
+ "escalade": "^3.2.0",
1597
+ "picocolors": "^1.1.1"
1598
+ },
1599
+ "bin": {
1600
+ "update-browserslist-db": "cli.js"
1601
+ },
1602
+ "peerDependencies": {
1603
+ "browserslist": ">= 4.21.0"
1604
+ }
1605
+ },
1606
+ "node_modules/vite": {
1607
+ "version": "5.4.21",
1608
+ "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.21.tgz",
1609
+ "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==",
1610
+ "dev": true,
1611
+ "license": "MIT",
1612
+ "dependencies": {
1613
+ "esbuild": "^0.21.3",
1614
+ "postcss": "^8.4.43",
1615
+ "rollup": "^4.20.0"
1616
+ },
1617
+ "bin": {
1618
+ "vite": "bin/vite.js"
1619
+ },
1620
+ "engines": {
1621
+ "node": "^18.0.0 || >=20.0.0"
1622
+ },
1623
+ "funding": {
1624
+ "url": "https://github.com/vitejs/vite?sponsor=1"
1625
+ },
1626
+ "optionalDependencies": {
1627
+ "fsevents": "~2.3.3"
1628
+ },
1629
+ "peerDependencies": {
1630
+ "@types/node": "^18.0.0 || >=20.0.0",
1631
+ "less": "*",
1632
+ "lightningcss": "^1.21.0",
1633
+ "sass": "*",
1634
+ "sass-embedded": "*",
1635
+ "stylus": "*",
1636
+ "sugarss": "*",
1637
+ "terser": "^5.4.0"
1638
+ },
1639
+ "peerDependenciesMeta": {
1640
+ "@types/node": {
1641
+ "optional": true
1642
+ },
1643
+ "less": {
1644
+ "optional": true
1645
+ },
1646
+ "lightningcss": {
1647
+ "optional": true
1648
+ },
1649
+ "sass": {
1650
+ "optional": true
1651
+ },
1652
+ "sass-embedded": {
1653
+ "optional": true
1654
+ },
1655
+ "stylus": {
1656
+ "optional": true
1657
+ },
1658
+ "sugarss": {
1659
+ "optional": true
1660
+ },
1661
+ "terser": {
1662
+ "optional": true
1663
+ }
1664
+ }
1665
+ },
1666
+ "node_modules/yallist": {
1667
+ "version": "3.1.1",
1668
+ "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
1669
+ "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==",
1670
+ "dev": true,
1671
+ "license": "ISC"
1672
+ }
1673
+ }
1674
+ }
react-ui/package.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "af3-chatgpt-pipeline-ui",
3
+ "version": "0.1.0",
4
+ "private": true,
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "preview": "vite preview"
10
+ },
11
+ "dependencies": {
12
+ "react": "^18.3.1",
13
+ "react-dom": "^18.3.1"
14
+ },
15
+ "devDependencies": {
16
+ "@vitejs/plugin-react": "^4.3.4",
17
+ "vite": "^5.4.11"
18
+ }
19
+ }
react-ui/src/App.jsx ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useMemo, useState } from "react";
2
+
3
+ const DEFAULT_PROMPT =
4
+ "Analyze this full song and provide concise, timestamped sections describing vocals, instrumentation, production effects, mix changes, energy flow, and genre cues. End with a short overall summary.";
5
+
6
+ export default function App() {
7
+ const [mode, setMode] = useState("path");
8
+ const [audioPath, setAudioPath] = useState("E:\\Coding\\hf-music-gen\\train-dataset\\Andrew Spacey - Wonder (Prod Beat It AT).mp3");
9
+ const [audioFile, setAudioFile] = useState(null);
10
+ const [backend, setBackend] = useState("hf_endpoint");
11
+ const [endpointUrl, setEndpointUrl] = useState("");
12
+ const [hfToken, setHfToken] = useState("");
13
+ const [modelId, setModelId] = useState("nvidia/audio-flamingo-3-hf");
14
+ const [openAiApiKey, setOpenAiApiKey] = useState("");
15
+ const [openAiModel, setOpenAiModel] = useState("gpt-5-mini");
16
+ const [prompt, setPrompt] = useState(DEFAULT_PROMPT);
17
+ const [userContext, setUserContext] = useState("");
18
+ const [artistName, setArtistName] = useState("");
19
+ const [trackName, setTrackName] = useState("");
20
+ const [enableWebSearch, setEnableWebSearch] = useState(false);
21
+ const [loading, setLoading] = useState(false);
22
+ const [error, setError] = useState("");
23
+ const [result, setResult] = useState(null);
24
+
25
+ useEffect(() => {
26
+ let mounted = true;
27
+ fetch("/api/config")
28
+ .then((r) => r.json())
29
+ .then((data) => {
30
+ if (!mounted) return;
31
+ const d = data?.defaults || {};
32
+ if (d.backend) setBackend(d.backend);
33
+ if (d.endpoint_url) setEndpointUrl(d.endpoint_url);
34
+ if (d.model_id) setModelId(d.model_id);
35
+ if (d.openai_model) setOpenAiModel(d.openai_model);
36
+ if (d.af3_prompt) setPrompt(d.af3_prompt);
37
+ })
38
+ .catch(() => {});
39
+ return () => {
40
+ mounted = false;
41
+ };
42
+ }, []);
43
+
44
+ const requestPreview = useMemo(() => {
45
+ return {
46
+ backend,
47
+ endpoint_url: endpointUrl || "(env default)",
48
+ model_id: modelId,
49
+ openai_model: openAiModel,
50
+ enable_web_search: enableWebSearch,
51
+ artist_name: artistName || "(none)",
52
+ track_name: trackName || "(none)",
53
+ };
54
+ }, [backend, endpointUrl, modelId, openAiModel, enableWebSearch, artistName, trackName]);
55
+
56
+ async function runPipeline() {
57
+ setLoading(true);
58
+ setError("");
59
+ setResult(null);
60
+ try {
61
+ let response;
62
+ if (mode === "path") {
63
+ response = await fetch("/api/pipeline/run-path", {
64
+ method: "POST",
65
+ headers: { "Content-Type": "application/json" },
66
+ body: JSON.stringify({
67
+ audio_path: audioPath,
68
+ backend,
69
+ endpoint_url: endpointUrl,
70
+ hf_token: hfToken,
71
+ model_id: modelId,
72
+ af3_prompt: prompt,
73
+ openai_api_key: openAiApiKey,
74
+ openai_model: openAiModel,
75
+ user_context: userContext,
76
+ artist_name: artistName,
77
+ track_name: trackName,
78
+ enable_web_search: enableWebSearch,
79
+ }),
80
+ });
81
+ } else {
82
+ if (!audioFile) {
83
+ throw new Error("Select an audio file first.");
84
+ }
85
+ const form = new FormData();
86
+ form.append("audio_file", audioFile);
87
+ form.append("backend", backend);
88
+ form.append("endpoint_url", endpointUrl);
89
+ form.append("hf_token", hfToken);
90
+ form.append("model_id", modelId);
91
+ form.append("af3_prompt", prompt);
92
+ form.append("openai_api_key", openAiApiKey);
93
+ form.append("openai_model", openAiModel);
94
+ form.append("user_context", userContext);
95
+ form.append("artist_name", artistName);
96
+ form.append("track_name", trackName);
97
+ form.append("enable_web_search", String(enableWebSearch));
98
+ response = await fetch("/api/pipeline/run-upload", {
99
+ method: "POST",
100
+ body: form,
101
+ });
102
+ }
103
+
104
+ const data = await response.json();
105
+ if (!response.ok) {
106
+ const detail = typeof data?.detail === "string" ? data.detail : JSON.stringify(data);
107
+ throw new Error(detail);
108
+ }
109
+ setResult(data);
110
+ } catch (err) {
111
+ setError(err.message || String(err));
112
+ } finally {
113
+ setLoading(false);
114
+ }
115
+ }
116
+
117
+ return (
118
+ <div className="page">
119
+ <div className="hero">
120
+ <h1>AF3 + ChatGPT Pipeline</h1>
121
+ <p>Run Audio Flamingo 3 analysis, then clean/structure for Ace Step 1.5 LoRA metadata.</p>
122
+ </div>
123
+
124
+ <div className="grid">
125
+ <section className="card">
126
+ <h2>Inputs</h2>
127
+ <div className="row">
128
+ <label>Mode</label>
129
+ <select value={mode} onChange={(e) => setMode(e.target.value)}>
130
+ <option value="path">Local Path</option>
131
+ <option value="upload">Upload</option>
132
+ </select>
133
+ </div>
134
+
135
+ {mode === "path" ? (
136
+ <div className="row">
137
+ <label>Audio Path</label>
138
+ <input value={audioPath} onChange={(e) => setAudioPath(e.target.value)} />
139
+ </div>
140
+ ) : (
141
+ <div className="row">
142
+ <label>Audio File</label>
143
+ <input type="file" accept="audio/*" onChange={(e) => setAudioFile(e.target.files?.[0] || null)} />
144
+ </div>
145
+ )}
146
+
147
+ <div className="row">
148
+ <label>AF3 Backend</label>
149
+ <select value={backend} onChange={(e) => setBackend(e.target.value)}>
150
+ <option value="hf_endpoint">HF Endpoint</option>
151
+ <option value="local">Local Model</option>
152
+ </select>
153
+ </div>
154
+ <div className="row">
155
+ <label>AF3 Endpoint URL</label>
156
+ <input value={endpointUrl} onChange={(e) => setEndpointUrl(e.target.value)} placeholder="https://..." />
157
+ </div>
158
+ <div className="row">
159
+ <label>HF Token (optional)</label>
160
+ <input type="password" value={hfToken} onChange={(e) => setHfToken(e.target.value)} />
161
+ </div>
162
+ <div className="row">
163
+ <label>AF3 Model ID</label>
164
+ <input value={modelId} onChange={(e) => setModelId(e.target.value)} />
165
+ </div>
166
+ <div className="row">
167
+ <label>OpenAI API Key (optional)</label>
168
+ <input type="password" value={openAiApiKey} onChange={(e) => setOpenAiApiKey(e.target.value)} />
169
+ </div>
170
+ <div className="row">
171
+ <label>OpenAI Model</label>
172
+ <input value={openAiModel} onChange={(e) => setOpenAiModel(e.target.value)} />
173
+ </div>
174
+ <div className="row">
175
+ <label>Artist (optional)</label>
176
+ <input value={artistName} onChange={(e) => setArtistName(e.target.value)} />
177
+ </div>
178
+ <div className="row">
179
+ <label>Track (optional)</label>
180
+ <input value={trackName} onChange={(e) => setTrackName(e.target.value)} />
181
+ </div>
182
+ <div className="row">
183
+ <label>Prompt</label>
184
+ <textarea rows={5} value={prompt} onChange={(e) => setPrompt(e.target.value)} />
185
+ </div>
186
+ <div className="row">
187
+ <label>User Context</label>
188
+ <textarea rows={4} value={userContext} onChange={(e) => setUserContext(e.target.value)} />
189
+ </div>
190
+ <div className="row inline">
191
+ <input
192
+ id="websearch"
193
+ type="checkbox"
194
+ checked={enableWebSearch}
195
+ onChange={(e) => setEnableWebSearch(e.target.checked)}
196
+ />
197
+ <label htmlFor="websearch">Enable ChatGPT web search (optional)</label>
198
+ </div>
199
+
200
+ <button className="run" disabled={loading} onClick={runPipeline}>
201
+ {loading ? "Running..." : "Run Pipeline"}
202
+ </button>
203
+ </section>
204
+
205
+ <section className="card">
206
+ <h2>Request Summary</h2>
207
+ <pre>{JSON.stringify(requestPreview, null, 2)}</pre>
208
+ {error ? <p className="error">{error}</p> : null}
209
+ {result ? (
210
+ <>
211
+ <h3>Saved Sidecar</h3>
212
+ <p className="mono">{result.saved_to}</p>
213
+ <h3>AF3 Analysis</h3>
214
+ <pre>{result.af3_analysis}</pre>
215
+ <h3>Final LoRA JSON</h3>
216
+ <pre>{JSON.stringify(result.sidecar, null, 2)}</pre>
217
+ </>
218
+ ) : null}
219
+ </section>
220
+ </div>
221
+ </div>
222
+ );
223
+ }
react-ui/src/main.jsx ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from "react";
2
+ import { createRoot } from "react-dom/client";
3
+ import App from "./App";
4
+ import "./styles.css";
5
+
6
+ createRoot(document.getElementById("root")).render(
7
+ <React.StrictMode>
8
+ <App />
9
+ </React.StrictMode>
10
+ );
11
+
react-ui/src/styles.css ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;600&display=swap");
2
+
3
+ :root {
4
+ --bg: #f4f1e8;
5
+ --bg-2: #e6f2ef;
6
+ --ink: #1f2a22;
7
+ --muted: #4b5a50;
8
+ --brand: #0a8f6a;
9
+ --brand-deep: #0c5b49;
10
+ --warn: #ad1f1f;
11
+ --card: rgba(255, 255, 255, 0.78);
12
+ --line: rgba(31, 42, 34, 0.2);
13
+ --shadow: 0 20px 60px rgba(6, 48, 38, 0.16);
14
+ }
15
+
16
+ * {
17
+ box-sizing: border-box;
18
+ }
19
+
20
+ body {
21
+ margin: 0;
22
+ color: var(--ink);
23
+ font-family: "Space Grotesk", system-ui, sans-serif;
24
+ background:
25
+ radial-gradient(circle at 15% 10%, rgba(10, 143, 106, 0.16), transparent 45%),
26
+ radial-gradient(circle at 85% 0%, rgba(255, 138, 61, 0.14), transparent 35%),
27
+ linear-gradient(140deg, var(--bg), var(--bg-2));
28
+ min-height: 100vh;
29
+ }
30
+
31
+ .page {
32
+ max-width: 1200px;
33
+ margin: 0 auto;
34
+ padding: 28px 18px 36px;
35
+ }
36
+
37
+ .hero {
38
+ margin-bottom: 18px;
39
+ animation: rise 0.55s ease;
40
+ }
41
+
42
+ .hero h1 {
43
+ margin: 0;
44
+ font-size: clamp(1.6rem, 3vw, 2.4rem);
45
+ letter-spacing: -0.02em;
46
+ }
47
+
48
+ .hero p {
49
+ margin: 6px 0 0;
50
+ color: var(--muted);
51
+ }
52
+
53
+ .grid {
54
+ display: grid;
55
+ grid-template-columns: 1.1fr 1fr;
56
+ gap: 16px;
57
+ align-items: start;
58
+ }
59
+
60
+ .card {
61
+ background: var(--card);
62
+ border: 1px solid var(--line);
63
+ border-radius: 14px;
64
+ padding: 14px;
65
+ backdrop-filter: blur(8px);
66
+ box-shadow: var(--shadow);
67
+ animation: rise 0.6s ease;
68
+ }
69
+
70
+ .card h2,
71
+ .card h3 {
72
+ margin: 0 0 12px;
73
+ }
74
+
75
+ .row {
76
+ display: grid;
77
+ gap: 6px;
78
+ margin-bottom: 10px;
79
+ }
80
+
81
+ .row label {
82
+ font-size: 0.84rem;
83
+ font-weight: 600;
84
+ color: var(--muted);
85
+ }
86
+
87
+ .row.inline {
88
+ display: flex;
89
+ align-items: center;
90
+ gap: 8px;
91
+ }
92
+
93
+ input,
94
+ select,
95
+ textarea,
96
+ button {
97
+ width: 100%;
98
+ font: inherit;
99
+ }
100
+
101
+ input,
102
+ select,
103
+ textarea {
104
+ border: 1px solid var(--line);
105
+ border-radius: 10px;
106
+ padding: 9px 10px;
107
+ background: rgba(255, 255, 255, 0.94);
108
+ color: var(--ink);
109
+ }
110
+
111
+ textarea {
112
+ resize: vertical;
113
+ }
114
+
115
+ input:focus,
116
+ select:focus,
117
+ textarea:focus {
118
+ outline: 2px solid rgba(10, 143, 106, 0.28);
119
+ border-color: rgba(10, 143, 106, 0.7);
120
+ }
121
+
122
+ .row.inline input[type="checkbox"] {
123
+ width: auto;
124
+ }
125
+
126
+ .run {
127
+ margin-top: 4px;
128
+ border: 0;
129
+ border-radius: 10px;
130
+ padding: 11px 12px;
131
+ font-weight: 700;
132
+ color: #fff;
133
+ cursor: pointer;
134
+ background: linear-gradient(120deg, var(--brand), var(--brand-deep));
135
+ }
136
+
137
+ .run:disabled {
138
+ opacity: 0.68;
139
+ cursor: not-allowed;
140
+ }
141
+
142
+ pre,
143
+ .mono {
144
+ font-family: "IBM Plex Mono", ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
145
+ }
146
+
147
+ pre {
148
+ white-space: pre-wrap;
149
+ word-break: break-word;
150
+ max-height: 320px;
151
+ overflow: auto;
152
+ background: rgba(20, 35, 29, 0.92);
153
+ color: #f4fffa;
154
+ padding: 10px;
155
+ border-radius: 10px;
156
+ border: 1px solid rgba(255, 255, 255, 0.1);
157
+ }
158
+
159
+ .error {
160
+ margin: 10px 0;
161
+ color: var(--warn);
162
+ font-weight: 600;
163
+ }
164
+
165
+ .mono {
166
+ font-size: 0.9rem;
167
+ overflow-wrap: anywhere;
168
+ }
169
+
170
+ @keyframes rise {
171
+ from {
172
+ opacity: 0;
173
+ transform: translateY(8px);
174
+ }
175
+ to {
176
+ opacity: 1;
177
+ transform: translateY(0);
178
+ }
179
+ }
180
+
181
+ @media (max-width: 980px) {
182
+ .grid {
183
+ grid-template-columns: 1fr;
184
+ }
185
+
186
+ .card {
187
+ padding: 12px;
188
+ }
189
+ }
react-ui/vite.config.js ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react";
3
+
4
+ export default defineConfig({
5
+ plugins: [react()],
6
+ server: {
7
+ port: 5173,
8
+ proxy: {
9
+ "/api": {
10
+ target: "http://localhost:8008",
11
+ changeOrigin: true,
12
+ },
13
+ },
14
+ },
15
+ });
requirements.txt CHANGED
@@ -2,7 +2,7 @@ numpy
2
  soundfile
3
  torch
4
  torchaudio
5
- transformers>=4.53.0,<4.58.0
6
  accelerate
7
  huggingface_hub
8
  diffusers
@@ -18,3 +18,8 @@ peft>=0.11.0
18
  gradio>=4.0.0
19
  pandas
20
  bitsandbytes
 
 
 
 
 
 
2
  soundfile
3
  torch
4
  torchaudio
5
+ transformers>=4.57.0,<4.58.0
6
  accelerate
7
  huggingface_hub
8
  diffusers
 
18
  gradio>=4.0.0
19
  pandas
20
  bitsandbytes
21
+ fastapi
22
+ uvicorn
23
+ python-multipart
24
+ openai
25
+ python-dotenv
scripts/annotations/qwen_annotate_file.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Annotate one audio file with Qwen2-Audio and save a sidecar JSON.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ from pathlib import Path
12
+
13
+ from qwen_audio_captioning import (
14
+ DEFAULT_ANALYSIS_PROMPT,
15
+ DEFAULT_LONG_ANALYSIS_PROMPT,
16
+ DEFAULT_MODEL_ID,
17
+ build_captioner,
18
+ generate_track_annotation,
19
+ read_prompt_file,
20
+ )
21
+
22
+
23
+ def read_dotenv_value(path: str, key: str) -> str:
24
+ p = Path(path)
25
+ if not p.exists():
26
+ return ""
27
+ for raw in p.read_text(encoding="utf-8").splitlines():
28
+ line = raw.strip()
29
+ if not line or line.startswith("#") or "=" not in line:
30
+ continue
31
+ k, v = line.split("=", 1)
32
+ if k.strip() == key:
33
+ return v.strip().strip('"').strip("'")
34
+ return ""
35
+
36
+
37
+ def main() -> int:
38
+ parser = argparse.ArgumentParser(description="Annotate a single audio file with Qwen2-Audio")
39
+ parser.add_argument("--audio", required=True, help="Audio file path")
40
+ parser.add_argument("--backend", default="hf_endpoint", choices=["local", "hf_endpoint"])
41
+ parser.add_argument("--model-id", default=DEFAULT_MODEL_ID)
42
+ parser.add_argument("--endpoint-url", default=os.getenv("HF_QWEN_ENDPOINT_URL", ""))
43
+ parser.add_argument("--token", default="")
44
+ parser.add_argument("--device", default="auto", choices=["auto", "cuda", "cpu", "mps"])
45
+ parser.add_argument("--torch-dtype", default="auto", choices=["auto", "float16", "bfloat16", "float32"])
46
+ parser.add_argument("--prompt", default=DEFAULT_ANALYSIS_PROMPT)
47
+ parser.add_argument("--prompt-file", default="")
48
+ parser.add_argument("--include-long-analysis", action="store_true")
49
+ parser.add_argument("--long-analysis-prompt", default=DEFAULT_LONG_ANALYSIS_PROMPT)
50
+ parser.add_argument("--long-analysis-prompt-file", default="")
51
+ parser.add_argument("--long-analysis-max-new-tokens", type=int, default=1200)
52
+ parser.add_argument("--long-analysis-temperature", type=float, default=0.1)
53
+ parser.add_argument("--segment-seconds", type=float, default=30.0)
54
+ parser.add_argument("--overlap-seconds", type=float, default=2.0)
55
+ parser.add_argument("--max-new-tokens", type=int, default=384)
56
+ parser.add_argument("--temperature", type=float, default=0.1)
57
+ parser.add_argument("--keep-raw-outputs", action="store_true")
58
+ parser.add_argument("--output-json", default="", help="Output JSON path (default: audio sidecar)")
59
+ args = parser.parse_args()
60
+
61
+ audio_path = Path(args.audio)
62
+ if not audio_path.is_file():
63
+ raise FileNotFoundError(f"Audio not found: {audio_path}")
64
+
65
+ prompt = read_prompt_file(args.prompt_file) if args.prompt_file else args.prompt
66
+ long_prompt = (
67
+ read_prompt_file(args.long_analysis_prompt_file)
68
+ if args.long_analysis_prompt_file
69
+ else args.long_analysis_prompt
70
+ )
71
+ token = (
72
+ args.token
73
+ or os.getenv("HF_TOKEN", "")
74
+ or read_dotenv_value(".env", "HF_TOKEN")
75
+ or read_dotenv_value(".env", "hf_token")
76
+ )
77
+
78
+ captioner = build_captioner(
79
+ backend=args.backend,
80
+ model_id=args.model_id,
81
+ endpoint_url=args.endpoint_url,
82
+ token=token,
83
+ device=args.device,
84
+ torch_dtype=args.torch_dtype,
85
+ )
86
+
87
+ sidecar = generate_track_annotation(
88
+ audio_path=str(audio_path),
89
+ captioner=captioner,
90
+ prompt=prompt,
91
+ segment_seconds=float(args.segment_seconds),
92
+ overlap_seconds=float(args.overlap_seconds),
93
+ max_new_tokens=int(args.max_new_tokens),
94
+ temperature=float(args.temperature),
95
+ keep_raw_outputs=bool(args.keep_raw_outputs),
96
+ include_long_analysis=bool(args.include_long_analysis),
97
+ long_analysis_prompt=long_prompt,
98
+ long_analysis_max_new_tokens=int(args.long_analysis_max_new_tokens),
99
+ long_analysis_temperature=float(args.long_analysis_temperature),
100
+ )
101
+
102
+ out_path = Path(args.output_json) if args.output_json else audio_path.with_suffix(".json")
103
+ out_path.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
104
+ print(
105
+ json.dumps(
106
+ {
107
+ "saved_to": str(out_path),
108
+ "caption": sidecar.get("caption", ""),
109
+ "bpm": sidecar.get("bpm"),
110
+ "keyscale": sidecar.get("keyscale", ""),
111
+ "duration": sidecar.get("duration"),
112
+ "segment_count": sidecar.get("music_analysis", {}).get("segment_count"),
113
+ },
114
+ indent=2,
115
+ ensure_ascii=False,
116
+ )
117
+ )
118
+ return 0
119
+
120
+
121
+ if __name__ == "__main__":
122
+ raise SystemExit(main())
scripts/annotations/qwen_caption_dataset.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Batch caption a music dataset with Qwen2-Audio and export LoRA-ready sidecars.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import os
10
+ import tempfile
11
+ from pathlib import Path
12
+ from typing import List
13
+
14
+ from huggingface_hub import HfApi, snapshot_download
15
+ from loguru import logger
16
+ from tqdm import tqdm
17
+
18
+ from qwen_audio_captioning import (
19
+ DEFAULT_ANALYSIS_PROMPT,
20
+ DEFAULT_LONG_ANALYSIS_PROMPT,
21
+ DEFAULT_MODEL_ID,
22
+ build_captioner,
23
+ export_annotation_records,
24
+ generate_track_annotation,
25
+ list_audio_files,
26
+ read_prompt_file,
27
+ )
28
+
29
+
30
+ def build_parser() -> argparse.ArgumentParser:
31
+ p = argparse.ArgumentParser(description="Qwen2-Audio batch captioning for LoRA datasets")
32
+
33
+ # Data source
34
+ p.add_argument("--dataset-dir", type=str, default="", help="Local dataset folder")
35
+ p.add_argument("--dataset-repo", type=str, default="", help="HF dataset repo id")
36
+ p.add_argument("--dataset-revision", type=str, default="main", help="HF dataset revision")
37
+ p.add_argument("--dataset-subdir", type=str, default="", help="Subdirectory inside dataset")
38
+
39
+ # Backend
40
+ p.add_argument("--backend", type=str, default="local", choices=["local", "hf_endpoint"])
41
+ p.add_argument("--model-id", type=str, default=DEFAULT_MODEL_ID)
42
+ p.add_argument("--endpoint-url", type=str, default="")
43
+ p.add_argument("--hf-token", type=str, default="", help="HF token (or use HF_TOKEN env var)")
44
+ p.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu", "mps"])
45
+ p.add_argument("--torch-dtype", type=str, default="auto", choices=["auto", "float16", "bfloat16", "float32"])
46
+
47
+ # Prompt + generation controls
48
+ p.add_argument("--prompt", type=str, default=DEFAULT_ANALYSIS_PROMPT)
49
+ p.add_argument("--prompt-file", type=str, default="", help="Text file to override --prompt")
50
+ p.add_argument("--include-long-analysis", action="store_true", help="Also request long prose analysis")
51
+ p.add_argument("--long-analysis-prompt", type=str, default=DEFAULT_LONG_ANALYSIS_PROMPT)
52
+ p.add_argument("--long-analysis-prompt-file", type=str, default="", help="Text file to override --long-analysis-prompt")
53
+ p.add_argument("--long-analysis-max-new-tokens", type=int, default=1200)
54
+ p.add_argument("--long-analysis-temperature", type=float, default=0.1)
55
+ p.add_argument("--segment-seconds", type=float, default=30.0)
56
+ p.add_argument("--overlap-seconds", type=float, default=2.0)
57
+ p.add_argument("--max-new-tokens", type=int, default=384)
58
+ p.add_argument("--temperature", type=float, default=0.1)
59
+ p.add_argument("--keep-raw-outputs", action="store_true", help="Store per-segment raw outputs in sidecar JSON")
60
+
61
+ # Export
62
+ p.add_argument("--output-dir", type=str, default="qwen_annotations")
63
+ p.add_argument("--copy-audio", action="store_true", help="Copy audio files into output_dir/dataset")
64
+ p.add_argument(
65
+ "--write-inplace-sidecars",
66
+ action=argparse.BooleanOptionalAction,
67
+ default=True,
68
+ help="Write sidecars next to source audio (default: true). Use --no-write-inplace-sidecars to disable.",
69
+ )
70
+
71
+ # Optional upload of exported folder
72
+ p.add_argument("--upload-repo", type=str, default="", help="Optional HF dataset repo to upload exports")
73
+ p.add_argument("--upload-private", action="store_true", help="Create upload repo as private")
74
+ p.add_argument("--upload-path", type=str, default="", help="Optional path inside upload repo")
75
+
76
+ return p
77
+
78
+
79
+ def resolve_dataset_dir(args) -> str:
80
+ if args.dataset_dir:
81
+ if not Path(args.dataset_dir).is_dir():
82
+ raise FileNotFoundError(f"Dataset folder not found: {args.dataset_dir}")
83
+ return args.dataset_dir
84
+
85
+ if not args.dataset_repo:
86
+ raise ValueError("Provide --dataset-dir or --dataset-repo")
87
+
88
+ token = args.hf_token or os.getenv("HF_TOKEN", "")
89
+ temp_root = tempfile.mkdtemp(prefix="qwen_caption_dataset_")
90
+ local_dir = os.path.join(temp_root, "dataset")
91
+ logger.info(f"Downloading dataset {args.dataset_repo}@{args.dataset_revision} -> {local_dir}")
92
+ snapshot_download(
93
+ repo_id=args.dataset_repo,
94
+ repo_type="dataset",
95
+ revision=args.dataset_revision,
96
+ local_dir=local_dir,
97
+ local_dir_use_symlinks=False,
98
+ token=token or None,
99
+ )
100
+ if args.dataset_subdir:
101
+ sub = os.path.join(local_dir, args.dataset_subdir)
102
+ if not Path(sub).is_dir():
103
+ raise FileNotFoundError(f"Dataset subdir not found: {sub}")
104
+ return sub
105
+ return local_dir
106
+
107
+
108
+ def upload_export_if_requested(args, output_dir: str):
109
+ if not args.upload_repo:
110
+ return
111
+ token = args.hf_token or os.getenv("HF_TOKEN", "")
112
+ if not token:
113
+ raise RuntimeError("HF token missing. Set --hf-token or HF_TOKEN.")
114
+
115
+ api = HfApi(token=token)
116
+ api.create_repo(
117
+ repo_id=args.upload_repo,
118
+ repo_type="dataset",
119
+ private=bool(args.upload_private),
120
+ exist_ok=True,
121
+ )
122
+ path_in_repo = args.upload_path.strip().strip("/") if args.upload_path else ""
123
+ logger.info(f"Uploading {output_dir} -> {args.upload_repo}/{path_in_repo}")
124
+ api.upload_folder(
125
+ repo_id=args.upload_repo,
126
+ repo_type="dataset",
127
+ folder_path=output_dir,
128
+ path_in_repo=path_in_repo,
129
+ commit_message="Upload Qwen2-Audio annotations",
130
+ )
131
+ logger.info("Upload complete")
132
+
133
+
134
+ def main() -> int:
135
+ args = build_parser().parse_args()
136
+ prompt = read_prompt_file(args.prompt_file) if args.prompt_file else args.prompt
137
+ long_prompt = (
138
+ read_prompt_file(args.long_analysis_prompt_file)
139
+ if args.long_analysis_prompt_file
140
+ else args.long_analysis_prompt
141
+ )
142
+ token = args.hf_token or os.getenv("HF_TOKEN", "")
143
+
144
+ dataset_dir = resolve_dataset_dir(args)
145
+ audio_files: List[str] = list_audio_files(dataset_dir)
146
+ if not audio_files:
147
+ raise RuntimeError(f"No audio files found in {dataset_dir}")
148
+ logger.info(f"Found {len(audio_files)} audio files")
149
+
150
+ captioner = build_captioner(
151
+ backend=args.backend,
152
+ model_id=args.model_id,
153
+ endpoint_url=args.endpoint_url,
154
+ token=token,
155
+ device=args.device,
156
+ torch_dtype=args.torch_dtype,
157
+ )
158
+
159
+ records = []
160
+ failed = []
161
+ for path in tqdm(audio_files, desc="Captioning audio"):
162
+ try:
163
+ sidecar = generate_track_annotation(
164
+ audio_path=path,
165
+ captioner=captioner,
166
+ prompt=prompt,
167
+ segment_seconds=float(args.segment_seconds),
168
+ overlap_seconds=float(args.overlap_seconds),
169
+ max_new_tokens=int(args.max_new_tokens),
170
+ temperature=float(args.temperature),
171
+ keep_raw_outputs=bool(args.keep_raw_outputs),
172
+ include_long_analysis=bool(args.include_long_analysis),
173
+ long_analysis_prompt=long_prompt,
174
+ long_analysis_max_new_tokens=int(args.long_analysis_max_new_tokens),
175
+ long_analysis_temperature=float(args.long_analysis_temperature),
176
+ )
177
+ records.append({"audio_path": path, "sidecar": sidecar})
178
+ except Exception as exc:
179
+ failed.append(f"{Path(path).name}: {exc}")
180
+ logger.exception(f"Failed: {path}")
181
+
182
+ export_result = export_annotation_records(
183
+ records=records,
184
+ output_dir=args.output_dir,
185
+ copy_audio=bool(args.copy_audio),
186
+ write_inplace_sidecars=bool(args.write_inplace_sidecars),
187
+ )
188
+
189
+ logger.info(
190
+ "Done. analyzed={} failed={} manifest={}",
191
+ len(records),
192
+ len(failed),
193
+ export_result["manifest_path"],
194
+ )
195
+ if failed:
196
+ logger.warning("First failures:\n" + "\n".join(failed[:20]))
197
+
198
+ upload_export_if_requested(args, args.output_dir)
199
+ return 0
200
+
201
+
202
+ if __name__ == "__main__":
203
+ raise SystemExit(main())
scripts/dev/run_af3_gui.ps1 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ param(
2
+ [string]$BindHost = "127.0.0.1",
3
+ [int]$Port = 8008,
4
+ [switch]$Reload,
5
+ [switch]$NoBrowser,
6
+ [switch]$SkipNpmInstall,
7
+ [switch]$SkipBuild
8
+ )
9
+
10
+ $cmd = @("python", "af3_gui_app.py", "--host", $BindHost, "--port", "$Port")
11
+ if ($Reload) { $cmd += "--reload" }
12
+ if ($NoBrowser) { $cmd += "--no-browser" }
13
+ if ($SkipNpmInstall) { $cmd += "--skip-npm-install" }
14
+ if ($SkipBuild) { $cmd += "--skip-build" }
15
+
16
+ $exe = $cmd[0]
17
+ $args = @()
18
+ if ($cmd.Length -gt 1) {
19
+ $args = $cmd[1..($cmd.Length-1)]
20
+ }
21
+ & $exe @args
scripts/dev/run_af3_gui.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Build and launch the AF3 + ChatGPT GUI stack (API + React UI)."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import shutil
8
+ import subprocess
9
+ import sys
10
+ import threading
11
+ import webbrowser
12
+ from pathlib import Path
13
+
14
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
15
+ if str(PROJECT_ROOT) not in sys.path:
16
+ sys.path.insert(0, str(PROJECT_ROOT))
17
+
18
+ from utils.env_config import load_project_env
19
+
20
+
21
+ def _run(cmd: list[str], cwd: Path) -> None:
22
+ proc = subprocess.run(cmd, cwd=str(cwd), check=False)
23
+ if proc.returncode != 0:
24
+ raise RuntimeError(f"Command failed ({proc.returncode}): {' '.join(cmd)}")
25
+
26
+
27
+ def _build_frontend(skip_npm_install: bool, skip_build: bool) -> None:
28
+ react_dir = PROJECT_ROOT / "react-ui"
29
+ if not react_dir.exists():
30
+ raise FileNotFoundError(f"React UI folder missing: {react_dir}")
31
+
32
+ npm = shutil.which("npm")
33
+ if not npm:
34
+ raise RuntimeError("`npm` was not found. Install Node.js (includes npm) first.")
35
+
36
+ if not skip_npm_install and not (react_dir / "node_modules").exists():
37
+ _run([npm, "install"], cwd=react_dir)
38
+
39
+ if not skip_build:
40
+ _run([npm, "run", "build"], cwd=react_dir)
41
+
42
+
43
+ def build_parser() -> argparse.ArgumentParser:
44
+ p = argparse.ArgumentParser(description="Launch AF3 GUI (FastAPI + built React frontend)")
45
+ p.add_argument("--host", default="127.0.0.1")
46
+ p.add_argument("--port", type=int, default=8008)
47
+ p.add_argument("--reload", action="store_true", help="Enable uvicorn reload mode")
48
+ p.add_argument("--no-browser", action="store_true", help="Do not open browser automatically")
49
+ p.add_argument("--skip-npm-install", action="store_true", help="Skip npm install")
50
+ p.add_argument("--skip-build", action="store_true", help="Skip frontend build")
51
+ return p
52
+
53
+
54
+ def main() -> int:
55
+ args = build_parser().parse_args()
56
+ load_project_env()
57
+
58
+ _build_frontend(skip_npm_install=bool(args.skip_npm_install), skip_build=bool(args.skip_build))
59
+
60
+ url = f"http://{args.host}:{args.port}"
61
+ if not args.no_browser:
62
+ threading.Timer(1.0, lambda: webbrowser.open(url)).start()
63
+
64
+ import uvicorn
65
+
66
+ uvicorn.run(
67
+ "services.pipeline_api:app",
68
+ host=args.host,
69
+ port=int(args.port),
70
+ reload=bool(args.reload),
71
+ )
72
+ return 0
73
+
74
+
75
+ if __name__ == "__main__":
76
+ raise SystemExit(main())
scripts/endpoint/test_af3_caption_endpoint.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Send one audio file to an Audio Flamingo 3 endpoint and print/save the response.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import base64
10
+ import io
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+ from urllib.error import HTTPError, URLError
15
+ from urllib.request import Request, urlopen
16
+
17
+ import soundfile as sf
18
+
19
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
20
+ if str(PROJECT_ROOT) not in sys.path:
21
+ sys.path.insert(0, str(PROJECT_ROOT))
22
+
23
+ from af3_chatgpt_pipeline import DEFAULT_AF3_PROMPT, DEFAULT_AF3_PROMPT_THINK_LONG
24
+ from qwen_audio_captioning import load_audio_mono
25
+ from utils.env_config import get_env, load_project_env
26
+
27
+
28
+ def load_audio_b64(audio_path: str, target_sr: int = 16000) -> str:
29
+ mono, sr = load_audio_mono(audio_path, target_sr=target_sr)
30
+ buf = io.BytesIO()
31
+ sf.write(buf, mono, int(sr), format="WAV")
32
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
33
+
34
+
35
+ def send(url: str, token: str, payload: dict) -> dict:
36
+ req = Request(
37
+ url=url,
38
+ method="POST",
39
+ data=json.dumps(payload).encode("utf-8"),
40
+ headers={
41
+ **({"Authorization": f"Bearer {token}"} if token else {}),
42
+ "Content-Type": "application/json",
43
+ },
44
+ )
45
+ try:
46
+ with urlopen(req, timeout=600) as resp:
47
+ text = resp.read().decode("utf-8")
48
+ return json.loads(text)
49
+ except HTTPError as e:
50
+ body = e.read().decode("utf-8", errors="replace")
51
+ lower = body.lower()
52
+ if "endpoint is in error" in lower:
53
+ body += (
54
+ "\nHint: open the endpoint page and restart/redeploy. "
55
+ "This is a remote runtime failure, not a local script issue."
56
+ )
57
+ if "no custom pipeline found" in lower:
58
+ body += (
59
+ "\nHint: endpoint repo root must contain handler.py; "
60
+ "ensure you deployed templates/hf-af3-caption-endpoint files."
61
+ )
62
+ if "audioflamingo3" in lower and "does not recognize" in lower:
63
+ body += (
64
+ "\nHint: runtime transformers is too old. "
65
+ "Use templates/hf-af3-caption-endpoint/handler.py bootstrap runtime "
66
+ "(AF3_TRANSFORMERS_SPEC=transformers==5.1.0) and redeploy."
67
+ )
68
+ if "failed to load af3 processor classes after runtime bootstrap" in lower:
69
+ body += (
70
+ "\nHint: endpoint startup could not install/load AF3 runtime deps. "
71
+ "Check startup logs for pip/network/disk issues and keep task=custom."
72
+ )
73
+ raise RuntimeError(f"HTTP {e.code}: {body}") from e
74
+ except URLError as e:
75
+ raise RuntimeError(f"Network error: {e}") from e
76
+
77
+
78
+ def main() -> int:
79
+ load_project_env()
80
+ parser = argparse.ArgumentParser(description="Test AF3 caption endpoint")
81
+ parser.add_argument(
82
+ "--url",
83
+ default=get_env("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url"),
84
+ required=False,
85
+ )
86
+ parser.add_argument(
87
+ "--token",
88
+ default=get_env("HF_TOKEN", "hf_token"),
89
+ required=False,
90
+ )
91
+ parser.add_argument("--audio", required=True, help="Path to local audio file")
92
+ parser.add_argument("--prompt", default=DEFAULT_AF3_PROMPT)
93
+ parser.add_argument(
94
+ "--mode",
95
+ choices=["auto", "think", "single"],
96
+ default="auto",
97
+ help="Optional AF3 mode selector for NVIDIA-stack endpoints.",
98
+ )
99
+ parser.add_argument(
100
+ "--think-long",
101
+ action="store_true",
102
+ help="Use long-form AF3 prompt + higher token budget defaults.",
103
+ )
104
+ parser.add_argument("--max-new-tokens", type=int, default=1400)
105
+ parser.add_argument("--temperature", type=float, default=0.1)
106
+ parser.add_argument("--save-json", default="", help="Optional output JSON path")
107
+ args = parser.parse_args()
108
+
109
+ if not args.url:
110
+ raise RuntimeError("Missing endpoint URL. Pass --url or set HF_AF3_ENDPOINT_URL.")
111
+ if not Path(args.audio).is_file():
112
+ raise FileNotFoundError(f"Audio file not found: {args.audio}")
113
+
114
+ audio_b64 = load_audio_b64(args.audio, target_sr=16000)
115
+ prompt = args.prompt
116
+ max_new_tokens = int(args.max_new_tokens)
117
+ temperature = float(args.temperature)
118
+ if args.think_long:
119
+ if prompt == DEFAULT_AF3_PROMPT:
120
+ prompt = DEFAULT_AF3_PROMPT_THINK_LONG
121
+ if max_new_tokens == 1400:
122
+ max_new_tokens = 3200
123
+ if abs(temperature - 0.1) < 1e-9:
124
+ temperature = 0.2
125
+
126
+ payload = {
127
+ "inputs": {
128
+ "prompt": prompt,
129
+ "audio_base64": audio_b64,
130
+ "sample_rate": 16000,
131
+ "max_new_tokens": max_new_tokens,
132
+ "temperature": temperature,
133
+ }
134
+ }
135
+ if args.mode != "auto":
136
+ payload["inputs"]["think_mode"] = bool(args.mode == "think")
137
+
138
+ result = send(args.url, args.token, payload)
139
+ rendered = json.dumps(result, indent=2, ensure_ascii=False)
140
+ try:
141
+ print(rendered)
142
+ except UnicodeEncodeError:
143
+ # Fallback for Windows cp1252 terminals when model emits non-ASCII punctuation.
144
+ print(json.dumps(result, indent=2, ensure_ascii=True))
145
+ if args.save_json:
146
+ Path(args.save_json).write_text(
147
+ json.dumps(result, indent=2, ensure_ascii=False),
148
+ encoding="utf-8",
149
+ )
150
+ print(f"Saved: {args.save_json}")
151
+ return 0
152
+
153
+
154
+ if __name__ == "__main__":
155
+ raise SystemExit(main())
scripts/endpoint/test_qwen_caption_endpoint.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Send one audio file to a Qwen caption endpoint and print/save the response.
4
+
5
+ Request contract expected by templates/hf-qwen-caption-endpoint/handler.py:
6
+ {
7
+ "inputs": {
8
+ "prompt": "...",
9
+ "audio_base64": "...",
10
+ "sample_rate": 16000,
11
+ "max_new_tokens": 384,
12
+ "temperature": 0.1
13
+ }
14
+ }
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import base64
21
+ import json
22
+ import os
23
+ from pathlib import Path
24
+ from urllib.error import HTTPError, URLError
25
+ from urllib.request import Request, urlopen
26
+
27
+ import soundfile as sf
28
+
29
+ from qwen_audio_captioning import DEFAULT_ANALYSIS_PROMPT, load_audio_mono
30
+
31
+
32
+ def read_dotenv_value(path: str, key: str) -> str:
33
+ p = Path(path)
34
+ if not p.exists():
35
+ return ""
36
+ for raw in p.read_text(encoding="utf-8").splitlines():
37
+ line = raw.strip()
38
+ if not line or line.startswith("#") or "=" not in line:
39
+ continue
40
+ k, v = line.split("=", 1)
41
+ if k.strip() == key:
42
+ return v.strip().strip('"').strip("'")
43
+ return ""
44
+
45
+
46
+ def load_audio_b64(audio_path: str, target_sr: int) -> str:
47
+ mono, sr = load_audio_mono(audio_path, target_sr=target_sr)
48
+
49
+ import io
50
+
51
+ buf = io.BytesIO()
52
+ sf.write(buf, mono, int(sr), format="WAV")
53
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
54
+
55
+
56
+ def send(url: str, token: str, payload: dict) -> dict:
57
+ req = Request(
58
+ url=url,
59
+ method="POST",
60
+ data=json.dumps(payload).encode("utf-8"),
61
+ headers={
62
+ "Authorization": f"Bearer {token}",
63
+ "Content-Type": "application/json",
64
+ },
65
+ )
66
+ try:
67
+ with urlopen(req, timeout=600) as resp:
68
+ text = resp.read().decode("utf-8")
69
+ return json.loads(text)
70
+ except HTTPError as e:
71
+ body = e.read().decode("utf-8", errors="replace")
72
+ raise RuntimeError(f"HTTP {e.code}: {body}") from e
73
+ except URLError as e:
74
+ raise RuntimeError(f"Network error: {e}") from e
75
+
76
+
77
+ def main() -> int:
78
+ parser = argparse.ArgumentParser(description="Test Qwen caption endpoint")
79
+ parser.add_argument(
80
+ "--url",
81
+ default=os.getenv("HF_QWEN_ENDPOINT_URL", "") or read_dotenv_value(".env", "HF_QWEN_ENDPOINT_URL"),
82
+ required=False,
83
+ )
84
+ parser.add_argument(
85
+ "--token",
86
+ default=(
87
+ os.getenv("HF_TOKEN", "")
88
+ or os.getenv("hf_token", "")
89
+ or read_dotenv_value(".env", "HF_TOKEN")
90
+ or read_dotenv_value(".env", "hf_token")
91
+ ),
92
+ required=False,
93
+ )
94
+ parser.add_argument("--audio", required=True, help="Path to local audio file")
95
+ parser.add_argument("--prompt", default=DEFAULT_ANALYSIS_PROMPT)
96
+ parser.add_argument("--sample-rate", type=int, default=16000)
97
+ parser.add_argument("--max-new-tokens", type=int, default=384)
98
+ parser.add_argument("--temperature", type=float, default=0.1)
99
+ parser.add_argument("--save-json", default="", help="Optional output JSON path")
100
+ args = parser.parse_args()
101
+
102
+ if not args.url:
103
+ raise RuntimeError("Missing endpoint URL. Pass --url or set HF_QWEN_ENDPOINT_URL.")
104
+ if not args.token:
105
+ raise RuntimeError("Missing HF token. Pass --token or set HF_TOKEN.")
106
+ if not Path(args.audio).is_file():
107
+ raise FileNotFoundError(f"Audio file not found: {args.audio}")
108
+
109
+ audio_b64 = load_audio_b64(args.audio, target_sr=args.sample_rate)
110
+ payload = {
111
+ "inputs": {
112
+ "prompt": args.prompt,
113
+ "audio_base64": audio_b64,
114
+ "sample_rate": args.sample_rate,
115
+ "max_new_tokens": args.max_new_tokens,
116
+ "temperature": args.temperature,
117
+ }
118
+ }
119
+
120
+ result = send(args.url, args.token, payload)
121
+ print(json.dumps(result, indent=2, ensure_ascii=False))
122
+ if args.save_json:
123
+ Path(args.save_json).write_text(
124
+ json.dumps(result, indent=2, ensure_ascii=False),
125
+ encoding="utf-8",
126
+ )
127
+ print(f"Saved: {args.save_json}")
128
+ return 0
129
+
130
+
131
+ if __name__ == "__main__":
132
+ raise SystemExit(main())
scripts/hf_clone.py CHANGED
@@ -5,6 +5,8 @@ Bootstrap this project into your own Hugging Face Space and/or Endpoint repo.
5
  Examples:
6
  python scripts/hf_clone.py space --repo-id your-name/ace-step-lora-studio
7
  python scripts/hf_clone.py endpoint --repo-id your-name/ace-step-endpoint
 
 
8
  python scripts/hf_clone.py all --space-repo-id your-name/ace-step-lora-studio --endpoint-repo-id your-name/ace-step-endpoint
9
  """
10
 
@@ -172,10 +174,95 @@ def _stage_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
172
  return copied, bytes_total
173
 
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def _resolve_token(arg_token: str) -> str | None:
176
  if arg_token:
177
  return arg_token
178
- return os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
 
181
  def _ensure_repo(
@@ -266,6 +353,72 @@ def clone_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool
266
  print(f"[endpoint] uploaded to https://huggingface.co/{repo_id}")
267
 
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  def build_parser() -> argparse.ArgumentParser:
270
  parser = argparse.ArgumentParser(description="Clone this project into your own HF Space/Endpoint repos.")
271
  subparsers = parser.add_subparsers(dest="cmd", required=True)
@@ -282,6 +435,31 @@ def build_parser() -> argparse.ArgumentParser:
282
  p_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
283
  p_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  p_all = subparsers.add_parser("all", help="Run both Space and Endpoint bootstrap.")
286
  p_all.add_argument("--space-repo-id", required=True, help="Target space repo id.")
287
  p_all.add_argument("--endpoint-repo-id", required=True, help="Target endpoint model repo id.")
@@ -305,6 +483,12 @@ def main() -> int:
305
  clone_space(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
306
  elif args.cmd == "endpoint":
307
  clone_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
 
 
 
 
 
 
308
  else:
309
  clone_space(args.space_repo_id, private=bool(args.space_private), token=token, dry_run=bool(args.dry_run))
310
  clone_endpoint(
 
5
  Examples:
6
  python scripts/hf_clone.py space --repo-id your-name/ace-step-lora-studio
7
  python scripts/hf_clone.py endpoint --repo-id your-name/ace-step-endpoint
8
+ python scripts/hf_clone.py af3-endpoint --repo-id your-name/af3-caption-endpoint
9
+ python scripts/hf_clone.py af3-nvidia-endpoint --repo-id your-name/af3-nvidia-endpoint
10
  python scripts/hf_clone.py all --space-repo-id your-name/ace-step-lora-studio --endpoint-repo-id your-name/ace-step-endpoint
11
  """
12
 
 
174
  return copied, bytes_total
175
 
176
 
177
+ def _iter_qwen_endpoint_template_paths() -> Iterable[tuple[Path, Path]]:
178
+ template_dir = PROJECT_ROOT / "templates" / "hf-qwen-caption-endpoint"
179
+ mapping = {
180
+ "handler.py": Path("handler.py"),
181
+ "requirements.txt": Path("requirements.txt"),
182
+ "README.md": Path("README.md"),
183
+ }
184
+ for src_name, dst_rel in mapping.items():
185
+ src = template_dir / src_name
186
+ if src.exists():
187
+ yield src, dst_rel
188
+
189
+
190
+ def _stage_qwen_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
191
+ copied = 0
192
+ bytes_total = 0
193
+ for src, rel_dst in _iter_qwen_endpoint_template_paths():
194
+ dst = staging_dir / rel_dst
195
+ _copy_file(src, dst)
196
+ copied += 1
197
+ bytes_total += src.stat().st_size
198
+ return copied, bytes_total
199
+
200
+
201
+ def _iter_af3_endpoint_template_paths() -> Iterable[tuple[Path, Path]]:
202
+ template_dir = PROJECT_ROOT / "templates" / "hf-af3-caption-endpoint"
203
+ mapping = {
204
+ "handler.py": Path("handler.py"),
205
+ "requirements.txt": Path("requirements.txt"),
206
+ "README.md": Path("README.md"),
207
+ }
208
+ for src_name, dst_rel in mapping.items():
209
+ src = template_dir / src_name
210
+ if src.exists():
211
+ yield src, dst_rel
212
+
213
+
214
+ def _stage_af3_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
215
+ copied = 0
216
+ bytes_total = 0
217
+ for src, rel_dst in _iter_af3_endpoint_template_paths():
218
+ dst = staging_dir / rel_dst
219
+ _copy_file(src, dst)
220
+ copied += 1
221
+ bytes_total += src.stat().st_size
222
+ return copied, bytes_total
223
+
224
+
225
+ def _iter_af3_nvidia_endpoint_template_paths() -> Iterable[tuple[Path, Path]]:
226
+ template_dir = PROJECT_ROOT / "templates" / "hf-af3-nvidia-endpoint"
227
+ mapping = {
228
+ "handler.py": Path("handler.py"),
229
+ "requirements.txt": Path("requirements.txt"),
230
+ "README.md": Path("README.md"),
231
+ }
232
+ for src_name, dst_rel in mapping.items():
233
+ src = template_dir / src_name
234
+ if src.exists():
235
+ yield src, dst_rel
236
+
237
+
238
+ def _stage_af3_nvidia_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
239
+ copied = 0
240
+ bytes_total = 0
241
+ for src, rel_dst in _iter_af3_nvidia_endpoint_template_paths():
242
+ dst = staging_dir / rel_dst
243
+ _copy_file(src, dst)
244
+ copied += 1
245
+ bytes_total += src.stat().st_size
246
+ return copied, bytes_total
247
+
248
+
249
  def _resolve_token(arg_token: str) -> str | None:
250
  if arg_token:
251
  return arg_token
252
+ env_token = os.getenv("HF_TOKEN") or os.getenv("hf_token")
253
+ if env_token:
254
+ return env_token
255
+
256
+ dotenv = PROJECT_ROOT / ".env"
257
+ if dotenv.exists():
258
+ for raw in dotenv.read_text(encoding="utf-8").splitlines():
259
+ line = raw.strip()
260
+ if not line or line.startswith("#") or "=" not in line:
261
+ continue
262
+ k, v = line.split("=", 1)
263
+ if k.strip() in {"HF_TOKEN", "hf_token"}:
264
+ return v.strip().strip('"').strip("'")
265
+ return None
266
 
267
 
268
  def _ensure_repo(
 
353
  print(f"[endpoint] uploaded to https://huggingface.co/{repo_id}")
354
 
355
 
356
+ def clone_qwen_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool) -> None:
357
+ with tempfile.TemporaryDirectory(prefix="hf_qwen_endpoint_clone_") as tmp:
358
+ staging = Path(tmp)
359
+ copied, bytes_total = _stage_qwen_endpoint_snapshot(staging)
360
+ print(f"[qwen-endpoint] staged files: {copied}, size: {_fmt_mb(bytes_total)}")
361
+
362
+ if dry_run:
363
+ print("[qwen-endpoint] dry-run complete (nothing uploaded).")
364
+ return
365
+
366
+ api = HfApi(token=token)
367
+ _ensure_repo(api, repo_id=repo_id, repo_type="model", private=private)
368
+ _upload_snapshot(
369
+ api,
370
+ repo_id=repo_id,
371
+ repo_type="model",
372
+ folder_path=staging,
373
+ commit_message="Bootstrap Qwen2-Audio custom endpoint repo",
374
+ )
375
+ print(f"[qwen-endpoint] uploaded to https://huggingface.co/{repo_id}")
376
+
377
+
378
+ def clone_af3_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool) -> None:
379
+ with tempfile.TemporaryDirectory(prefix="hf_af3_endpoint_clone_") as tmp:
380
+ staging = Path(tmp)
381
+ copied, bytes_total = _stage_af3_endpoint_snapshot(staging)
382
+ print(f"[af3-endpoint] staged files: {copied}, size: {_fmt_mb(bytes_total)}")
383
+
384
+ if dry_run:
385
+ print("[af3-endpoint] dry-run complete (nothing uploaded).")
386
+ return
387
+
388
+ api = HfApi(token=token)
389
+ _ensure_repo(api, repo_id=repo_id, repo_type="model", private=private)
390
+ _upload_snapshot(
391
+ api,
392
+ repo_id=repo_id,
393
+ repo_type="model",
394
+ folder_path=staging,
395
+ commit_message="Bootstrap Audio Flamingo 3 custom endpoint repo",
396
+ )
397
+ print(f"[af3-endpoint] uploaded to https://huggingface.co/{repo_id}")
398
+
399
+
400
+ def clone_af3_nvidia_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool) -> None:
401
+ with tempfile.TemporaryDirectory(prefix="hf_af3_nvidia_endpoint_clone_") as tmp:
402
+ staging = Path(tmp)
403
+ copied, bytes_total = _stage_af3_nvidia_endpoint_snapshot(staging)
404
+ print(f"[af3-nvidia-endpoint] staged files: {copied}, size: {_fmt_mb(bytes_total)}")
405
+
406
+ if dry_run:
407
+ print("[af3-nvidia-endpoint] dry-run complete (nothing uploaded).")
408
+ return
409
+
410
+ api = HfApi(token=token)
411
+ _ensure_repo(api, repo_id=repo_id, repo_type="model", private=private)
412
+ _upload_snapshot(
413
+ api,
414
+ repo_id=repo_id,
415
+ repo_type="model",
416
+ folder_path=staging,
417
+ commit_message="Bootstrap Audio Flamingo 3 NVIDIA-stack endpoint repo",
418
+ )
419
+ print(f"[af3-nvidia-endpoint] uploaded to https://huggingface.co/{repo_id}")
420
+
421
+
422
  def build_parser() -> argparse.ArgumentParser:
423
  parser = argparse.ArgumentParser(description="Clone this project into your own HF Space/Endpoint repos.")
424
  subparsers = parser.add_subparsers(dest="cmd", required=True)
 
435
  p_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
436
  p_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
437
 
438
+ p_qwen_endpoint = subparsers.add_parser("qwen-endpoint", help="Create/update Qwen2-Audio custom endpoint repo.")
439
+ p_qwen_endpoint.add_argument("--repo-id", required=True, help="Target model repo id, e.g. username/my-qwen-endpoint.")
440
+ p_qwen_endpoint.add_argument("--private", action="store_true", help="Create repo as private.")
441
+ p_qwen_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
442
+ p_qwen_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
443
+
444
+ p_af3_endpoint = subparsers.add_parser("af3-endpoint", help="Create/update Audio Flamingo 3 custom endpoint repo.")
445
+ p_af3_endpoint.add_argument("--repo-id", required=True, help="Target model repo id, e.g. username/my-af3-endpoint.")
446
+ p_af3_endpoint.add_argument("--private", action="store_true", help="Create repo as private.")
447
+ p_af3_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
448
+ p_af3_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
449
+
450
+ p_af3_nvidia_endpoint = subparsers.add_parser(
451
+ "af3-nvidia-endpoint",
452
+ help="Create/update AF3 NVIDIA-stack (llava+stage35) endpoint repo.",
453
+ )
454
+ p_af3_nvidia_endpoint.add_argument(
455
+ "--repo-id",
456
+ required=True,
457
+ help="Target model repo id, e.g. username/my-af3-nvidia-endpoint.",
458
+ )
459
+ p_af3_nvidia_endpoint.add_argument("--private", action="store_true", help="Create repo as private.")
460
+ p_af3_nvidia_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
461
+ p_af3_nvidia_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
462
+
463
  p_all = subparsers.add_parser("all", help="Run both Space and Endpoint bootstrap.")
464
  p_all.add_argument("--space-repo-id", required=True, help="Target space repo id.")
465
  p_all.add_argument("--endpoint-repo-id", required=True, help="Target endpoint model repo id.")
 
483
  clone_space(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
484
  elif args.cmd == "endpoint":
485
  clone_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
486
+ elif args.cmd == "qwen-endpoint":
487
+ clone_qwen_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
488
+ elif args.cmd == "af3-endpoint":
489
+ clone_af3_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
490
+ elif args.cmd == "af3-nvidia-endpoint":
491
+ clone_af3_nvidia_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
492
  else:
493
  clone_space(args.space_repo_id, private=bool(args.space_private), token=token, dry_run=bool(args.dry_run))
494
  clone_endpoint(
scripts/jobs/submit_hf_qwen_caption_job.ps1 ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ param(
2
+ [string]$CodeRepo = "YOUR_USERNAME/ace-step-lora-studio",
3
+ [string]$DatasetRepo = "",
4
+ [string]$DatasetRevision = "main",
5
+ [string]$DatasetSubdir = "",
6
+ [string]$Backend = "local",
7
+ [string]$ModelId = "Qwen/Qwen2-Audio-7B-Instruct",
8
+ [string]$EndpointUrl = "",
9
+ [string]$Device = "auto",
10
+ [string]$TorchDtype = "auto",
11
+ [string]$Prompt = "",
12
+ [double]$SegmentSeconds = 30.0,
13
+ [double]$OverlapSeconds = 2.0,
14
+ [int]$MaxNewTokens = 384,
15
+ [double]$Temperature = 0.1,
16
+ [string]$OutputDir = "/workspace/qwen_annotations",
17
+ [string]$UploadRepo = "",
18
+ [switch]$UploadPrivate,
19
+ [switch]$CopyAudio,
20
+ [switch]$KeepRawOutputs,
21
+ [switch]$WriteInplaceSidecars,
22
+ [string]$Flavor = "a10g-large",
23
+ [string]$Timeout = "8h",
24
+ [switch]$Detach
25
+ )
26
+
27
+ $ErrorActionPreference = "Stop"
28
+
29
+ if (-not $DatasetRepo) {
30
+ throw "Provide -DatasetRepo (HF dataset repo containing audio files)."
31
+ }
32
+
33
+ if ($Backend -eq "hf_endpoint" -and -not $EndpointUrl) {
34
+ throw "Backend hf_endpoint requires -EndpointUrl."
35
+ }
36
+
37
+ $secretArgs = @("--secrets", "HF_TOKEN")
38
+
39
+ $datasetSubdirArgs = ""
40
+ if ($DatasetSubdir) {
41
+ $datasetSubdirArgs = "--dataset-subdir `"$DatasetSubdir`""
42
+ }
43
+
44
+ $endpointArgs = ""
45
+ if ($EndpointUrl) {
46
+ $endpointArgs = "--endpoint-url `"$EndpointUrl`""
47
+ }
48
+
49
+ $uploadArgs = ""
50
+ if ($UploadRepo) {
51
+ $uploadArgs = "--upload-repo `"$UploadRepo`""
52
+ if ($UploadPrivate.IsPresent) {
53
+ $uploadArgs += " --upload-private"
54
+ }
55
+ }
56
+
57
+ $copyAudioArg = ""
58
+ if ($CopyAudio.IsPresent) {
59
+ $copyAudioArg = "--copy-audio"
60
+ }
61
+
62
+ $keepRawArg = ""
63
+ if ($KeepRawOutputs.IsPresent) {
64
+ $keepRawArg = "--keep-raw-outputs"
65
+ }
66
+
67
+ $writeInplaceArg = ""
68
+ if ($WriteInplaceSidecars.IsPresent) {
69
+ $writeInplaceArg = "--write-inplace-sidecars"
70
+ }
71
+
72
+ $promptArg = ""
73
+ if ($Prompt) {
74
+ $escapedPrompt = $Prompt.Replace('"', '\"')
75
+ $promptArg = "--prompt `"$escapedPrompt`""
76
+ }
77
+
78
+ $detachArg = ""
79
+ if ($Detach.IsPresent) {
80
+ $detachArg = "--detach"
81
+ }
82
+
83
+ $jobCommand = @"
84
+ set -e
85
+ python -m pip install --no-cache-dir --upgrade pip
86
+ git clone https://huggingface.co/$CodeRepo /workspace/code
87
+ cd /workspace/code
88
+ python -m pip install --no-cache-dir -r requirements.txt
89
+ python scripts/annotations/qwen_caption_dataset.py \
90
+ --dataset-repo "$DatasetRepo" \
91
+ --dataset-revision "$DatasetRevision" \
92
+ $datasetSubdirArgs \
93
+ --backend "$Backend" \
94
+ --model-id "$ModelId" \
95
+ $endpointArgs \
96
+ --device "$Device" \
97
+ --torch-dtype "$TorchDtype" \
98
+ --segment-seconds $SegmentSeconds \
99
+ --overlap-seconds $OverlapSeconds \
100
+ --max-new-tokens $MaxNewTokens \
101
+ --temperature $Temperature \
102
+ --output-dir "$OutputDir" \
103
+ $promptArg \
104
+ $copyAudioArg \
105
+ $keepRawArg \
106
+ $writeInplaceArg \
107
+ $uploadArgs
108
+ "@
109
+
110
+ $argsList = @(
111
+ "jobs", "run",
112
+ "--flavor", $Flavor,
113
+ "--timeout", $Timeout
114
+ ) + $secretArgs
115
+
116
+ if ($detachArg) {
117
+ $argsList += $detachArg
118
+ }
119
+
120
+ $argsList += @(
121
+ "pytorch/pytorch:2.5.1-cuda12.1-cudnn9-runtime",
122
+ "bash", "-lc", $jobCommand
123
+ )
124
+
125
+ Write-Host "Submitting Qwen caption HF Job with flavor=$Flavor timeout=$Timeout ..."
126
+ Write-Host "Dataset repo: $DatasetRepo"
127
+ Write-Host "Code repo: $CodeRepo"
128
+ if ($UploadRepo) {
129
+ Write-Host "Will upload exported annotations to: $UploadRepo"
130
+ }
131
+
132
+ & hf @argsList
133
+
scripts/pipeline/refine_dataset_json_with_openai.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Refine existing dataset JSON annotations into Ace-Step 1.5 LoRA-ready sidecars.
4
+
5
+ This script:
6
+ 1. Reads existing JSON files (typically containing AF3 `generated_text`).
7
+ 2. Uses OpenAI cleanup (optionally with web search) to normalize/expand metadata.
8
+ 3. Writes normalized sidecar JSON in-place (or to an output directory).
9
+ 4. Creates backup copies before overwrite by default.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import shutil
17
+ import sys
18
+ from pathlib import Path
19
+ from typing import Dict, Iterable, List, Optional, Tuple
20
+
21
+ from tqdm import tqdm
22
+
23
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
24
+ if str(PROJECT_ROOT) not in sys.path:
25
+ sys.path.insert(0, str(PROJECT_ROOT))
26
+
27
+ from af3_chatgpt_pipeline import ( # noqa: E402
28
+ DEFAULT_AF3_PROMPT,
29
+ DEFAULT_OPENAI_MODEL,
30
+ build_lora_sidecar,
31
+ cleanup_with_chatgpt,
32
+ )
33
+ from qwen_audio_captioning import AUDIO_EXTENSIONS # noqa: E402
34
+ from utils.env_config import get_env, load_project_env # noqa: E402
35
+
36
+
37
+ def _parse_args() -> argparse.Namespace:
38
+ load_project_env()
39
+ p = argparse.ArgumentParser(
40
+ description="Refine dataset JSONs into Ace-Step 1.5 LoRA-ready metadata using OpenAI."
41
+ )
42
+ p.add_argument("--dataset-dir", default="train-dataset", help="Directory containing source JSON files")
43
+ p.add_argument("--recursive", action="store_true", help="Include nested folders")
44
+ p.add_argument("--pattern", default="*.json", help="Filename glob pattern")
45
+ p.add_argument("--output-dir", default="", help="Optional output folder. Default: overwrite in place")
46
+ p.add_argument(
47
+ "--backup-ext",
48
+ default=".backup-before-openai.json",
49
+ help="Backup extension for in-place writes",
50
+ )
51
+ p.add_argument("--no-backup", action="store_true", help="Disable backup creation for in-place writes")
52
+ p.add_argument("--limit", type=int, default=0, help="Process only first N files (0 = all)")
53
+ p.add_argument("--artist-default", default="Andrew Spacey", help="Fallback artist if parsing fails")
54
+ p.add_argument("--user-context", default="", help="Extra guidance passed to OpenAI cleanup")
55
+ p.add_argument("--openai-api-key", default="", help="Overrides OPENAI_API_KEY from .env")
56
+ p.add_argument(
57
+ "--openai-model",
58
+ default=get_env("OPENAI_MODEL", "openai_model", default=DEFAULT_OPENAI_MODEL),
59
+ help="OpenAI model id",
60
+ )
61
+ p.add_argument(
62
+ "--enable-web-search",
63
+ action="store_true",
64
+ help="Enable web search tool for artist/track context lookup",
65
+ )
66
+ p.add_argument("--fail-fast", action="store_true", help="Stop on first failure")
67
+ p.add_argument("--dry-run", action="store_true", help="Do not write files")
68
+ return p.parse_args()
69
+
70
+
71
+ def _iter_json_files(dataset_dir: Path, pattern: str, recursive: bool) -> List[Path]:
72
+ if recursive:
73
+ return sorted(dataset_dir.rglob(pattern))
74
+ return sorted(dataset_dir.glob(pattern))
75
+
76
+
77
+ def _load_json(path: Path) -> Dict:
78
+ # Handle both standard UTF-8 and UTF-8 with BOM.
79
+ text = path.read_text(encoding="utf-8-sig")
80
+ data = json.loads(text)
81
+ if not isinstance(data, dict):
82
+ raise ValueError("Top-level JSON is not an object")
83
+ return data
84
+
85
+
86
+ def _detect_audio_path(json_path: Path) -> Optional[Path]:
87
+ stem = json_path.stem
88
+ for ext in AUDIO_EXTENSIONS:
89
+ candidate = json_path.with_suffix(ext)
90
+ if candidate.exists():
91
+ return candidate
92
+ # Fallback to case-insensitive scan.
93
+ parent = json_path.parent
94
+ for f in parent.iterdir():
95
+ if f.is_file() and f.stem == stem and f.suffix.lower() in AUDIO_EXTENSIONS:
96
+ return f
97
+ return None
98
+
99
+
100
+ def _try_duration_seconds(audio_path: Optional[Path], fallback: float = 0.0) -> float:
101
+ if audio_path is None or not audio_path.exists():
102
+ return float(fallback or 0.0)
103
+ try:
104
+ import soundfile as sf
105
+
106
+ info = sf.info(str(audio_path))
107
+ if info.samplerate and info.frames:
108
+ return float(info.frames) / float(info.samplerate)
109
+ except Exception:
110
+ pass
111
+ return float(fallback or 0.0)
112
+
113
+
114
+ def _parse_artist_track_from_stem(stem: str, artist_default: str) -> Tuple[str, str]:
115
+ parts = stem.split(" - ", 1)
116
+ if len(parts) == 2:
117
+ artist, track = parts[0].strip(), parts[1].strip()
118
+ if artist and track:
119
+ return artist, track
120
+ return artist_default.strip() or "Unknown Artist", stem.strip()
121
+
122
+
123
+ def _extract_raw_analysis(data: Dict) -> str:
124
+ checks: Iterable[object] = (
125
+ data.get("generated_text"),
126
+ data.get("af3_analysis"),
127
+ data.get("analysis_long"),
128
+ data.get("analysis_short"),
129
+ (data.get("music_analysis") or {}).get("summary_long") if isinstance(data.get("music_analysis"), dict) else None,
130
+ data.get("caption"),
131
+ )
132
+ for value in checks:
133
+ if isinstance(value, str) and value.strip():
134
+ return value.strip()
135
+ return ""
136
+
137
+
138
+ def _ensure_output_path(src_json: Path, output_dir: Optional[Path]) -> Path:
139
+ if output_dir:
140
+ output_dir.mkdir(parents=True, exist_ok=True)
141
+ return output_dir / src_json.name
142
+ return src_json
143
+
144
+
145
+ def _create_backup(src: Path, backup_ext: str) -> Optional[Path]:
146
+ backup_path = src.with_name(src.stem + backup_ext)
147
+ if backup_path.exists():
148
+ return backup_path
149
+ shutil.copy2(src, backup_path)
150
+ return backup_path
151
+
152
+
153
+ def _finalize_sidecar(
154
+ *,
155
+ cleaned: Dict,
156
+ raw_analysis: str,
157
+ duration: float,
158
+ source_audio: Optional[Path],
159
+ source_json: Path,
160
+ artist: str,
161
+ track_name: str,
162
+ openai_model: str,
163
+ web_search_used: bool,
164
+ ) -> Dict:
165
+ source_audio_str = str(source_audio) if source_audio else ""
166
+ sidecar = build_lora_sidecar(
167
+ cleaned,
168
+ af3_text=raw_analysis,
169
+ af3_prompt=DEFAULT_AF3_PROMPT,
170
+ af3_backend="existing_json_refine",
171
+ af3_model_id="nvidia/audio-flamingo-3",
172
+ source_audio=source_audio_str,
173
+ duration=duration,
174
+ chatgpt_model=openai_model,
175
+ web_search_used=web_search_used,
176
+ )
177
+ sidecar["artist"] = artist
178
+ sidecar["track_name"] = track_name
179
+ sidecar["source"] = {
180
+ "input_json": str(source_json),
181
+ "input_audio": source_audio_str,
182
+ "refined_from_existing_json": True,
183
+ }
184
+ return sidecar
185
+
186
+
187
+ def main() -> int:
188
+ args = _parse_args()
189
+ dataset_dir = Path(args.dataset_dir)
190
+ if not dataset_dir.exists():
191
+ raise FileNotFoundError(f"Dataset directory not found: {dataset_dir}")
192
+
193
+ openai_key = args.openai_api_key or get_env("OPENAI_API_KEY", "openai_api_key")
194
+ if not openai_key:
195
+ raise RuntimeError("Missing OPENAI_API_KEY (set in .env or pass --openai-api-key).")
196
+
197
+ files = _iter_json_files(dataset_dir, pattern=args.pattern, recursive=bool(args.recursive))
198
+ if args.limit and args.limit > 0:
199
+ files = files[: int(args.limit)]
200
+ if not files:
201
+ raise RuntimeError(f"No files matched {args.pattern} in {dataset_dir}")
202
+
203
+ output_dir = Path(args.output_dir) if args.output_dir else None
204
+ failures: List[str] = []
205
+ saved: List[str] = []
206
+ backups: List[str] = []
207
+
208
+ for json_path in tqdm(files, desc="Refine JSON"):
209
+ try:
210
+ data = _load_json(json_path)
211
+ raw_analysis = _extract_raw_analysis(data)
212
+ if not raw_analysis:
213
+ raise ValueError("No analysis text found (generated_text/analysis/caption missing)")
214
+
215
+ artist, track_name = _parse_artist_track_from_stem(json_path.stem, args.artist_default)
216
+ artist = str(data.get("artist") or artist).strip() or artist
217
+ track_name = str(data.get("track_name") or data.get("title") or track_name).strip() or track_name
218
+
219
+ source_audio = _detect_audio_path(json_path)
220
+ duration = _try_duration_seconds(source_audio, fallback=float(data.get("duration") or 0.0))
221
+
222
+ try:
223
+ cleaned = cleanup_with_chatgpt(
224
+ raw_analysis,
225
+ openai_api_key=openai_key,
226
+ model=args.openai_model,
227
+ duration=duration,
228
+ user_context=args.user_context,
229
+ artist_name=artist,
230
+ track_name=track_name,
231
+ enable_web_search=bool(args.enable_web_search),
232
+ )
233
+ web_used = bool(args.enable_web_search)
234
+ except Exception:
235
+ # If web-search tool compatibility fails on this runtime, retry without it.
236
+ if not args.enable_web_search:
237
+ raise
238
+ cleaned = cleanup_with_chatgpt(
239
+ raw_analysis,
240
+ openai_api_key=openai_key,
241
+ model=args.openai_model,
242
+ duration=duration,
243
+ user_context=args.user_context,
244
+ artist_name=artist,
245
+ track_name=track_name,
246
+ enable_web_search=False,
247
+ )
248
+ web_used = False
249
+
250
+ sidecar = _finalize_sidecar(
251
+ cleaned=cleaned,
252
+ raw_analysis=raw_analysis,
253
+ duration=duration,
254
+ source_audio=source_audio,
255
+ source_json=json_path,
256
+ artist=artist,
257
+ track_name=track_name,
258
+ openai_model=args.openai_model,
259
+ web_search_used=web_used,
260
+ )
261
+
262
+ out_path = _ensure_output_path(json_path, output_dir)
263
+ if not args.dry_run and output_dir is None and not args.no_backup:
264
+ backup = _create_backup(json_path, args.backup_ext)
265
+ if backup:
266
+ backups.append(str(backup))
267
+
268
+ if not args.dry_run:
269
+ out_path.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
270
+ saved.append(str(out_path))
271
+ except Exception as exc:
272
+ failures.append(f"{json_path.name}: {exc}")
273
+ if args.fail_fast:
274
+ break
275
+
276
+ summary = {
277
+ "processed": len(files),
278
+ "saved": len(saved),
279
+ "failed": len(failures),
280
+ "backup_count": len(backups),
281
+ "output_mode": "separate_dir" if output_dir else ("dry_run" if args.dry_run else "in_place"),
282
+ "sample_saved": saved[:10],
283
+ "sample_failures": failures[:10],
284
+ }
285
+ print(json.dumps(summary, indent=2, ensure_ascii=False))
286
+ return 0 if not failures else 2
287
+
288
+
289
+ if __name__ == "__main__":
290
+ raise SystemExit(main())
291
+
scripts/pipeline/run_af3_chatgpt_pipeline.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Run AF3 -> ChatGPT cleanup pipeline on one file or a dataset folder.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import json
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import List
13
+
14
+ from tqdm import tqdm
15
+
16
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
17
+ if str(PROJECT_ROOT) not in sys.path:
18
+ sys.path.insert(0, str(PROJECT_ROOT))
19
+
20
+ from af3_chatgpt_pipeline import (
21
+ DEFAULT_AF3_MODEL_ID,
22
+ DEFAULT_AF3_PROMPT,
23
+ DEFAULT_AF3_PROMPT_THINK_LONG,
24
+ DEFAULT_OPENAI_MODEL,
25
+ AF3EndpointClient,
26
+ AF3LocalClient,
27
+ run_af3_chatgpt_pipeline,
28
+ )
29
+ from qwen_audio_captioning import list_audio_files
30
+ from utils.env_config import get_env, load_project_env
31
+
32
+
33
+ def build_parser() -> argparse.ArgumentParser:
34
+ load_project_env()
35
+ p = argparse.ArgumentParser(description="AF3 + ChatGPT LoRA metadata pipeline")
36
+ p.add_argument("--audio", default="", help="Single audio path")
37
+ p.add_argument("--dataset-dir", default="", help="Dataset folder")
38
+ p.add_argument("--backend", default="hf_endpoint", choices=["hf_endpoint", "local"])
39
+ p.add_argument("--endpoint-url", default=get_env("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url"))
40
+ p.add_argument("--hf-token", default="")
41
+ p.add_argument("--model-id", default=get_env("AF3_MODEL_ID", "af3_model_id", default=DEFAULT_AF3_MODEL_ID))
42
+ p.add_argument("--device", default="auto", choices=["auto", "cuda", "cpu", "mps"])
43
+ p.add_argument("--torch-dtype", default="auto", choices=["auto", "float16", "bfloat16", "float32"])
44
+ p.add_argument("--prompt", default=DEFAULT_AF3_PROMPT)
45
+ p.add_argument(
46
+ "--think-long",
47
+ action="store_true",
48
+ help="Use long-form AF3 prompt + higher token budget defaults.",
49
+ )
50
+ p.add_argument("--af3-max-new-tokens", type=int, default=1400)
51
+ p.add_argument("--af3-temperature", type=float, default=0.1)
52
+ p.add_argument("--openai-api-key", default="")
53
+ p.add_argument("--openai-model", default=get_env("OPENAI_MODEL", "openai_model", default=DEFAULT_OPENAI_MODEL))
54
+ p.add_argument("--user-context", default="")
55
+ p.add_argument("--artist-name", default="")
56
+ p.add_argument("--track-name", default="")
57
+ p.add_argument("--enable-web-search", action="store_true")
58
+ p.add_argument("--output-dir", default="", help="If set, save sidecars here instead of next to audio")
59
+ return p
60
+
61
+
62
+ def resolve_audio_paths(args) -> List[str]:
63
+ if args.audio:
64
+ p = Path(args.audio)
65
+ if not p.is_file():
66
+ raise FileNotFoundError(f"Audio file not found: {p}")
67
+ return [str(p)]
68
+ if args.dataset_dir:
69
+ files = list_audio_files(args.dataset_dir)
70
+ if not files:
71
+ raise RuntimeError(f"No audio files found in {args.dataset_dir}")
72
+ return files
73
+ raise ValueError("Provide --audio or --dataset-dir")
74
+
75
+
76
+ def main() -> int:
77
+ args = build_parser().parse_args()
78
+ hf_token = args.hf_token or get_env("HF_TOKEN", "hf_token")
79
+ openai_key = (
80
+ args.openai_api_key
81
+ or get_env("OPENAI_API_KEY", "openai_api_key")
82
+ )
83
+ if not openai_key:
84
+ raise RuntimeError("OPENAI_API_KEY is required for cleanup step.")
85
+
86
+ if args.backend == "hf_endpoint":
87
+ if not args.endpoint_url:
88
+ raise RuntimeError("HF endpoint backend requires --endpoint-url")
89
+ af3_client = AF3EndpointClient(
90
+ endpoint_url=args.endpoint_url,
91
+ token=hf_token,
92
+ model_id=args.model_id,
93
+ )
94
+ else:
95
+ af3_client = AF3LocalClient(
96
+ model_id=args.model_id,
97
+ device=args.device,
98
+ torch_dtype=args.torch_dtype,
99
+ )
100
+
101
+ af3_prompt = args.prompt
102
+ af3_max_new_tokens = int(args.af3_max_new_tokens)
103
+ af3_temperature = float(args.af3_temperature)
104
+ if args.think_long:
105
+ if af3_prompt == DEFAULT_AF3_PROMPT:
106
+ af3_prompt = DEFAULT_AF3_PROMPT_THINK_LONG
107
+ if af3_max_new_tokens == 1400:
108
+ af3_max_new_tokens = 3200
109
+ if abs(af3_temperature - 0.1) < 1e-9:
110
+ af3_temperature = 0.2
111
+
112
+ audio_paths = resolve_audio_paths(args)
113
+ failures = []
114
+ saved = []
115
+ for audio_path in tqdm(audio_paths, desc="AF3+ChatGPT"):
116
+ try:
117
+ result = run_af3_chatgpt_pipeline(
118
+ audio_path=audio_path,
119
+ af3_client=af3_client,
120
+ af3_prompt=af3_prompt,
121
+ af3_max_new_tokens=af3_max_new_tokens,
122
+ af3_temperature=af3_temperature,
123
+ openai_api_key=openai_key,
124
+ openai_model=args.openai_model,
125
+ user_context=args.user_context,
126
+ artist_name=args.artist_name,
127
+ track_name=args.track_name,
128
+ enable_web_search=bool(args.enable_web_search),
129
+ )
130
+ sidecar = result["sidecar"]
131
+ if args.output_dir:
132
+ out_path = Path(args.output_dir) / (Path(audio_path).stem + ".json")
133
+ else:
134
+ out_path = Path(audio_path).with_suffix(".json")
135
+ out_path.parent.mkdir(parents=True, exist_ok=True)
136
+ out_path.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
137
+ saved.append(str(out_path))
138
+ except Exception as exc:
139
+ failures.append(f"{Path(audio_path).name}: {exc}")
140
+
141
+ print(
142
+ json.dumps(
143
+ {
144
+ "processed": len(audio_paths),
145
+ "saved": len(saved),
146
+ "failed": len(failures),
147
+ "saved_paths": saved[:20],
148
+ "failures": failures[:20],
149
+ },
150
+ indent=2,
151
+ ensure_ascii=False,
152
+ )
153
+ )
154
+ return 0 if not failures else 2
155
+
156
+
157
+ if __name__ == "__main__":
158
+ raise SystemExit(main())
services/pipeline_api.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local orchestration API for AF3 captioning + ChatGPT cleanup pipeline.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import tempfile
8
+ from pathlib import Path
9
+
10
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.responses import FileResponse, JSONResponse
13
+ from fastapi.staticfiles import StaticFiles
14
+ from pydantic import BaseModel
15
+
16
+ from af3_chatgpt_pipeline import (
17
+ DEFAULT_AF3_MODEL_ID,
18
+ DEFAULT_AF3_PROMPT,
19
+ DEFAULT_OPENAI_MODEL,
20
+ AF3EndpointClient,
21
+ AF3LocalClient,
22
+ run_af3_chatgpt_pipeline,
23
+ save_sidecar,
24
+ )
25
+ from utils.env_config import get_env, load_project_env
26
+
27
+
28
+ load_project_env()
29
+
30
+
31
+ def _resolve_token(name_upper: str, name_lower: str) -> str:
32
+ return get_env(name_upper, name_lower)
33
+
34
+
35
+ def _build_af3_client(
36
+ backend: str,
37
+ endpoint_url: str,
38
+ hf_token: str,
39
+ model_id: str,
40
+ device: str,
41
+ torch_dtype: str,
42
+ ):
43
+ if backend == "hf_endpoint":
44
+ if not endpoint_url:
45
+ raise HTTPException(status_code=400, detail="AF3 endpoint backend requires endpoint_url")
46
+ return AF3EndpointClient(
47
+ endpoint_url=endpoint_url,
48
+ token=hf_token,
49
+ model_id=model_id or DEFAULT_AF3_MODEL_ID,
50
+ )
51
+ return AF3LocalClient(
52
+ model_id=model_id or DEFAULT_AF3_MODEL_ID,
53
+ device=device,
54
+ torch_dtype=torch_dtype,
55
+ )
56
+
57
+
58
+ app = FastAPI(title="AF3 + ChatGPT Pipeline API", version="1.0.0")
59
+ app.add_middleware(
60
+ CORSMiddleware,
61
+ allow_origins=["*"],
62
+ allow_credentials=True,
63
+ allow_methods=["*"],
64
+ allow_headers=["*"],
65
+ )
66
+ FRONTEND_DIST = Path(__file__).resolve().parents[1] / "react-ui" / "dist"
67
+ FRONTEND_ASSETS = FRONTEND_DIST / "assets"
68
+ if FRONTEND_ASSETS.exists():
69
+ app.mount("/assets", StaticFiles(directory=str(FRONTEND_ASSETS)), name="assets")
70
+
71
+
72
+ class PipelinePathRequest(BaseModel):
73
+ audio_path: str
74
+ backend: str = "hf_endpoint"
75
+ endpoint_url: str = ""
76
+ hf_token: str = ""
77
+ model_id: str = DEFAULT_AF3_MODEL_ID
78
+ device: str = "auto"
79
+ torch_dtype: str = "auto"
80
+ af3_prompt: str = DEFAULT_AF3_PROMPT
81
+ af3_max_new_tokens: int = 1400
82
+ af3_temperature: float = 0.1
83
+ openai_api_key: str = ""
84
+ openai_model: str = DEFAULT_OPENAI_MODEL
85
+ user_context: str = ""
86
+ artist_name: str = ""
87
+ track_name: str = ""
88
+ enable_web_search: bool = False
89
+ output_json: str = ""
90
+
91
+
92
+ @app.get("/api/health")
93
+ def health():
94
+ return {"ok": True}
95
+
96
+
97
+ @app.get("/", include_in_schema=False)
98
+ def serve_root():
99
+ if FRONTEND_DIST.exists():
100
+ index = FRONTEND_DIST / "index.html"
101
+ if index.exists():
102
+ return FileResponse(index)
103
+ return JSONResponse(
104
+ {
105
+ "ok": True,
106
+ "message": "Frontend build not found. Run `python af3_gui_app.py` or `npm --prefix react-ui run build`.",
107
+ }
108
+ )
109
+
110
+
111
+ @app.get("/api/config")
112
+ def config():
113
+ return {
114
+ "defaults": {
115
+ "backend": "hf_endpoint",
116
+ "endpoint_url": _resolve_token("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url"),
117
+ "model_id": _resolve_token("AF3_MODEL_ID", "af3_model_id") or DEFAULT_AF3_MODEL_ID,
118
+ "openai_model": _resolve_token("OPENAI_MODEL", "openai_model") or DEFAULT_OPENAI_MODEL,
119
+ "af3_prompt": DEFAULT_AF3_PROMPT,
120
+ }
121
+ }
122
+
123
+
124
+ @app.post("/api/pipeline/run-path")
125
+ def run_pipeline_path(req: PipelinePathRequest):
126
+ audio_path = Path(req.audio_path)
127
+ if not audio_path.is_file():
128
+ raise HTTPException(status_code=404, detail=f"Audio not found: {audio_path}")
129
+
130
+ hf_token = req.hf_token or _resolve_token("HF_TOKEN", "hf_token")
131
+ openai_key = req.openai_api_key or _resolve_token("OPENAI_API_KEY", "openai_api_key")
132
+ if not openai_key:
133
+ raise HTTPException(status_code=400, detail="OPENAI_API_KEY is required.")
134
+
135
+ endpoint_url = req.endpoint_url or _resolve_token("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url")
136
+ af3_client = _build_af3_client(
137
+ backend=req.backend,
138
+ endpoint_url=endpoint_url,
139
+ hf_token=hf_token,
140
+ model_id=req.model_id,
141
+ device=req.device,
142
+ torch_dtype=req.torch_dtype,
143
+ )
144
+ try:
145
+ result = run_af3_chatgpt_pipeline(
146
+ audio_path=str(audio_path),
147
+ af3_client=af3_client,
148
+ af3_prompt=req.af3_prompt,
149
+ af3_max_new_tokens=req.af3_max_new_tokens,
150
+ af3_temperature=req.af3_temperature,
151
+ openai_api_key=openai_key,
152
+ openai_model=req.openai_model,
153
+ user_context=req.user_context,
154
+ artist_name=req.artist_name,
155
+ track_name=req.track_name,
156
+ enable_web_search=req.enable_web_search,
157
+ )
158
+ except Exception as exc:
159
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
160
+
161
+ output_json = req.output_json or str(audio_path.with_suffix(".json"))
162
+ save_path = save_sidecar(result["sidecar"], output_json)
163
+ return {
164
+ "saved_to": save_path,
165
+ "af3_analysis": result["af3_analysis"],
166
+ "cleaned": result["cleaned"],
167
+ "sidecar": result["sidecar"],
168
+ }
169
+
170
+
171
+ @app.post("/api/pipeline/run-upload")
172
+ async def run_pipeline_upload(
173
+ audio_file: UploadFile = File(...),
174
+ backend: str = Form("hf_endpoint"),
175
+ endpoint_url: str = Form(""),
176
+ hf_token: str = Form(""),
177
+ model_id: str = Form(DEFAULT_AF3_MODEL_ID),
178
+ device: str = Form("auto"),
179
+ torch_dtype: str = Form("auto"),
180
+ af3_prompt: str = Form(DEFAULT_AF3_PROMPT),
181
+ af3_max_new_tokens: int = Form(1400),
182
+ af3_temperature: float = Form(0.1),
183
+ openai_api_key: str = Form(""),
184
+ openai_model: str = Form(DEFAULT_OPENAI_MODEL),
185
+ user_context: str = Form(""),
186
+ artist_name: str = Form(""),
187
+ track_name: str = Form(""),
188
+ enable_web_search: bool = Form(False),
189
+ output_json: str = Form(""),
190
+ ):
191
+ suffix = Path(audio_file.filename or "uploaded.wav").suffix or ".wav"
192
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
193
+ temp_audio = Path(tmp.name)
194
+ try:
195
+ content = await audio_file.read()
196
+ temp_audio.write_bytes(content)
197
+
198
+ hf_token_val = hf_token or _resolve_token("HF_TOKEN", "hf_token")
199
+ openai_key = openai_api_key or _resolve_token("OPENAI_API_KEY", "openai_api_key")
200
+ if not openai_key:
201
+ raise HTTPException(status_code=400, detail="OPENAI_API_KEY is required.")
202
+
203
+ endpoint_url_val = endpoint_url or _resolve_token("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url")
204
+ af3_client = _build_af3_client(
205
+ backend=backend,
206
+ endpoint_url=endpoint_url_val,
207
+ hf_token=hf_token_val,
208
+ model_id=model_id,
209
+ device=device,
210
+ torch_dtype=torch_dtype,
211
+ )
212
+
213
+ result = run_af3_chatgpt_pipeline(
214
+ audio_path=str(temp_audio),
215
+ af3_client=af3_client,
216
+ af3_prompt=af3_prompt,
217
+ af3_max_new_tokens=af3_max_new_tokens,
218
+ af3_temperature=af3_temperature,
219
+ openai_api_key=openai_key,
220
+ openai_model=openai_model,
221
+ user_context=user_context,
222
+ artist_name=artist_name,
223
+ track_name=track_name,
224
+ enable_web_search=enable_web_search,
225
+ )
226
+ default_out = Path("outputs") / "af3_chatgpt" / (Path(audio_file.filename or "track").stem + ".json")
227
+ save_path = save_sidecar(result["sidecar"], output_json or str(default_out))
228
+ return {
229
+ "saved_to": save_path,
230
+ "af3_analysis": result["af3_analysis"],
231
+ "cleaned": result["cleaned"],
232
+ "sidecar": result["sidecar"],
233
+ }
234
+ except HTTPException:
235
+ raise
236
+ except Exception as exc:
237
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
238
+ finally:
239
+ try:
240
+ temp_audio.unlink(missing_ok=True)
241
+ except Exception:
242
+ pass
summaries/findings.md CHANGED
@@ -1,124 +1,204 @@
1
- # Improving ACE-Step LoRA with Time-Event-Based Annotation
2
 
3
  [Back to project README](../README.md)
4
 
5
- ## Baseline context in this repo
6
 
7
- This project already provides a solid end-to-end workflow:
8
 
9
- - Train LoRA adapters with `lora_train.py` and the Gradio UI (`app.py`, `lora_ui.py`).
10
- - Deploy generation through a custom endpoint runtime (`handler.py`, `acestep/`).
11
- - Test prompts and lyrics quickly with endpoint client scripts in `scripts/endpoint/`.
 
12
 
13
- Today, most conditioning in this pipeline is still global (caption, lyrics, BPM, key, tags). That is a strong baseline, but it does not explicitly teach *when* events happen inside a track.
14
 
15
- ## Core limitation
16
 
17
- Current annotations usually describe *what* a song is, not *when* events occur. The model can learn style and texture, but temporal structure is weaker:
 
 
 
 
 
 
18
 
19
- - Verse/chorus transitions are often less deliberate than human-produced songs.
20
- - Build-ups, drops, or effect changes can feel averaged or blurred.
21
- - Subgenre-specific arrangement timing is harder to reproduce consistently.
22
 
23
- ## Observed baseline behavior (working assumption)
24
 
25
- From current prompt and endpoint testing workflows in this repo, the baseline appears to do best on:
26
 
27
- - overall timbre/style conditioning from caption-like prompts,
28
- - short-form motif continuity,
29
- - broad genre direction.
30
 
31
- The baseline appears weaker on:
 
 
 
 
 
 
 
 
32
 
33
- - section-level planning across longer durations,
34
- - predictable timing of transitions (intro/verse/chorus/bridge),
35
- - reliable callback motifs that should reappear at known timestamps.
36
 
37
- These are expected gaps for globally conditioned generation and provide a clear target for time-event experiments.
 
 
 
 
 
 
 
 
 
 
38
 
39
- ## Why time-event labels are promising
40
 
41
- 1. Better musical structure: teach the model where sections start/end and where key transitions occur.
42
- 2. Better genre fidelity: encode timing differences between styles that share similar instruments.
43
- 3. Better control at inference: allow prompting for both content and structure (what + when).
44
 
45
- ## Practical direction for this codebase
46
 
47
- A useful next step is to extend the current sidecar metadata approach with optional timed events.
48
 
49
- Example direction:
50
 
51
- - Keep existing fields (`caption`, `lyrics`, `bpm`, etc.).
52
- - Add an `events` list with event type + start/end times.
53
- - Start with a small, high-quality subset before scaling.
54
 
55
- Illustrative shape:
56
 
57
- ```json
58
- {
59
- "caption": "emotional rnb pop with warm pads",
60
- "bpm": 92,
61
- "events": [
62
- {"type": "intro", "start": 0.0, "end": 8.0},
63
- {"type": "verse", "start": 8.0, "end": 32.0},
64
- {"type": "chorus", "start": 32.0, "end": 48.0}
65
- ]
66
- }
67
- ```
68
 
69
- Optional extension fields that may help later:
70
 
71
- - `intensity` (0-1) per event,
72
- - `instrument_focus` tags per section,
73
- - `transition_type` (hard cut, riser, filtered handoff, etc.).
74
 
75
- ## Early experiments worth running
76
 
77
- - Compare baseline LoRA vs time-event LoRA on the same curated mini-dataset.
78
- - Score structural accuracy (section order, transition timing tolerance).
79
- - Run blind listening tests for perceived musical arc and arrangement coherence.
80
- - Track whether time labels improve consistency without reducing creativity.
81
 
82
- ## Suggested evaluation rubric (v1)
83
 
84
- Use a simple shared scorecard to keep comparisons objective:
85
 
86
- 1. Structure match (0-5): generated section order vs target plan.
87
- 2. Timing adherence (0-5): transition timestamps within tolerance window.
88
- 3. Musical coherence (0-5): transitions feel intentional, not abrupt/noisy.
89
- 4. Genre fit (0-5): arrangement behavior matches requested subgenre.
90
- 5. Prompt fidelity (0-5): requested mood/style/lyrics alignment.
91
 
92
- This makes iteration easier than relying only on subjective listening notes.
93
 
94
- ## Incremental execution plan
 
95
 
96
- Phase 1: Data and schema
97
 
98
- - Define the minimal `events` schema and annotation guidelines.
99
- - Build a small seed set (for example 50-200 clips) with high label quality.
100
 
101
- Phase 2: Training and ablation
102
 
103
- - Train a baseline LoRA and an event-aware LoRA with matched settings.
104
- - Run ablations (with/without `events`, coarse vs fine event types).
105
 
106
- Phase 3: Inference controls
107
 
108
- - Add optional event-aware controls in the UI and endpoint payload.
109
- - Keep backward compatibility so existing prompts still work.
110
 
111
- Phase 4: Evaluation and docs
112
 
113
- - Publish scorecard results + examples.
114
- - Document tradeoffs (quality, speed, annotation effort).
115
 
116
- ## Expected outcomes
117
 
118
- If this works, this repo can evolve from "style-conditioned generation" toward "structure-aware generation":
 
 
 
119
 
120
- - More intentional song progression.
121
- - Stronger subgenre identity.
122
- - Better controllability for creators.
123
 
124
- This is still a baseline research note, but it gives a clear technical direction that fits the current project architecture.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ACE-Step 1.5 Annotation and LoRA Findings (My Notes)
2
 
3
  [Back to project README](../README.md)
4
 
5
+ ## What I was trying to build
6
 
7
+ I wanted a reliable pipeline to:
8
 
9
+ 1. Analyze my songs with AF3/Qwen-style timestamped musical detail.
10
+ 2. Clean and structure the results with ChatGPT.
11
+ 3. Save sidecar JSON files that ACE-Step 1.5 LoRA training can consume directly.
12
+ 4. Keep enough detail for future iteration (human edits, richer annotations, timeline/event work).
13
 
14
+ ## What ACE-Step 1.5 actually reads during LoRA training
15
 
16
+ Based on this repo's loader (`lora_train.py`), the training loop directly reads these JSON keys:
17
 
18
+ - `caption`
19
+ - `lyrics`
20
+ - `bpm`
21
+ - `keyscale`
22
+ - `timesignature`
23
+ - `vocal_language`
24
+ - `duration`
25
 
26
+ Anything else is effectively extra metadata for my own workflow. This is why I moved rich analysis detail into `caption` so it is not ignored by the model.
 
 
27
 
28
+ ## Endpoint stack comparison I observed
29
 
30
+ I tested two serving stacks on the same tracks/prompts.
31
 
32
+ ### Stack A (lower quality)
 
 
33
 
34
+ - Model path: `nvidia/audio-flamingo-3-hf`
35
+ - Runtime style: generic Transformers path with custom endpoint handler
36
+ - Behavior I observed:
37
+ - Often short outputs
38
+ - Sometimes repetitive segment text
39
+ - Less convincing section-by-section progression
40
+ - Latency I observed:
41
+ - Fast short runs
42
+ - Medium-length think runs
43
 
44
+ ### Stack B (higher quality)
 
 
45
 
46
+ - Model path:
47
+ - base: `nvidia/audio-flamingo-3`
48
+ - think adapter: `stage35`
49
+ - Runtime style: NVIDIA-style `llava`/`generate_content` stack
50
+ - Behavior I observed:
51
+ - Longer, richer timestamped prose
52
+ - Better flow across sections
53
+ - Better musical interaction detail (vocals + instruments + arrangement)
54
+ - Latency I observed:
55
+ - Slower than Stack A
56
+ - Roughly around 1 minute per track in think/long style runs
57
 
58
+ ### My conclusion
59
 
60
+ If I care about annotation quality, Stack B is clearly better even if it is slower.
 
 
61
 
62
+ ## Main issues I hit and how I resolved them
63
 
64
+ ### 1) Endpoint failed with `Unknown task custom`
65
 
66
+ Observed error:
67
 
68
+ - `KeyError: "Unknown task custom ..."`
 
 
69
 
70
+ What caused it:
71
 
72
+ - Endpoint fell back to default pipeline path instead of loading my custom `handler.py`.
73
+ - Log showed: `No custom pipeline found at /repository/handler.py`.
 
 
 
 
 
 
 
 
 
74
 
75
+ Fix:
76
 
77
+ - Ensure endpoint repo has top-level `handler.py`.
78
+ - Deploy using the custom endpoint template files exactly.
 
79
 
80
+ ### 2) AF3 architecture not recognized
81
 
82
+ Observed error:
 
 
 
83
 
84
+ - `model type audioflamingo3 not recognized`
85
 
86
+ What caused it:
87
 
88
+ - Endpoint base runtime had older Transformers stack that could not load AF3 model classes.
 
 
 
 
89
 
90
+ Fix:
91
 
92
+ - Bootstrap runtime dependencies compatible with AF3 in custom handler/template.
93
+ - Avoid relying on plain default endpoint image assumptions.
94
 
95
+ ### 3) Processor load failures for HF-converted AF3 repo
96
 
97
+ Observed error:
 
98
 
99
+ - `Unrecognized processing class in nvidia/audio-flamingo-3-hf`
100
 
101
+ What caused it:
 
102
 
103
+ - Mismatch between model repo packaging and runtime loader expectations.
104
 
105
+ Fix:
 
106
 
107
+ - Move to NVIDIA stack template path and serving format that matches expected classes/runtime behavior.
108
 
109
+ ### 4) Dependency conflicts after forced upgrades
 
110
 
111
+ Observed logs showed conflicts around:
112
 
113
+ - `transformers`
114
+ - `huggingface_hub`
115
+ - `torch`/`torchaudio`/`torchvision`
116
+ - `huggingface-inference-toolkit` pinned versions
117
 
118
+ What caused it:
 
 
119
 
120
+ - Upgrading one package in place inside endpoint image caused incompatibility with toolkit pins.
121
+
122
+ Fix:
123
+
124
+ - Use curated endpoint template/runtime setup instead of ad-hoc package upgrades.
125
+
126
+ ### 5) Token/auth confusion
127
+
128
+ Observed warning:
129
+
130
+ - Unauthenticated requests to HF Hub even though I had a token in `.env`.
131
+
132
+ What caused it:
133
+
134
+ - Variable name mismatch (`hf_token` vs expected runtime env var names like `HF_TOKEN`) in some contexts.
135
+
136
+ Fix:
137
+
138
+ - Normalize env variable names and pass token consistently in endpoint/runtime settings.
139
+
140
+ ### 6) Very short or repetitive analysis output
141
+
142
+ What caused it:
143
+
144
+ - Wrong stack path (HF-converted flow) and/or non-think-compatible runtime behavior.
145
+
146
+ Fix:
147
+
148
+ - Migrate to NVIDIA think-capable stack.
149
+ - Use longer token budgets and think-oriented prompt settings.
150
+
151
+ ## Dataset run results and quality checks
152
+
153
+ ### Batch throughput I observed
154
+
155
+ - 22 tracks processed in about 22 minutes.
156
+ - Roughly 60 seconds per track average.
157
+
158
+ ### Repetition audit outcome
159
+
160
+ - No exact duplicate full outputs across tracks.
161
+ - But strong template reuse in phrasing and sentence structures.
162
+
163
+ Interpretation:
164
+
165
+ - The model output varied by track, but stylistically collapsed into repeated wording patterns.
166
+
167
+ ## JSON shaping decisions I made
168
+
169
+ ### Flattening for LoRA compatibility
170
+
171
+ I flattened each sidecar to core fields used by `lora_train.py`:
172
+
173
+ - `artist`, `caption`, `lyrics`, `bpm`, `keyscale`, `timesignature`, `vocal_language`, `duration`, `source`
174
+
175
+ ### Keeping rich detail without losing trainability
176
+
177
+ I preserved detail under `source.rich_details` and then pushed high-value content into `caption` so training sees it.
178
+
179
+ ### Global normalization applied
180
+
181
+ - `timesignature`: `"4"`
182
+ - `vocal_language`: `"en"`
183
+ - Captions prefixed with `Andrew Spacey:`
184
+
185
+ ## Important remaining data limitations
186
+
187
+ Even after cleanup, these are still weak points in current sidecars:
188
+
189
+ - `bpm` is mostly null
190
+ - `keyscale` is mostly unknown/blank
191
+
192
+ These are optional for training, but adding reliable BPM/key would likely improve control and consistency.
193
+
194
+ ## My current recommendation
195
+
196
+ 1. Keep NVIDIA stack as default for annotation generation quality.
197
+ 2. Keep core LoRA fields simple and valid.
198
+ 3. Keep rich details in `source.rich_details` for traceability.
199
+ 4. Keep detail-rich caption text for actual conditioning.
200
+ 5. Add a BPM/key estimation pass next if I want stronger metadata conditioning.
201
+
202
+ ## Next technical step I want
203
+
204
+ I should run a structured event pass (`events` list with start/end/type/intensity) on a subset first, then test whether event-aware captions improve generated song structure over the current caption-only approach.
templates/hf-af3-caption-endpoint/README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio Flamingo 3 Caption Endpoint Template
2
+
3
+ Use this as a custom `handler.py` runtime for a Hugging Face Dedicated Endpoint.
4
+
5
+ ## Request contract
6
+
7
+ ```json
8
+ {
9
+ "inputs": {
10
+ "prompt": "Analyze this full song and summarize arrangement changes.",
11
+ "audio_base64": "<base64-encoded WAV bytes>",
12
+ "max_new_tokens": 1200,
13
+ "temperature": 0.1
14
+ }
15
+ }
16
+ ```
17
+
18
+ ## Response contract
19
+
20
+ ```json
21
+ {
22
+ "generated_text": "..."
23
+ }
24
+ ```
25
+
26
+ ## Setup
27
+
28
+ Fastest path from this repo:
29
+
30
+ ```bash
31
+ python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
32
+ ```
33
+
34
+ Then deploy a Dedicated Endpoint from that model repo.
35
+
36
+ Important: make sure your endpoint repo contains top-level:
37
+ - `handler.py`
38
+ - `requirements.txt`
39
+ - `README.md`
40
+
41
+ Use endpoint task `custom` so the runtime loads `handler.py` instead of a default Transformers pipeline.
42
+
43
+ ## Endpoint env vars
44
+
45
+ Required:
46
+ - `AF3_MODEL_ID=nvidia/audio-flamingo-3-hf`
47
+
48
+ Optional runtime bootstrap (defaults shown):
49
+ - `AF3_BOOTSTRAP_RUNTIME=1`
50
+ - `AF3_TRANSFORMERS_SPEC=transformers==5.1.0`
51
+ - `AF3_RUNTIME_DIR=/tmp/af3_runtime`
52
+ - `AF3_STUB_TORCHVISION=1`
53
+
54
+ ## Notes
55
+
56
+ - Audio Flamingo 3 is large; use a GPU endpoint.
57
+ - First boot can take longer because the handler installs AF3-compatible runtime dependencies.
58
+ - This handler returns raw prose analysis. Use the local AF3+ChatGPT pipeline to normalize to LoRA sidecar JSON.
templates/hf-af3-caption-endpoint/handler.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import importlib
3
+ import importlib.machinery
4
+ import importlib.util
5
+ import io
6
+ import os
7
+ import subprocess
8
+ import sys
9
+ import types
10
+ from typing import Any, Dict, List, Tuple
11
+
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import torch
15
+
16
+
17
+ def _resolve_model_id(model_dir: str) -> str:
18
+ default_id = os.getenv("AF3_MODEL_ID", "nvidia/audio-flamingo-3-hf")
19
+ if model_dir and os.path.isdir(model_dir):
20
+ has_local = os.path.exists(os.path.join(model_dir, "config.json"))
21
+ if has_local:
22
+ return model_dir
23
+ return default_id
24
+
25
+
26
+ def _log(msg: str) -> None:
27
+ print(f"[AF3 handler] {msg}", flush=True)
28
+
29
+
30
+ def _env_true(name: str, default: bool = False) -> bool:
31
+ raw = os.getenv(name)
32
+ if raw is None:
33
+ return default
34
+ return str(raw).strip().lower() in {"1", "true", "yes", "on"}
35
+
36
+
37
+ def _install_torchvision_stub() -> None:
38
+ if not _env_true("AF3_STUB_TORCHVISION", True):
39
+ return
40
+ interpolation_mode = types.SimpleNamespace(
41
+ NEAREST=0,
42
+ BILINEAR=2,
43
+ BICUBIC=3,
44
+ BOX=4,
45
+ HAMMING=5,
46
+ LANCZOS=1,
47
+ )
48
+ transforms_stub = types.ModuleType("torchvision.transforms")
49
+ setattr(transforms_stub, "InterpolationMode", interpolation_mode)
50
+ setattr(
51
+ transforms_stub,
52
+ "__spec__",
53
+ importlib.machinery.ModuleSpec(name="torchvision.transforms", loader=None),
54
+ )
55
+ tv_stub = types.ModuleType("torchvision")
56
+ setattr(tv_stub, "transforms", transforms_stub)
57
+ setattr(
58
+ tv_stub,
59
+ "__spec__",
60
+ importlib.machinery.ModuleSpec(name="torchvision", loader=None),
61
+ )
62
+ sys.modules["torchvision"] = tv_stub
63
+ sys.modules["torchvision.transforms"] = transforms_stub
64
+
65
+
66
+ _FIND_SPEC_PATCHED = False
67
+
68
+
69
+ def _patch_optional_backend_discovery() -> None:
70
+ global _FIND_SPEC_PATCHED
71
+ if _FIND_SPEC_PATCHED:
72
+ return
73
+ blocked = {"torchvision", "librosa"}
74
+ original_find_spec = importlib.util.find_spec
75
+
76
+ def wrapped_find_spec(name: str, package: str | None = None):
77
+ root = name.split(".", 1)[0]
78
+ if root in blocked:
79
+ return None
80
+ return original_find_spec(name, package)
81
+
82
+ importlib.util.find_spec = wrapped_find_spec # type: ignore[assignment]
83
+ _FIND_SPEC_PATCHED = True
84
+
85
+
86
+ def _clear_python_modules(prefixes: Tuple[str, ...]) -> None:
87
+ for name in list(sys.modules.keys()):
88
+ if any(name == p or name.startswith(f"{p}.") for p in prefixes):
89
+ sys.modules.pop(name, None)
90
+
91
+
92
+ def _patch_torch_compat() -> None:
93
+ try:
94
+ import torch._dynamo._trace_wrapped_higher_order_op as dyn_wrap
95
+ except Exception:
96
+ return
97
+ if hasattr(dyn_wrap, "TransformGetItemToIndex"):
98
+ return
99
+
100
+ class TransformGetItemToIndex: # pragma: no cover - runtime compatibility shim
101
+ pass
102
+
103
+ setattr(dyn_wrap, "TransformGetItemToIndex", TransformGetItemToIndex)
104
+
105
+
106
+ def _af3_classes_available() -> tuple[bool, str]:
107
+ try:
108
+ from transformers import AudioFlamingo3ForConditionalGeneration # noqa: F401
109
+ from transformers import AudioFlamingo3Processor # noqa: F401
110
+
111
+ return True, ""
112
+ except Exception as exc:
113
+ return False, f"{type(exc).__name__}: {exc}"
114
+
115
+
116
+ def _bootstrap_runtime_transformers(target_dir: str) -> None:
117
+ packages = [
118
+ os.getenv("AF3_TRANSFORMERS_SPEC", "transformers==5.1.0"),
119
+ "numpy<2",
120
+ "accelerate>=1.1.0",
121
+ "sentencepiece",
122
+ "safetensors",
123
+ "soxr",
124
+ ]
125
+ cmd = [sys.executable, "-m", "pip", "install", "--upgrade", "--no-cache-dir", "--target", target_dir, *packages]
126
+ _log("Installing runtime deps for AF3 (first boot can take a few minutes).")
127
+ subprocess.check_call(cmd)
128
+
129
+
130
+ def _ensure_af3_transformers():
131
+ _patch_optional_backend_discovery()
132
+ _install_torchvision_stub()
133
+ _patch_torch_compat()
134
+
135
+ import transformers
136
+
137
+ ok, err = _af3_classes_available()
138
+ if ok:
139
+ _log(f"Using bundled transformers={transformers.__version__}")
140
+ return transformers
141
+
142
+ if not _env_true("AF3_BOOTSTRAP_RUNTIME", True):
143
+ raise RuntimeError(
144
+ "AF3 classes are unavailable in bundled transformers "
145
+ f"({transformers.__version__}) and AF3_BOOTSTRAP_RUNTIME is disabled. "
146
+ f"Last import error: {err}"
147
+ )
148
+
149
+ target_dir = os.getenv("AF3_RUNTIME_DIR", "/tmp/af3_runtime")
150
+ os.makedirs(target_dir, exist_ok=True)
151
+ _bootstrap_runtime_transformers(target_dir)
152
+ if target_dir not in sys.path:
153
+ sys.path.insert(0, target_dir)
154
+
155
+ _clear_python_modules(("transformers", "tokenizers", "huggingface_hub", "safetensors"))
156
+ _patch_optional_backend_discovery()
157
+ _install_torchvision_stub()
158
+ _patch_torch_compat()
159
+ importlib.invalidate_caches()
160
+ transformers = importlib.import_module("transformers")
161
+
162
+ ok, err = _af3_classes_available()
163
+ if not ok:
164
+ raise RuntimeError(
165
+ "Failed to load AF3 processor classes after runtime bootstrap. "
166
+ f"transformers={getattr(transformers, '__version__', 'unknown')} "
167
+ f"error={err}"
168
+ )
169
+ _log(f"Bootstrapped transformers={transformers.__version__}")
170
+ return transformers
171
+
172
+
173
+ def _resample_audio_mono(audio: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
174
+ if src_sr == dst_sr:
175
+ return audio.astype(np.float32, copy=False)
176
+ if audio.size == 0:
177
+ return np.zeros((0,), dtype=np.float32)
178
+ src_idx = np.arange(audio.shape[0], dtype=np.float64)
179
+ dst_len = int(round(audio.shape[0] * float(dst_sr) / float(src_sr)))
180
+ dst_len = max(dst_len, 1)
181
+ dst_idx = np.linspace(0.0, float(max(audio.shape[0] - 1, 0)), dst_len, dtype=np.float64)
182
+ out = np.interp(dst_idx, src_idx, audio.astype(np.float64, copy=False))
183
+ return out.astype(np.float32, copy=False)
184
+
185
+
186
+ def _decode_audio_from_b64(audio_b64: str) -> tuple[np.ndarray, int]:
187
+ raw = base64.b64decode(audio_b64)
188
+ data, sr = sf.read(io.BytesIO(raw), dtype="float32", always_2d=False)
189
+ if data.ndim == 2:
190
+ data = np.mean(data, axis=1)
191
+ if data.ndim != 1:
192
+ data = np.asarray(data).reshape(-1)
193
+ target_sr = 16000
194
+ if int(sr) != target_sr:
195
+ data = _resample_audio_mono(data, int(sr), target_sr)
196
+ sr = target_sr
197
+ return data.astype(np.float32, copy=False), int(sr)
198
+
199
+
200
+ class EndpointHandler:
201
+ """
202
+ Hugging Face Dedicated Endpoint custom handler.
203
+
204
+ Request:
205
+ {
206
+ "inputs": {
207
+ "prompt": "...",
208
+ "audio_base64": "...",
209
+ "max_new_tokens": 1200,
210
+ "temperature": 0.1
211
+ }
212
+ }
213
+
214
+ Response:
215
+ {"generated_text": "..."}
216
+ """
217
+
218
+ def __init__(self, model_dir: str = ""):
219
+ self.model_id = _resolve_model_id(model_dir)
220
+ self.transformers = _ensure_af3_transformers()
221
+ from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
222
+
223
+ _log(
224
+ f"torch={torch.__version__} cuda={torch.cuda.is_available()} "
225
+ f"transformers={self.transformers.__version__} model_id={self.model_id}"
226
+ )
227
+
228
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
229
+ self.processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True)
230
+ self.model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
231
+ self.model_id,
232
+ torch_dtype=dtype,
233
+ trust_remote_code=True,
234
+ )
235
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
236
+ self.model.to(self.device)
237
+
238
+ def _build_inputs(self, audio: np.ndarray, sample_rate: int, prompt: str) -> Dict[str, Any]:
239
+ conversation: List[Dict[str, Any]] = [
240
+ {
241
+ "role": "user",
242
+ "content": [
243
+ {"type": "audio", "audio": audio},
244
+ {"type": "text", "text": prompt},
245
+ ],
246
+ }
247
+ ]
248
+ try:
249
+ return self.processor.apply_chat_template(
250
+ conversation,
251
+ tokenize=True,
252
+ add_generation_prompt=True,
253
+ return_dict=True,
254
+ return_tensors="pt",
255
+ audio_kwargs={"sampling_rate": int(sample_rate)},
256
+ )
257
+ except Exception:
258
+ return self.processor.apply_chat_template(
259
+ conversation,
260
+ tokenize=True,
261
+ add_generation_prompt=True,
262
+ return_dict=True,
263
+ return_tensors="pt",
264
+ )
265
+
266
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
267
+ payload = data.get("inputs", data) if isinstance(data, dict) else {}
268
+ prompt = str(payload.get("prompt", "Analyze this full song and summarize arrangement changes.")).strip()
269
+ audio_b64 = payload.get("audio_base64")
270
+ if not audio_b64:
271
+ return {"error": "audio_base64 is required"}
272
+
273
+ max_new_tokens = int(payload.get("max_new_tokens", 1200))
274
+ temperature = float(payload.get("temperature", 0.1))
275
+
276
+ try:
277
+ audio, sample_rate = _decode_audio_from_b64(audio_b64)
278
+ inputs = self._build_inputs(audio, sample_rate, prompt)
279
+ device = next(self.model.parameters()).device
280
+ model_dtype = next(self.model.parameters()).dtype
281
+ for key, value in list(inputs.items()):
282
+ if hasattr(value, "to"):
283
+ if hasattr(value, "dtype") and torch.is_floating_point(value):
284
+ inputs[key] = value.to(device=device, dtype=model_dtype)
285
+ else:
286
+ inputs[key] = value.to(device)
287
+
288
+ do_sample = bool(temperature > 0)
289
+ gen_kwargs = {
290
+ "max_new_tokens": max_new_tokens,
291
+ "do_sample": do_sample,
292
+ }
293
+ if do_sample:
294
+ gen_kwargs["temperature"] = max(temperature, 1e-5)
295
+
296
+ with torch.no_grad():
297
+ outputs = self.model.generate(**inputs, **gen_kwargs)
298
+
299
+ start = int(inputs["input_ids"].shape[1])
300
+ text = self.processor.batch_decode(outputs[:, start:], skip_special_tokens=True)[0].strip()
301
+ if not text:
302
+ text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
303
+ return {"generated_text": text}
304
+ except Exception as exc:
305
+ return {"error": str(exc)}
templates/hf-af3-caption-endpoint/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ numpy<2
2
+ soundfile
templates/hf-af3-nvidia-endpoint/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio Flamingo 3 NVIDIA-Stack Endpoint Template
2
+
3
+ This template uses the same core runtime pattern as NVIDIA's Space:
4
+ - `llava` code from `nvidia/audio-flamingo-3` (space repo)
5
+ - base checkpoint from `nvidia/audio-flamingo-3` (model repo)
6
+ - optional `stage35` think/long adapter
7
+
8
+ ## Request contract
9
+
10
+ ```json
11
+ {
12
+ "inputs": {
13
+ "prompt": "Please describe the audio in detail.",
14
+ "audio_base64": "<base64 WAV bytes>",
15
+ "think_mode": true,
16
+ "max_new_tokens": 2048,
17
+ "temperature": 0.2
18
+ }
19
+ }
20
+ ```
21
+
22
+ ## Response contract
23
+
24
+ ```json
25
+ {
26
+ "generated_text": "...",
27
+ "mode": "think"
28
+ }
29
+ ```
30
+
31
+ ## Bootstrap command
32
+
33
+ ```bash
34
+ python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
35
+ ```
36
+
37
+ ## Endpoint settings
38
+
39
+ - Task: `custom`
40
+ - GPU instance required
41
+ - Secrets:
42
+ - `HF_TOKEN=<your_token>`
43
+
44
+ ## Optional env vars
45
+
46
+ - `AF3_NV_CODE_REPO_ID=nvidia/audio-flamingo-3`
47
+ - `AF3_NV_MODEL_REPO_ID=nvidia/audio-flamingo-3`
48
+ - `AF3_NV_CODE_REPO_TYPE=space`
49
+ - `AF3_NV_MODEL_REPO_TYPE=model`
50
+ - `AF3_NV_DEFAULT_MODE=think`
51
+ - `AF3_NV_LOAD_THINK=1`
52
+ - `AF3_NV_LOAD_SINGLE=0`
53
+
54
+ Default behavior loads think/long mode for higher-quality long-form reasoning.
templates/hf-af3-nvidia-endpoint/handler.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import copy
3
+ import os
4
+ import sys
5
+ import tempfile
6
+ from typing import Any, Dict
7
+
8
+ import torch
9
+ from huggingface_hub import snapshot_download
10
+ from peft import PeftModel
11
+
12
+
13
+ DEFAULT_PROMPT = "Please describe the audio in detail."
14
+
15
+
16
+ def _log(msg: str) -> None:
17
+ print(f"[AF3 NVIDIA handler] {msg}", flush=True)
18
+
19
+
20
+ def _env_true(name: str, default: bool = False) -> bool:
21
+ raw = os.getenv(name)
22
+ if raw is None:
23
+ return default
24
+ return str(raw).strip().lower() in {"1", "true", "yes", "on"}
25
+
26
+
27
+ def _strip_state_dict_prefixes(state_dict: Dict[str, Any]) -> Dict[str, Any]:
28
+ out: Dict[str, Any] = {}
29
+ for key, value in state_dict.items():
30
+ key2 = key[6:] if key.startswith("model.") else key
31
+ out[key2] = value
32
+ return out
33
+
34
+
35
+ class EndpointHandler:
36
+ """
37
+ NVIDIA AF3 stack endpoint handler (matches Space architecture closely).
38
+
39
+ Request:
40
+ {
41
+ "inputs": {
42
+ "prompt": "...",
43
+ "audio_base64": "...",
44
+ "think_mode": true,
45
+ "max_new_tokens": 2048,
46
+ "temperature": 0.2
47
+ }
48
+ }
49
+
50
+ Response:
51
+ {"generated_text": "...", "mode": "think|single"}
52
+ """
53
+
54
+ def __init__(self, model_dir: str = ""):
55
+ del model_dir
56
+ self.hf_token = os.getenv("HF_TOKEN", "")
57
+ self.code_repo_id = os.getenv("AF3_NV_CODE_REPO_ID", "nvidia/audio-flamingo-3")
58
+ self.model_repo_id = os.getenv("AF3_NV_MODEL_REPO_ID", "nvidia/audio-flamingo-3")
59
+ self.code_repo_type = os.getenv("AF3_NV_CODE_REPO_TYPE", "space")
60
+ self.model_repo_type = os.getenv("AF3_NV_MODEL_REPO_TYPE", "model")
61
+ self.default_mode = os.getenv("AF3_NV_DEFAULT_MODE", "think").strip().lower()
62
+ if self.default_mode not in {"think", "single"}:
63
+ self.default_mode = "think"
64
+
65
+ self.load_think = _env_true("AF3_NV_LOAD_THINK", True)
66
+ self.load_single = _env_true("AF3_NV_LOAD_SINGLE", self.default_mode == "single")
67
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
68
+
69
+ _log(f"torch={torch.__version__} cuda={torch.cuda.is_available()} device={self.device}")
70
+ _log(
71
+ f"code_repo={self.code_repo_type}:{self.code_repo_id} "
72
+ f"model_repo={self.model_repo_type}:{self.model_repo_id} default_mode={self.default_mode}"
73
+ )
74
+
75
+ self.llava = self._load_llava_runtime()
76
+ self.model_root = self._download_model_root()
77
+
78
+ self.model_single = None
79
+ self.model_think = None
80
+
81
+ if self.load_single:
82
+ self.model_single = self._load_single_model()
83
+ if self.load_think:
84
+ self.model_think = self._load_think_model()
85
+
86
+ if self.model_single is None and self.model_think is None:
87
+ raise RuntimeError("No model loaded. Enable AF3_NV_LOAD_THINK or AF3_NV_LOAD_SINGLE.")
88
+
89
+ def _load_llava_runtime(self):
90
+ code_root = snapshot_download(
91
+ repo_id=self.code_repo_id,
92
+ repo_type=self.code_repo_type,
93
+ allow_patterns=["llava/**"],
94
+ token=self.hf_token or None,
95
+ )
96
+ if code_root not in sys.path:
97
+ sys.path.insert(0, code_root)
98
+ import llava # type: ignore
99
+
100
+ _log(f"Loaded llava runtime from {code_root}")
101
+ return llava
102
+
103
+ def _download_model_root(self) -> str:
104
+ model_root = snapshot_download(
105
+ repo_id=self.model_repo_id,
106
+ repo_type=self.model_repo_type,
107
+ token=self.hf_token or None,
108
+ )
109
+ _log(f"Model root: {model_root}")
110
+ return model_root
111
+
112
+ def _load_single_model(self):
113
+ _log("Loading single-turn model...")
114
+ model = self.llava.load(self.model_root, model_base=None)
115
+ model = model.to(self.device)
116
+ model.eval()
117
+ return model
118
+
119
+ def _load_think_model(self):
120
+ _log("Loading think/long model (stage35 adapter)...")
121
+ stage35_dir = os.path.join(self.model_root, "stage35")
122
+ non_lora_path = os.path.join(stage35_dir, "non_lora_trainables.bin")
123
+ if not os.path.exists(non_lora_path):
124
+ raise RuntimeError(f"stage35 non_lora_trainables missing: {non_lora_path}")
125
+
126
+ model = self.llava.load(self.model_root, model_base=None)
127
+ model = model.to(self.device)
128
+
129
+ non_lora_trainables = torch.load(non_lora_path, map_location="cpu")
130
+ non_lora_trainables = _strip_state_dict_prefixes(non_lora_trainables)
131
+ model.load_state_dict(non_lora_trainables, strict=False)
132
+
133
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
134
+ model = PeftModel.from_pretrained(
135
+ model,
136
+ stage35_dir,
137
+ device_map="auto" if torch.cuda.is_available() else None,
138
+ torch_dtype=dtype,
139
+ )
140
+ model.eval()
141
+ return model
142
+
143
+ def _select_model(self, think_mode: bool):
144
+ if think_mode and self.model_think is not None:
145
+ return self.model_think, "think"
146
+ if (not think_mode) and self.model_single is not None:
147
+ return self.model_single, "single"
148
+ if self.model_think is not None:
149
+ return self.model_think, "think"
150
+ return self.model_single, "single"
151
+
152
+ def _build_generation_config(self, model, max_new_tokens: int, temperature: float):
153
+ base_cfg = getattr(model, "default_generation_config", None)
154
+ if base_cfg is None:
155
+ return None
156
+ cfg = copy.deepcopy(base_cfg)
157
+ if max_new_tokens > 0:
158
+ setattr(cfg, "max_new_tokens", int(max_new_tokens))
159
+ if temperature > 0:
160
+ setattr(cfg, "temperature", float(temperature))
161
+ setattr(cfg, "do_sample", True)
162
+ else:
163
+ setattr(cfg, "do_sample", False)
164
+ return cfg
165
+
166
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
167
+ payload = data.get("inputs", data) if isinstance(data, dict) else {}
168
+ audio_b64 = payload.get("audio_base64")
169
+ if not audio_b64:
170
+ return {"error": "audio_base64 is required"}
171
+
172
+ prompt = str(payload.get("prompt", DEFAULT_PROMPT)).strip() or DEFAULT_PROMPT
173
+ think_mode_val = payload.get("think_mode")
174
+ if think_mode_val is None:
175
+ think_mode = self.default_mode == "think"
176
+ else:
177
+ think_mode = bool(think_mode_val)
178
+
179
+ max_new_tokens = int(payload.get("max_new_tokens", 2048))
180
+ temperature = float(payload.get("temperature", 0.2))
181
+ model, mode = self._select_model(think_mode)
182
+
183
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
184
+ tmp_path = tmp.name
185
+ tmp.write(base64.b64decode(audio_b64))
186
+
187
+ try:
188
+ sound = self.llava.Sound(tmp_path)
189
+ full_prompt = f"<sound>\n{prompt}"
190
+ gen_cfg = self._build_generation_config(model, max_new_tokens=max_new_tokens, temperature=temperature)
191
+
192
+ with torch.inference_mode():
193
+ if gen_cfg is not None:
194
+ response = model.generate_content([sound, full_prompt], generation_config=gen_cfg)
195
+ else:
196
+ response = model.generate_content([sound, full_prompt])
197
+ return {"generated_text": str(response).strip(), "mode": mode}
198
+ except Exception as exc:
199
+ return {"error": str(exc), "mode": mode}
200
+ finally:
201
+ try:
202
+ os.unlink(tmp_path)
203
+ except Exception:
204
+ pass
templates/hf-af3-nvidia-endpoint/requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers==4.46.0
2
+ accelerate==0.34.2
3
+ peft==0.14.0
4
+ numpy==1.26.4
5
+ Pillow
6
+ pydub
7
+ soundfile
8
+ librosa
9
+ openai-whisper
10
+ ftfy
11
+ jiwer
12
+ einops
13
+ hydra-core
14
+ loguru
15
+ matplotlib
16
+ pytorchvideo==0.1.5
17
+ deepspeed==0.15.4
18
+ kaldiio
19
+ wandb
20
+ opencv-python-headless==4.8.0.76
21
+ protobuf==3.20.*
22
+ termcolor
23
+ sentencepiece
templates/hf-qwen-caption-endpoint/README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen2-Audio Caption Endpoint Template
2
+
3
+ Use this as a custom `handler.py` runtime for a Hugging Face Dedicated Endpoint.
4
+
5
+ ## Request contract
6
+
7
+ ```json
8
+ {
9
+ "inputs": {
10
+ "prompt": "Analyze and describe this music segment.",
11
+ "audio_base64": "<base64-encoded WAV bytes>",
12
+ "sample_rate": 16000,
13
+ "max_new_tokens": 384,
14
+ "temperature": 0.1
15
+ }
16
+ }
17
+ ```
18
+
19
+ ## Response contract
20
+
21
+ ```json
22
+ {
23
+ "generated_text": "..."
24
+ }
25
+ ```
26
+
27
+ ## Setup
28
+
29
+ Fastest way from this repo:
30
+
31
+ ```bash
32
+ python scripts/hf_clone.py qwen-endpoint --repo-id YOUR_USERNAME/YOUR_QWEN_ENDPOINT_REPO
33
+ ```
34
+
35
+ Then deploy a Dedicated Endpoint from that repo with task `custom`.
36
+
37
+ Manual path:
38
+
39
+ 1. Create a new model repo for your endpoint runtime.
40
+ 2. Copy `handler.py` from this folder into that repo as top-level `handler.py`.
41
+ 3. Add a `requirements.txt` containing at least:
42
+ - `torch`
43
+ - `torchaudio`
44
+ - `transformers>=4.53.0,<4.58.0`
45
+ - `soundfile`
46
+ - `numpy`
47
+ 4. Deploy a Dedicated Endpoint from that repo.
48
+ 5. Optional endpoint env var:
49
+ - `QWEN_MODEL_ID=Qwen/Qwen2-Audio-7B-Instruct`
50
+
51
+ Then point `qwen_caption_app.py` backend `hf_endpoint` at that endpoint URL.
52
+
53
+ ## Quick local test script
54
+
55
+ From this repo:
56
+
57
+ ```bash
58
+ python scripts/endpoint/test_qwen_caption_endpoint.py \
59
+ --url https://YOUR_ENDPOINT.endpoints.huggingface.cloud \
60
+ --token hf_xxx \
61
+ --audio path/to/song.wav
62
+ ```
templates/hf-qwen-caption-endpoint/handler.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import os
4
+ from typing import Any, Dict
5
+
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import torch
9
+ from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
10
+
11
+
12
+ def _decode_audio_b64(audio_b64: str):
13
+ raw = base64.b64decode(audio_b64)
14
+ audio, sr = sf.read(io.BytesIO(raw), dtype="float32", always_2d=True)
15
+ mono = audio.mean(axis=1).astype(np.float32)
16
+ return mono, int(sr)
17
+
18
+
19
+ class EndpointHandler:
20
+ """
21
+ HF Dedicated Endpoint custom handler contract:
22
+ request:
23
+ {
24
+ "inputs": {
25
+ "prompt": "...",
26
+ "audio_base64": "...",
27
+ "sample_rate": 16000,
28
+ "max_new_tokens": 384,
29
+ "temperature": 0.1
30
+ }
31
+ }
32
+ response:
33
+ {"generated_text": "..."}
34
+ """
35
+
36
+ def __init__(self, model_dir: str = ""):
37
+ model_id = os.getenv("QWEN_MODEL_ID", "Qwen/Qwen2-Audio-7B-Instruct")
38
+ # Only load from model_dir when actual weights/config are packaged there.
39
+ if model_dir and os.path.isdir(model_dir):
40
+ has_local_model = (
41
+ os.path.exists(os.path.join(model_dir, "config.json"))
42
+ and (
43
+ os.path.exists(os.path.join(model_dir, "model.safetensors"))
44
+ or any(name.endswith(".safetensors") for name in os.listdir(model_dir))
45
+ )
46
+ )
47
+ if has_local_model:
48
+ model_id = model_dir
49
+
50
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
51
+ self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
52
+ self.model = Qwen2AudioForConditionalGeneration.from_pretrained(
53
+ model_id,
54
+ torch_dtype=dtype,
55
+ trust_remote_code=True,
56
+ )
57
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
58
+ self.model.to(self.device)
59
+
60
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
61
+ payload = data.get("inputs", data) if isinstance(data, dict) else {}
62
+ prompt = str(payload.get("prompt", "Analyze this music audio.")).strip()
63
+ audio_b64 = payload.get("audio_base64")
64
+ if not audio_b64:
65
+ return {"error": "audio_base64 is required"}
66
+
67
+ max_new_tokens = int(payload.get("max_new_tokens", 384))
68
+ temperature = float(payload.get("temperature", 0.1))
69
+
70
+ audio, sr = _decode_audio_b64(audio_b64)
71
+ sampling_rate = int(payload.get("sample_rate", sr))
72
+
73
+ # Use direct audio token format to force audio conditioning.
74
+ chat_text = f"<|audio_bos|><|AUDIO|><|audio_eos|>\n{prompt}\n"
75
+ inputs = self.processor(
76
+ text=chat_text,
77
+ audio=[audio],
78
+ sampling_rate=sampling_rate,
79
+ return_tensors="pt",
80
+ padding=True,
81
+ )
82
+
83
+ device = next(self.model.parameters()).device
84
+ for key, value in list(inputs.items()):
85
+ if hasattr(value, "to"):
86
+ inputs[key] = value.to(device)
87
+
88
+ do_sample = bool(temperature and temperature > 0)
89
+ gen_kwargs = {
90
+ "max_new_tokens": int(max_new_tokens),
91
+ "do_sample": do_sample,
92
+ }
93
+ if do_sample:
94
+ gen_kwargs["temperature"] = max(float(temperature), 1e-5)
95
+
96
+ with torch.no_grad():
97
+ generated_ids = self.model.generate(**inputs, **gen_kwargs)
98
+ prompt_tokens = inputs["input_ids"].shape[1]
99
+ generated_new = generated_ids[:, prompt_tokens:]
100
+ text = self.processor.batch_decode(
101
+ generated_new,
102
+ skip_special_tokens=True,
103
+ clean_up_tokenization_spaces=False,
104
+ )[0]
105
+ if not text.strip():
106
+ # Some backends may return generated-only ids without prefix tokens.
107
+ text = self.processor.batch_decode(
108
+ generated_ids,
109
+ skip_special_tokens=True,
110
+ clean_up_tokenization_spaces=False,
111
+ )[0]
112
+ return {"generated_text": text.strip()}
templates/hf-qwen-caption-endpoint/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ soundfile
4
+ numpy
5
+ transformers>=4.53.0,<4.58.0
6
+ accelerate
utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ 
utils/env_config.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Environment helpers for project-wide .env loading."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ _PROJECT_ROOT = Path(__file__).resolve().parents[1]
11
+ _DOTENV_PATH = _PROJECT_ROOT / ".env"
12
+ _DOTENV_LOADED = False
13
+
14
+
15
+ def load_project_env() -> None:
16
+ global _DOTENV_LOADED
17
+ if _DOTENV_LOADED:
18
+ return
19
+ load_dotenv(dotenv_path=_DOTENV_PATH, override=False)
20
+ _DOTENV_LOADED = True
21
+
22
+
23
+ def get_env(*keys: str, default: str = "") -> str:
24
+ load_project_env()
25
+ for key in keys:
26
+ value = os.getenv(key)
27
+ if value:
28
+ return value
29
+ return default
30
+
31
+
32
+ def set_default_env_file_value(key: str, value: str) -> bool:
33
+ """Set key=value in .env only if key is missing; returns True when file changed."""
34
+ key = (key or "").strip()
35
+ if not key:
36
+ return False
37
+
38
+ lines = []
39
+ if _DOTENV_PATH.exists():
40
+ lines = _DOTENV_PATH.read_text(encoding="utf-8").splitlines()
41
+
42
+ for raw in lines:
43
+ line = raw.strip()
44
+ if not line or line.startswith("#") or "=" not in line:
45
+ continue
46
+ k, _ = line.split("=", 1)
47
+ if k.strip() == key:
48
+ return False
49
+
50
+ lines.append(f"{key}={value}")
51
+ _DOTENV_PATH.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
52
+ return True