Andrew commited on Feb 15

Commit

8bdd018

1 Parent(s): 72f8b14

Consolidate AF3/Qwen pipelines, endpoint templates, and setup docs

Files changed (45) hide show

.env.example +8 -0
.gitignore +7 -0
README.md +262 -2
af3_chatgpt_pipeline.py +584 -0
af3_gui_app.py +17 -0
docs/deploy/AF3_ENDPOINT.md +79 -0
docs/deploy/AF3_NVIDIA_ENDPOINT.md +64 -0
docs/deploy/QWEN_SPACE.md +26 -0
docs/deploy/SPACE.md +1 -1
docs/guides/README.md +1 -0
docs/guides/af3-chatgpt-pipeline.md +155 -0
docs/guides/qwen2-audio-train.md +171 -0
qwen_audio_captioning.py +996 -0
qwen_caption_app.py +506 -0
react-ui/index.html +12 -0
react-ui/package-lock.json +1674 -0
react-ui/package.json +19 -0
react-ui/src/App.jsx +223 -0
react-ui/src/main.jsx +11 -0
react-ui/src/styles.css +189 -0
react-ui/vite.config.js +15 -0
requirements.txt +6 -1
scripts/annotations/qwen_annotate_file.py +122 -0
scripts/annotations/qwen_caption_dataset.py +203 -0
scripts/dev/run_af3_gui.ps1 +21 -0
scripts/dev/run_af3_gui.py +76 -0
scripts/endpoint/test_af3_caption_endpoint.py +155 -0
scripts/endpoint/test_qwen_caption_endpoint.py +132 -0
scripts/hf_clone.py +185 -1
scripts/jobs/submit_hf_qwen_caption_job.ps1 +133 -0
scripts/pipeline/refine_dataset_json_with_openai.py +291 -0
scripts/pipeline/run_af3_chatgpt_pipeline.py +158 -0
services/pipeline_api.py +242 -0
summaries/findings.md +160 -80
templates/hf-af3-caption-endpoint/README.md +58 -0
templates/hf-af3-caption-endpoint/handler.py +305 -0
templates/hf-af3-caption-endpoint/requirements.txt +2 -0
templates/hf-af3-nvidia-endpoint/README.md +54 -0
templates/hf-af3-nvidia-endpoint/handler.py +204 -0
templates/hf-af3-nvidia-endpoint/requirements.txt +23 -0
templates/hf-qwen-caption-endpoint/README.md +62 -0
templates/hf-qwen-caption-endpoint/handler.py +112 -0
templates/hf-qwen-caption-endpoint/requirements.txt +6 -0
utils/__init__.py +1 -0
utils/env_config.py +52 -0

.env.example CHANGED Viewed

@@ -1,5 +1,13 @@
 HF_TOKEN=hf_xxx_your_token_here
 HF_ENDPOINT_URL=https://your-endpoint-url.endpoints.huggingface.cloud
 # Optional defaults used by scripts/hf_clone.py
 HF_USERNAME=your-hf-username

+# Copy this file to `.env` locally. Do not commit real secrets.
 HF_TOKEN=hf_xxx_your_token_here
 HF_ENDPOINT_URL=https://your-endpoint-url.endpoints.huggingface.cloud
+HF_QWEN_ENDPOINT_URL=https://your-qwen-endpoint-url.endpoints.huggingface.cloud
+HF_AF3_ENDPOINT_URL=https://your-af3-endpoint-url.endpoints.huggingface.cloud
+QWEN_MODEL_ID=Qwen/Qwen2-Audio-7B-Instruct
+AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
+AF3_NV_DEFAULT_MODE=think
+OPENAI_API_KEY=sk-proj-xxx
+OPENAI_MODEL=gpt-5-mini
 # Optional defaults used by scripts/hf_clone.py
 HF_USERNAME=your-hf-username

.gitignore CHANGED Viewed

@@ -17,6 +17,9 @@ htmlcov/
 build/
 dist/
 *.egg-info/
 # Virtual environments
 .venv/
@@ -27,6 +30,7 @@ env/
 .cache/
 .huggingface/
 .gradio/
 # Logs/temp
 *.log
@@ -37,6 +41,7 @@ env/
 # Model/data/runtime artifacts
 checkpoints/
 lora_output/
 outputs/
 artifacts/
 models/
@@ -64,3 +69,5 @@ Thumbs.db
 # Optional local working copies
 Lora-ace-step/
 song_summaries_llm*.md

 build/
 dist/
 *.egg-info/
+node_modules/
+react-ui/node_modules/
+react-ui/dist/
 # Virtual environments
 .venv/
 .cache/
 .huggingface/
 .gradio/
+.tmp_tf*/
 # Logs/temp
 *.log
 # Model/data/runtime artifacts
 checkpoints/
 lora_output/
+qwen_annotations/
 outputs/
 artifacts/
 models/
 # Optional local working copies
 Lora-ace-step/
 song_summaries_llm*.md
+train-dataset/*

README.md CHANGED Viewed

@@ -21,6 +21,9 @@ Train ACE-Step 1.5 LoRA adapters, deploy your own Hugging Face Space, and run pr
 - LoRA training UI and workflow: `app.py`, `lora_ui.py`
 - CLI LoRA trainer for local/HF datasets: `lora_train.py`
 - Custom endpoint runtime: `handler.py`, `acestep/`
 - Bootstrap automation for cloning into your HF account: `scripts/hf_clone.py`
 - Endpoint test clients and HF job launcher: `scripts/endpoint/`, `scripts/jobs/`
@@ -35,6 +38,103 @@ python app.py
 Open `http://localhost:7860`.
 ## Clone to your HF account
 Use the two buttons near the top of this README to create target repos in your HF account, then run:
@@ -61,6 +161,28 @@ Clone your own Endpoint repo:
 python scripts/hf_clone.py endpoint --repo-id YOUR_USERNAME/YOUR_ENDPOINT_REPO
 ```
 Clone both in one run:
 ```bash
@@ -76,10 +198,21 @@ python scripts/hf_clone.py all \
 |- app.py
 |- lora_ui.py
 |- lora_train.py
 |- handler.py
 |- acestep/
 |- scripts/
 |  |- hf_clone.py
 |  |- endpoint/
 |  |  |- generate_interactive.py
 |  |  |- test.ps1
@@ -88,6 +221,12 @@ python scripts/hf_clone.py all \
 |  |  `- test_rnb_2min.bat
 |  `- jobs/
 |     `- submit_hf_lora_job.ps1
 |- docs/
 |  |- deploy/
 |  `- guides/
@@ -119,6 +258,102 @@ Optional sidecar metadata per track:
 }
 ```
 ## Endpoint testing
 ```bash
@@ -139,12 +374,37 @@ Current baseline analysis and improvement ideas are tracked in:
 ## Docs
 - Space deployment: `docs/deploy/SPACE.md`
 - Endpoint deployment: `docs/deploy/ENDPOINT.md`
-- Additional guides: `docs/guides/qwen2-audio-train.md`
 ## Open-source readiness checklist
-- Secrets are env-driven (`HF_TOKEN`, `HF_ENDPOINT_URL`, `.env`).
 - Local artifacts are ignored via `.gitignore`.
 - MIT license included.
 - Reproducible clone/deploy paths documented.

 - LoRA training UI and workflow: `app.py`, `lora_ui.py`
 - CLI LoRA trainer for local/HF datasets: `lora_train.py`
+- Qwen2-Audio captioning/annotation pipeline: `qwen_caption_app.py`, `qwen_audio_captioning.py`, `scripts/annotations/`
+- Audio Flamingo 3 + ChatGPT cleanup pipeline: `af3_chatgpt_pipeline.py`, `scripts/pipeline/`, `services/pipeline_api.py`
+- React orchestration UI for AF3+ChatGPT: `react-ui/`
 - Custom endpoint runtime: `handler.py`, `acestep/`
 - Bootstrap automation for cloning into your HF account: `scripts/hf_clone.py`
 - Endpoint test clients and HF job launcher: `scripts/endpoint/`, `scripts/jobs/`
 Open `http://localhost:7860`.
+## End-to-end setup (recommended)
+Use this sequence when setting up from scratch.
+1. Install dependencies
+```bash
+python -m pip install --upgrade pip
+python -m pip install -r requirements.txt
+```
+2. Create local `.env` from `.env.example` and fill secrets
+```env
+HF_TOKEN=hf_xxx
+HF_AF3_ENDPOINT_URL=https://YOUR_AF3_ENDPOINT.endpoints.huggingface.cloud
+OPENAI_API_KEY=sk-...
+OPENAI_MODEL=gpt-5-mini
+AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
+```
+3. Bootstrap your Hugging Face repos (Space + endpoint templates)
+```bash
+python scripts/hf_clone.py space --repo-id YOUR_USERNAME/YOUR_SPACE_NAME
+python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
+```
+4. Deploy endpoint from the cloned AF3 NVIDIA endpoint repo
+- Set endpoint task to `custom`.
+- Confirm top-level `handler.py` exists in the endpoint repo.
+- Set endpoint env vars if needed (`HF_TOKEN`, `AF3_NV_DEFAULT_MODE=think`).
+5. Generate analysis sidecars from audio
+```bash
+python scripts/pipeline/run_af3_chatgpt_pipeline.py \
+  --dataset-dir ./train-dataset \
+  --backend hf_endpoint \
+  --endpoint-url "$HF_AF3_ENDPOINT_URL" \
+  --openai-api-key "$OPENAI_API_KEY"
+```
+6. Normalize existing JSONs into LoRA-ready shape (optional but recommended)
+```bash
+python scripts/pipeline/refine_dataset_json_with_openai.py \
+  --dataset-dir ./train-dataset \
+  --enable-web-search
+```
+This script keeps core fields needed by ACE-Step LoRA training and preserves rich analysis context in `source.rich_details`.
+7. Train LoRA
+```bash
+python app.py
+```
+Then in UI:
+- Load model.
+- Scan/upload dataset.
+- Start LoRA training.
+8. Test generation with your new adapter
+- Use the endpoint scripts in `scripts/endpoint/`.
+- Or test through the Gradio UI flow.
+## AF3 GUI one-command startup
+1. Configure `.env` (never commit this file):
+```env
+HF_TOKEN=hf_xxx
+HF_AF3_ENDPOINT_URL=https://bc3r76slij67lskb.us-east-1.aws.endpoints.huggingface.cloud
+OPENAI_API_KEY=sk-...
+OPENAI_MODEL=gpt-5-mini
+AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
+```
+2. Launch API + GUI together:
+```bash
+python af3_gui_app.py
+```
+PowerShell alternative:
+```powershell
+.\scripts\dev\run_af3_gui.ps1
+```
+This command builds the React UI and serves it from the FastAPI backend.
+Open `http://127.0.0.1:8008`.
 ## Clone to your HF account
 Use the two buttons near the top of this README to create target repos in your HF account, then run:
 python scripts/hf_clone.py endpoint --repo-id YOUR_USERNAME/YOUR_ENDPOINT_REPO
 ```
+Clone a Qwen2-Audio caption endpoint repo:
+```bash
+python scripts/hf_clone.py qwen-endpoint --repo-id YOUR_USERNAME/YOUR_QWEN_ENDPOINT_REPO
+```
+Clone an Audio Flamingo 3 caption endpoint repo:
+```bash
+python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
+```
+When creating that endpoint, set task to `custom` so it loads the custom `handler.py`.
+Clone an AF3 NVIDIA-stack endpoint repo (matches NVIDIA Space stack better):
+```bash
+python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
+```
+Use this path when you want think/long quality behavior closer to NVIDIA's public demo.
 Clone both in one run:
 ```bash
 |- app.py
 |- lora_ui.py
 |- lora_train.py
+|- qwen_caption_app.py
+|- qwen_audio_captioning.py
+|- af3_chatgpt_pipeline.py
+|- af3_gui_app.py
 |- handler.py
 |- acestep/
 |- scripts/
 |  |- hf_clone.py
+|  |- dev/
+|  |  |- run_af3_gui.py
+|  |  `- run_af3_gui.ps1
+|  |- annotations/
+|  |  `- qwen_caption_dataset.py
+|  |- pipeline/
+|  |  `- run_af3_chatgpt_pipeline.py
 |  |- endpoint/
 |  |  |- generate_interactive.py
 |  |  |- test.ps1
 |  |  `- test_rnb_2min.bat
 |  `- jobs/
 |     `- submit_hf_lora_job.ps1
+|     `- submit_hf_qwen_caption_job.ps1
+|- services/
+|  `- pipeline_api.py
+|- react-ui/
+|- utils/
+|  `- env_config.py
 |- docs/
 |  |- deploy/
 |  `- guides/
 }
 ```
+## Qwen2-Audio annotation pipeline (music captioning)
+Run the dedicated annotation UI:
+```bash
+python qwen_caption_app.py
+```
+Batch caption from CLI:
+```bash
+python scripts/annotations/qwen_caption_dataset.py \
+  --dataset-dir ./dataset_inbox \
+  --backend local \
+  --model-id Qwen/Qwen2-Audio-7B-Instruct \
+  --output-dir ./qwen_annotations \
+  --copy-audio
+```
+This also writes `.json` sidecars next to source audio by default for direct ACE-Step LoRA training.
+Then train LoRA on the exported dataset:
+```bash
+python lora_train.py --dataset-dir ./qwen_annotations/dataset --model-config acestep-v15-base
+```
+## Audio Flamingo 3 + ChatGPT pipeline (analysis -> normalized sidecar JSON)
+This stack runs:
+1. Audio Flamingo 3 for raw music analysis prose.
+2. ChatGPT for cleanup/normalization into LoRA-ready fields.
+3. Sidecar JSON export next to each audio file (or in a custom output folder).
+CLI single track:
+```bash
+python scripts/pipeline/run_af3_chatgpt_pipeline.py \
+  --audio "./train-dataset/Andrew Spacey - Wonder (Prod Beat It AT).mp3" \
+  --backend hf_endpoint \
+  --endpoint-url "$HF_AF3_ENDPOINT_URL" \
+  --hf-token "$HF_TOKEN" \
+  --openai-api-key "$OPENAI_API_KEY" \
+  --artist-name "Andrew Spacey" \
+  --track-name "Wonder"
+```
+CLI dataset batch:
+```bash
+python scripts/pipeline/run_af3_chatgpt_pipeline.py \
+  --dataset-dir ./train-dataset \
+  --backend hf_endpoint \
+  --endpoint-url "$HF_AF3_ENDPOINT_URL" \
+  --openai-api-key "$OPENAI_API_KEY"
+```
+Refine already-generated JSON files in place:
+```bash
+python scripts/pipeline/refine_dataset_json_with_openai.py \
+  --dataset-dir ./train-dataset \
+  --enable-web-search
+```
+Write refined files to a separate folder:
+```bash
+python scripts/pipeline/refine_dataset_json_with_openai.py \
+  --dataset-dir ./train-dataset \
+  --recursive \
+  --enable-web-search \
+  --output-dir ./train-dataset-refined
+```
+Single-command GUI (recommended):
+```bash
+python af3_gui_app.py
+```
+Manual API + React UI:
+```bash
+uvicorn services.pipeline_api:app --host 0.0.0.0 --port 8008 --reload
+```
+```bash
+cd react-ui
+npm install
+npm run dev
+```
+Open `http://localhost:5173` (manual) or `http://127.0.0.1:8008` (single-command).
 ## Endpoint testing
 ```bash
 ## Docs
 - Space deployment: `docs/deploy/SPACE.md`
+- Qwen caption Space deployment: `docs/deploy/QWEN_SPACE.md`
 - Endpoint deployment: `docs/deploy/ENDPOINT.md`
+- AF3 endpoint deployment: `docs/deploy/AF3_ENDPOINT.md`
+- AF3 NVIDIA-stack endpoint deployment: `docs/deploy/AF3_NVIDIA_ENDPOINT.md`
+- Additional guides: `docs/guides/qwen2-audio-train.md`, `docs/guides/af3-chatgpt-pipeline.md`
 ## Open-source readiness checklist
+- Secrets are env-driven (`HF_TOKEN`, `HF_AF3_ENDPOINT_URL`, `OPENAI_API_KEY`, `.env`).
 - Local artifacts are ignored via `.gitignore`.
 - MIT license included.
 - Reproducible clone/deploy paths documented.
+- `.env` is git-ignored; keep real credentials only in local `.env`.
+## GitHub publish flow
+1. Check status
+```bash
+git status
+```
+2. Stage and commit
+```bash
+git add .
+git commit -m "Consolidate AF3/Qwen pipelines, endpoint templates, and docs"
+```
+3. Push to GitHub remote
+```bash
+git push github main
+```

af3_chatgpt_pipeline.py ADDED Viewed

	@@ -0,0 +1,584 @@

+"""
+Audio Flamingo 3 -> ChatGPT cleanup pipeline for Ace Step 1.5 LoRA metadata.
+"""
+from __future__ import annotations
+import base64
+import io
+import json
+import os
+import urllib.request
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, ClassVar, Dict, List, Optional
+import soundfile as sf
+from qwen_audio_captioning import AUDIO_EXTENSIONS, load_audio_mono
+DEFAULT_AF3_MODEL_ID = "nvidia/audio-flamingo-3-hf"
+DEFAULT_AF3_PROMPT = (
+    "Analyze this full song and provide concise, timestamped sections describing vocals, "
+    "instrumentation, production effects, mix changes, energy flow, and genre cues. End with "
+    "a short overall summary."
+)
+DEFAULT_AF3_PROMPT_THINK_LONG = (
+    "Analyze the entire song from start to finish and produce a detailed, timestamped breakdown. "
+    "Cover the full duration with many sections, describing vocals, instrumentation, effects, mix, "
+    "arrangement, and energy transitions. Include notable moments and end with a concise overall summary."
+)
+DEFAULT_OPENAI_MODEL = "gpt-5-mini"
+LUNA_OUTPUT_SCHEMA: Dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "caption": {"type": "string"},
+        "lyrics": {"type": "string"},
+        "bpm": {"type": ["integer", "null"]},
+        "keyscale": {"type": "string"},
+        "timesignature": {"type": "string"},
+        "vocal_language": {"type": "string"},
+        "duration": {"type": "number"},
+        "analysis_short": {"type": "string"},
+        "analysis_long": {"type": "string"},
+        "sections": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "start_sec": {"type": "number"},
+                    "end_sec": {"type": "number"},
+                    "summary": {"type": "string"},
+                    "vocal_notes": {"type": "string"},
+                    "instrument_notes": {"type": "string"},
+                    "effects": {"type": "array", "items": {"type": "string"}},
+                    "mix_notes": {"type": "array", "items": {"type": "string"}},
+                },
+                "required": [
+                    "start_sec",
+                    "end_sec",
+                    "summary",
+                    "vocal_notes",
+                    "instrument_notes",
+                    "effects",
+                    "mix_notes",
+                ],
+                "additionalProperties": False,
+            },
+        },
+        "tags": {"type": "array", "items": {"type": "string"}},
+    },
+    "required": [
+        "caption",
+        "lyrics",
+        "bpm",
+        "keyscale",
+        "timesignature",
+        "vocal_language",
+        "duration",
+        "analysis_short",
+        "analysis_long",
+        "sections",
+        "tags",
+    ],
+    "additionalProperties": False,
+}
+def _extract_json_object(text: str) -> Dict[str, Any]:
+    text = (text or "").strip()
+    if not text:
+        raise ValueError("Empty model output")
+    try:
+        data = json.loads(text)
+        if isinstance(data, dict):
+            return data
+    except Exception:
+        pass
+    start = text.find("{")
+    if start < 0:
+        raise ValueError("No JSON object found in model output")
+    depth = 0
+    for i in range(start, len(text)):
+        ch = text[i]
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                candidate = text[start : i + 1]
+                data = json.loads(candidate)
+                if isinstance(data, dict):
+                    return data
+                break
+    raise ValueError("Failed to parse JSON object from model output")
+def _ensure_str(value: Any, default: str = "") -> str:
+    if value is None:
+        return default
+    return str(value).strip()
+def _ensure_float(value: Any, default: float = 0.0) -> float:
+    try:
+        return float(value)
+    except Exception:
+        return default
+def _ensure_int_or_none(value: Any) -> Optional[int]:
+    if value is None:
+        return None
+    try:
+        iv = int(float(value))
+    except Exception:
+        return None
+    if iv <= 0:
+        return None
+    return iv
+def _ensure_str_list(value: Any) -> List[str]:
+    if value is None:
+        return []
+    if isinstance(value, str):
+        s = value.strip()
+        return [s] if s else []
+    if not isinstance(value, list):
+        return []
+    out: List[str] = []
+    seen = set()
+    for item in value:
+        s = _ensure_str(item)
+        if not s:
+            continue
+        k = s.lower()
+        if k in seen:
+            continue
+        seen.add(k)
+        out.append(s)
+    return out
+def _normalize_sections(sections: Any, duration: float) -> List[Dict[str, Any]]:
+    if not isinstance(sections, list):
+        return []
+    out: List[Dict[str, Any]] = []
+    for idx, sec in enumerate(sections):
+        if not isinstance(sec, dict):
+            continue
+        start = _ensure_float(sec.get("start_sec"), default=0.0)
+        end = _ensure_float(sec.get("end_sec"), default=start)
+        if end < start:
+            end = start
+        if duration > 0:
+            start = max(0.0, min(start, duration))
+            end = max(start, min(end, duration))
+        out.append(
+            {
+                "index": idx,
+                "start_sec": round(start, 3),
+                "end_sec": round(end, 3),
+                "summary": _ensure_str(sec.get("summary")),
+                "vocal_notes": _ensure_str(sec.get("vocal_notes")),
+                "instrument_notes": _ensure_str(sec.get("instrument_notes")),
+                "effects": _ensure_str_list(sec.get("effects")),
+                "mix_notes": _ensure_str_list(sec.get("mix_notes")),
+            }
+        )
+    return out
+def _audio_to_wav_base64(audio_path: str, sample_rate: int = 16000) -> str:
+    audio, sr = load_audio_mono(audio_path, target_sr=sample_rate)
+    buf = io.BytesIO()
+    sf.write(buf, audio, sr, format="WAV")
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+@dataclass
+class AF3EndpointClient:
+    backend_name: ClassVar[str] = "hf_endpoint"
+    endpoint_url: str
+    token: str
+    model_id: str = DEFAULT_AF3_MODEL_ID
+    timeout_seconds: int = 300
+    def analyze(
+        self,
+        audio_path: str,
+        prompt: str,
+        max_new_tokens: int = 1024,
+        temperature: float = 0.1,
+    ) -> str:
+        audio_b64 = _audio_to_wav_base64(audio_path, sample_rate=16000)
+        payload = {
+            "inputs": {
+                "prompt": prompt,
+                "audio_base64": audio_b64,
+                "sample_rate": 16000,
+                "max_new_tokens": int(max_new_tokens),
+                "temperature": float(temperature),
+                "model_id": self.model_id,
+            }
+        }
+        req = urllib.request.Request(
+            self.endpoint_url,
+            data=json.dumps(payload).encode("utf-8"),
+            method="POST",
+            headers={
+                "Content-Type": "application/json",
+                **({"Authorization": f"Bearer {self.token}"} if self.token else {}),
+            },
+        )
+        with urllib.request.urlopen(req, timeout=self.timeout_seconds) as resp:
+            raw = resp.read().decode("utf-8")
+        data = json.loads(raw)
+        if isinstance(data, dict) and isinstance(data.get("generated_text"), str):
+            return data["generated_text"].strip()
+        if isinstance(data, dict) and isinstance(data.get("text"), str):
+            return data["text"].strip()
+        if isinstance(data, list) and data:
+            first = data[0]
+            if isinstance(first, dict) and isinstance(first.get("generated_text"), str):
+                return first["generated_text"].strip()
+            if isinstance(first, dict) and isinstance(first.get("text"), str):
+                return first["text"].strip()
+        return str(data).strip()
+@dataclass
+class AF3LocalClient:
+    backend_name: ClassVar[str] = "local"
+    model_id: str = DEFAULT_AF3_MODEL_ID
+    device: str = "auto"
+    torch_dtype: str = "auto"
+    trust_remote_code: bool = True
+    def __post_init__(self):
+        self._processor = None
+        self._model = None
+    def _load(self):
+        if self._model is not None and self._processor is not None:
+            return
+        import torch
+        try:
+            from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+            model_cls = AudioFlamingo3ForConditionalGeneration
+        except Exception as exc:
+            try:
+                from transformers import AutoModelForImageTextToText, AutoProcessor
+                model_cls = AutoModelForImageTextToText
+            except Exception:
+                raise RuntimeError(
+                    "Audio Flamingo 3 classes are unavailable. Install transformers>=4.57.0."
+                ) from exc
+        if self.torch_dtype == "auto":
+            dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        elif self.torch_dtype == "bfloat16":
+            dtype = torch.bfloat16
+        elif self.torch_dtype == "float16":
+            dtype = torch.float16
+        else:
+            dtype = torch.float32
+        self._processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
+        self._model = model_cls.from_pretrained(
+            self.model_id,
+            torch_dtype=dtype,
+            device_map="auto" if self.device == "auto" else None,
+            trust_remote_code=self.trust_remote_code,
+        )
+        if self.device != "auto":
+            self._model.to(self.device)
+    def analyze(
+        self,
+        audio_path: str,
+        prompt: str,
+        max_new_tokens: int = 1024,
+        temperature: float = 0.1,
+    ) -> str:
+        self._load()
+        import torch
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "audio", "path": audio_path},
+                ],
+            }
+        ]
+        inputs = self._processor.apply_chat_template(
+            conversation,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+        )
+        device = next(self._model.parameters()).device
+        for k, v in list(inputs.items()):
+            if hasattr(v, "to"):
+                inputs[k] = v.to(device)
+        gen_kwargs = {
+            "max_new_tokens": int(max_new_tokens),
+            "do_sample": bool(temperature > 0),
+        }
+        if temperature > 0:
+            gen_kwargs["temperature"] = max(temperature, 1e-5)
+        with torch.no_grad():
+            outputs = self._model.generate(**inputs, **gen_kwargs)
+        start = int(inputs["input_ids"].shape[1])
+        text = self._processor.batch_decode(outputs[:, start:], skip_special_tokens=True)[0].strip()
+        if not text:
+            text = self._processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        return text
+def cleanup_with_chatgpt(
+    af3_text: str,
+    *,
+    openai_api_key: str,
+    model: str = DEFAULT_OPENAI_MODEL,
+    duration: float = 0.0,
+    user_context: str = "",
+    artist_name: str = "",
+    track_name: str = "",
+    enable_web_search: bool = False,
+) -> Dict[str, Any]:
+    if not openai_api_key:
+        raise ValueError("openai_api_key is required for ChatGPT cleanup.")
+    try:
+        from openai import OpenAI
+    except Exception as exc:
+        raise RuntimeError("openai package is not installed. Add `openai` to dependencies.") from exc
+    client = OpenAI(api_key=openai_api_key)
+    system = (
+        "You transform raw audio-analysis prose into high-quality LoRA training metadata for Ace Step 1.5. "
+        "Always return compact, truthful JSON. Never invent precise music facts not supported by input."
+    )
+    user = (
+        f"Raw AF3 analysis:\n{af3_text}\n\n"
+        f"Track duration seconds: {duration}\n"
+        f"Artist (optional): {artist_name or 'unknown'}\n"
+        f"Track name (optional): {track_name or 'unknown'}\n"
+        f"User context (optional): {user_context or 'none'}\n\n"
+        "Return output matching the JSON schema exactly. "
+        "Keep caption concise and useful for LoRA conditioning."
+    )
+    if hasattr(client, "responses"):
+        req_kwargs: Dict[str, Any] = {
+            "model": model,
+            "input": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            "text": {
+                "format": {
+                    "type": "json_schema",
+                    "name": "ace_step_luna_metadata",
+                    "schema": LUNA_OUTPUT_SCHEMA,
+                    "strict": True,
+                }
+            },
+        }
+        if enable_web_search:
+            req_kwargs["tools"] = [{"type": "web_search"}]
+        try:
+            response = client.responses.create(**req_kwargs)
+        except Exception:
+            if enable_web_search:
+                # Fallback for SDK/runtime variants that still use the preview tool id.
+                req_kwargs["tools"] = [{"type": "web_search_preview"}]
+                response = client.responses.create(**req_kwargs)
+            else:
+                raise
+        output_text = getattr(response, "output_text", "") or ""
+        if not output_text and hasattr(response, "output"):
+            chunks: List[str] = []
+            for item in getattr(response, "output", []):
+                for content in getattr(item, "content", []):
+                    text_val = getattr(content, "text", None)
+                    if text_val:
+                        chunks.append(str(text_val))
+            output_text = "\n".join(chunks).strip()
+    else:
+        if enable_web_search:
+            raise RuntimeError(
+                "enable_web_search requires an OpenAI SDK/runtime with Responses API support. "
+                "Upgrade openai package to a recent version."
+            )
+        try:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system},
+                    {"role": "user", "content": user},
+                ],
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": {
+                        "name": "ace_step_luna_metadata",
+                        "schema": LUNA_OUTPUT_SCHEMA,
+                        "strict": True,
+                    },
+                },
+            )
+        except Exception:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system},
+                    {
+                        "role": "user",
+                        "content": (
+                            user
+                            + "\n\nReturn valid JSON with keys exactly matching this set: "
+                            "caption, lyrics, bpm, keyscale, timesignature, vocal_language, "
+                            "duration, analysis_short, analysis_long, sections, tags."
+                        ),
+                    },
+                ],
+                response_format={"type": "json_object"},
+            )
+        output_text = ""
+        if getattr(response, "choices", None):
+            message = response.choices[0].message
+            output_text = getattr(message, "content", "") or ""
+    cleaned = _extract_json_object(output_text)
+    return cleaned
+def build_lora_sidecar(
+    cleaned: Dict[str, Any],
+    *,
+    af3_text: str,
+    af3_prompt: str,
+    af3_backend: str,
+    af3_model_id: str,
+    source_audio: str,
+    duration: float,
+    chatgpt_model: str,
+    web_search_used: bool,
+) -> Dict[str, Any]:
+    caption = _ensure_str(cleaned.get("caption"), "music track with evolving arrangement")
+    lyrics = _ensure_str(cleaned.get("lyrics"), "")
+    bpm = _ensure_int_or_none(cleaned.get("bpm"))
+    keyscale = _ensure_str(cleaned.get("keyscale"), "")
+    timesignature = _ensure_str(cleaned.get("timesignature"), "4/4") or "4/4"
+    vocal_language = _ensure_str(cleaned.get("vocal_language"), "unknown") or "unknown"
+    duration_val = _ensure_float(cleaned.get("duration"), duration)
+    analysis_short = _ensure_str(cleaned.get("analysis_short"), caption)
+    analysis_long = _ensure_str(cleaned.get("analysis_long"), af3_text)
+    sections = _normalize_sections(cleaned.get("sections"), duration=duration_val)
+    tags = _ensure_str_list(cleaned.get("tags"))
+    sidecar: Dict[str, Any] = {
+        "caption": caption,
+        "lyrics": lyrics,
+        "bpm": bpm,
+        "keyscale": keyscale,
+        "timesignature": timesignature,
+        "vocal_language": vocal_language,
+        "duration": round(duration_val, 3),
+        "analysis_short": analysis_short,
+        "analysis_long": analysis_long,
+        "source_audio": source_audio,
+        "annotation_version": "af3_chatgpt_luna_v1",
+        "music_analysis": {
+            "timeline": sections,
+            "tags": tags,
+            "summary_long": analysis_long,
+            "segment_count": len(sections),
+        },
+        "pipeline": {
+            "af3_prompt": af3_prompt,
+            "af3_backend": af3_backend,
+            "af3_model_id": af3_model_id,
+            "af3_raw_analysis": af3_text,
+            "chatgpt_model": chatgpt_model,
+            "chatgpt_web_search_used": bool(web_search_used),
+            "generated_at": datetime.now(timezone.utc).isoformat(),
+        },
+    }
+    return sidecar
+def run_af3_chatgpt_pipeline(
+    *,
+    audio_path: str,
+    af3_client: Any,
+    af3_prompt: str = DEFAULT_AF3_PROMPT,
+    af3_max_new_tokens: int = 1400,
+    af3_temperature: float = 0.1,
+    openai_api_key: str = "",
+    openai_model: str = DEFAULT_OPENAI_MODEL,
+    user_context: str = "",
+    artist_name: str = "",
+    track_name: str = "",
+    enable_web_search: bool = False,
+) -> Dict[str, Any]:
+    audio_path = str(Path(audio_path))
+    if not Path(audio_path).is_file():
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+    if Path(audio_path).suffix.lower() not in AUDIO_EXTENSIONS:
+        raise ValueError(f"Unsupported audio extension: {Path(audio_path).suffix}")
+    audio, sr = load_audio_mono(audio_path, target_sr=16000)
+    duration = (float(audio.shape[0]) / float(sr)) if sr > 0 else 0.0
+    af3_text = af3_client.analyze(
+        audio_path=audio_path,
+        prompt=af3_prompt,
+        max_new_tokens=af3_max_new_tokens,
+        temperature=af3_temperature,
+    )
+    cleaned = cleanup_with_chatgpt(
+        af3_text,
+        openai_api_key=openai_api_key,
+        model=openai_model,
+        duration=duration,
+        user_context=user_context,
+        artist_name=artist_name,
+        track_name=track_name,
+        enable_web_search=enable_web_search,
+    )
+    sidecar = build_lora_sidecar(
+        cleaned,
+        af3_text=af3_text,
+        af3_prompt=af3_prompt,
+        af3_backend=getattr(af3_client, "backend_name", type(af3_client).__name__),
+        af3_model_id=getattr(af3_client, "model_id", DEFAULT_AF3_MODEL_ID),
+        source_audio=audio_path,
+        duration=duration,
+        chatgpt_model=openai_model,
+        web_search_used=enable_web_search,
+    )
+    return {
+        "af3_analysis": af3_text,
+        "cleaned": cleaned,
+        "sidecar": sidecar,
+    }
+def save_sidecar(sidecar: Dict[str, Any], output_json: str) -> str:
+    out = Path(output_json)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
+    return str(out)

af3_gui_app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python
+"""Convenience entrypoint for AF3 GUI stack."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent
+DEV_SCRIPTS = ROOT / "scripts" / "dev"
+if str(DEV_SCRIPTS) not in sys.path:
+    sys.path.insert(0, str(DEV_SCRIPTS))
+from run_af3_gui import main
+if __name__ == "__main__":
+    raise SystemExit(main())

docs/deploy/AF3_ENDPOINT.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# Deploy Audio Flamingo 3 Caption Endpoint (Dedicated Endpoint)
+Note: this guide is for the HF-converted `audio-flamingo-3-hf` runtime path.
+For NVIDIA Space stack parity (`llava` + `stage35` think adapter), use:
+`docs/deploy/AF3_NVIDIA_ENDPOINT.md`.
+## 1) Create endpoint runtime repo
+```bash
+python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
+```
+This pushes:
+- `handler.py`
+- `requirements.txt`
+- `README.md`
+from `templates/hf-af3-caption-endpoint/`.
+## 2) Create endpoint from that model repo
+In Hugging Face Endpoints:
+1. Create endpoint from `YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO`.
+2. Choose a GPU instance.
+3. Set task to `custom`.
+4. Set env vars:
+   - `AF3_MODEL_ID=nvidia/audio-flamingo-3-hf`
+   - `AF3_BOOTSTRAP_RUNTIME=1`
+   - `AF3_TRANSFORMERS_SPEC=transformers==5.1.0`
+## 3) Validate startup
+If logs contain:
+- `No custom pipeline found at /repository/handler.py`
+then `handler.py` is not in repo root. Re-upload the runtime template files.
+If logs contain:
+- `Failed to load AF3 processor classes after runtime bootstrap`
+keep endpoint task as `custom`, then check that startup could install runtime deps (network + disk). The first cold start can take several minutes.
+## 4) Connect from local pipeline
+Set:
+- `HF_AF3_ENDPOINT_URL`
+- `HF_TOKEN`
+- `OPENAI_API_KEY`
+Recommended local `.env`:
+```env
+HF_AF3_ENDPOINT_URL=https://bc3r76slij67lskb.us-east-1.aws.endpoints.huggingface.cloud
+HF_TOKEN=hf_xxx
+OPENAI_API_KEY=sk-...
+```
+`.env` is git-ignored in this repo. Do not commit real credentials.
+Then run:
+```bash
+python scripts/pipeline/run_af3_chatgpt_pipeline.py \
+  --audio ./train-dataset/track.mp3 \
+  --backend hf_endpoint \
+  --endpoint-url "$HF_AF3_ENDPOINT_URL" \
+  --openai-api-key "$OPENAI_API_KEY"
+```
+Or launch full GUI stack:
+```bash
+python af3_gui_app.py
+```

docs/deploy/AF3_NVIDIA_ENDPOINT.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# Deploy AF3 NVIDIA-Stack Endpoint (Space-Parity Runtime)
+This path uses NVIDIA's `llava` stack + `stage35` think adapter, which matches the quality profile of:
+- `https://huggingface.co/spaces/nvidia/audio-flamingo-3`
+## 1) Create endpoint runtime repo
+```bash
+python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
+```
+This pushes:
+- `handler.py`
+- `requirements.txt`
+- `README.md`
+from `templates/hf-af3-nvidia-endpoint/`.
+## 2) Create Dedicated Endpoint
+1. Create endpoint from `YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO`.
+2. Set task to `custom`.
+3. Use a GPU instance.
+4. Add secret:
+   - `HF_TOKEN=hf_xxx`
+## 3) Recommended endpoint env vars
+- `AF3_NV_DEFAULT_MODE=think`
+- `AF3_NV_LOAD_THINK=1`
+- `AF3_NV_LOAD_SINGLE=0`
+- `AF3_NV_CODE_REPO_ID=nvidia/audio-flamingo-3`
+- `AF3_NV_MODEL_REPO_ID=nvidia/audio-flamingo-3`
+## 4) Request shape from local scripts
+Current scripts send:
+```json
+{
+  "inputs": {
+    "prompt": "...",
+    "audio_base64": "...",
+    "max_new_tokens": 3200,
+    "temperature": 0.2
+  }
+}
+```
+Optional extra flag for this endpoint:
+```json
+{
+  "inputs": {
+    "think_mode": true
+  }
+}
+```
+## 5) Notes
+- First boot is slow because runtime deps + model artifacts must load.
+- Keep at least one warm replica if you want consistent latency.
+- This runtime is heavier than the HF-converted `audio-flamingo-3-hf` endpoint path.

docs/deploy/QWEN_SPACE.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Deploy Qwen Captioning UI To HF Space
+This deploys the music-captioning app (`qwen_caption_app.py`) as its own Space.
+## Prerequisites
+- Hugging Face account
+- `HF_TOKEN` with write access
+## Steps
+1. Create a new Hugging Face Space (SDK: `Gradio`).
+2. Push this repo content to that Space.
+3. In Space `README.md` front matter, set:
+   - `sdk: gradio`
+   - `app_file: qwen_caption_app.py`
+4. Pick GPU hardware (A10G or better recommended for local backend).
+5. Optional secrets/env:
+   - `HF_TOKEN` (if accessing private datasets or endpoint backend)
+## Runtime notes
+- `local` backend loads `Qwen/Qwen2-Audio-7B-Instruct` in the Space runtime.
+- `hf_endpoint` backend can call a dedicated endpoint URL instead.
+- Export defaults to `/data/qwen_annotations` on Spaces when persistent storage is enabled.

docs/deploy/SPACE.md CHANGED Viewed

@@ -1,6 +1,7 @@
 # Deploy LoRA Studio To Your Own HF Space
 This guide deploys the full LoRA Studio UI to your own Hugging Face Space.
 ## Prerequisites
@@ -37,4 +38,3 @@ python scripts/hf_clone.py space --repo-id YOUR_USERNAME/YOUR_SPACE_NAME --priva
 - Space output defaults to `/data/lora_output` on Hugging Face Spaces.
 - Enable persistent storage if you need checkpoint retention across restarts.
 - For long-running non-interactive training, HF Jobs may be more cost-efficient than keeping a Space running.

 # Deploy LoRA Studio To Your Own HF Space
 This guide deploys the full LoRA Studio UI to your own Hugging Face Space.
+For the dedicated Qwen captioning UI, see `docs/deploy/QWEN_SPACE.md`.
 ## Prerequisites
 - Space output defaults to `/data/lora_output` on Hugging Face Spaces.
 - Enable persistent storage if you need checkpoint retention across restarts.
 - For long-running non-interactive training, HF Jobs may be more cost-efficient than keeping a Space running.

docs/guides/README.md CHANGED Viewed

@@ -3,3 +3,4 @@
 Additional step-by-step guides that are useful but not required for the core LoRA Studio flow.
 - `qwen2-audio-train.md`

 Additional step-by-step guides that are useful but not required for the core LoRA Studio flow.
 - `qwen2-audio-train.md`
+- `af3-chatgpt-pipeline.md`

docs/guides/af3-chatgpt-pipeline.md ADDED Viewed

	@@ -0,0 +1,155 @@

+# Audio Flamingo 3 + ChatGPT Pipeline (Local Orchestration)
+This guide sets up a cloud-first annotation workflow:
+1. **Audio Flamingo 3** generates raw audio analysis text.
+2. **ChatGPT** cleans and structures that output into Ace Step 1.5 LoRA sidecar JSON.
+3. Optional human edits are applied before LoRA training.
+## Endpoint vs Space
+For 100+ tracks, use an **HF Dedicated Endpoint** for AF3 inference.
+- Endpoint: production API, autoscaling options, stable URL, easier local integration.
+- Space: better for interactive demos/tools, less ideal for bulk API workloads.
+Use a Space only if you want a hosted UI. Keep heavy batch inference on Endpoint.
+## Files in this repo
+- Pipeline core: `af3_chatgpt_pipeline.py`
+- Batch CLI: `scripts/pipeline/run_af3_chatgpt_pipeline.py`
+- Local API: `services/pipeline_api.py`
+- React UI: `react-ui/`
+- AF3 endpoint template: `templates/hf-af3-caption-endpoint/`
+## 1) Deploy AF3 endpoint
+Create/push endpoint runtime repo:
+```bash
+python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
+```
+If you want NVIDIA Space parity (llava + stage35 think adapter), use:
+```bash
+python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
+```
+Then create a Hugging Face Dedicated Endpoint from that model repo.
+If startup logs show:
+- `No custom pipeline found at /repository/handler.py`
+your repo root is missing `handler.py` (copy from `templates/hf-af3-caption-endpoint/handler.py`).
+## 2) Configure env
+Set values in `.env` (or shell env vars):
+```env
+HF_TOKEN=hf_xxx
+HF_AF3_ENDPOINT_URL=https://bc3r76slij67lskb.us-east-1.aws.endpoints.huggingface.cloud
+AF3_MODEL_ID=nvidia/audio-flamingo-3-hf
+OPENAI_API_KEY=sk-...
+OPENAI_MODEL=gpt-5-mini
+```
+`.env` is git-ignored by default. Keep all real secrets in local `.env` only.
+## 3) Run one track from CLI
+```bash
+python scripts/pipeline/run_af3_chatgpt_pipeline.py \
+  --audio "E:/Coding/hf-music-gen/train-dataset/Andrew Spacey - Wonder (Prod Beat It AT).mp3" \
+  --backend hf_endpoint \
+  --endpoint-url "$HF_AF3_ENDPOINT_URL" \
+  --hf-token "$HF_TOKEN" \
+  --openai-api-key "$OPENAI_API_KEY" \
+  --artist-name "Andrew Spacey" \
+  --track-name "Wonder"
+```
+Default behavior writes JSON next to the audio file (`same_stem.json`).
+## 4) Batch all tracks
+```bash
+python scripts/pipeline/run_af3_chatgpt_pipeline.py \
+  --dataset-dir ./train-dataset \
+  --backend hf_endpoint \
+  --endpoint-url "$HF_AF3_ENDPOINT_URL" \
+  --openai-api-key "$OPENAI_API_KEY" \
+  --enable-web-search
+```
+Use `--output-dir` if you want sidecars in a separate folder.
+## 5) Run GUI stack
+One command (recommended):
+```bash
+python af3_gui_app.py
+```
+This builds React and serves it from FastAPI. Open `http://127.0.0.1:8008`.
+PowerShell:
+```powershell
+.\scripts\dev\run_af3_gui.ps1
+```
+Manual mode:
+```bash
+uvicorn services.pipeline_api:app --host 0.0.0.0 --port 8008 --reload
+cd react-ui
+npm install
+npm run dev
+```
+Open `http://localhost:5173`.
+UI supports:
+- Local file path mode or upload mode
+- AF3 backend toggle (`hf_endpoint` or `local`)
+- Optional user context
+- Optional web-search-enhanced ChatGPT cleanup
+- Artist/track hints for better metadata normalization
+## 6) Human-in-the-loop refinement
+Recommended loop:
+1. Generate sidecars with AF3+ChatGPT.
+2. Review/edit core fields (`caption`, `bpm`, `keyscale`, `timesignature`, `duration`).
+3. Keep rich analysis fields for traceability.
+4. Train LoRA with `lora_train.py` on the folder containing audio + JSON sidecars.
+## Output compatibility
+The pipeline keeps Ace Step core sidecar fields:
+- `caption`
+- `lyrics`
+- `bpm`
+- `keyscale`
+- `timesignature`
+- `vocal_language`
+- `duration`
+And adds richer analysis fields in `music_analysis` + `pipeline` for auditability.
+## Note on "guarantee"
+No model can guarantee perfect music metadata. This pipeline improves reliability by:
+- Schema-constrained ChatGPT output
+- Normalization/defaulting in `build_lora_sidecar(...)`
+- Optional human review pass before training

docs/guides/qwen2-audio-train.md CHANGED Viewed

	@@ -0,0 +1,171 @@

+# Qwen2-Audio Captioning -> Human Refinement -> ACE-Step LoRA Dataset
+This guide adds a full annotation pipeline around `Qwen/Qwen2-Audio-7B-Instruct` so you can:
+1. Caption full songs with timestamped segment analysis.
+2. Refine/expand annotations manually.
+3. Export LoRA-ready sidecar JSON for ACE-Step 1.5 training.
+## What was added
+- Reusable captioning module: `qwen_audio_captioning.py`
+- Gradio UI for upload/analyze/edit/export: `qwen_caption_app.py`
+- Batch CLI for local/HF jobs: `scripts/annotations/qwen_caption_dataset.py`
+- HF Job launcher for batch captioning: `scripts/jobs/submit_hf_qwen_caption_job.ps1`
+- Optional endpoint handler template: `templates/hf-qwen-caption-endpoint/handler.py`
+## Why use `Qwen2-Audio-7B-Instruct`
+Use `Qwen/Qwen2-Audio-7B-Instruct` for this task because your prompt is instruction-heavy and structured (musical elements, mix/effects, vocals, and timestamped interactions).
+## Default analysis prompt
+The pipeline defaults to:
+> Analyze and detail the musical elements, tones, instruments, genre and effects. Describe the effects and mix of instruments and vocals. Vocals may use modern production techniques such as pitch correction and tuning effects. Explain how musical elements interact throughout the song with timestamps. Go in depth on vocal performance and musical writing. Be concise but detail-rich.
+You can override this in the UI or CLI.
+## Run locally (recommended first)
+Install dependencies:
+```bash
+python -m pip install --upgrade pip
+python -m pip install -r requirements.txt
+```
+Start the captioning UI:
+```bash
+python qwen_caption_app.py
+```
+Open `http://localhost:7860`.
+### UI flow
+1. **Load Audio** tab:
+   - Scan a folder and/or upload files.
+2. **Run Qwen Captioning** tab:
+   - Backend:
+     - `local` (model runs in same app process), or
+     - `hf_endpoint` (calls a remote endpoint URL).
+   - Tune segmentation (`segment_seconds`, `overlap_seconds`) for timestamp granularity.
+3. **Human Annotation + Export** tab:
+   - Load JSON per track.
+   - Manually refine timelines, instrument/mix notes, caption text.
+   - Export sidecars + manifest.
+## Run batch from CLI
+Example local batch:
+```bash
+python scripts/annotations/qwen_caption_dataset.py \
+  --dataset-dir ./dataset_inbox \
+  --backend local \
+  --model-id Qwen/Qwen2-Audio-7B-Instruct \
+  --segment-seconds 30 \
+  --overlap-seconds 2 \
+  --max-new-tokens 384 \
+  --temperature 0.1 \
+  --output-dir ./qwen_annotations \
+  --copy-audio
+```
+Sidecars are written next to each source audio file by default.
+Disable with `--no-write-inplace-sidecars`.
+Outputs:
+- `qwen_annotations/dataset/*.audio` (if `--copy-audio`)
+- `qwen_annotations/dataset/*.json` (LoRA sidecars)
+- `qwen_annotations/annotations_manifest.jsonl`
+- `qwen_annotations/annotations_index.json`
+## Run batch on Hugging Face Jobs
+PowerShell:
+```powershell
+.\scripts\jobs\submit_hf_qwen_caption_job.ps1 `
+  -CodeRepo "YOUR_USERNAME/ace-step-lora-studio" `
+  -DatasetRepo "YOUR_USERNAME/YOUR_AUDIO_DATASET" `
+  -ModelId "Qwen/Qwen2-Audio-7B-Instruct" `
+  -Flavor "a10g-large" `
+  -Timeout "8h" `
+  -CopyAudio `
+  -UploadRepo "YOUR_USERNAME/YOUR_ANNOTATED_DATASET"
+```
+## Use on Hugging Face Space
+To run this UI as a dedicated Space app, set Space `README.md` front matter:
+- `sdk: gradio`
+- `app_file: qwen_caption_app.py`
+Then push this repo content to that Space.
+## Optional: remote endpoint backend
+If you want local UI to call a remote endpoint:
+1. Deploy dedicated endpoint runtime from this template:
+   - `python scripts/hf_clone.py qwen-endpoint --repo-id YOUR_USERNAME/YOUR_QWEN_ENDPOINT_REPO`
+2. In UI select `backend=hf_endpoint`.
+3. Set endpoint URL + token.
+## Sidecar schema and ACE-Step compatibility
+The exported JSON keeps ACE-Step core fields:
+- `caption`
+- `lyrics`
+- `bpm`
+- `keyscale`
+- `timesignature`
+- `vocal_language`
+- `duration`
+And adds rich fields:
+- `music_analysis.timeline` (timestamped segment notes)
+- `music_analysis.instruments`, `effects`, `vocal_characteristics`, `mix_notes`
+- `analysis_prompt`, `analysis_model`, `analysis_generated_at`
+ACE-Step loader ignores unknown keys, so rich fields stay available for later refinement while training still works with core fields.
+## Train ACE-Step LoRA from exported dataset
+Local:
+```bash
+python lora_train.py \
+  --dataset-dir ./qwen_annotations/dataset \
+  --model-config acestep-v15-base \
+  --device auto \
+  --num-epochs 20 \
+  --batch-size 1 \
+  --grad-accum 1 \
+  --output-dir ./lora_output
+```
+HF Job (existing script):
+```powershell
+.\scripts\jobs\submit_hf_lora_job.ps1 `
+  -CodeRepo "YOUR_USERNAME/ace-step-lora-studio" `
+  -DatasetRepo "YOUR_USERNAME/YOUR_ANNOTATED_DATASET" `
+  -ModelConfig "acestep-v15-base"
+```
+## Recommended iterative loop
+1. Auto-caption with segment timestamps.
+2. Human refine 10-20% highest-impact tracks first.
+3. Export updated sidecars.
+4. Train LoRA.
+5. Evaluate structural/timing control.
+6. Feed findings back into prompt + schema refinements.

qwen_audio_captioning.py ADDED Viewed

	@@ -0,0 +1,996 @@

+"""
+Qwen2-Audio captioning utilities for music annotation workflows.
+This module supports:
+1) Local inference with Qwen2-Audio models via transformers.
+2) Remote inference via a Hugging Face Endpoint with a simple JSON contract.
+3) Segment-based analysis with timestamped aggregation.
+4) Export helpers for ACE-Step LoRA sidecars and manifest files.
+"""
+from __future__ import annotations
+import base64
+import io
+import json
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import urllib.request
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import soundfile as sf
+import torchaudio
+AUDIO_EXTENSIONS = {".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a", ".aac"}
+DEFAULT_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
+DEFAULT_ANALYSIS_PROMPT = (
+    "Analyze and detail the musical elements, tones, instruments, genre and effects. "
+    "Describe the effects and mix of instruments and vocals. Vocals may use modern production "
+    "techniques such as pitch correction and tuning effects. Explain how musical elements interact "
+    "throughout the song with timestamps. Go in depth on vocal performance and musical writing. "
+    "Be concise but detail-rich."
+)
+DEFAULT_LONG_ANALYSIS_PROMPT = (
+    "Analyze the full song and return a concise but detailed timestamped prose breakdown. "
+    "Use sections every 10 to 20 seconds (or major arrangement changes). For each section, "
+    "describe vocals, instrumentation, genre cues, effects, mix/energy changes, and how elements "
+    "interact. End with a short overall summary paragraph."
+)
+SEGMENT_JSON_SCHEMA_HINT = (
+    'Return JSON only with keys: "segment_summary" (string), "section_label" (string), '
+    '"genre" (array of strings), "instruments" (array of strings), "effects" (array of strings), '
+    '"vocal_characteristics" (array of strings), "mix_notes" (array of strings), '
+    '"interaction_notes" (string), "bpm_guess" (number or null), "key_guess" (string or ""), '
+    '"notable_moments" (array of objects with "timestamp_sec" and "note").'
+)
+@dataclass
+class SegmentResult:
+    index: int
+    start_sec: float
+    end_sec: float
+    prompt: str
+    raw_response: str
+    parsed: Dict[str, Any]
+def list_audio_files(folder: str) -> List[str]:
+    root = Path(folder)
+    if not root.is_dir():
+        return []
+    files: List[str] = []
+    for path in sorted(root.rglob("*")):
+        if path.suffix.lower() in AUDIO_EXTENSIONS:
+            files.append(str(path))
+    return files
+def _load_audio_with_fallback(path: str) -> Tuple[np.ndarray, int]:
+    """Load audio to mono float32 numpy array with fallback decode path."""
+    try:
+        wav, sr = torchaudio.load(path)
+        wav = wav.float().numpy()
+        if wav.ndim == 1:
+            mono = wav
+        else:
+            mono = wav.mean(axis=0)
+        return mono.astype(np.float32), int(sr)
+    except Exception as torchaudio_exc:
+        try:
+            audio_np, sr = sf.read(path, dtype="float32", always_2d=True)
+            mono = audio_np.mean(axis=1)
+            return mono.astype(np.float32), int(sr)
+        except Exception as sf_exc:
+            # Last fallback: ffmpeg decode (works when local libsndfile lacks mp3 codec).
+            try:
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+                    tmp_wav = tmp.name
+                cmd = [
+                    "ffmpeg",
+                    "-y",
+                    "-i",
+                    str(path),
+                    "-vn",
+                    "-ac",
+                    "1",
+                    "-ar",
+                    "16000",
+                    tmp_wav,
+                ]
+                proc = subprocess.run(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                )
+                if proc.returncode != 0:
+                    tail = (proc.stderr or "")[-800:]
+                    raise RuntimeError(f"ffmpeg decode failed: {tail}")
+                audio_np, sr = sf.read(tmp_wav, dtype="float32", always_2d=True)
+                mono = audio_np.mean(axis=1)
+                return mono.astype(np.float32), int(sr)
+            except Exception as ffmpeg_exc:
+                raise RuntimeError(
+                    f"Audio decode failed for '{path}'. "
+                    f"torchaudio_error={torchaudio_exc}; "
+                    f"soundfile_error={sf_exc}; "
+                    f"ffmpeg_error={ffmpeg_exc}"
+                ) from ffmpeg_exc
+            finally:
+                try:
+                    if "tmp_wav" in locals():
+                        Path(tmp_wav).unlink(missing_ok=True)
+                except Exception:
+                    pass
+def load_audio_mono(path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
+    audio, sr = _load_audio_with_fallback(path)
+    if sr == target_sr:
+        return audio, sr
+    wav = torch_audio_from_numpy(audio)
+    resampled = torchaudio.functional.resample(wav, sr, target_sr)
+    return resampled.squeeze(0).cpu().numpy().astype(np.float32), target_sr
+def torch_audio_from_numpy(audio: np.ndarray):
+    import torch
+    if audio.ndim != 1:
+        raise ValueError(f"Expected mono waveform [T], got shape={audio.shape}")
+    return torch.from_numpy(audio).unsqueeze(0)
+def split_audio_segments(
+    audio: np.ndarray,
+    sample_rate: int,
+    segment_seconds: float,
+    overlap_seconds: float,
+) -> List[Tuple[float, float, np.ndarray]]:
+    if segment_seconds <= 0:
+        raise ValueError("segment_seconds must be > 0")
+    if overlap_seconds < 0:
+        raise ValueError("overlap_seconds must be >= 0")
+    if overlap_seconds >= segment_seconds:
+        raise ValueError("overlap_seconds must be smaller than segment_seconds")
+    total_samples = int(audio.shape[0])
+    segment_samples = max(1, int(round(segment_seconds * sample_rate)))
+    step_samples = max(1, int(round((segment_seconds - overlap_seconds) * sample_rate)))
+    segments: List[Tuple[float, float, np.ndarray]] = []
+    start = 0
+    idx = 0
+    while start < total_samples:
+        end = min(total_samples, start + segment_samples)
+        seg_audio = audio[start:end]
+        start_sec = start / sample_rate
+        end_sec = end / sample_rate
+        segments.append((start_sec, end_sec, seg_audio))
+        idx += 1
+        if end >= total_samples:
+            break
+        start = idx * step_samples
+    return segments
+def _extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
+    text = (text or "").strip()
+    if not text:
+        return None
+    # Direct parse first.
+    try:
+        obj = json.loads(text)
+        if isinstance(obj, dict):
+            return obj
+    except Exception:
+        pass
+    # Parse markdown code fence if present.
+    fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.S | re.I)
+    if fence_match:
+        block = fence_match.group(1)
+        try:
+            obj = json.loads(block)
+            if isinstance(obj, dict):
+                return obj
+        except Exception:
+            pass
+    # Fallback: first brace-balanced object.
+    start = text.find("{")
+    if start < 0:
+        return None
+    depth = 0
+    for i in range(start, len(text)):
+        ch = text[i]
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                candidate = text[start : i + 1]
+                try:
+                    obj = json.loads(candidate)
+                    if isinstance(obj, dict):
+                        return obj
+                except Exception:
+                    return None
+    return None
+def _ensure_string_list(value: Any) -> List[str]:
+    if value is None:
+        return []
+    if isinstance(value, str):
+        v = value.strip()
+        return [v] if v else []
+    out: List[str] = []
+    if isinstance(value, Sequence):
+        for item in value:
+            if item is None:
+                continue
+            s = str(item).strip()
+            if s:
+                out.append(s)
+    deduped: List[str] = []
+    seen = set()
+    for item in out:
+        key = item.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(item)
+    return deduped
+def _float_or_none(value: Any) -> Optional[float]:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except Exception:
+        return None
+_GENRE_KEYWORDS = [
+    "pop",
+    "rock",
+    "hip-hop",
+    "hip hop",
+    "rap",
+    "r&b",
+    "rnb",
+    "electronic",
+    "edm",
+    "trap",
+    "house",
+    "techno",
+    "ambient",
+    "indie",
+    "soul",
+    "jazz",
+    "metal",
+    "punk",
+    "country",
+    "lo-fi",
+    "lofi",
+    "drill",
+]
+_INSTRUMENT_KEYWORDS = [
+    "drums",
+    "kick",
+    "snare",
+    "hihat",
+    "hi-hat",
+    "808",
+    "bass",
+    "synth",
+    "piano",
+    "guitar",
+    "electric guitar",
+    "acoustic guitar",
+    "strings",
+    "pad",
+    "lead",
+    "pluck",
+    "vocal",
+    "choir",
+]
+_EFFECT_KEYWORDS = [
+    "reverb",
+    "delay",
+    "distortion",
+    "saturation",
+    "autotune",
+    "auto tune",
+    "pitch correction",
+    "compression",
+    "eq",
+    "sidechain",
+    "chorus",
+    "flanger",
+    "phaser",
+    "stereo widening",
+]
+_VOCAL_KEYWORDS = [
+    "autotune",
+    "auto tune",
+    "pitch correction",
+    "harmonies",
+    "ad-libs",
+    "ad libs",
+    "falsetto",
+    "breathy",
+    "raspy",
+    "processed vocals",
+]
+def _clean_model_text(text: str) -> str:
+    s = (text or "").strip()
+    if not s:
+        return ""
+    # Remove repetitive leading boilerplate often produced when JSON is requested.
+    s = re.sub(r"^\s*The output should be a JSON object with these fields\.?\s*", "", s, flags=re.I)
+    s = re.sub(r"^\s*This is the requested information for the given song segment:?\s*", "", s, flags=re.I)
+    s = re.sub(r"^\s*From\s+\d+(\.\d+)?s\s+to\s+\d+(\.\d+)?s\s*", "", s, flags=re.I)
+    return s.strip()
+def _extract_bpm_guess(text: str) -> Optional[float]:
+    for pat in [r"\b(\d{2,3}(?:\.\d+)?)\s*bpm\b", r"\btempo\s*(?:of|is|:)?\s*(\d{2,3}(?:\.\d+)?)\b"]:
+        m = re.search(pat, text, flags=re.I)
+        if m:
+            try:
+                val = float(m.group(1))
+                if 30 <= val <= 300:
+                    return val
+            except Exception:
+                continue
+    return None
+def _extract_key_guess(text: str) -> str:
+    patterns = [
+        r"\b([A-G](?:#|b)?\s*(?:major|minor))\b",
+        r"\b([A-G](?:#|b)?m)\b",
+    ]
+    for pat in patterns:
+        m = re.search(pat, text, flags=re.I)
+        if m:
+            key = m.group(1).strip()
+            return key[0].upper() + key[1:]
+    return ""
+def _extract_keyword_hits(text: str, keywords: List[str]) -> List[str]:
+    lower = text.lower()
+    found: List[str] = []
+    for kw in keywords:
+        if kw.lower() in lower:
+            label = kw.replace("rnb", "R&B").replace("hip-hop", "hip-hop")
+            if label.lower() not in {x.lower() for x in found}:
+                found.append(label)
+    return found
+class BaseCaptioner:
+    backend_name = "base"
+    model_id = DEFAULT_MODEL_ID
+    def generate(
+        self,
+        audio: np.ndarray,
+        sample_rate: int,
+        prompt: str,
+        max_new_tokens: int,
+        temperature: float,
+    ) -> str:
+        raise NotImplementedError
+class LocalQwen2AudioCaptioner(BaseCaptioner):
+    backend_name = "local"
+    def __init__(
+        self,
+        model_id: str = DEFAULT_MODEL_ID,
+        device: str = "auto",
+        torch_dtype: str = "auto",
+        trust_remote_code: bool = True,
+    ):
+        self.model_id = model_id
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self.trust_remote_code = trust_remote_code
+        self._processor = None
+        self._model = None
+    def _load(self):
+        if self._processor is not None and self._model is not None:
+            return
+        import torch
+        try:
+            from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+        except Exception as exc:
+            raise RuntimeError(
+                "Qwen2-Audio classes are unavailable. Install a recent transformers build "
+                "(for example transformers>=4.53.0) and retry."
+            ) from exc
+        if self.torch_dtype == "auto":
+            dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        elif self.torch_dtype == "bfloat16":
+            dtype = torch.bfloat16
+        elif self.torch_dtype == "float16":
+            dtype = torch.float16
+        else:
+            dtype = torch.float32
+        device_map = "auto" if self.device == "auto" else None
+        self._processor = AutoProcessor.from_pretrained(
+            self.model_id,
+            trust_remote_code=self.trust_remote_code,
+        )
+        self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
+            self.model_id,
+            torch_dtype=dtype,
+            device_map=device_map,
+            trust_remote_code=self.trust_remote_code,
+        )
+        if device_map is None:
+            if self.device == "auto":
+                target_device = "cuda" if torch.cuda.is_available() else "cpu"
+            else:
+                target_device = self.device
+            self._model.to(target_device)
+    def _model_device(self):
+        import torch
+        if self._model is None:
+            return torch.device("cpu")
+        return next(self._model.parameters()).device
+    def generate(
+        self,
+        audio: np.ndarray,
+        sample_rate: int,
+        prompt: str,
+        max_new_tokens: int,
+        temperature: float,
+    ) -> str:
+        self._load()
+        import torch
+        conversation = [
+            {"role": "system", "content": "You are a precise music analysis assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "audio_url": "local://segment.wav"},
+                    {"type": "text", "text": prompt},
+                ],
+            },
+        ]
+        text = self._processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        inputs = self._processor(
+            text=text,
+            audio=[audio],
+            sampling_rate=sample_rate,
+            return_tensors="pt",
+            padding=True,
+        )
+        device = self._model_device()
+        for key, value in list(inputs.items()):
+            if hasattr(value, "to"):
+                inputs[key] = value.to(device)
+        do_sample = bool(temperature and temperature > 0)
+        gen_kwargs = {
+            "max_new_tokens": int(max_new_tokens),
+            "do_sample": do_sample,
+        }
+        if do_sample:
+            gen_kwargs["temperature"] = max(float(temperature), 1e-5)
+        with torch.no_grad():
+            generated = self._model.generate(**inputs, **gen_kwargs)
+        prompt_tokens = inputs["input_ids"].size(1)
+        generated_new = generated[:, prompt_tokens:]
+        text_out = self._processor.batch_decode(
+            generated_new,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        if not text_out.strip():
+            text_out = self._processor.batch_decode(
+                generated,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )[0]
+        return text_out.strip()
+class HFEndpointCaptioner(BaseCaptioner):
+    backend_name = "hf_endpoint"
+    def __init__(
+        self,
+        endpoint_url: str,
+        token: Optional[str] = None,
+        model_id: str = DEFAULT_MODEL_ID,
+        timeout_seconds: int = 180,
+    ):
+        if not endpoint_url:
+            raise ValueError("endpoint_url is required for HFEndpointCaptioner")
+        self.endpoint_url = endpoint_url.strip()
+        self.token = token or os.getenv("HF_TOKEN", "")
+        self.model_id = model_id
+        self.timeout_seconds = timeout_seconds
+    def generate(
+        self,
+        audio: np.ndarray,
+        sample_rate: int,
+        prompt: str,
+        max_new_tokens: int,
+        temperature: float,
+    ) -> str:
+        # Serialize to wav bytes for endpoint transport.
+        buffer = io.BytesIO()
+        sf.write(buffer, audio, sample_rate, format="WAV")
+        wav_bytes = buffer.getvalue()
+        audio_b64 = base64.b64encode(wav_bytes).decode("utf-8")
+        payload = {
+            "inputs": {
+                "prompt": prompt,
+                "audio_base64": audio_b64,
+                "sample_rate": sample_rate,
+                "max_new_tokens": int(max_new_tokens),
+                "temperature": float(temperature),
+                "model_id": self.model_id,
+            }
+        }
+        req = urllib.request.Request(
+            self.endpoint_url,
+            data=json.dumps(payload).encode("utf-8"),
+            headers={
+                "Content-Type": "application/json",
+                **({"Authorization": f"Bearer {self.token}"} if self.token else {}),
+            },
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=self.timeout_seconds) as resp:
+            body = resp.read().decode("utf-8")
+        data = json.loads(body)
+        # Accept common endpoint output shapes.
+        if isinstance(data, dict):
+            if isinstance(data.get("generated_text"), str):
+                return data["generated_text"].strip()
+            if isinstance(data.get("text"), str):
+                return data["text"].strip()
+            if isinstance(data.get("output_text"), str):
+                return data["output_text"].strip()
+        if isinstance(data, list) and data:
+            first = data[0]
+            if isinstance(first, dict) and isinstance(first.get("generated_text"), str):
+                return first["generated_text"].strip()
+        return str(data).strip()
+def build_segment_prompt(
+    base_prompt: str,
+    start_sec: float,
+    end_sec: float,
+) -> str:
+    return (
+        f"{base_prompt}\n\n"
+        f"Analyze only the song segment from {start_sec:.2f}s to {end_sec:.2f}s.\n"
+        "Use timestamp references in absolute song seconds.\n"
+        f"{SEGMENT_JSON_SCHEMA_HINT}"
+    )
+def _make_fallback_segment_dict(raw_text: str) -> Dict[str, Any]:
+    summary = _clean_model_text(raw_text)
+    if not summary:
+        summary = "No analysis generated."
+    bpm_guess = _extract_bpm_guess(summary)
+    key_guess = _extract_key_guess(summary)
+    genres = _extract_keyword_hits(summary, _GENRE_KEYWORDS)
+    instruments = _extract_keyword_hits(summary, _INSTRUMENT_KEYWORDS)
+    effects = _extract_keyword_hits(summary, _EFFECT_KEYWORDS)
+    vocal_chars = _extract_keyword_hits(summary, _VOCAL_KEYWORDS)
+    return {
+        "segment_summary": summary,
+        "section_label": "",
+        "genre": genres,
+        "instruments": instruments,
+        "effects": effects,
+        "vocal_characteristics": vocal_chars,
+        "mix_notes": [],
+        "interaction_notes": summary,
+        "bpm_guess": bpm_guess,
+        "key_guess": key_guess,
+        "notable_moments": [],
+    }
+def _parse_segment_output(raw_text: str) -> Dict[str, Any]:
+    parsed = _extract_json_from_text(raw_text)
+    if not parsed:
+        return _make_fallback_segment_dict(raw_text)
+    out = dict(parsed)
+    out["segment_summary"] = str(out.get("segment_summary", "")).strip()
+    out["section_label"] = str(out.get("section_label", "")).strip()
+    out["genre"] = _ensure_string_list(out.get("genre"))
+    out["instruments"] = _ensure_string_list(out.get("instruments"))
+    out["effects"] = _ensure_string_list(out.get("effects"))
+    out["vocal_characteristics"] = _ensure_string_list(out.get("vocal_characteristics"))
+    out["mix_notes"] = _ensure_string_list(out.get("mix_notes"))
+    out["interaction_notes"] = str(out.get("interaction_notes", "")).strip()
+    out["bpm_guess"] = _float_or_none(out.get("bpm_guess"))
+    out["key_guess"] = str(out.get("key_guess", "")).strip()
+    nm = out.get("notable_moments")
+    cleaned_nm: List[Dict[str, Any]] = []
+    if isinstance(nm, Sequence):
+        for item in nm:
+            if not isinstance(item, dict):
+                continue
+            ts = _float_or_none(item.get("timestamp_sec"))
+            note = str(item.get("note", "")).strip()
+            if ts is None and not note:
+                continue
+            cleaned_nm.append({"timestamp_sec": ts, "note": note})
+    out["notable_moments"] = cleaned_nm
+    return out
+def _pick_common_key(values: List[str]) -> str:
+    counts: Dict[str, int] = {}
+    first_original: Dict[str, str] = {}
+    for v in values:
+        s = (v or "").strip()
+        if not s:
+            continue
+        k = s.lower()
+        counts[k] = counts.get(k, 0) + 1
+        if k not in first_original:
+            first_original[k] = s
+    if not counts:
+        return ""
+    best = sorted(counts.items(), key=lambda x: (-x[1], x[0]))[0][0]
+    return first_original[best]
+def _collect_unique(items: List[List[str]], limit: int = 12) -> List[str]:
+    out: List[str] = []
+    seen = set()
+    for group in items:
+        for item in group:
+            key = item.strip().lower()
+            if not key or key in seen:
+                continue
+            seen.add(key)
+            out.append(item.strip())
+            if len(out) >= limit:
+                return out
+    return out
+def _derive_caption(genres: List[str], instruments: List[str], vocals: List[str]) -> str:
+    parts: List[str] = []
+    if genres:
+        parts.append(", ".join(genres[:2]))
+    if instruments:
+        parts.append("with " + ", ".join(instruments[:3]))
+    if vocals:
+        parts.append("and modern processed vocals")
+    if not parts:
+        return "music track with detailed arrangement and production dynamics"
+    return " ".join(parts)
+def generate_track_annotation(
+    audio_path: str,
+    captioner: BaseCaptioner,
+    prompt: str = DEFAULT_ANALYSIS_PROMPT,
+    segment_seconds: float = 30.0,
+    overlap_seconds: float = 2.0,
+    max_new_tokens: int = 384,
+    temperature: float = 0.1,
+    keep_raw_outputs: bool = True,
+    include_long_analysis: bool = False,
+    long_analysis_prompt: str = DEFAULT_LONG_ANALYSIS_PROMPT,
+    long_analysis_max_new_tokens: int = 1200,
+    long_analysis_temperature: float = 0.1,
+) -> Dict[str, Any]:
+    audio, sr = load_audio_mono(audio_path, target_sr=16000)
+    duration_sec = float(audio.shape[0]) / float(sr) if sr > 0 else 0.0
+    segments = split_audio_segments(
+        audio=audio,
+        sample_rate=sr,
+        segment_seconds=segment_seconds,
+        overlap_seconds=overlap_seconds,
+    )
+    results: List[SegmentResult] = []
+    for idx, (start_sec, end_sec, seg_audio) in enumerate(segments):
+        seg_prompt = build_segment_prompt(prompt, start_sec=start_sec, end_sec=end_sec)
+        raw = captioner.generate(
+            audio=seg_audio,
+            sample_rate=sr,
+            prompt=seg_prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+        )
+        parsed = _parse_segment_output(raw)
+        results.append(
+            SegmentResult(
+                index=idx,
+                start_sec=start_sec,
+                end_sec=end_sec,
+                prompt=seg_prompt,
+                raw_response=raw,
+                parsed=parsed,
+            )
+        )
+    timeline: List[Dict[str, Any]] = []
+    all_genres: List[List[str]] = []
+    all_instruments: List[List[str]] = []
+    all_effects: List[List[str]] = []
+    all_vocals: List[List[str]] = []
+    all_mix_notes: List[List[str]] = []
+    bpm_values: List[float] = []
+    keys: List[str] = []
+    interaction_summary: List[str] = []
+    for seg in results:
+        p = seg.parsed
+        all_genres.append(_ensure_string_list(p.get("genre")))
+        all_instruments.append(_ensure_string_list(p.get("instruments")))
+        all_effects.append(_ensure_string_list(p.get("effects")))
+        all_vocals.append(_ensure_string_list(p.get("vocal_characteristics")))
+        all_mix_notes.append(_ensure_string_list(p.get("mix_notes")))
+        bpm = _float_or_none(p.get("bpm_guess"))
+        if bpm is not None and bpm > 0:
+            bpm_values.append(bpm)
+        key_guess = str(p.get("key_guess", "")).strip()
+        if key_guess:
+            keys.append(key_guess)
+        if p.get("interaction_notes"):
+            interaction_summary.append(str(p["interaction_notes"]).strip())
+        timeline_entry = {
+            "segment_index": seg.index,
+            "start_sec": round(seg.start_sec, 3),
+            "end_sec": round(seg.end_sec, 3),
+            "section_label": str(p.get("section_label", "")).strip(),
+            "segment_summary": str(p.get("segment_summary", "")).strip(),
+            "instruments": _ensure_string_list(p.get("instruments")),
+            "effects": _ensure_string_list(p.get("effects")),
+            "vocal_characteristics": _ensure_string_list(p.get("vocal_characteristics")),
+            "interaction_notes": str(p.get("interaction_notes", "")).strip(),
+            "mix_notes": _ensure_string_list(p.get("mix_notes")),
+            "notable_moments": p.get("notable_moments", []),
+        }
+        if keep_raw_outputs:
+            timeline_entry["raw_response"] = seg.raw_response
+        timeline.append(timeline_entry)
+    genres = _collect_unique(all_genres, limit=10)
+    instruments = _collect_unique(all_instruments, limit=16)
+    effects = _collect_unique(all_effects, limit=16)
+    vocal_traits = _collect_unique(all_vocals, limit=12)
+    mix_notes = _collect_unique(all_mix_notes, limit=24)
+    keyscale = _pick_common_key(keys)
+    bpm = int(round(sum(bpm_values) / len(bpm_values))) if bpm_values else None
+    caption = _derive_caption(genres=genres, instruments=instruments, vocals=vocal_traits)
+    sidecar: Dict[str, Any] = {
+        "caption": caption,
+        "lyrics": "",
+        "bpm": bpm,
+        "keyscale": keyscale,
+        "timesignature": "4/4",
+        "vocal_language": "unknown",
+        "duration": round(duration_sec, 3),
+        "annotation_version": "qwen2_audio_music_v1",
+        "source_audio": str(audio_path),
+        "analysis_prompt": prompt,
+        "analysis_backend": captioner.backend_name,
+        "analysis_model": captioner.model_id,
+        "analysis_generated_at": datetime.now(timezone.utc).isoformat(),
+        "music_analysis": {
+            "genres": genres,
+            "instruments": instruments,
+            "effects": effects,
+            "vocal_characteristics": vocal_traits,
+            "mix_notes": mix_notes,
+            "interaction_summary": interaction_summary,
+            "timeline": timeline,
+            "segment_seconds": segment_seconds,
+            "overlap_seconds": overlap_seconds,
+            "segment_count": len(timeline),
+        },
+    }
+    if include_long_analysis:
+        long_prompt = (long_analysis_prompt or "").strip() or DEFAULT_LONG_ANALYSIS_PROMPT
+        try:
+            long_raw = captioner.generate(
+                audio=audio,
+                sample_rate=sr,
+                prompt=long_prompt,
+                max_new_tokens=int(long_analysis_max_new_tokens),
+                temperature=float(long_analysis_temperature),
+            )
+            long_text = _clean_model_text(long_raw)
+            sidecar["analysis_long_prompt"] = long_prompt
+            sidecar["analysis_long"] = long_text
+            sidecar["music_analysis"]["summary_long"] = long_text
+        except Exception as exc:
+            sidecar["analysis_long_prompt"] = long_prompt
+            sidecar["analysis_long"] = ""
+            sidecar["analysis_long_error"] = str(exc)
+    return sidecar
+def build_captioner(
+    backend: str,
+    model_id: str = DEFAULT_MODEL_ID,
+    endpoint_url: str = "",
+    token: str = "",
+    device: str = "auto",
+    torch_dtype: str = "auto",
+) -> BaseCaptioner:
+    backend = (backend or "").strip().lower()
+    if backend in {"local", "hf_space_local"}:
+        return LocalQwen2AudioCaptioner(
+            model_id=model_id or DEFAULT_MODEL_ID,
+            device=device,
+            torch_dtype=torch_dtype,
+        )
+    if backend in {"endpoint", "hf_endpoint"}:
+        return HFEndpointCaptioner(
+            endpoint_url=endpoint_url,
+            token=token,
+            model_id=model_id or DEFAULT_MODEL_ID,
+        )
+    raise ValueError(f"Unsupported backend: {backend}")
+def export_annotation_records(
+    records: List[Dict[str, Any]],
+    output_dir: str,
+    copy_audio: bool = True,
+    write_inplace_sidecars: bool = True,
+) -> Dict[str, Any]:
+    """
+    Export analyzed tracks as LoRA-ready sidecars + manifest.
+    records item schema:
+      {
+        "audio_path": "...",
+        "sidecar": {...annotation json...}
+      }
+    """
+    out_root = Path(output_dir)
+    out_root.mkdir(parents=True, exist_ok=True)
+    dataset_root = out_root / "dataset"
+    if copy_audio:
+        dataset_root.mkdir(parents=True, exist_ok=True)
+    manifest_path = out_root / "annotations_manifest.jsonl"
+    index_path = out_root / "annotations_index.json"
+    manifest_lines: List[str] = []
+    index_items: List[Dict[str, Any]] = []
+    written_count = 0
+    for rec in records:
+        src_audio = Path(rec["audio_path"])
+        sidecar = dict(rec["sidecar"])
+        if not src_audio.exists():
+            continue
+        if copy_audio:
+            dst_audio = dataset_root / src_audio.name
+            if src_audio.resolve() != dst_audio.resolve():
+                shutil.copy2(src_audio, dst_audio)
+            dst_sidecar = dst_audio.with_suffix(".json")
+        else:
+            dst_sidecar = (out_root / src_audio.name).with_suffix(".json")
+        dst_sidecar.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
+        written_count += 1
+        if write_inplace_sidecars:
+            inplace_sidecar = src_audio.with_suffix(".json")
+            inplace_sidecar.write_text(
+                json.dumps(sidecar, indent=2, ensure_ascii=False),
+                encoding="utf-8",
+            )
+        manifest_row = {
+            "audio_path": str(dst_sidecar.with_suffix(src_audio.suffix).as_posix()) if copy_audio else str(src_audio),
+            "sidecar_path": str(dst_sidecar),
+            "caption": sidecar.get("caption", ""),
+            "duration": sidecar.get("duration"),
+            "bpm": sidecar.get("bpm"),
+            "keyscale": sidecar.get("keyscale", ""),
+        }
+        manifest_lines.append(json.dumps(manifest_row, ensure_ascii=False))
+        index_items.append(
+            {
+                "source_audio": str(src_audio),
+                "exported_sidecar": str(dst_sidecar),
+                "caption": sidecar.get("caption", ""),
+            }
+        )
+    manifest_path.write_text("\n".join(manifest_lines), encoding="utf-8")
+    index_path.write_text(
+        json.dumps(
+            {
+                "generated_at": datetime.now(timezone.utc).isoformat(),
+                "records": index_items,
+            },
+            indent=2,
+            ensure_ascii=False,
+        ),
+        encoding="utf-8",
+    )
+    return {
+        "written_count": written_count,
+        "manifest_path": str(manifest_path),
+        "index_path": str(index_path),
+        "dataset_root": str(dataset_root) if copy_audio else "",
+    }
+def read_prompt_file(prompt_file: str) -> str:
+    path = Path(prompt_file)
+    if not path.is_file():
+        raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
+    text = path.read_text(encoding="utf-8").strip()
+    if not text:
+        raise ValueError(f"Prompt file is empty: {prompt_file}")
+    return text

qwen_caption_app.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import gradio as gr
+import torchaudio
+# On Hugging Face Spaces Zero, `spaces` must be imported before CUDA-related modules.
+if os.getenv("SPACE_ID"):
+    try:
+        import spaces  # noqa: F401
+    except Exception:
+        pass
+from qwen_audio_captioning import (
+    DEFAULT_ANALYSIS_PROMPT,
+    DEFAULT_MODEL_ID,
+    build_captioner,
+    export_annotation_records,
+    generate_track_annotation,
+    list_audio_files,
+)
+IS_SPACE = bool(os.getenv("SPACE_ID"))
+DEFAULT_EXPORT_DIR = "/data/qwen_annotations" if IS_SPACE else "qwen_annotations"
+_captioner_cache: Dict[str, Any] = {"key": None, "obj": None}
+def _audio_duration_sec(path: str) -> Optional[float]:
+    try:
+        info = torchaudio.info(path)
+        if info.sample_rate <= 0:
+            return None
+        return float(info.num_frames) / float(info.sample_rate)
+    except Exception:
+        return None
+def _dedupe_paths(paths: List[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for p in paths:
+        if not isinstance(p, str):
+            continue
+        pp = p.strip()
+        if not pp:
+            continue
+        key = str(Path(pp).resolve()) if Path(pp).exists() else pp
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(pp)
+    return out
+def _files_table(paths: List[str]) -> List[List[str]]:
+    rows: List[List[str]] = []
+    for p in paths:
+        duration = _audio_duration_sec(p)
+        rows.append([
+            Path(p).name,
+            f"{duration:.2f}" if duration is not None else "?",
+            p,
+        ])
+    return rows
+def _records_table(records: List[Dict[str, Any]]) -> List[List[str]]:
+    rows: List[List[str]] = []
+    for rec in records:
+        sidecar = rec.get("sidecar", {})
+        analysis = sidecar.get("music_analysis", {})
+        rows.append([
+            Path(rec.get("audio_path", "")).name,
+            f"{sidecar.get('duration', '?')}",
+            str(analysis.get("segment_count", "?")),
+            str(sidecar.get("bpm", "")),
+            str(sidecar.get("keyscale", "")),
+            str(sidecar.get("caption", ""))[:160],
+            str(rec.get("status", "ok")),
+        ])
+    return rows
+def _get_captioner(
+    backend: str,
+    model_id: str,
+    endpoint_url: str,
+    token: str,
+    device: str,
+    dtype: str,
+):
+    cache_key = (backend, model_id, endpoint_url, device, dtype, token if backend == "hf_endpoint" else "")
+    if _captioner_cache["obj"] is not None and _captioner_cache["key"] == cache_key:
+        return _captioner_cache["obj"]
+    cap = build_captioner(
+        backend=backend,
+        model_id=model_id,
+        endpoint_url=endpoint_url,
+        token=token,
+        device=device,
+        torch_dtype=dtype,
+    )
+    _captioner_cache["obj"] = cap
+    _captioner_cache["key"] = cache_key
+    return cap
+def scan_folder(folder_path: str, current_paths: List[str]):
+    current_paths = current_paths or []
+    if not folder_path or not Path(folder_path).is_dir():
+        return "Provide a valid folder path.", current_paths, _files_table(current_paths)
+    merged = _dedupe_paths(current_paths + list_audio_files(folder_path))
+    return f"Loaded {len(merged)} audio files.", merged, _files_table(merged)
+def add_uploaded(uploaded_paths: List[str], current_paths: List[str]):
+    current_paths = current_paths or []
+    uploaded_paths = uploaded_paths or []
+    merged = _dedupe_paths(current_paths + uploaded_paths)
+    if not merged:
+        return "Upload one or more audio files first.", merged, _files_table(merged)
+    return f"Loaded {len(merged)} audio files.", merged, _files_table(merged)
+def clear_files():
+    return "Cleared file list.", [], []
+def load_existing_sidecars(audio_paths: List[str], records: List[Dict[str, Any]]):
+    audio_paths = audio_paths or []
+    records = records or []
+    existing_by_path = {r.get("audio_path"): r for r in records}
+    loaded = 0
+    for audio_path in audio_paths:
+        sidecar_path = Path(audio_path).with_suffix(".json")
+        if not sidecar_path.exists():
+            continue
+        try:
+            data = json.loads(sidecar_path.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+        existing_by_path[audio_path] = {
+            "audio_path": audio_path,
+            "sidecar": data,
+            "status": "loaded-existing",
+        }
+        loaded += 1
+    merged_records = list(existing_by_path.values())
+    choices = [r.get("audio_path", "") for r in merged_records]
+    return (
+        f"Loaded {loaded} existing sidecar(s). Total editable records: {len(merged_records)}.",
+        merged_records,
+        _records_table(merged_records),
+        gr.update(choices=choices, value=choices[0] if choices else None),
+    )
+def run_analysis(
+    audio_paths: List[str],
+    backend: str,
+    model_id: str,
+    endpoint_url: str,
+    token: str,
+    device: str,
+    dtype: str,
+    prompt: str,
+    segment_seconds: float,
+    overlap_seconds: float,
+    max_new_tokens: int,
+    temperature: float,
+    keep_raw_outputs: bool,
+    existing_records: List[Dict[str, Any]],
+):
+    audio_paths = audio_paths or []
+    existing_records = existing_records or []
+    if not audio_paths:
+        return (
+            "No audio files loaded.",
+            existing_records,
+            _records_table(existing_records),
+            gr.update(choices=[], value=None),
+        )
+    prompt = (prompt or "").strip() or DEFAULT_ANALYSIS_PROMPT
+    captioner = _get_captioner(
+        backend=backend,
+        model_id=model_id or DEFAULT_MODEL_ID,
+        endpoint_url=endpoint_url,
+        token=token,
+        device=device,
+        dtype=dtype,
+    )
+    existing_by_path = {r.get("audio_path"): r for r in existing_records}
+    failures: List[str] = []
+    for audio_path in audio_paths:
+        try:
+            sidecar = generate_track_annotation(
+                audio_path=audio_path,
+                captioner=captioner,
+                prompt=prompt,
+                segment_seconds=float(segment_seconds),
+                overlap_seconds=float(overlap_seconds),
+                max_new_tokens=int(max_new_tokens),
+                temperature=float(temperature),
+                keep_raw_outputs=bool(keep_raw_outputs),
+            )
+            # Persist immediately so dataset folder stays LoRA-ready.
+            Path(audio_path).with_suffix(".json").write_text(
+                json.dumps(sidecar, indent=2, ensure_ascii=False),
+                encoding="utf-8",
+            )
+            existing_by_path[audio_path] = {
+                "audio_path": audio_path,
+                "sidecar": sidecar,
+                "status": "analyzed+saved",
+            }
+        except Exception as exc:
+            failures.append(f"{Path(audio_path).name}: {exc}")
+            fallback = existing_by_path.get(audio_path, {"audio_path": audio_path, "sidecar": {}})
+            fallback["status"] = f"failed: {exc}"
+            existing_by_path[audio_path] = fallback
+    merged_records = list(existing_by_path.values())
+    choices = [r.get("audio_path", "") for r in merged_records]
+    message = (
+        f"Analyzed {len(audio_paths)} file(s). "
+        f"Failures: {len(failures)}."
+    )
+    if failures:
+        message += "\n" + "\n".join(failures[:12])
+    return (
+        message,
+        merged_records,
+        _records_table(merged_records),
+        gr.update(choices=choices, value=choices[0] if choices else None),
+    )
+def load_record_json(selected_audio_path: str, records: List[Dict[str, Any]]):
+    records = records or []
+    if not selected_audio_path:
+        return "{}", "", "", "", "", "", ""
+    for rec in records:
+        if rec.get("audio_path") == selected_audio_path:
+            sidecar = rec.get("sidecar", {})
+            return (
+                json.dumps(sidecar, indent=2, ensure_ascii=False),
+                str(sidecar.get("caption", "")),
+                str(sidecar.get("lyrics", "")),
+                str(sidecar.get("bpm", "")),
+                str(sidecar.get("keyscale", "")),
+                str(sidecar.get("vocal_language", "")),
+                str(sidecar.get("duration", "")),
+            )
+    return "{}", "", "", "", "", "", ""
+def save_record_json(
+    selected_audio_path: str,
+    edited_json: str,
+    records: List[Dict[str, Any]],
+):
+    records = records or []
+    if not selected_audio_path:
+        return "Select a track first.", records, _records_table(records)
+    try:
+        payload = json.loads(edited_json)
+        if not isinstance(payload, dict):
+            return "Edited payload must be a JSON object.", records, _records_table(records)
+    except Exception as exc:
+        return f"Invalid JSON: {exc}", records, _records_table(records)
+    updated = False
+    for rec in records:
+        if rec.get("audio_path") == selected_audio_path:
+            rec["sidecar"] = payload
+            rec["status"] = "edited+saved"
+            updated = True
+            break
+    if not updated:
+        records.append({"audio_path": selected_audio_path, "sidecar": payload, "status": "edited+saved"})
+    # Persist edits next to source audio for LoRA-ready folder layout.
+    Path(selected_audio_path).with_suffix(".json").write_text(
+        json.dumps(payload, indent=2, ensure_ascii=False),
+        encoding="utf-8",
+    )
+    return "Saved edits and wrote sidecar next to source audio.", records, _records_table(records)
+def export_records(
+    records: List[Dict[str, Any]],
+    output_dir: str,
+    copy_audio: bool,
+    write_inplace_sidecars: bool,
+):
+    records = records or []
+    valid: List[Dict[str, Any]] = []
+    for rec in records:
+        if not rec.get("audio_path") or not isinstance(rec.get("sidecar"), dict):
+            continue
+        valid.append({"audio_path": rec["audio_path"], "sidecar": rec["sidecar"]})
+    if not valid:
+        return "No valid analyzed/edited records to export."
+    out_dir = (output_dir or "").strip() or DEFAULT_EXPORT_DIR
+    result = export_annotation_records(
+        records=valid,
+        output_dir=out_dir,
+        copy_audio=bool(copy_audio),
+        write_inplace_sidecars=bool(write_inplace_sidecars),
+    )
+    return (
+        f"Exported {result['written_count']} sidecar(s).\n"
+        f"Manifest: {result['manifest_path']}\n"
+        f"Index: {result['index_path']}\n"
+        f"Dataset root: {result['dataset_root'] or '(audio copy disabled)'}"
+    )
+def build_ui():
+    with gr.Blocks(title="Qwen2-Audio Music Captioning", theme=gr.themes.Soft()) as app:
+        gr.Markdown(
+            "# Qwen2-Audio Music Captioning + Annotation Export\n"
+            "Upload songs, run structured timestamped music analysis, optionally edit annotations, "
+            "then export ACE-Step LoRA sidecars."
+        )
+        audio_paths_state = gr.State([])
+        records_state = gr.State([])
+        with gr.Tab("1) Load Audio"):
+            with gr.Row():
+                folder_input = gr.Textbox(label="Dataset Folder", placeholder="e.g. ./dataset_inbox")
+                scan_btn = gr.Button("Scan Folder")
+            with gr.Row():
+                upload_files = gr.Files(
+                    label="Upload Audio Files",
+                    file_count="multiple",
+                    file_types=["audio"],
+                    type="filepath",
+                )
+                add_upload_btn = gr.Button("Add Uploaded Files")
+                clear_btn = gr.Button("Clear")
+            files_status = gr.Textbox(label="Load Status", interactive=False)
+            files_table = gr.Dataframe(
+                headers=["File", "Duration(s)", "Path"],
+                datatype=["str", "str", "str"],
+                label="Loaded Audio",
+                interactive=False,
+            )
+            scan_btn.click(
+                scan_folder,
+                [folder_input, audio_paths_state],
+                [files_status, audio_paths_state, files_table],
+            )
+            add_upload_btn.click(
+                add_uploaded,
+                [upload_files, audio_paths_state],
+                [files_status, audio_paths_state, files_table],
+            )
+            clear_btn.click(
+                clear_files,
+                outputs=[files_status, audio_paths_state, files_table],
+            )
+        with gr.Tab("2) Run Qwen Captioning"):
+            with gr.Row():
+                backend_dd = gr.Dropdown(
+                    choices=["local", "hf_endpoint"],
+                    value="local",
+                    label="Backend",
+                )
+                model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL_ID)
+                endpoint_url = gr.Textbox(label="HF Endpoint URL (for hf_endpoint backend)", value="")
+            with gr.Row():
+                hf_token = gr.Textbox(label="HF Token (optional)", type="password", value="")
+                device_dd = gr.Dropdown(
+                    choices=["auto", "cuda", "cpu", "mps"],
+                    value="auto",
+                    label="Local Device",
+                )
+                dtype_dd = gr.Dropdown(
+                    choices=["auto", "float16", "bfloat16", "float32"],
+                    value="auto",
+                    label="Torch DType",
+                )
+            prompt_box = gr.Textbox(
+                label="Analysis Prompt",
+                lines=6,
+                value=DEFAULT_ANALYSIS_PROMPT,
+            )
+            with gr.Row():
+                segment_seconds = gr.Slider(10, 120, value=30, step=1, label="Segment Seconds")
+                overlap_seconds = gr.Slider(0, 20, value=2, step=1, label="Overlap Seconds")
+                max_new_tokens = gr.Slider(64, 2048, value=384, step=32, label="Max New Tokens")
+            with gr.Row():
+                temperature = gr.Slider(0.0, 1.2, value=0.1, step=0.05, label="Temperature")
+                keep_raw = gr.Checkbox(value=True, label="Keep Raw Segment Responses In JSON")
+                analyze_btn = gr.Button("Run Captioning", variant="primary")
+            with gr.Row():
+                load_existing_btn = gr.Button("Load Existing Sidecars")
+            analysis_status = gr.Textbox(label="Analysis Status", lines=5, interactive=False)
+            gr.Markdown("Sidecars are auto-saved next to each source audio file during analysis.")
+            records_table = gr.Dataframe(
+                headers=["File", "Duration", "Segments", "BPM", "Key", "Caption", "Status"],
+                datatype=["str", "str", "str", "str", "str", "str", "str"],
+                interactive=False,
+                label="Annotation Records",
+            )
+            track_selector = gr.Dropdown(choices=[], label="Select Track For Editing")
+            analyze_btn.click(
+                run_analysis,
+                [
+                    audio_paths_state,
+                    backend_dd,
+                    model_id,
+                    endpoint_url,
+                    hf_token,
+                    device_dd,
+                    dtype_dd,
+                    prompt_box,
+                    segment_seconds,
+                    overlap_seconds,
+                    max_new_tokens,
+                    temperature,
+                    keep_raw,
+                    records_state,
+                ],
+                [analysis_status, records_state, records_table, track_selector],
+            )
+            load_existing_btn.click(
+                load_existing_sidecars,
+                [audio_paths_state, records_state],
+                [analysis_status, records_state, records_table, track_selector],
+            )
+        with gr.Tab("3) Human Annotation + Export"):
+            with gr.Row():
+                load_record_btn = gr.Button("Load Selected JSON")
+                save_record_btn = gr.Button("Save JSON Edits")
+            json_editor = gr.Textbox(label="Editable Annotation JSON", lines=24)
+            with gr.Row():
+                caption_preview = gr.Textbox(label="Caption", interactive=False)
+                bpm_preview = gr.Textbox(label="BPM", interactive=False)
+                key_preview = gr.Textbox(label="Key/Scale", interactive=False)
+            with gr.Row():
+                lang_preview = gr.Textbox(label="Vocal Language", interactive=False)
+                duration_preview = gr.Textbox(label="Duration", interactive=False)
+                lyrics_preview = gr.Textbox(label="Lyrics", interactive=False)
+            edit_status = gr.Textbox(label="Edit Status", interactive=False)
+            gr.Markdown("Saving JSON edits also writes the sidecar next to the source audio file.")
+            load_record_btn.click(
+                load_record_json,
+                [track_selector, records_state],
+                [
+                    json_editor,
+                    caption_preview,
+                    lyrics_preview,
+                    bpm_preview,
+                    key_preview,
+                    lang_preview,
+                    duration_preview,
+                ],
+            )
+            save_record_btn.click(
+                save_record_json,
+                [track_selector, json_editor, records_state],
+                [edit_status, records_state, records_table],
+            )
+            gr.Markdown("### Export LoRA-Ready Dataset")
+            with gr.Row():
+                export_dir = gr.Textbox(label="Export Directory", value=DEFAULT_EXPORT_DIR)
+                copy_audio_cb = gr.Checkbox(value=True, label="Copy Audio Into Export Dataset")
+                inplace_cb = gr.Checkbox(value=True, label="Also Write Sidecars Next To Source Audio")
+                export_btn = gr.Button("Export", variant="primary")
+            export_status = gr.Textbox(label="Export Status", lines=5, interactive=False)
+            export_btn.click(
+                export_records,
+                [records_state, export_dir, copy_audio_cb, inplace_cb],
+                export_status,
+            )
+    app.queue(default_concurrency_limit=1)
+    return app
+app = build_ui()
+if __name__ == "__main__":
+    port = int(os.getenv("PORT", "7860"))
+    app.launch(server_name="0.0.0.0", server_port=port, share=False)

react-ui/index.html ADDED Viewed

	@@ -0,0 +1,12 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>AF3 + ChatGPT Pipeline</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.jsx"></script>
+  </body>
+</html>

react-ui/package-lock.json ADDED Viewed

	@@ -0,0 +1,1674 @@

+{
+  "name": "af3-chatgpt-pipeline-ui",
+  "version": "0.1.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "af3-chatgpt-pipeline-ui",
+      "version": "0.1.0",
+      "dependencies": {
+        "react": "^18.3.1",
+        "react-dom": "^18.3.1"
+      },
+      "devDependencies": {
+        "@vitejs/plugin-react": "^4.3.4",
+        "vite": "^5.4.11"
+      }
+    },
+    "node_modules/@babel/code-frame": {
+      "version": "7.29.0",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz",
+      "integrity": "sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-validator-identifier": "^7.28.5",
+        "js-tokens": "^4.0.0",
+        "picocolors": "^1.1.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/compat-data": {
+      "version": "7.29.0",
+      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.29.0.tgz",
+      "integrity": "sha512-T1NCJqT/j9+cn8fvkt7jtwbLBfLC/1y1c7NtCeXFRgzGTsafi68MRv8yzkYSapBnFA6L3U2VSc02ciDzoAJhJg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/core": {
+      "version": "7.29.0",
+      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.29.0.tgz",
+      "integrity": "sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.29.0",
+        "@babel/generator": "^7.29.0",
+        "@babel/helper-compilation-targets": "^7.28.6",
+        "@babel/helper-module-transforms": "^7.28.6",
+        "@babel/helpers": "^7.28.6",
+        "@babel/parser": "^7.29.0",
+        "@babel/template": "^7.28.6",
+        "@babel/traverse": "^7.29.0",
+        "@babel/types": "^7.29.0",
+        "@jridgewell/remapping": "^2.3.5",
+        "convert-source-map": "^2.0.0",
+        "debug": "^4.1.0",
+        "gensync": "^1.0.0-beta.2",
+        "json5": "^2.2.3",
+        "semver": "^6.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/babel"
+      }
+    },
+    "node_modules/@babel/generator": {
+      "version": "7.29.1",
+      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.29.1.tgz",
+      "integrity": "sha512-qsaF+9Qcm2Qv8SRIMMscAvG4O3lJ0F1GuMo5HR/Bp02LopNgnZBC/EkbevHFeGs4ls/oPz9v+Bsmzbkbe+0dUw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.29.0",
+        "@babel/types": "^7.29.0",
+        "@jridgewell/gen-mapping": "^0.3.12",
+        "@jridgewell/trace-mapping": "^0.3.28",
+        "jsesc": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-compilation-targets": {
+      "version": "7.28.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.28.6.tgz",
+      "integrity": "sha512-JYtls3hqi15fcx5GaSNL7SCTJ2MNmjrkHXg4FSpOA/grxK8KwyZ5bubHsCq8FXCkua6xhuaaBit+3b7+VZRfcA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/compat-data": "^7.28.6",
+        "@babel/helper-validator-option": "^7.27.1",
+        "browserslist": "^4.24.0",
+        "lru-cache": "^5.1.1",
+        "semver": "^6.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-globals": {
+      "version": "7.28.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz",
+      "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-imports": {
+      "version": "7.28.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.28.6.tgz",
+      "integrity": "sha512-l5XkZK7r7wa9LucGw9LwZyyCUscb4x37JWTPz7swwFE/0FMQAGpiWUZn8u9DzkSBWEcK25jmvubfpw2dnAMdbw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/traverse": "^7.28.6",
+        "@babel/types": "^7.28.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-transforms": {
+      "version": "7.28.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.6.tgz",
+      "integrity": "sha512-67oXFAYr2cDLDVGLXTEABjdBJZ6drElUSI7WKp70NrpyISso3plG9SAGEF6y7zbha/wOzUByWWTJvEDVNIUGcA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-module-imports": "^7.28.6",
+        "@babel/helper-validator-identifier": "^7.28.5",
+        "@babel/traverse": "^7.28.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
+    "node_modules/@babel/helper-plugin-utils": {
+      "version": "7.28.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.28.6.tgz",
+      "integrity": "sha512-S9gzZ/bz83GRysI7gAD4wPT/AI3uCnY+9xn+Mx/KPs2JwHJIz1W8PZkg2cqyt3RNOBM8ejcXhV6y8Og7ly/Dug==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-string-parser": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz",
+      "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-identifier": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz",
+      "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-option": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz",
+      "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helpers": {
+      "version": "7.28.6",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.6.tgz",
+      "integrity": "sha512-xOBvwq86HHdB7WUDTfKfT/Vuxh7gElQ+Sfti2Cy6yIWNW05P8iUslOVcZ4/sKbE+/jQaukQAdz/gf3724kYdqw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/template": "^7.28.6",
+        "@babel/types": "^7.28.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/parser": {
+      "version": "7.29.0",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.0.tgz",
+      "integrity": "sha512-IyDgFV5GeDUVX4YdF/3CPULtVGSXXMLh1xVIgdCgxApktqnQV0r7/8Nqthg+8YLGaAtdyIlo2qIdZrbCv4+7ww==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.29.0"
+      },
+      "bin": {
+        "parser": "bin/babel-parser.js"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@babel/plugin-transform-react-jsx-self": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.27.1.tgz",
+      "integrity": "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.27.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-transform-react-jsx-source": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.27.1.tgz",
+      "integrity": "sha512-zbwoTsBruTeKB9hSq73ha66iFeJHuaFkUbwvqElnygoNbj/jHRsSeokowZFN3CZ64IvEqcmmkVe89OPXc7ldAw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.27.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/template": {
+      "version": "7.28.6",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz",
+      "integrity": "sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.28.6",
+        "@babel/parser": "^7.28.6",
+        "@babel/types": "^7.28.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/traverse": {
+      "version": "7.29.0",
+      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.29.0.tgz",
+      "integrity": "sha512-4HPiQr0X7+waHfyXPZpWPfWL/J7dcN1mx9gL6WdQVMbPnF3+ZhSMs8tCxN7oHddJE9fhNE7+lxdnlyemKfJRuA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.29.0",
+        "@babel/generator": "^7.29.0",
+        "@babel/helper-globals": "^7.28.0",
+        "@babel/parser": "^7.29.0",
+        "@babel/template": "^7.28.6",
+        "@babel/types": "^7.29.0",
+        "debug": "^4.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/types": {
+      "version": "7.29.0",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.29.0.tgz",
+      "integrity": "sha512-LwdZHpScM4Qz8Xw2iKSzS+cfglZzJGvofQICy7W7v4caru4EaAmyUuO6BGrbyQ2mYV11W0U8j5mBhd14dd3B0A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-string-parser": "^7.27.1",
+        "@babel/helper-validator-identifier": "^7.28.5"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
+      "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz",
+      "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz",
+      "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz",
+      "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz",
+      "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz",
+      "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz",
+      "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz",
+      "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz",
+      "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz",
+      "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz",
+      "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz",
+      "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz",
+      "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==",
+      "cpu": [
+        "mips64el"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz",
+      "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz",
+      "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz",
+      "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz",
+      "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz",
+      "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz",
+      "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz",
+      "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz",
+      "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz",
+      "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz",
+      "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@jridgewell/gen-mapping": {
+      "version": "0.3.13",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
+      "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/sourcemap-codec": "^1.5.0",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      }
+    },
+    "node_modules/@jridgewell/remapping": {
+      "version": "2.3.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz",
+      "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/gen-mapping": "^0.3.5",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      }
+    },
+    "node_modules/@jridgewell/resolve-uri": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
+      "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/sourcemap-codec": {
+      "version": "1.5.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
+      "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@jridgewell/trace-mapping": {
+      "version": "0.3.31",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz",
+      "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/resolve-uri": "^3.1.0",
+        "@jridgewell/sourcemap-codec": "^1.4.14"
+      }
+    },
+    "node_modules/@rolldown/pluginutils": {
+      "version": "1.0.0-beta.27",
+      "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz",
+      "integrity": "sha512-+d0F4MKMCbeVUJwG96uQ4SgAznZNSq93I3V+9NHA4OpvqG8mRCpGdKmK8l/dl02h2CCDHwW2FqilnTyDcAnqjA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@rollup/rollup-android-arm-eabi": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.57.1.tgz",
+      "integrity": "sha512-A6ehUVSiSaaliTxai040ZpZ2zTevHYbvu/lDoeAteHI8QnaosIzm4qwtezfRg1jOYaUmnzLX1AOD6Z+UJjtifg==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-android-arm64": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.57.1.tgz",
+      "integrity": "sha512-dQaAddCY9YgkFHZcFNS/606Exo8vcLHwArFZ7vxXq4rigo2bb494/xKMMwRRQW6ug7Js6yXmBZhSBRuBvCCQ3w==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-darwin-arm64": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.57.1.tgz",
+      "integrity": "sha512-crNPrwJOrRxagUYeMn/DZwqN88SDmwaJ8Cvi/TN1HnWBU7GwknckyosC2gd0IqYRsHDEnXf328o9/HC6OkPgOg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@rollup/rollup-darwin-x64": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.57.1.tgz",
+      "integrity": "sha512-Ji8g8ChVbKrhFtig5QBV7iMaJrGtpHelkB3lsaKzadFBe58gmjfGXAOfI5FV0lYMH8wiqsxKQ1C9B0YTRXVy4w==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-arm64": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.57.1.tgz",
+      "integrity": "sha512-R+/WwhsjmwodAcz65guCGFRkMb4gKWTcIeLy60JJQbXrJ97BOXHxnkPFrP+YwFlaS0m+uWJTstrUA9o+UchFug==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-x64": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.57.1.tgz",
+      "integrity": "sha512-IEQTCHeiTOnAUC3IDQdzRAGj3jOAYNr9kBguI7MQAAZK3caezRrg0GxAb6Hchg4lxdZEI5Oq3iov/w/hnFWY9Q==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-gnueabihf": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.57.1.tgz",
+      "integrity": "sha512-F8sWbhZ7tyuEfsmOxwc2giKDQzN3+kuBLPwwZGyVkLlKGdV1nvnNwYD0fKQ8+XS6hp9nY7B+ZeK01EBUE7aHaw==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-musleabihf": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.57.1.tgz",
+      "integrity": "sha512-rGfNUfn0GIeXtBP1wL5MnzSj98+PZe/AXaGBCRmT0ts80lU5CATYGxXukeTX39XBKsxzFpEeK+Mrp9faXOlmrw==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-gnu": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.57.1.tgz",
+      "integrity": "sha512-MMtej3YHWeg/0klK2Qodf3yrNzz6CGjo2UntLvk2RSPlhzgLvYEB3frRvbEF2wRKh1Z2fDIg9KRPe1fawv7C+g==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-musl": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.57.1.tgz",
+      "integrity": "sha512-1a/qhaaOXhqXGpMFMET9VqwZakkljWHLmZOX48R0I/YLbhdxr1m4gtG1Hq7++VhVUmf+L3sTAf9op4JlhQ5u1Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-loong64-gnu": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.57.1.tgz",
+      "integrity": "sha512-QWO6RQTZ/cqYtJMtxhkRkidoNGXc7ERPbZN7dVW5SdURuLeVU7lwKMpo18XdcmpWYd0qsP1bwKPf7DNSUinhvA==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-loong64-musl": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.57.1.tgz",
+      "integrity": "sha512-xpObYIf+8gprgWaPP32xiN5RVTi/s5FCR+XMXSKmhfoJjrpRAjCuuqQXyxUa/eJTdAE6eJ+KDKaoEqjZQxh3Gw==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-ppc64-gnu": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.57.1.tgz",
+      "integrity": "sha512-4BrCgrpZo4hvzMDKRqEaW1zeecScDCR+2nZ86ATLhAoJ5FQ+lbHVD3ttKe74/c7tNT9c6F2viwB3ufwp01Oh2w==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-ppc64-musl": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.57.1.tgz",
+      "integrity": "sha512-NOlUuzesGauESAyEYFSe3QTUguL+lvrN1HtwEEsU2rOwdUDeTMJdO5dUYl/2hKf9jWydJrO9OL/XSSf65R5+Xw==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-riscv64-gnu": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.57.1.tgz",
+      "integrity": "sha512-ptA88htVp0AwUUqhVghwDIKlvJMD/fmL/wrQj99PRHFRAG6Z5nbWoWG4o81Nt9FT+IuqUQi+L31ZKAFeJ5Is+A==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-riscv64-musl": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.57.1.tgz",
+      "integrity": "sha512-S51t7aMMTNdmAMPpBg7OOsTdn4tySRQvklmL3RpDRyknk87+Sp3xaumlatU+ppQ+5raY7sSTcC2beGgvhENfuw==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-s390x-gnu": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.57.1.tgz",
+      "integrity": "sha512-Bl00OFnVFkL82FHbEqy3k5CUCKH6OEJL54KCyx2oqsmZnFTR8IoNqBF+mjQVcRCT5sB6yOvK8A37LNm/kPJiZg==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-gnu": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.57.1.tgz",
+      "integrity": "sha512-ABca4ceT4N+Tv/GtotnWAeXZUZuM/9AQyCyKYyKnpk4yoA7QIAuBt6Hkgpw8kActYlew2mvckXkvx0FfoInnLg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-musl": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.57.1.tgz",
+      "integrity": "sha512-HFps0JeGtuOR2convgRRkHCekD7j+gdAuXM+/i6kGzQtFhlCtQkpwtNzkNj6QhCDp7DRJ7+qC/1Vg2jt5iSOFw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-openbsd-x64": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.57.1.tgz",
+      "integrity": "sha512-H+hXEv9gdVQuDTgnqD+SQffoWoc0Of59AStSzTEj/feWTBAnSfSD3+Dql1ZruJQxmykT/JVY0dE8Ka7z0DH1hw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-openharmony-arm64": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.57.1.tgz",
+      "integrity": "sha512-4wYoDpNg6o/oPximyc/NG+mYUejZrCU2q+2w6YZqrAs2UcNUChIZXjtafAiiZSUc7On8v5NyNj34Kzj/Ltk6dQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openharmony"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-arm64-msvc": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.57.1.tgz",
+      "integrity": "sha512-O54mtsV/6LW3P8qdTcamQmuC990HDfR71lo44oZMZlXU4tzLrbvTii87Ni9opq60ds0YzuAlEr/GNwuNluZyMQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-ia32-msvc": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.57.1.tgz",
+      "integrity": "sha512-P3dLS+IerxCT/7D2q2FYcRdWRl22dNbrbBEtxdWhXrfIMPP9lQhb5h4Du04mdl5Woq05jVCDPCMF7Ub0NAjIew==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-x64-gnu": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.57.1.tgz",
+      "integrity": "sha512-VMBH2eOOaKGtIJYleXsi2B8CPVADrh+TyNxJ4mWPnKfLB/DBUmzW+5m1xUrcwWoMfSLagIRpjUFeW5CO5hyciQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-x64-msvc": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.57.1.tgz",
+      "integrity": "sha512-mxRFDdHIWRxg3UfIIAwCm6NzvxG0jDX/wBN6KsQFTvKFqqg9vTrWUE68qEjHt19A5wwx5X5aUi2zuZT7YR0jrA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@types/babel__core": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
+      "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.20.7",
+        "@babel/types": "^7.20.7",
+        "@types/babel__generator": "*",
+        "@types/babel__template": "*",
+        "@types/babel__traverse": "*"
+      }
+    },
+    "node_modules/@types/babel__generator": {
+      "version": "7.27.0",
+      "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz",
+      "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__template": {
+      "version": "7.4.4",
+      "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz",
+      "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__traverse": {
+      "version": "7.28.0",
+      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz",
+      "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.28.2"
+      }
+    },
+    "node_modules/@types/estree": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@vitejs/plugin-react": {
+      "version": "4.7.0",
+      "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.7.0.tgz",
+      "integrity": "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/core": "^7.28.0",
+        "@babel/plugin-transform-react-jsx-self": "^7.27.1",
+        "@babel/plugin-transform-react-jsx-source": "^7.27.1",
+        "@rolldown/pluginutils": "1.0.0-beta.27",
+        "@types/babel__core": "^7.20.5",
+        "react-refresh": "^0.17.0"
+      },
+      "engines": {
+        "node": "^14.18.0 || >=16.0.0"
+      },
+      "peerDependencies": {
+        "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
+      }
+    },
+    "node_modules/baseline-browser-mapping": {
+      "version": "2.9.19",
+      "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.19.tgz",
+      "integrity": "sha512-ipDqC8FrAl/76p2SSWKSI+H9tFwm7vYqXQrItCuiVPt26Km0jS+NzSsBWAaBusvSbQcfJG+JitdMm+wZAgTYqg==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "bin": {
+        "baseline-browser-mapping": "dist/cli.js"
+      }
+    },
+    "node_modules/browserslist": {
+      "version": "4.28.1",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz",
+      "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "baseline-browser-mapping": "^2.9.0",
+        "caniuse-lite": "^1.0.30001759",
+        "electron-to-chromium": "^1.5.263",
+        "node-releases": "^2.0.27",
+        "update-browserslist-db": "^1.2.0"
+      },
+      "bin": {
+        "browserslist": "cli.js"
+      },
+      "engines": {
+        "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
+      }
+    },
+    "node_modules/caniuse-lite": {
+      "version": "1.0.30001769",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001769.tgz",
+      "integrity": "sha512-BCfFL1sHijQlBGWBMuJyhZUhzo7wer5sVj9hqekB/7xn0Ypy+pER/edCYQm4exbXj4WiySGp40P8UuTh6w1srg==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "CC-BY-4.0"
+    },
+    "node_modules/convert-source-map": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz",
+      "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/debug": {
+      "version": "4.4.3",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
+      "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ms": "^2.1.3"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/electron-to-chromium": {
+      "version": "1.5.286",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.286.tgz",
+      "integrity": "sha512-9tfDXhJ4RKFNerfjdCcZfufu49vg620741MNs26a9+bhLThdB+plgMeou98CAaHu/WATj2iHOOHTp1hWtABj2A==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/esbuild": {
+      "version": "0.21.5",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz",
+      "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.21.5",
+        "@esbuild/android-arm": "0.21.5",
+        "@esbuild/android-arm64": "0.21.5",
+        "@esbuild/android-x64": "0.21.5",
+        "@esbuild/darwin-arm64": "0.21.5",
+        "@esbuild/darwin-x64": "0.21.5",
+        "@esbuild/freebsd-arm64": "0.21.5",
+        "@esbuild/freebsd-x64": "0.21.5",
+        "@esbuild/linux-arm": "0.21.5",
+        "@esbuild/linux-arm64": "0.21.5",
+        "@esbuild/linux-ia32": "0.21.5",
+        "@esbuild/linux-loong64": "0.21.5",
+        "@esbuild/linux-mips64el": "0.21.5",
+        "@esbuild/linux-ppc64": "0.21.5",
+        "@esbuild/linux-riscv64": "0.21.5",
+        "@esbuild/linux-s390x": "0.21.5",
+        "@esbuild/linux-x64": "0.21.5",
+        "@esbuild/netbsd-x64": "0.21.5",
+        "@esbuild/openbsd-x64": "0.21.5",
+        "@esbuild/sunos-x64": "0.21.5",
+        "@esbuild/win32-arm64": "0.21.5",
+        "@esbuild/win32-ia32": "0.21.5",
+        "@esbuild/win32-x64": "0.21.5"
+      }
+    },
+    "node_modules/escalade": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/gensync": {
+      "version": "1.0.0-beta.2",
+      "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",
+      "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/js-tokens": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+      "license": "MIT"
+    },
+    "node_modules/jsesc": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz",
+      "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "jsesc": "bin/jsesc"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/json5": {
+      "version": "2.2.3",
+      "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz",
+      "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "json5": "lib/cli.js"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/loose-envify": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
+      "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
+      "license": "MIT",
+      "dependencies": {
+        "js-tokens": "^3.0.0 || ^4.0.0"
+      },
+      "bin": {
+        "loose-envify": "cli.js"
+      }
+    },
+    "node_modules/lru-cache": {
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
+      "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "yallist": "^3.0.2"
+      }
+    },
+    "node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/nanoid": {
+      "version": "3.3.11",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
+      "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "bin": {
+        "nanoid": "bin/nanoid.cjs"
+      },
+      "engines": {
+        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
+      }
+    },
+    "node_modules/node-releases": {
+      "version": "2.0.27",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
+      "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/picocolors": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/postcss": {
+      "version": "8.5.6",
+      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
+      "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/postcss"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "nanoid": "^3.3.11",
+        "picocolors": "^1.1.1",
+        "source-map-js": "^1.2.1"
+      },
+      "engines": {
+        "node": "^10 || ^12 || >=14"
+      }
+    },
+    "node_modules/react": {
+      "version": "18.3.1",
+      "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz",
+      "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/react-dom": {
+      "version": "18.3.1",
+      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz",
+      "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.1.0",
+        "scheduler": "^0.23.2"
+      },
+      "peerDependencies": {
+        "react": "^18.3.1"
+      }
+    },
+    "node_modules/react-refresh": {
+      "version": "0.17.0",
+      "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.17.0.tgz",
+      "integrity": "sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/rollup": {
+      "version": "4.57.1",
+      "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.57.1.tgz",
+      "integrity": "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "1.0.8"
+      },
+      "bin": {
+        "rollup": "dist/bin/rollup"
+      },
+      "engines": {
+        "node": ">=18.0.0",
+        "npm": ">=8.0.0"
+      },
+      "optionalDependencies": {
+        "@rollup/rollup-android-arm-eabi": "4.57.1",
+        "@rollup/rollup-android-arm64": "4.57.1",
+        "@rollup/rollup-darwin-arm64": "4.57.1",
+        "@rollup/rollup-darwin-x64": "4.57.1",
+        "@rollup/rollup-freebsd-arm64": "4.57.1",
+        "@rollup/rollup-freebsd-x64": "4.57.1",
+        "@rollup/rollup-linux-arm-gnueabihf": "4.57.1",
+        "@rollup/rollup-linux-arm-musleabihf": "4.57.1",
+        "@rollup/rollup-linux-arm64-gnu": "4.57.1",
+        "@rollup/rollup-linux-arm64-musl": "4.57.1",
+        "@rollup/rollup-linux-loong64-gnu": "4.57.1",
+        "@rollup/rollup-linux-loong64-musl": "4.57.1",
+        "@rollup/rollup-linux-ppc64-gnu": "4.57.1",
+        "@rollup/rollup-linux-ppc64-musl": "4.57.1",
+        "@rollup/rollup-linux-riscv64-gnu": "4.57.1",
+        "@rollup/rollup-linux-riscv64-musl": "4.57.1",
+        "@rollup/rollup-linux-s390x-gnu": "4.57.1",
+        "@rollup/rollup-linux-x64-gnu": "4.57.1",
+        "@rollup/rollup-linux-x64-musl": "4.57.1",
+        "@rollup/rollup-openbsd-x64": "4.57.1",
+        "@rollup/rollup-openharmony-arm64": "4.57.1",
+        "@rollup/rollup-win32-arm64-msvc": "4.57.1",
+        "@rollup/rollup-win32-ia32-msvc": "4.57.1",
+        "@rollup/rollup-win32-x64-gnu": "4.57.1",
+        "@rollup/rollup-win32-x64-msvc": "4.57.1",
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/scheduler": {
+      "version": "0.23.2",
+      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz",
+      "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.1.0"
+      }
+    },
+    "node_modules/semver": {
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/source-map-js": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/update-browserslist-db": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
+      "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
+      },
+      "bin": {
+        "update-browserslist-db": "cli.js"
+      },
+      "peerDependencies": {
+        "browserslist": ">= 4.21.0"
+      }
+    },
+    "node_modules/vite": {
+      "version": "5.4.21",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.21.tgz",
+      "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "esbuild": "^0.21.3",
+        "postcss": "^8.4.43",
+        "rollup": "^4.20.0"
+      },
+      "bin": {
+        "vite": "bin/vite.js"
+      },
+      "engines": {
+        "node": "^18.0.0 || >=20.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/vitejs/vite?sponsor=1"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      },
+      "peerDependencies": {
+        "@types/node": "^18.0.0 || >=20.0.0",
+        "less": "*",
+        "lightningcss": "^1.21.0",
+        "sass": "*",
+        "sass-embedded": "*",
+        "stylus": "*",
+        "sugarss": "*",
+        "terser": "^5.4.0"
+      },
+      "peerDependenciesMeta": {
+        "@types/node": {
+          "optional": true
+        },
+        "less": {
+          "optional": true
+        },
+        "lightningcss": {
+          "optional": true
+        },
+        "sass": {
+          "optional": true
+        },
+        "sass-embedded": {
+          "optional": true
+        },
+        "stylus": {
+          "optional": true
+        },
+        "sugarss": {
+          "optional": true
+        },
+        "terser": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/yallist": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
+      "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==",
+      "dev": true,
+      "license": "ISC"
+    }
+  }
+}

react-ui/package.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "name": "af3-chatgpt-pipeline-ui",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1"
+  },
+  "devDependencies": {
+    "@vitejs/plugin-react": "^4.3.4",
+    "vite": "^5.4.11"
+  }
+}

react-ui/src/App.jsx ADDED Viewed

	@@ -0,0 +1,223 @@

+import { useEffect, useMemo, useState } from "react";
+const DEFAULT_PROMPT =
+  "Analyze this full song and provide concise, timestamped sections describing vocals, instrumentation, production effects, mix changes, energy flow, and genre cues. End with a short overall summary.";
+export default function App() {
+  const [mode, setMode] = useState("path");
+  const [audioPath, setAudioPath] = useState("E:\\Coding\\hf-music-gen\\train-dataset\\Andrew Spacey - Wonder (Prod Beat It AT).mp3");
+  const [audioFile, setAudioFile] = useState(null);
+  const [backend, setBackend] = useState("hf_endpoint");
+  const [endpointUrl, setEndpointUrl] = useState("");
+  const [hfToken, setHfToken] = useState("");
+  const [modelId, setModelId] = useState("nvidia/audio-flamingo-3-hf");
+  const [openAiApiKey, setOpenAiApiKey] = useState("");
+  const [openAiModel, setOpenAiModel] = useState("gpt-5-mini");
+  const [prompt, setPrompt] = useState(DEFAULT_PROMPT);
+  const [userContext, setUserContext] = useState("");
+  const [artistName, setArtistName] = useState("");
+  const [trackName, setTrackName] = useState("");
+  const [enableWebSearch, setEnableWebSearch] = useState(false);
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState("");
+  const [result, setResult] = useState(null);
+  useEffect(() => {
+    let mounted = true;
+    fetch("/api/config")
+      .then((r) => r.json())
+      .then((data) => {
+        if (!mounted) return;
+        const d = data?.defaults || {};
+        if (d.backend) setBackend(d.backend);
+        if (d.endpoint_url) setEndpointUrl(d.endpoint_url);
+        if (d.model_id) setModelId(d.model_id);
+        if (d.openai_model) setOpenAiModel(d.openai_model);
+        if (d.af3_prompt) setPrompt(d.af3_prompt);
+      })
+      .catch(() => {});
+    return () => {
+      mounted = false;
+    };
+  }, []);
+  const requestPreview = useMemo(() => {
+    return {
+      backend,
+      endpoint_url: endpointUrl || "(env default)",
+      model_id: modelId,
+      openai_model: openAiModel,
+      enable_web_search: enableWebSearch,
+      artist_name: artistName || "(none)",
+      track_name: trackName || "(none)",
+    };
+  }, [backend, endpointUrl, modelId, openAiModel, enableWebSearch, artistName, trackName]);
+  async function runPipeline() {
+    setLoading(true);
+    setError("");
+    setResult(null);
+    try {
+      let response;
+      if (mode === "path") {
+        response = await fetch("/api/pipeline/run-path", {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({
+            audio_path: audioPath,
+            backend,
+            endpoint_url: endpointUrl,
+            hf_token: hfToken,
+            model_id: modelId,
+            af3_prompt: prompt,
+            openai_api_key: openAiApiKey,
+            openai_model: openAiModel,
+            user_context: userContext,
+            artist_name: artistName,
+            track_name: trackName,
+            enable_web_search: enableWebSearch,
+          }),
+        });
+      } else {
+        if (!audioFile) {
+          throw new Error("Select an audio file first.");
+        }
+        const form = new FormData();
+        form.append("audio_file", audioFile);
+        form.append("backend", backend);
+        form.append("endpoint_url", endpointUrl);
+        form.append("hf_token", hfToken);
+        form.append("model_id", modelId);
+        form.append("af3_prompt", prompt);
+        form.append("openai_api_key", openAiApiKey);
+        form.append("openai_model", openAiModel);
+        form.append("user_context", userContext);
+        form.append("artist_name", artistName);
+        form.append("track_name", trackName);
+        form.append("enable_web_search", String(enableWebSearch));
+        response = await fetch("/api/pipeline/run-upload", {
+          method: "POST",
+          body: form,
+        });
+      }
+      const data = await response.json();
+      if (!response.ok) {
+        const detail = typeof data?.detail === "string" ? data.detail : JSON.stringify(data);
+        throw new Error(detail);
+      }
+      setResult(data);
+    } catch (err) {
+      setError(err.message || String(err));
+    } finally {
+      setLoading(false);
+    }
+  }
+  return (
+    <div className="page">
+      <div className="hero">
+        <h1>AF3 + ChatGPT Pipeline</h1>
+        <p>Run Audio Flamingo 3 analysis, then clean/structure for Ace Step 1.5 LoRA metadata.</p>
+      </div>
+      <div className="grid">
+        <section className="card">
+          <h2>Inputs</h2>
+          <div className="row">
+            <label>Mode</label>
+            <select value={mode} onChange={(e) => setMode(e.target.value)}>
+              <option value="path">Local Path</option>
+              <option value="upload">Upload</option>
+            </select>
+          </div>
+          {mode === "path" ? (
+            <div className="row">
+              <label>Audio Path</label>
+              <input value={audioPath} onChange={(e) => setAudioPath(e.target.value)} />
+            </div>
+          ) : (
+            <div className="row">
+              <label>Audio File</label>
+              <input type="file" accept="audio/*" onChange={(e) => setAudioFile(e.target.files?.[0] || null)} />
+            </div>
+          )}
+          <div className="row">
+            <label>AF3 Backend</label>
+            <select value={backend} onChange={(e) => setBackend(e.target.value)}>
+              <option value="hf_endpoint">HF Endpoint</option>
+              <option value="local">Local Model</option>
+            </select>
+          </div>
+          <div className="row">
+            <label>AF3 Endpoint URL</label>
+            <input value={endpointUrl} onChange={(e) => setEndpointUrl(e.target.value)} placeholder="https://..." />
+          </div>
+          <div className="row">
+            <label>HF Token (optional)</label>
+            <input type="password" value={hfToken} onChange={(e) => setHfToken(e.target.value)} />
+          </div>
+          <div className="row">
+            <label>AF3 Model ID</label>
+            <input value={modelId} onChange={(e) => setModelId(e.target.value)} />
+          </div>
+          <div className="row">
+            <label>OpenAI API Key (optional)</label>
+            <input type="password" value={openAiApiKey} onChange={(e) => setOpenAiApiKey(e.target.value)} />
+          </div>
+          <div className="row">
+            <label>OpenAI Model</label>
+            <input value={openAiModel} onChange={(e) => setOpenAiModel(e.target.value)} />
+          </div>
+          <div className="row">
+            <label>Artist (optional)</label>
+            <input value={artistName} onChange={(e) => setArtistName(e.target.value)} />
+          </div>
+          <div className="row">
+            <label>Track (optional)</label>
+            <input value={trackName} onChange={(e) => setTrackName(e.target.value)} />
+          </div>
+          <div className="row">
+            <label>Prompt</label>
+            <textarea rows={5} value={prompt} onChange={(e) => setPrompt(e.target.value)} />
+          </div>
+          <div className="row">
+            <label>User Context</label>
+            <textarea rows={4} value={userContext} onChange={(e) => setUserContext(e.target.value)} />
+          </div>
+          <div className="row inline">
+            <input
+              id="websearch"
+              type="checkbox"
+              checked={enableWebSearch}
+              onChange={(e) => setEnableWebSearch(e.target.checked)}
+            />
+            <label htmlFor="websearch">Enable ChatGPT web search (optional)</label>
+          </div>
+          <button className="run" disabled={loading} onClick={runPipeline}>
+            {loading ? "Running..." : "Run Pipeline"}
+          </button>
+        </section>
+        <section className="card">
+          <h2>Request Summary</h2>
+          <pre>{JSON.stringify(requestPreview, null, 2)}</pre>
+          {error ? <p className="error">{error}</p> : null}
+          {result ? (
+            <>
+              <h3>Saved Sidecar</h3>
+              <p className="mono">{result.saved_to}</p>
+              <h3>AF3 Analysis</h3>
+              <pre>{result.af3_analysis}</pre>
+              <h3>Final LoRA JSON</h3>
+              <pre>{JSON.stringify(result.sidecar, null, 2)}</pre>
+            </>
+          ) : null}
+        </section>
+      </div>
+    </div>
+  );
+}

react-ui/src/main.jsx ADDED Viewed

	@@ -0,0 +1,11 @@

+import React from "react";
+import { createRoot } from "react-dom/client";
+import App from "./App";
+import "./styles.css";
+createRoot(document.getElementById("root")).render(
+  <React.StrictMode>
+    <App />
+  </React.StrictMode>
+);

react-ui/src/styles.css ADDED Viewed

	@@ -0,0 +1,189 @@

+@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;600&display=swap");
+:root {
+  --bg: #f4f1e8;
+  --bg-2: #e6f2ef;
+  --ink: #1f2a22;
+  --muted: #4b5a50;
+  --brand: #0a8f6a;
+  --brand-deep: #0c5b49;
+  --warn: #ad1f1f;
+  --card: rgba(255, 255, 255, 0.78);
+  --line: rgba(31, 42, 34, 0.2);
+  --shadow: 0 20px 60px rgba(6, 48, 38, 0.16);
+}
+* {
+  box-sizing: border-box;
+}
+body {
+  margin: 0;
+  color: var(--ink);
+  font-family: "Space Grotesk", system-ui, sans-serif;
+  background:
+    radial-gradient(circle at 15% 10%, rgba(10, 143, 106, 0.16), transparent 45%),
+    radial-gradient(circle at 85% 0%, rgba(255, 138, 61, 0.14), transparent 35%),
+    linear-gradient(140deg, var(--bg), var(--bg-2));
+  min-height: 100vh;
+}
+.page {
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 28px 18px 36px;
+}
+.hero {
+  margin-bottom: 18px;
+  animation: rise 0.55s ease;
+}
+.hero h1 {
+  margin: 0;
+  font-size: clamp(1.6rem, 3vw, 2.4rem);
+  letter-spacing: -0.02em;
+}
+.hero p {
+  margin: 6px 0 0;
+  color: var(--muted);
+}
+.grid {
+  display: grid;
+  grid-template-columns: 1.1fr 1fr;
+  gap: 16px;
+  align-items: start;
+}
+.card {
+  background: var(--card);
+  border: 1px solid var(--line);
+  border-radius: 14px;
+  padding: 14px;
+  backdrop-filter: blur(8px);
+  box-shadow: var(--shadow);
+  animation: rise 0.6s ease;
+}
+.card h2,
+.card h3 {
+  margin: 0 0 12px;
+}
+.row {
+  display: grid;
+  gap: 6px;
+  margin-bottom: 10px;
+}
+.row label {
+  font-size: 0.84rem;
+  font-weight: 600;
+  color: var(--muted);
+}
+.row.inline {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+}
+input,
+select,
+textarea,
+button {
+  width: 100%;
+  font: inherit;
+}
+input,
+select,
+textarea {
+  border: 1px solid var(--line);
+  border-radius: 10px;
+  padding: 9px 10px;
+  background: rgba(255, 255, 255, 0.94);
+  color: var(--ink);
+}
+textarea {
+  resize: vertical;
+}
+input:focus,
+select:focus,
+textarea:focus {
+  outline: 2px solid rgba(10, 143, 106, 0.28);
+  border-color: rgba(10, 143, 106, 0.7);
+}
+.row.inline input[type="checkbox"] {
+  width: auto;
+}
+.run {
+  margin-top: 4px;
+  border: 0;
+  border-radius: 10px;
+  padding: 11px 12px;
+  font-weight: 700;
+  color: #fff;
+  cursor: pointer;
+  background: linear-gradient(120deg, var(--brand), var(--brand-deep));
+}
+.run:disabled {
+  opacity: 0.68;
+  cursor: not-allowed;
+}
+pre,
+.mono {
+  font-family: "IBM Plex Mono", ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+}
+pre {
+  white-space: pre-wrap;
+  word-break: break-word;
+  max-height: 320px;
+  overflow: auto;
+  background: rgba(20, 35, 29, 0.92);
+  color: #f4fffa;
+  padding: 10px;
+  border-radius: 10px;
+  border: 1px solid rgba(255, 255, 255, 0.1);
+}
+.error {
+  margin: 10px 0;
+  color: var(--warn);
+  font-weight: 600;
+}
+.mono {
+  font-size: 0.9rem;
+  overflow-wrap: anywhere;
+}
+@keyframes rise {
+  from {
+    opacity: 0;
+    transform: translateY(8px);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0);
+  }
+}
+@media (max-width: 980px) {
+  .grid {
+    grid-template-columns: 1fr;
+  }
+  .card {
+    padding: 12px;
+  }
+}

react-ui/vite.config.js ADDED Viewed

	@@ -0,0 +1,15 @@

+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+export default defineConfig({
+  plugins: [react()],
+  server: {
+    port: 5173,
+    proxy: {
+      "/api": {
+        target: "http://localhost:8008",
+        changeOrigin: true,
+      },
+    },
+  },
+});

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ numpy
 soundfile
 torch
 torchaudio
-transformers>=4.53.0,<4.58.0
 accelerate
 huggingface_hub
 diffusers
@@ -18,3 +18,8 @@ peft>=0.11.0
 gradio>=4.0.0
 pandas
 bitsandbytes

 soundfile
 torch
 torchaudio
+transformers>=4.57.0,<4.58.0
 accelerate
 huggingface_hub
 diffusers
 gradio>=4.0.0
 pandas
 bitsandbytes
+fastapi
+uvicorn
+python-multipart
+openai
+python-dotenv

scripts/annotations/qwen_annotate_file.py ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/usr/bin/env python
+"""
+Annotate one audio file with Qwen2-Audio and save a sidecar JSON.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+from pathlib import Path
+from qwen_audio_captioning import (
+    DEFAULT_ANALYSIS_PROMPT,
+    DEFAULT_LONG_ANALYSIS_PROMPT,
+    DEFAULT_MODEL_ID,
+    build_captioner,
+    generate_track_annotation,
+    read_prompt_file,
+)
+def read_dotenv_value(path: str, key: str) -> str:
+    p = Path(path)
+    if not p.exists():
+        return ""
+    for raw in p.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        k, v = line.split("=", 1)
+        if k.strip() == key:
+            return v.strip().strip('"').strip("'")
+    return ""
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Annotate a single audio file with Qwen2-Audio")
+    parser.add_argument("--audio", required=True, help="Audio file path")
+    parser.add_argument("--backend", default="hf_endpoint", choices=["local", "hf_endpoint"])
+    parser.add_argument("--model-id", default=DEFAULT_MODEL_ID)
+    parser.add_argument("--endpoint-url", default=os.getenv("HF_QWEN_ENDPOINT_URL", ""))
+    parser.add_argument("--token", default="")
+    parser.add_argument("--device", default="auto", choices=["auto", "cuda", "cpu", "mps"])
+    parser.add_argument("--torch-dtype", default="auto", choices=["auto", "float16", "bfloat16", "float32"])
+    parser.add_argument("--prompt", default=DEFAULT_ANALYSIS_PROMPT)
+    parser.add_argument("--prompt-file", default="")
+    parser.add_argument("--include-long-analysis", action="store_true")
+    parser.add_argument("--long-analysis-prompt", default=DEFAULT_LONG_ANALYSIS_PROMPT)
+    parser.add_argument("--long-analysis-prompt-file", default="")
+    parser.add_argument("--long-analysis-max-new-tokens", type=int, default=1200)
+    parser.add_argument("--long-analysis-temperature", type=float, default=0.1)
+    parser.add_argument("--segment-seconds", type=float, default=30.0)
+    parser.add_argument("--overlap-seconds", type=float, default=2.0)
+    parser.add_argument("--max-new-tokens", type=int, default=384)
+    parser.add_argument("--temperature", type=float, default=0.1)
+    parser.add_argument("--keep-raw-outputs", action="store_true")
+    parser.add_argument("--output-json", default="", help="Output JSON path (default: audio sidecar)")
+    args = parser.parse_args()
+    audio_path = Path(args.audio)
+    if not audio_path.is_file():
+        raise FileNotFoundError(f"Audio not found: {audio_path}")
+    prompt = read_prompt_file(args.prompt_file) if args.prompt_file else args.prompt
+    long_prompt = (
+        read_prompt_file(args.long_analysis_prompt_file)
+        if args.long_analysis_prompt_file
+        else args.long_analysis_prompt
+    )
+    token = (
+        args.token
+        or os.getenv("HF_TOKEN", "")
+        or read_dotenv_value(".env", "HF_TOKEN")
+        or read_dotenv_value(".env", "hf_token")
+    )
+    captioner = build_captioner(
+        backend=args.backend,
+        model_id=args.model_id,
+        endpoint_url=args.endpoint_url,
+        token=token,
+        device=args.device,
+        torch_dtype=args.torch_dtype,
+    )
+    sidecar = generate_track_annotation(
+        audio_path=str(audio_path),
+        captioner=captioner,
+        prompt=prompt,
+        segment_seconds=float(args.segment_seconds),
+        overlap_seconds=float(args.overlap_seconds),
+        max_new_tokens=int(args.max_new_tokens),
+        temperature=float(args.temperature),
+        keep_raw_outputs=bool(args.keep_raw_outputs),
+        include_long_analysis=bool(args.include_long_analysis),
+        long_analysis_prompt=long_prompt,
+        long_analysis_max_new_tokens=int(args.long_analysis_max_new_tokens),
+        long_analysis_temperature=float(args.long_analysis_temperature),
+    )
+    out_path = Path(args.output_json) if args.output_json else audio_path.with_suffix(".json")
+    out_path.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(
+        json.dumps(
+            {
+                "saved_to": str(out_path),
+                "caption": sidecar.get("caption", ""),
+                "bpm": sidecar.get("bpm"),
+                "keyscale": sidecar.get("keyscale", ""),
+                "duration": sidecar.get("duration"),
+                "segment_count": sidecar.get("music_analysis", {}).get("segment_count"),
+            },
+            indent=2,
+            ensure_ascii=False,
+        )
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/annotations/qwen_caption_dataset.py ADDED Viewed

	@@ -0,0 +1,203 @@

+#!/usr/bin/env python
+"""
+Batch caption a music dataset with Qwen2-Audio and export LoRA-ready sidecars.
+"""
+from __future__ import annotations
+import argparse
+import os
+import tempfile
+from pathlib import Path
+from typing import List
+from huggingface_hub import HfApi, snapshot_download
+from loguru import logger
+from tqdm import tqdm
+from qwen_audio_captioning import (
+    DEFAULT_ANALYSIS_PROMPT,
+    DEFAULT_LONG_ANALYSIS_PROMPT,
+    DEFAULT_MODEL_ID,
+    build_captioner,
+    export_annotation_records,
+    generate_track_annotation,
+    list_audio_files,
+    read_prompt_file,
+)
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description="Qwen2-Audio batch captioning for LoRA datasets")
+    # Data source
+    p.add_argument("--dataset-dir", type=str, default="", help="Local dataset folder")
+    p.add_argument("--dataset-repo", type=str, default="", help="HF dataset repo id")
+    p.add_argument("--dataset-revision", type=str, default="main", help="HF dataset revision")
+    p.add_argument("--dataset-subdir", type=str, default="", help="Subdirectory inside dataset")
+    # Backend
+    p.add_argument("--backend", type=str, default="local", choices=["local", "hf_endpoint"])
+    p.add_argument("--model-id", type=str, default=DEFAULT_MODEL_ID)
+    p.add_argument("--endpoint-url", type=str, default="")
+    p.add_argument("--hf-token", type=str, default="", help="HF token (or use HF_TOKEN env var)")
+    p.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu", "mps"])
+    p.add_argument("--torch-dtype", type=str, default="auto", choices=["auto", "float16", "bfloat16", "float32"])
+    # Prompt + generation controls
+    p.add_argument("--prompt", type=str, default=DEFAULT_ANALYSIS_PROMPT)
+    p.add_argument("--prompt-file", type=str, default="", help="Text file to override --prompt")
+    p.add_argument("--include-long-analysis", action="store_true", help="Also request long prose analysis")
+    p.add_argument("--long-analysis-prompt", type=str, default=DEFAULT_LONG_ANALYSIS_PROMPT)
+    p.add_argument("--long-analysis-prompt-file", type=str, default="", help="Text file to override --long-analysis-prompt")
+    p.add_argument("--long-analysis-max-new-tokens", type=int, default=1200)
+    p.add_argument("--long-analysis-temperature", type=float, default=0.1)
+    p.add_argument("--segment-seconds", type=float, default=30.0)
+    p.add_argument("--overlap-seconds", type=float, default=2.0)
+    p.add_argument("--max-new-tokens", type=int, default=384)
+    p.add_argument("--temperature", type=float, default=0.1)
+    p.add_argument("--keep-raw-outputs", action="store_true", help="Store per-segment raw outputs in sidecar JSON")
+    # Export
+    p.add_argument("--output-dir", type=str, default="qwen_annotations")
+    p.add_argument("--copy-audio", action="store_true", help="Copy audio files into output_dir/dataset")
+    p.add_argument(
+        "--write-inplace-sidecars",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Write sidecars next to source audio (default: true). Use --no-write-inplace-sidecars to disable.",
+    )
+    # Optional upload of exported folder
+    p.add_argument("--upload-repo", type=str, default="", help="Optional HF dataset repo to upload exports")
+    p.add_argument("--upload-private", action="store_true", help="Create upload repo as private")
+    p.add_argument("--upload-path", type=str, default="", help="Optional path inside upload repo")
+    return p
+def resolve_dataset_dir(args) -> str:
+    if args.dataset_dir:
+        if not Path(args.dataset_dir).is_dir():
+            raise FileNotFoundError(f"Dataset folder not found: {args.dataset_dir}")
+        return args.dataset_dir
+    if not args.dataset_repo:
+        raise ValueError("Provide --dataset-dir or --dataset-repo")
+    token = args.hf_token or os.getenv("HF_TOKEN", "")
+    temp_root = tempfile.mkdtemp(prefix="qwen_caption_dataset_")
+    local_dir = os.path.join(temp_root, "dataset")
+    logger.info(f"Downloading dataset {args.dataset_repo}@{args.dataset_revision} -> {local_dir}")
+    snapshot_download(
+        repo_id=args.dataset_repo,
+        repo_type="dataset",
+        revision=args.dataset_revision,
+        local_dir=local_dir,
+        local_dir_use_symlinks=False,
+        token=token or None,
+    )
+    if args.dataset_subdir:
+        sub = os.path.join(local_dir, args.dataset_subdir)
+        if not Path(sub).is_dir():
+            raise FileNotFoundError(f"Dataset subdir not found: {sub}")
+        return sub
+    return local_dir
+def upload_export_if_requested(args, output_dir: str):
+    if not args.upload_repo:
+        return
+    token = args.hf_token or os.getenv("HF_TOKEN", "")
+    if not token:
+        raise RuntimeError("HF token missing. Set --hf-token or HF_TOKEN.")
+    api = HfApi(token=token)
+    api.create_repo(
+        repo_id=args.upload_repo,
+        repo_type="dataset",
+        private=bool(args.upload_private),
+        exist_ok=True,
+    )
+    path_in_repo = args.upload_path.strip().strip("/") if args.upload_path else ""
+    logger.info(f"Uploading {output_dir} -> {args.upload_repo}/{path_in_repo}")
+    api.upload_folder(
+        repo_id=args.upload_repo,
+        repo_type="dataset",
+        folder_path=output_dir,
+        path_in_repo=path_in_repo,
+        commit_message="Upload Qwen2-Audio annotations",
+    )
+    logger.info("Upload complete")
+def main() -> int:
+    args = build_parser().parse_args()
+    prompt = read_prompt_file(args.prompt_file) if args.prompt_file else args.prompt
+    long_prompt = (
+        read_prompt_file(args.long_analysis_prompt_file)
+        if args.long_analysis_prompt_file
+        else args.long_analysis_prompt
+    )
+    token = args.hf_token or os.getenv("HF_TOKEN", "")
+    dataset_dir = resolve_dataset_dir(args)
+    audio_files: List[str] = list_audio_files(dataset_dir)
+    if not audio_files:
+        raise RuntimeError(f"No audio files found in {dataset_dir}")
+    logger.info(f"Found {len(audio_files)} audio files")
+    captioner = build_captioner(
+        backend=args.backend,
+        model_id=args.model_id,
+        endpoint_url=args.endpoint_url,
+        token=token,
+        device=args.device,
+        torch_dtype=args.torch_dtype,
+    )
+    records = []
+    failed = []
+    for path in tqdm(audio_files, desc="Captioning audio"):
+        try:
+            sidecar = generate_track_annotation(
+                audio_path=path,
+                captioner=captioner,
+                prompt=prompt,
+                segment_seconds=float(args.segment_seconds),
+                overlap_seconds=float(args.overlap_seconds),
+                max_new_tokens=int(args.max_new_tokens),
+                temperature=float(args.temperature),
+                keep_raw_outputs=bool(args.keep_raw_outputs),
+                include_long_analysis=bool(args.include_long_analysis),
+                long_analysis_prompt=long_prompt,
+                long_analysis_max_new_tokens=int(args.long_analysis_max_new_tokens),
+                long_analysis_temperature=float(args.long_analysis_temperature),
+            )
+            records.append({"audio_path": path, "sidecar": sidecar})
+        except Exception as exc:
+            failed.append(f"{Path(path).name}: {exc}")
+            logger.exception(f"Failed: {path}")
+    export_result = export_annotation_records(
+        records=records,
+        output_dir=args.output_dir,
+        copy_audio=bool(args.copy_audio),
+        write_inplace_sidecars=bool(args.write_inplace_sidecars),
+    )
+    logger.info(
+        "Done. analyzed={} failed={} manifest={}",
+        len(records),
+        len(failed),
+        export_result["manifest_path"],
+    )
+    if failed:
+        logger.warning("First failures:\n" + "\n".join(failed[:20]))
+    upload_export_if_requested(args, args.output_dir)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/dev/run_af3_gui.ps1 ADDED Viewed

	@@ -0,0 +1,21 @@

+param(
+    [string]$BindHost = "127.0.0.1",
+    [int]$Port = 8008,
+    [switch]$Reload,
+    [switch]$NoBrowser,
+    [switch]$SkipNpmInstall,
+    [switch]$SkipBuild
+)
+$cmd = @("python", "af3_gui_app.py", "--host", $BindHost, "--port", "$Port")
+if ($Reload) { $cmd += "--reload" }
+if ($NoBrowser) { $cmd += "--no-browser" }
+if ($SkipNpmInstall) { $cmd += "--skip-npm-install" }
+if ($SkipBuild) { $cmd += "--skip-build" }
+$exe = $cmd[0]
+$args = @()
+if ($cmd.Length -gt 1) {
+    $args = $cmd[1..($cmd.Length-1)]
+}
+& $exe @args

scripts/dev/run_af3_gui.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python
+"""Build and launch the AF3 + ChatGPT GUI stack (API + React UI)."""
+from __future__ import annotations
+import argparse
+import shutil
+import subprocess
+import sys
+import threading
+import webbrowser
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from utils.env_config import load_project_env
+def _run(cmd: list[str], cwd: Path) -> None:
+    proc = subprocess.run(cmd, cwd=str(cwd), check=False)
+    if proc.returncode != 0:
+        raise RuntimeError(f"Command failed ({proc.returncode}): {' '.join(cmd)}")
+def _build_frontend(skip_npm_install: bool, skip_build: bool) -> None:
+    react_dir = PROJECT_ROOT / "react-ui"
+    if not react_dir.exists():
+        raise FileNotFoundError(f"React UI folder missing: {react_dir}")
+    npm = shutil.which("npm")
+    if not npm:
+        raise RuntimeError("`npm` was not found. Install Node.js (includes npm) first.")
+    if not skip_npm_install and not (react_dir / "node_modules").exists():
+        _run([npm, "install"], cwd=react_dir)
+    if not skip_build:
+        _run([npm, "run", "build"], cwd=react_dir)
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description="Launch AF3 GUI (FastAPI + built React frontend)")
+    p.add_argument("--host", default="127.0.0.1")
+    p.add_argument("--port", type=int, default=8008)
+    p.add_argument("--reload", action="store_true", help="Enable uvicorn reload mode")
+    p.add_argument("--no-browser", action="store_true", help="Do not open browser automatically")
+    p.add_argument("--skip-npm-install", action="store_true", help="Skip npm install")
+    p.add_argument("--skip-build", action="store_true", help="Skip frontend build")
+    return p
+def main() -> int:
+    args = build_parser().parse_args()
+    load_project_env()
+    _build_frontend(skip_npm_install=bool(args.skip_npm_install), skip_build=bool(args.skip_build))
+    url = f"http://{args.host}:{args.port}"
+    if not args.no_browser:
+        threading.Timer(1.0, lambda: webbrowser.open(url)).start()
+    import uvicorn
+    uvicorn.run(
+        "services.pipeline_api:app",
+        host=args.host,
+        port=int(args.port),
+        reload=bool(args.reload),
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/endpoint/test_af3_caption_endpoint.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env python
+"""
+Send one audio file to an Audio Flamingo 3 endpoint and print/save the response.
+"""
+from __future__ import annotations
+import argparse
+import base64
+import io
+import json
+import sys
+from pathlib import Path
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
+import soundfile as sf
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from af3_chatgpt_pipeline import DEFAULT_AF3_PROMPT, DEFAULT_AF3_PROMPT_THINK_LONG
+from qwen_audio_captioning import load_audio_mono
+from utils.env_config import get_env, load_project_env
+def load_audio_b64(audio_path: str, target_sr: int = 16000) -> str:
+    mono, sr = load_audio_mono(audio_path, target_sr=target_sr)
+    buf = io.BytesIO()
+    sf.write(buf, mono, int(sr), format="WAV")
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+def send(url: str, token: str, payload: dict) -> dict:
+    req = Request(
+        url=url,
+        method="POST",
+        data=json.dumps(payload).encode("utf-8"),
+        headers={
+            **({"Authorization": f"Bearer {token}"} if token else {}),
+            "Content-Type": "application/json",
+        },
+    )
+    try:
+        with urlopen(req, timeout=600) as resp:
+            text = resp.read().decode("utf-8")
+        return json.loads(text)
+    except HTTPError as e:
+        body = e.read().decode("utf-8", errors="replace")
+        lower = body.lower()
+        if "endpoint is in error" in lower:
+            body += (
+                "\nHint: open the endpoint page and restart/redeploy. "
+                "This is a remote runtime failure, not a local script issue."
+            )
+        if "no custom pipeline found" in lower:
+            body += (
+                "\nHint: endpoint repo root must contain handler.py; "
+                "ensure you deployed templates/hf-af3-caption-endpoint files."
+            )
+        if "audioflamingo3" in lower and "does not recognize" in lower:
+            body += (
+                "\nHint: runtime transformers is too old. "
+                "Use templates/hf-af3-caption-endpoint/handler.py bootstrap runtime "
+                "(AF3_TRANSFORMERS_SPEC=transformers==5.1.0) and redeploy."
+            )
+        if "failed to load af3 processor classes after runtime bootstrap" in lower:
+            body += (
+                "\nHint: endpoint startup could not install/load AF3 runtime deps. "
+                "Check startup logs for pip/network/disk issues and keep task=custom."
+            )
+        raise RuntimeError(f"HTTP {e.code}: {body}") from e
+    except URLError as e:
+        raise RuntimeError(f"Network error: {e}") from e
+def main() -> int:
+    load_project_env()
+    parser = argparse.ArgumentParser(description="Test AF3 caption endpoint")
+    parser.add_argument(
+        "--url",
+        default=get_env("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url"),
+        required=False,
+    )
+    parser.add_argument(
+        "--token",
+        default=get_env("HF_TOKEN", "hf_token"),
+        required=False,
+    )
+    parser.add_argument("--audio", required=True, help="Path to local audio file")
+    parser.add_argument("--prompt", default=DEFAULT_AF3_PROMPT)
+    parser.add_argument(
+        "--mode",
+        choices=["auto", "think", "single"],
+        default="auto",
+        help="Optional AF3 mode selector for NVIDIA-stack endpoints.",
+    )
+    parser.add_argument(
+        "--think-long",
+        action="store_true",
+        help="Use long-form AF3 prompt + higher token budget defaults.",
+    )
+    parser.add_argument("--max-new-tokens", type=int, default=1400)
+    parser.add_argument("--temperature", type=float, default=0.1)
+    parser.add_argument("--save-json", default="", help="Optional output JSON path")
+    args = parser.parse_args()
+    if not args.url:
+        raise RuntimeError("Missing endpoint URL. Pass --url or set HF_AF3_ENDPOINT_URL.")
+    if not Path(args.audio).is_file():
+        raise FileNotFoundError(f"Audio file not found: {args.audio}")
+    audio_b64 = load_audio_b64(args.audio, target_sr=16000)
+    prompt = args.prompt
+    max_new_tokens = int(args.max_new_tokens)
+    temperature = float(args.temperature)
+    if args.think_long:
+        if prompt == DEFAULT_AF3_PROMPT:
+            prompt = DEFAULT_AF3_PROMPT_THINK_LONG
+        if max_new_tokens == 1400:
+            max_new_tokens = 3200
+        if abs(temperature - 0.1) < 1e-9:
+            temperature = 0.2
+    payload = {
+        "inputs": {
+            "prompt": prompt,
+            "audio_base64": audio_b64,
+            "sample_rate": 16000,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+        }
+    }
+    if args.mode != "auto":
+        payload["inputs"]["think_mode"] = bool(args.mode == "think")
+    result = send(args.url, args.token, payload)
+    rendered = json.dumps(result, indent=2, ensure_ascii=False)
+    try:
+        print(rendered)
+    except UnicodeEncodeError:
+        # Fallback for Windows cp1252 terminals when model emits non-ASCII punctuation.
+        print(json.dumps(result, indent=2, ensure_ascii=True))
+    if args.save_json:
+        Path(args.save_json).write_text(
+            json.dumps(result, indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+        print(f"Saved: {args.save_json}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/endpoint/test_qwen_caption_endpoint.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python
+"""
+Send one audio file to a Qwen caption endpoint and print/save the response.
+Request contract expected by templates/hf-qwen-caption-endpoint/handler.py:
+{
+  "inputs": {
+    "prompt": "...",
+    "audio_base64": "...",
+    "sample_rate": 16000,
+    "max_new_tokens": 384,
+    "temperature": 0.1
+  }
+}
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+from pathlib import Path
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
+import soundfile as sf
+from qwen_audio_captioning import DEFAULT_ANALYSIS_PROMPT, load_audio_mono
+def read_dotenv_value(path: str, key: str) -> str:
+    p = Path(path)
+    if not p.exists():
+        return ""
+    for raw in p.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        k, v = line.split("=", 1)
+        if k.strip() == key:
+            return v.strip().strip('"').strip("'")
+    return ""
+def load_audio_b64(audio_path: str, target_sr: int) -> str:
+    mono, sr = load_audio_mono(audio_path, target_sr=target_sr)
+    import io
+    buf = io.BytesIO()
+    sf.write(buf, mono, int(sr), format="WAV")
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+def send(url: str, token: str, payload: dict) -> dict:
+    req = Request(
+        url=url,
+        method="POST",
+        data=json.dumps(payload).encode("utf-8"),
+        headers={
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+        },
+    )
+    try:
+        with urlopen(req, timeout=600) as resp:
+            text = resp.read().decode("utf-8")
+        return json.loads(text)
+    except HTTPError as e:
+        body = e.read().decode("utf-8", errors="replace")
+        raise RuntimeError(f"HTTP {e.code}: {body}") from e
+    except URLError as e:
+        raise RuntimeError(f"Network error: {e}") from e
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Test Qwen caption endpoint")
+    parser.add_argument(
+        "--url",
+        default=os.getenv("HF_QWEN_ENDPOINT_URL", "") or read_dotenv_value(".env", "HF_QWEN_ENDPOINT_URL"),
+        required=False,
+    )
+    parser.add_argument(
+        "--token",
+        default=(
+            os.getenv("HF_TOKEN", "")
+            or os.getenv("hf_token", "")
+            or read_dotenv_value(".env", "HF_TOKEN")
+            or read_dotenv_value(".env", "hf_token")
+        ),
+        required=False,
+    )
+    parser.add_argument("--audio", required=True, help="Path to local audio file")
+    parser.add_argument("--prompt", default=DEFAULT_ANALYSIS_PROMPT)
+    parser.add_argument("--sample-rate", type=int, default=16000)
+    parser.add_argument("--max-new-tokens", type=int, default=384)
+    parser.add_argument("--temperature", type=float, default=0.1)
+    parser.add_argument("--save-json", default="", help="Optional output JSON path")
+    args = parser.parse_args()
+    if not args.url:
+        raise RuntimeError("Missing endpoint URL. Pass --url or set HF_QWEN_ENDPOINT_URL.")
+    if not args.token:
+        raise RuntimeError("Missing HF token. Pass --token or set HF_TOKEN.")
+    if not Path(args.audio).is_file():
+        raise FileNotFoundError(f"Audio file not found: {args.audio}")
+    audio_b64 = load_audio_b64(args.audio, target_sr=args.sample_rate)
+    payload = {
+        "inputs": {
+            "prompt": args.prompt,
+            "audio_base64": audio_b64,
+            "sample_rate": args.sample_rate,
+            "max_new_tokens": args.max_new_tokens,
+            "temperature": args.temperature,
+        }
+    }
+    result = send(args.url, args.token, payload)
+    print(json.dumps(result, indent=2, ensure_ascii=False))
+    if args.save_json:
+        Path(args.save_json).write_text(
+            json.dumps(result, indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+        print(f"Saved: {args.save_json}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/hf_clone.py CHANGED Viewed

@@ -5,6 +5,8 @@ Bootstrap this project into your own Hugging Face Space and/or Endpoint repo.
 Examples:
   python scripts/hf_clone.py space --repo-id your-name/ace-step-lora-studio
   python scripts/hf_clone.py endpoint --repo-id your-name/ace-step-endpoint
   python scripts/hf_clone.py all --space-repo-id your-name/ace-step-lora-studio --endpoint-repo-id your-name/ace-step-endpoint
 """
@@ -172,10 +174,95 @@ def _stage_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
     return copied, bytes_total
 def _resolve_token(arg_token: str) -> str | None:
     if arg_token:
         return arg_token
-    return os.getenv("HF_TOKEN")
 def _ensure_repo(
@@ -266,6 +353,72 @@ def clone_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool
         print(f"[endpoint] uploaded to https://huggingface.co/{repo_id}")
 def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Clone this project into your own HF Space/Endpoint repos.")
     subparsers = parser.add_subparsers(dest="cmd", required=True)
@@ -282,6 +435,31 @@ def build_parser() -> argparse.ArgumentParser:
     p_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
     p_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
     p_all = subparsers.add_parser("all", help="Run both Space and Endpoint bootstrap.")
     p_all.add_argument("--space-repo-id", required=True, help="Target space repo id.")
     p_all.add_argument("--endpoint-repo-id", required=True, help="Target endpoint model repo id.")
@@ -305,6 +483,12 @@ def main() -> int:
         clone_space(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
     elif args.cmd == "endpoint":
         clone_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
     else:
         clone_space(args.space_repo_id, private=bool(args.space_private), token=token, dry_run=bool(args.dry_run))
         clone_endpoint(

 Examples:
   python scripts/hf_clone.py space --repo-id your-name/ace-step-lora-studio
   python scripts/hf_clone.py endpoint --repo-id your-name/ace-step-endpoint
+  python scripts/hf_clone.py af3-endpoint --repo-id your-name/af3-caption-endpoint
+  python scripts/hf_clone.py af3-nvidia-endpoint --repo-id your-name/af3-nvidia-endpoint
   python scripts/hf_clone.py all --space-repo-id your-name/ace-step-lora-studio --endpoint-repo-id your-name/ace-step-endpoint
 """
     return copied, bytes_total
+def _iter_qwen_endpoint_template_paths() -> Iterable[tuple[Path, Path]]:
+    template_dir = PROJECT_ROOT / "templates" / "hf-qwen-caption-endpoint"
+    mapping = {
+        "handler.py": Path("handler.py"),
+        "requirements.txt": Path("requirements.txt"),
+        "README.md": Path("README.md"),
+    }
+    for src_name, dst_rel in mapping.items():
+        src = template_dir / src_name
+        if src.exists():
+            yield src, dst_rel
+def _stage_qwen_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
+    copied = 0
+    bytes_total = 0
+    for src, rel_dst in _iter_qwen_endpoint_template_paths():
+        dst = staging_dir / rel_dst
+        _copy_file(src, dst)
+        copied += 1
+        bytes_total += src.stat().st_size
+    return copied, bytes_total
+def _iter_af3_endpoint_template_paths() -> Iterable[tuple[Path, Path]]:
+    template_dir = PROJECT_ROOT / "templates" / "hf-af3-caption-endpoint"
+    mapping = {
+        "handler.py": Path("handler.py"),
+        "requirements.txt": Path("requirements.txt"),
+        "README.md": Path("README.md"),
+    }
+    for src_name, dst_rel in mapping.items():
+        src = template_dir / src_name
+        if src.exists():
+            yield src, dst_rel
+def _stage_af3_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
+    copied = 0
+    bytes_total = 0
+    for src, rel_dst in _iter_af3_endpoint_template_paths():
+        dst = staging_dir / rel_dst
+        _copy_file(src, dst)
+        copied += 1
+        bytes_total += src.stat().st_size
+    return copied, bytes_total
+def _iter_af3_nvidia_endpoint_template_paths() -> Iterable[tuple[Path, Path]]:
+    template_dir = PROJECT_ROOT / "templates" / "hf-af3-nvidia-endpoint"
+    mapping = {
+        "handler.py": Path("handler.py"),
+        "requirements.txt": Path("requirements.txt"),
+        "README.md": Path("README.md"),
+    }
+    for src_name, dst_rel in mapping.items():
+        src = template_dir / src_name
+        if src.exists():
+            yield src, dst_rel
+def _stage_af3_nvidia_endpoint_snapshot(staging_dir: Path) -> tuple[int, int]:
+    copied = 0
+    bytes_total = 0
+    for src, rel_dst in _iter_af3_nvidia_endpoint_template_paths():
+        dst = staging_dir / rel_dst
+        _copy_file(src, dst)
+        copied += 1
+        bytes_total += src.stat().st_size
+    return copied, bytes_total
 def _resolve_token(arg_token: str) -> str | None:
     if arg_token:
         return arg_token
+    env_token = os.getenv("HF_TOKEN") or os.getenv("hf_token")
+    if env_token:
+        return env_token
+    dotenv = PROJECT_ROOT / ".env"
+    if dotenv.exists():
+        for raw in dotenv.read_text(encoding="utf-8").splitlines():
+            line = raw.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            k, v = line.split("=", 1)
+            if k.strip() in {"HF_TOKEN", "hf_token"}:
+                return v.strip().strip('"').strip("'")
+    return None
 def _ensure_repo(
         print(f"[endpoint] uploaded to https://huggingface.co/{repo_id}")
+def clone_qwen_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool) -> None:
+    with tempfile.TemporaryDirectory(prefix="hf_qwen_endpoint_clone_") as tmp:
+        staging = Path(tmp)
+        copied, bytes_total = _stage_qwen_endpoint_snapshot(staging)
+        print(f"[qwen-endpoint] staged files: {copied}, size: {_fmt_mb(bytes_total)}")
+        if dry_run:
+            print("[qwen-endpoint] dry-run complete (nothing uploaded).")
+            return
+        api = HfApi(token=token)
+        _ensure_repo(api, repo_id=repo_id, repo_type="model", private=private)
+        _upload_snapshot(
+            api,
+            repo_id=repo_id,
+            repo_type="model",
+            folder_path=staging,
+            commit_message="Bootstrap Qwen2-Audio custom endpoint repo",
+        )
+        print(f"[qwen-endpoint] uploaded to https://huggingface.co/{repo_id}")
+def clone_af3_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool) -> None:
+    with tempfile.TemporaryDirectory(prefix="hf_af3_endpoint_clone_") as tmp:
+        staging = Path(tmp)
+        copied, bytes_total = _stage_af3_endpoint_snapshot(staging)
+        print(f"[af3-endpoint] staged files: {copied}, size: {_fmt_mb(bytes_total)}")
+        if dry_run:
+            print("[af3-endpoint] dry-run complete (nothing uploaded).")
+            return
+        api = HfApi(token=token)
+        _ensure_repo(api, repo_id=repo_id, repo_type="model", private=private)
+        _upload_snapshot(
+            api,
+            repo_id=repo_id,
+            repo_type="model",
+            folder_path=staging,
+            commit_message="Bootstrap Audio Flamingo 3 custom endpoint repo",
+        )
+        print(f"[af3-endpoint] uploaded to https://huggingface.co/{repo_id}")
+def clone_af3_nvidia_endpoint(repo_id: str, private: bool, token: str | None, dry_run: bool) -> None:
+    with tempfile.TemporaryDirectory(prefix="hf_af3_nvidia_endpoint_clone_") as tmp:
+        staging = Path(tmp)
+        copied, bytes_total = _stage_af3_nvidia_endpoint_snapshot(staging)
+        print(f"[af3-nvidia-endpoint] staged files: {copied}, size: {_fmt_mb(bytes_total)}")
+        if dry_run:
+            print("[af3-nvidia-endpoint] dry-run complete (nothing uploaded).")
+            return
+        api = HfApi(token=token)
+        _ensure_repo(api, repo_id=repo_id, repo_type="model", private=private)
+        _upload_snapshot(
+            api,
+            repo_id=repo_id,
+            repo_type="model",
+            folder_path=staging,
+            commit_message="Bootstrap Audio Flamingo 3 NVIDIA-stack endpoint repo",
+        )
+        print(f"[af3-nvidia-endpoint] uploaded to https://huggingface.co/{repo_id}")
 def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Clone this project into your own HF Space/Endpoint repos.")
     subparsers = parser.add_subparsers(dest="cmd", required=True)
     p_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
     p_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
+    p_qwen_endpoint = subparsers.add_parser("qwen-endpoint", help="Create/update Qwen2-Audio custom endpoint repo.")
+    p_qwen_endpoint.add_argument("--repo-id", required=True, help="Target model repo id, e.g. username/my-qwen-endpoint.")
+    p_qwen_endpoint.add_argument("--private", action="store_true", help="Create repo as private.")
+    p_qwen_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
+    p_qwen_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
+    p_af3_endpoint = subparsers.add_parser("af3-endpoint", help="Create/update Audio Flamingo 3 custom endpoint repo.")
+    p_af3_endpoint.add_argument("--repo-id", required=True, help="Target model repo id, e.g. username/my-af3-endpoint.")
+    p_af3_endpoint.add_argument("--private", action="store_true", help="Create repo as private.")
+    p_af3_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
+    p_af3_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
+    p_af3_nvidia_endpoint = subparsers.add_parser(
+        "af3-nvidia-endpoint",
+        help="Create/update AF3 NVIDIA-stack (llava+stage35) endpoint repo.",
+    )
+    p_af3_nvidia_endpoint.add_argument(
+        "--repo-id",
+        required=True,
+        help="Target model repo id, e.g. username/my-af3-nvidia-endpoint.",
+    )
+    p_af3_nvidia_endpoint.add_argument("--private", action="store_true", help="Create repo as private.")
+    p_af3_nvidia_endpoint.add_argument("--token", type=str, default="", help="HF token (default: HF_TOKEN env var).")
+    p_af3_nvidia_endpoint.add_argument("--dry-run", action="store_true", help="Stage files only; do not upload.")
     p_all = subparsers.add_parser("all", help="Run both Space and Endpoint bootstrap.")
     p_all.add_argument("--space-repo-id", required=True, help="Target space repo id.")
     p_all.add_argument("--endpoint-repo-id", required=True, help="Target endpoint model repo id.")
         clone_space(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
     elif args.cmd == "endpoint":
         clone_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
+    elif args.cmd == "qwen-endpoint":
+        clone_qwen_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
+    elif args.cmd == "af3-endpoint":
+        clone_af3_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
+    elif args.cmd == "af3-nvidia-endpoint":
+        clone_af3_nvidia_endpoint(args.repo_id, private=bool(args.private), token=token, dry_run=bool(args.dry_run))
     else:
         clone_space(args.space_repo_id, private=bool(args.space_private), token=token, dry_run=bool(args.dry_run))
         clone_endpoint(

scripts/jobs/submit_hf_qwen_caption_job.ps1 ADDED Viewed

	@@ -0,0 +1,133 @@

+param(
+  [string]$CodeRepo = "YOUR_USERNAME/ace-step-lora-studio",
+  [string]$DatasetRepo = "",
+  [string]$DatasetRevision = "main",
+  [string]$DatasetSubdir = "",
+  [string]$Backend = "local",
+  [string]$ModelId = "Qwen/Qwen2-Audio-7B-Instruct",
+  [string]$EndpointUrl = "",
+  [string]$Device = "auto",
+  [string]$TorchDtype = "auto",
+  [string]$Prompt = "",
+  [double]$SegmentSeconds = 30.0,
+  [double]$OverlapSeconds = 2.0,
+  [int]$MaxNewTokens = 384,
+  [double]$Temperature = 0.1,
+  [string]$OutputDir = "/workspace/qwen_annotations",
+  [string]$UploadRepo = "",
+  [switch]$UploadPrivate,
+  [switch]$CopyAudio,
+  [switch]$KeepRawOutputs,
+  [switch]$WriteInplaceSidecars,
+  [string]$Flavor = "a10g-large",
+  [string]$Timeout = "8h",
+  [switch]$Detach
+)
+$ErrorActionPreference = "Stop"
+if (-not $DatasetRepo) {
+  throw "Provide -DatasetRepo (HF dataset repo containing audio files)."
+}
+if ($Backend -eq "hf_endpoint" -and -not $EndpointUrl) {
+  throw "Backend hf_endpoint requires -EndpointUrl."
+}
+$secretArgs = @("--secrets", "HF_TOKEN")
+$datasetSubdirArgs = ""
+if ($DatasetSubdir) {
+  $datasetSubdirArgs = "--dataset-subdir `"$DatasetSubdir`""
+}
+$endpointArgs = ""
+if ($EndpointUrl) {
+  $endpointArgs = "--endpoint-url `"$EndpointUrl`""
+}
+$uploadArgs = ""
+if ($UploadRepo) {
+  $uploadArgs = "--upload-repo `"$UploadRepo`""
+  if ($UploadPrivate.IsPresent) {
+    $uploadArgs += " --upload-private"
+  }
+}
+$copyAudioArg = ""
+if ($CopyAudio.IsPresent) {
+  $copyAudioArg = "--copy-audio"
+}
+$keepRawArg = ""
+if ($KeepRawOutputs.IsPresent) {
+  $keepRawArg = "--keep-raw-outputs"
+}
+$writeInplaceArg = ""
+if ($WriteInplaceSidecars.IsPresent) {
+  $writeInplaceArg = "--write-inplace-sidecars"
+}
+$promptArg = ""
+if ($Prompt) {
+  $escapedPrompt = $Prompt.Replace('"', '\"')
+  $promptArg = "--prompt `"$escapedPrompt`""
+}
+$detachArg = ""
+if ($Detach.IsPresent) {
+  $detachArg = "--detach"
+}
+$jobCommand = @"
+set -e
+python -m pip install --no-cache-dir --upgrade pip
+git clone https://huggingface.co/$CodeRepo /workspace/code
+cd /workspace/code
+python -m pip install --no-cache-dir -r requirements.txt
+python scripts/annotations/qwen_caption_dataset.py \
+  --dataset-repo "$DatasetRepo" \
+  --dataset-revision "$DatasetRevision" \
+  $datasetSubdirArgs \
+  --backend "$Backend" \
+  --model-id "$ModelId" \
+  $endpointArgs \
+  --device "$Device" \
+  --torch-dtype "$TorchDtype" \
+  --segment-seconds $SegmentSeconds \
+  --overlap-seconds $OverlapSeconds \
+  --max-new-tokens $MaxNewTokens \
+  --temperature $Temperature \
+  --output-dir "$OutputDir" \
+  $promptArg \
+  $copyAudioArg \
+  $keepRawArg \
+  $writeInplaceArg \
+  $uploadArgs
+"@
+$argsList = @(
+  "jobs", "run",
+  "--flavor", $Flavor,
+  "--timeout", $Timeout
+) + $secretArgs
+if ($detachArg) {
+  $argsList += $detachArg
+}
+$argsList += @(
+  "pytorch/pytorch:2.5.1-cuda12.1-cudnn9-runtime",
+  "bash", "-lc", $jobCommand
+)
+Write-Host "Submitting Qwen caption HF Job with flavor=$Flavor timeout=$Timeout ..."
+Write-Host "Dataset repo: $DatasetRepo"
+Write-Host "Code repo: $CodeRepo"
+if ($UploadRepo) {
+  Write-Host "Will upload exported annotations to: $UploadRepo"
+}
+& hf @argsList

scripts/pipeline/refine_dataset_json_with_openai.py ADDED Viewed

	@@ -0,0 +1,291 @@

+#!/usr/bin/env python
+"""
+Refine existing dataset JSON annotations into Ace-Step 1.5 LoRA-ready sidecars.
+This script:
+1. Reads existing JSON files (typically containing AF3 `generated_text`).
+2. Uses OpenAI cleanup (optionally with web search) to normalize/expand metadata.
+3. Writes normalized sidecar JSON in-place (or to an output directory).
+4. Creates backup copies before overwrite by default.
+"""
+from __future__ import annotations
+import argparse
+import json
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+from tqdm import tqdm
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from af3_chatgpt_pipeline import (  # noqa: E402
+    DEFAULT_AF3_PROMPT,
+    DEFAULT_OPENAI_MODEL,
+    build_lora_sidecar,
+    cleanup_with_chatgpt,
+)
+from qwen_audio_captioning import AUDIO_EXTENSIONS  # noqa: E402
+from utils.env_config import get_env, load_project_env  # noqa: E402
+def _parse_args() -> argparse.Namespace:
+    load_project_env()
+    p = argparse.ArgumentParser(
+        description="Refine dataset JSONs into Ace-Step 1.5 LoRA-ready metadata using OpenAI."
+    )
+    p.add_argument("--dataset-dir", default="train-dataset", help="Directory containing source JSON files")
+    p.add_argument("--recursive", action="store_true", help="Include nested folders")
+    p.add_argument("--pattern", default="*.json", help="Filename glob pattern")
+    p.add_argument("--output-dir", default="", help="Optional output folder. Default: overwrite in place")
+    p.add_argument(
+        "--backup-ext",
+        default=".backup-before-openai.json",
+        help="Backup extension for in-place writes",
+    )
+    p.add_argument("--no-backup", action="store_true", help="Disable backup creation for in-place writes")
+    p.add_argument("--limit", type=int, default=0, help="Process only first N files (0 = all)")
+    p.add_argument("--artist-default", default="Andrew Spacey", help="Fallback artist if parsing fails")
+    p.add_argument("--user-context", default="", help="Extra guidance passed to OpenAI cleanup")
+    p.add_argument("--openai-api-key", default="", help="Overrides OPENAI_API_KEY from .env")
+    p.add_argument(
+        "--openai-model",
+        default=get_env("OPENAI_MODEL", "openai_model", default=DEFAULT_OPENAI_MODEL),
+        help="OpenAI model id",
+    )
+    p.add_argument(
+        "--enable-web-search",
+        action="store_true",
+        help="Enable web search tool for artist/track context lookup",
+    )
+    p.add_argument("--fail-fast", action="store_true", help="Stop on first failure")
+    p.add_argument("--dry-run", action="store_true", help="Do not write files")
+    return p.parse_args()
+def _iter_json_files(dataset_dir: Path, pattern: str, recursive: bool) -> List[Path]:
+    if recursive:
+        return sorted(dataset_dir.rglob(pattern))
+    return sorted(dataset_dir.glob(pattern))
+def _load_json(path: Path) -> Dict:
+    # Handle both standard UTF-8 and UTF-8 with BOM.
+    text = path.read_text(encoding="utf-8-sig")
+    data = json.loads(text)
+    if not isinstance(data, dict):
+        raise ValueError("Top-level JSON is not an object")
+    return data
+def _detect_audio_path(json_path: Path) -> Optional[Path]:
+    stem = json_path.stem
+    for ext in AUDIO_EXTENSIONS:
+        candidate = json_path.with_suffix(ext)
+        if candidate.exists():
+            return candidate
+    # Fallback to case-insensitive scan.
+    parent = json_path.parent
+    for f in parent.iterdir():
+        if f.is_file() and f.stem == stem and f.suffix.lower() in AUDIO_EXTENSIONS:
+            return f
+    return None
+def _try_duration_seconds(audio_path: Optional[Path], fallback: float = 0.0) -> float:
+    if audio_path is None or not audio_path.exists():
+        return float(fallback or 0.0)
+    try:
+        import soundfile as sf
+        info = sf.info(str(audio_path))
+        if info.samplerate and info.frames:
+            return float(info.frames) / float(info.samplerate)
+    except Exception:
+        pass
+    return float(fallback or 0.0)
+def _parse_artist_track_from_stem(stem: str, artist_default: str) -> Tuple[str, str]:
+    parts = stem.split(" - ", 1)
+    if len(parts) == 2:
+        artist, track = parts[0].strip(), parts[1].strip()
+        if artist and track:
+            return artist, track
+    return artist_default.strip() or "Unknown Artist", stem.strip()
+def _extract_raw_analysis(data: Dict) -> str:
+    checks: Iterable[object] = (
+        data.get("generated_text"),
+        data.get("af3_analysis"),
+        data.get("analysis_long"),
+        data.get("analysis_short"),
+        (data.get("music_analysis") or {}).get("summary_long") if isinstance(data.get("music_analysis"), dict) else None,
+        data.get("caption"),
+    )
+    for value in checks:
+        if isinstance(value, str) and value.strip():
+            return value.strip()
+    return ""
+def _ensure_output_path(src_json: Path, output_dir: Optional[Path]) -> Path:
+    if output_dir:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        return output_dir / src_json.name
+    return src_json
+def _create_backup(src: Path, backup_ext: str) -> Optional[Path]:
+    backup_path = src.with_name(src.stem + backup_ext)
+    if backup_path.exists():
+        return backup_path
+    shutil.copy2(src, backup_path)
+    return backup_path
+def _finalize_sidecar(
+    *,
+    cleaned: Dict,
+    raw_analysis: str,
+    duration: float,
+    source_audio: Optional[Path],
+    source_json: Path,
+    artist: str,
+    track_name: str,
+    openai_model: str,
+    web_search_used: bool,
+) -> Dict:
+    source_audio_str = str(source_audio) if source_audio else ""
+    sidecar = build_lora_sidecar(
+        cleaned,
+        af3_text=raw_analysis,
+        af3_prompt=DEFAULT_AF3_PROMPT,
+        af3_backend="existing_json_refine",
+        af3_model_id="nvidia/audio-flamingo-3",
+        source_audio=source_audio_str,
+        duration=duration,
+        chatgpt_model=openai_model,
+        web_search_used=web_search_used,
+    )
+    sidecar["artist"] = artist
+    sidecar["track_name"] = track_name
+    sidecar["source"] = {
+        "input_json": str(source_json),
+        "input_audio": source_audio_str,
+        "refined_from_existing_json": True,
+    }
+    return sidecar
+def main() -> int:
+    args = _parse_args()
+    dataset_dir = Path(args.dataset_dir)
+    if not dataset_dir.exists():
+        raise FileNotFoundError(f"Dataset directory not found: {dataset_dir}")
+    openai_key = args.openai_api_key or get_env("OPENAI_API_KEY", "openai_api_key")
+    if not openai_key:
+        raise RuntimeError("Missing OPENAI_API_KEY (set in .env or pass --openai-api-key).")
+    files = _iter_json_files(dataset_dir, pattern=args.pattern, recursive=bool(args.recursive))
+    if args.limit and args.limit > 0:
+        files = files[: int(args.limit)]
+    if not files:
+        raise RuntimeError(f"No files matched {args.pattern} in {dataset_dir}")
+    output_dir = Path(args.output_dir) if args.output_dir else None
+    failures: List[str] = []
+    saved: List[str] = []
+    backups: List[str] = []
+    for json_path in tqdm(files, desc="Refine JSON"):
+        try:
+            data = _load_json(json_path)
+            raw_analysis = _extract_raw_analysis(data)
+            if not raw_analysis:
+                raise ValueError("No analysis text found (generated_text/analysis/caption missing)")
+            artist, track_name = _parse_artist_track_from_stem(json_path.stem, args.artist_default)
+            artist = str(data.get("artist") or artist).strip() or artist
+            track_name = str(data.get("track_name") or data.get("title") or track_name).strip() or track_name
+            source_audio = _detect_audio_path(json_path)
+            duration = _try_duration_seconds(source_audio, fallback=float(data.get("duration") or 0.0))
+            try:
+                cleaned = cleanup_with_chatgpt(
+                    raw_analysis,
+                    openai_api_key=openai_key,
+                    model=args.openai_model,
+                    duration=duration,
+                    user_context=args.user_context,
+                    artist_name=artist,
+                    track_name=track_name,
+                    enable_web_search=bool(args.enable_web_search),
+                )
+                web_used = bool(args.enable_web_search)
+            except Exception:
+                # If web-search tool compatibility fails on this runtime, retry without it.
+                if not args.enable_web_search:
+                    raise
+                cleaned = cleanup_with_chatgpt(
+                    raw_analysis,
+                    openai_api_key=openai_key,
+                    model=args.openai_model,
+                    duration=duration,
+                    user_context=args.user_context,
+                    artist_name=artist,
+                    track_name=track_name,
+                    enable_web_search=False,
+                )
+                web_used = False
+            sidecar = _finalize_sidecar(
+                cleaned=cleaned,
+                raw_analysis=raw_analysis,
+                duration=duration,
+                source_audio=source_audio,
+                source_json=json_path,
+                artist=artist,
+                track_name=track_name,
+                openai_model=args.openai_model,
+                web_search_used=web_used,
+            )
+            out_path = _ensure_output_path(json_path, output_dir)
+            if not args.dry_run and output_dir is None and not args.no_backup:
+                backup = _create_backup(json_path, args.backup_ext)
+                if backup:
+                    backups.append(str(backup))
+            if not args.dry_run:
+                out_path.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
+            saved.append(str(out_path))
+        except Exception as exc:
+            failures.append(f"{json_path.name}: {exc}")
+            if args.fail_fast:
+                break
+    summary = {
+        "processed": len(files),
+        "saved": len(saved),
+        "failed": len(failures),
+        "backup_count": len(backups),
+        "output_mode": "separate_dir" if output_dir else ("dry_run" if args.dry_run else "in_place"),
+        "sample_saved": saved[:10],
+        "sample_failures": failures[:10],
+    }
+    print(json.dumps(summary, indent=2, ensure_ascii=False))
+    return 0 if not failures else 2
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/pipeline/run_af3_chatgpt_pipeline.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/env python
+"""
+Run AF3 -> ChatGPT cleanup pipeline on one file or a dataset folder.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import List
+from tqdm import tqdm
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from af3_chatgpt_pipeline import (
+    DEFAULT_AF3_MODEL_ID,
+    DEFAULT_AF3_PROMPT,
+    DEFAULT_AF3_PROMPT_THINK_LONG,
+    DEFAULT_OPENAI_MODEL,
+    AF3EndpointClient,
+    AF3LocalClient,
+    run_af3_chatgpt_pipeline,
+)
+from qwen_audio_captioning import list_audio_files
+from utils.env_config import get_env, load_project_env
+def build_parser() -> argparse.ArgumentParser:
+    load_project_env()
+    p = argparse.ArgumentParser(description="AF3 + ChatGPT LoRA metadata pipeline")
+    p.add_argument("--audio", default="", help="Single audio path")
+    p.add_argument("--dataset-dir", default="", help="Dataset folder")
+    p.add_argument("--backend", default="hf_endpoint", choices=["hf_endpoint", "local"])
+    p.add_argument("--endpoint-url", default=get_env("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url"))
+    p.add_argument("--hf-token", default="")
+    p.add_argument("--model-id", default=get_env("AF3_MODEL_ID", "af3_model_id", default=DEFAULT_AF3_MODEL_ID))
+    p.add_argument("--device", default="auto", choices=["auto", "cuda", "cpu", "mps"])
+    p.add_argument("--torch-dtype", default="auto", choices=["auto", "float16", "bfloat16", "float32"])
+    p.add_argument("--prompt", default=DEFAULT_AF3_PROMPT)
+    p.add_argument(
+        "--think-long",
+        action="store_true",
+        help="Use long-form AF3 prompt + higher token budget defaults.",
+    )
+    p.add_argument("--af3-max-new-tokens", type=int, default=1400)
+    p.add_argument("--af3-temperature", type=float, default=0.1)
+    p.add_argument("--openai-api-key", default="")
+    p.add_argument("--openai-model", default=get_env("OPENAI_MODEL", "openai_model", default=DEFAULT_OPENAI_MODEL))
+    p.add_argument("--user-context", default="")
+    p.add_argument("--artist-name", default="")
+    p.add_argument("--track-name", default="")
+    p.add_argument("--enable-web-search", action="store_true")
+    p.add_argument("--output-dir", default="", help="If set, save sidecars here instead of next to audio")
+    return p
+def resolve_audio_paths(args) -> List[str]:
+    if args.audio:
+        p = Path(args.audio)
+        if not p.is_file():
+            raise FileNotFoundError(f"Audio file not found: {p}")
+        return [str(p)]
+    if args.dataset_dir:
+        files = list_audio_files(args.dataset_dir)
+        if not files:
+            raise RuntimeError(f"No audio files found in {args.dataset_dir}")
+        return files
+    raise ValueError("Provide --audio or --dataset-dir")
+def main() -> int:
+    args = build_parser().parse_args()
+    hf_token = args.hf_token or get_env("HF_TOKEN", "hf_token")
+    openai_key = (
+        args.openai_api_key
+        or get_env("OPENAI_API_KEY", "openai_api_key")
+    )
+    if not openai_key:
+        raise RuntimeError("OPENAI_API_KEY is required for cleanup step.")
+    if args.backend == "hf_endpoint":
+        if not args.endpoint_url:
+            raise RuntimeError("HF endpoint backend requires --endpoint-url")
+        af3_client = AF3EndpointClient(
+            endpoint_url=args.endpoint_url,
+            token=hf_token,
+            model_id=args.model_id,
+        )
+    else:
+        af3_client = AF3LocalClient(
+            model_id=args.model_id,
+            device=args.device,
+            torch_dtype=args.torch_dtype,
+        )
+    af3_prompt = args.prompt
+    af3_max_new_tokens = int(args.af3_max_new_tokens)
+    af3_temperature = float(args.af3_temperature)
+    if args.think_long:
+        if af3_prompt == DEFAULT_AF3_PROMPT:
+            af3_prompt = DEFAULT_AF3_PROMPT_THINK_LONG
+        if af3_max_new_tokens == 1400:
+            af3_max_new_tokens = 3200
+        if abs(af3_temperature - 0.1) < 1e-9:
+            af3_temperature = 0.2
+    audio_paths = resolve_audio_paths(args)
+    failures = []
+    saved = []
+    for audio_path in tqdm(audio_paths, desc="AF3+ChatGPT"):
+        try:
+            result = run_af3_chatgpt_pipeline(
+                audio_path=audio_path,
+                af3_client=af3_client,
+                af3_prompt=af3_prompt,
+                af3_max_new_tokens=af3_max_new_tokens,
+                af3_temperature=af3_temperature,
+                openai_api_key=openai_key,
+                openai_model=args.openai_model,
+                user_context=args.user_context,
+                artist_name=args.artist_name,
+                track_name=args.track_name,
+                enable_web_search=bool(args.enable_web_search),
+            )
+            sidecar = result["sidecar"]
+            if args.output_dir:
+                out_path = Path(args.output_dir) / (Path(audio_path).stem + ".json")
+            else:
+                out_path = Path(audio_path).with_suffix(".json")
+            out_path.parent.mkdir(parents=True, exist_ok=True)
+            out_path.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8")
+            saved.append(str(out_path))
+        except Exception as exc:
+            failures.append(f"{Path(audio_path).name}: {exc}")
+    print(
+        json.dumps(
+            {
+                "processed": len(audio_paths),
+                "saved": len(saved),
+                "failed": len(failures),
+                "saved_paths": saved[:20],
+                "failures": failures[:20],
+            },
+            indent=2,
+            ensure_ascii=False,
+        )
+    )
+    return 0 if not failures else 2
+if __name__ == "__main__":
+    raise SystemExit(main())

services/pipeline_api.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Local orchestration API for AF3 captioning + ChatGPT cleanup pipeline.
+"""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from af3_chatgpt_pipeline import (
+    DEFAULT_AF3_MODEL_ID,
+    DEFAULT_AF3_PROMPT,
+    DEFAULT_OPENAI_MODEL,
+    AF3EndpointClient,
+    AF3LocalClient,
+    run_af3_chatgpt_pipeline,
+    save_sidecar,
+)
+from utils.env_config import get_env, load_project_env
+load_project_env()
+def _resolve_token(name_upper: str, name_lower: str) -> str:
+    return get_env(name_upper, name_lower)
+def _build_af3_client(
+    backend: str,
+    endpoint_url: str,
+    hf_token: str,
+    model_id: str,
+    device: str,
+    torch_dtype: str,
+):
+    if backend == "hf_endpoint":
+        if not endpoint_url:
+            raise HTTPException(status_code=400, detail="AF3 endpoint backend requires endpoint_url")
+        return AF3EndpointClient(
+            endpoint_url=endpoint_url,
+            token=hf_token,
+            model_id=model_id or DEFAULT_AF3_MODEL_ID,
+        )
+    return AF3LocalClient(
+        model_id=model_id or DEFAULT_AF3_MODEL_ID,
+        device=device,
+        torch_dtype=torch_dtype,
+    )
+app = FastAPI(title="AF3 + ChatGPT Pipeline API", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+FRONTEND_DIST = Path(__file__).resolve().parents[1] / "react-ui" / "dist"
+FRONTEND_ASSETS = FRONTEND_DIST / "assets"
+if FRONTEND_ASSETS.exists():
+    app.mount("/assets", StaticFiles(directory=str(FRONTEND_ASSETS)), name="assets")
+class PipelinePathRequest(BaseModel):
+    audio_path: str
+    backend: str = "hf_endpoint"
+    endpoint_url: str = ""
+    hf_token: str = ""
+    model_id: str = DEFAULT_AF3_MODEL_ID
+    device: str = "auto"
+    torch_dtype: str = "auto"
+    af3_prompt: str = DEFAULT_AF3_PROMPT
+    af3_max_new_tokens: int = 1400
+    af3_temperature: float = 0.1
+    openai_api_key: str = ""
+    openai_model: str = DEFAULT_OPENAI_MODEL
+    user_context: str = ""
+    artist_name: str = ""
+    track_name: str = ""
+    enable_web_search: bool = False
+    output_json: str = ""
+@app.get("/api/health")
+def health():
+    return {"ok": True}
+@app.get("/", include_in_schema=False)
+def serve_root():
+    if FRONTEND_DIST.exists():
+        index = FRONTEND_DIST / "index.html"
+        if index.exists():
+            return FileResponse(index)
+    return JSONResponse(
+        {
+            "ok": True,
+            "message": "Frontend build not found. Run `python af3_gui_app.py` or `npm --prefix react-ui run build`.",
+        }
+    )
+@app.get("/api/config")
+def config():
+    return {
+        "defaults": {
+            "backend": "hf_endpoint",
+            "endpoint_url": _resolve_token("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url"),
+            "model_id": _resolve_token("AF3_MODEL_ID", "af3_model_id") or DEFAULT_AF3_MODEL_ID,
+            "openai_model": _resolve_token("OPENAI_MODEL", "openai_model") or DEFAULT_OPENAI_MODEL,
+            "af3_prompt": DEFAULT_AF3_PROMPT,
+        }
+    }
+@app.post("/api/pipeline/run-path")
+def run_pipeline_path(req: PipelinePathRequest):
+    audio_path = Path(req.audio_path)
+    if not audio_path.is_file():
+        raise HTTPException(status_code=404, detail=f"Audio not found: {audio_path}")
+    hf_token = req.hf_token or _resolve_token("HF_TOKEN", "hf_token")
+    openai_key = req.openai_api_key or _resolve_token("OPENAI_API_KEY", "openai_api_key")
+    if not openai_key:
+        raise HTTPException(status_code=400, detail="OPENAI_API_KEY is required.")
+    endpoint_url = req.endpoint_url or _resolve_token("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url")
+    af3_client = _build_af3_client(
+        backend=req.backend,
+        endpoint_url=endpoint_url,
+        hf_token=hf_token,
+        model_id=req.model_id,
+        device=req.device,
+        torch_dtype=req.torch_dtype,
+    )
+    try:
+        result = run_af3_chatgpt_pipeline(
+            audio_path=str(audio_path),
+            af3_client=af3_client,
+            af3_prompt=req.af3_prompt,
+            af3_max_new_tokens=req.af3_max_new_tokens,
+            af3_temperature=req.af3_temperature,
+            openai_api_key=openai_key,
+            openai_model=req.openai_model,
+            user_context=req.user_context,
+            artist_name=req.artist_name,
+            track_name=req.track_name,
+            enable_web_search=req.enable_web_search,
+        )
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    output_json = req.output_json or str(audio_path.with_suffix(".json"))
+    save_path = save_sidecar(result["sidecar"], output_json)
+    return {
+        "saved_to": save_path,
+        "af3_analysis": result["af3_analysis"],
+        "cleaned": result["cleaned"],
+        "sidecar": result["sidecar"],
+    }
+@app.post("/api/pipeline/run-upload")
+async def run_pipeline_upload(
+    audio_file: UploadFile = File(...),
+    backend: str = Form("hf_endpoint"),
+    endpoint_url: str = Form(""),
+    hf_token: str = Form(""),
+    model_id: str = Form(DEFAULT_AF3_MODEL_ID),
+    device: str = Form("auto"),
+    torch_dtype: str = Form("auto"),
+    af3_prompt: str = Form(DEFAULT_AF3_PROMPT),
+    af3_max_new_tokens: int = Form(1400),
+    af3_temperature: float = Form(0.1),
+    openai_api_key: str = Form(""),
+    openai_model: str = Form(DEFAULT_OPENAI_MODEL),
+    user_context: str = Form(""),
+    artist_name: str = Form(""),
+    track_name: str = Form(""),
+    enable_web_search: bool = Form(False),
+    output_json: str = Form(""),
+):
+    suffix = Path(audio_file.filename or "uploaded.wav").suffix or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        temp_audio = Path(tmp.name)
+    try:
+        content = await audio_file.read()
+        temp_audio.write_bytes(content)
+        hf_token_val = hf_token or _resolve_token("HF_TOKEN", "hf_token")
+        openai_key = openai_api_key or _resolve_token("OPENAI_API_KEY", "openai_api_key")
+        if not openai_key:
+            raise HTTPException(status_code=400, detail="OPENAI_API_KEY is required.")
+        endpoint_url_val = endpoint_url or _resolve_token("HF_AF3_ENDPOINT_URL", "hf_af3_endpoint_url")
+        af3_client = _build_af3_client(
+            backend=backend,
+            endpoint_url=endpoint_url_val,
+            hf_token=hf_token_val,
+            model_id=model_id,
+            device=device,
+            torch_dtype=torch_dtype,
+        )
+        result = run_af3_chatgpt_pipeline(
+            audio_path=str(temp_audio),
+            af3_client=af3_client,
+            af3_prompt=af3_prompt,
+            af3_max_new_tokens=af3_max_new_tokens,
+            af3_temperature=af3_temperature,
+            openai_api_key=openai_key,
+            openai_model=openai_model,
+            user_context=user_context,
+            artist_name=artist_name,
+            track_name=track_name,
+            enable_web_search=enable_web_search,
+        )
+        default_out = Path("outputs") / "af3_chatgpt" / (Path(audio_file.filename or "track").stem + ".json")
+        save_path = save_sidecar(result["sidecar"], output_json or str(default_out))
+        return {
+            "saved_to": save_path,
+            "af3_analysis": result["af3_analysis"],
+            "cleaned": result["cleaned"],
+            "sidecar": result["sidecar"],
+        }
+    except HTTPException:
+        raise
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    finally:
+        try:
+            temp_audio.unlink(missing_ok=True)
+        except Exception:
+            pass

summaries/findings.md CHANGED Viewed

@@ -1,124 +1,204 @@
-# Improving ACE-Step LoRA with Time-Event-Based Annotation
 [Back to project README](../README.md)
-## Baseline context in this repo
-This project already provides a solid end-to-end workflow:
-- Train LoRA adapters with `lora_train.py` and the Gradio UI (`app.py`, `lora_ui.py`).
-- Deploy generation through a custom endpoint runtime (`handler.py`, `acestep/`).
-- Test prompts and lyrics quickly with endpoint client scripts in `scripts/endpoint/`.
-Today, most conditioning in this pipeline is still global (caption, lyrics, BPM, key, tags). That is a strong baseline, but it does not explicitly teach *when* events happen inside a track.
-## Core limitation
-Current annotations usually describe *what* a song is, not *when* events occur. The model can learn style and texture, but temporal structure is weaker:
-- Verse/chorus transitions are often less deliberate than human-produced songs.
-- Build-ups, drops, or effect changes can feel averaged or blurred.
-- Subgenre-specific arrangement timing is harder to reproduce consistently.
-## Observed baseline behavior (working assumption)
-From current prompt and endpoint testing workflows in this repo, the baseline appears to do best on:
-- overall timbre/style conditioning from caption-like prompts,
-- short-form motif continuity,
-- broad genre direction.
-The baseline appears weaker on:
-- section-level planning across longer durations,
-- predictable timing of transitions (intro/verse/chorus/bridge),
-- reliable callback motifs that should reappear at known timestamps.
-These are expected gaps for globally conditioned generation and provide a clear target for time-event experiments.
-## Why time-event labels are promising
-1. Better musical structure: teach the model where sections start/end and where key transitions occur.
-2. Better genre fidelity: encode timing differences between styles that share similar instruments.
-3. Better control at inference: allow prompting for both content and structure (what + when).
-## Practical direction for this codebase
-A useful next step is to extend the current sidecar metadata approach with optional timed events.
-Example direction:
-- Keep existing fields (`caption`, `lyrics`, `bpm`, etc.).
-- Add an `events` list with event type + start/end times.
-- Start with a small, high-quality subset before scaling.
-Illustrative shape:
-```json
-{
-  "caption": "emotional rnb pop with warm pads",
-  "bpm": 92,
-  "events": [
-    {"type": "intro", "start": 0.0, "end": 8.0},
-    {"type": "verse", "start": 8.0, "end": 32.0},
-    {"type": "chorus", "start": 32.0, "end": 48.0}
-  ]
-}
-```
-Optional extension fields that may help later:
-- `intensity` (0-1) per event,
-- `instrument_focus` tags per section,
-- `transition_type` (hard cut, riser, filtered handoff, etc.).
-## Early experiments worth running
-- Compare baseline LoRA vs time-event LoRA on the same curated mini-dataset.
-- Score structural accuracy (section order, transition timing tolerance).
-- Run blind listening tests for perceived musical arc and arrangement coherence.
-- Track whether time labels improve consistency without reducing creativity.
-## Suggested evaluation rubric (v1)
-Use a simple shared scorecard to keep comparisons objective:
-1. Structure match (0-5): generated section order vs target plan.
-2. Timing adherence (0-5): transition timestamps within tolerance window.
-3. Musical coherence (0-5): transitions feel intentional, not abrupt/noisy.
-4. Genre fit (0-5): arrangement behavior matches requested subgenre.
-5. Prompt fidelity (0-5): requested mood/style/lyrics alignment.
-This makes iteration easier than relying only on subjective listening notes.
-## Incremental execution plan
-Phase 1: Data and schema
-- Define the minimal `events` schema and annotation guidelines.
-- Build a small seed set (for example 50-200 clips) with high label quality.
-Phase 2: Training and ablation
-- Train a baseline LoRA and an event-aware LoRA with matched settings.
-- Run ablations (with/without `events`, coarse vs fine event types).
-Phase 3: Inference controls
-- Add optional event-aware controls in the UI and endpoint payload.
-- Keep backward compatibility so existing prompts still work.
-Phase 4: Evaluation and docs
-- Publish scorecard results + examples.
-- Document tradeoffs (quality, speed, annotation effort).
-## Expected outcomes
-If this works, this repo can evolve from "style-conditioned generation" toward "structure-aware generation":
-- More intentional song progression.
-- Stronger subgenre identity.
-- Better controllability for creators.
-This is still a baseline research note, but it gives a clear technical direction that fits the current project architecture.

+# ACE-Step 1.5 Annotation and LoRA Findings (My Notes)
 [Back to project README](../README.md)
+## What I was trying to build
+I wanted a reliable pipeline to:
+1. Analyze my songs with AF3/Qwen-style timestamped musical detail.
+2. Clean and structure the results with ChatGPT.
+3. Save sidecar JSON files that ACE-Step 1.5 LoRA training can consume directly.
+4. Keep enough detail for future iteration (human edits, richer annotations, timeline/event work).
+## What ACE-Step 1.5 actually reads during LoRA training
+Based on this repo's loader (`lora_train.py`), the training loop directly reads these JSON keys:
+- `caption`
+- `lyrics`
+- `bpm`
+- `keyscale`
+- `timesignature`
+- `vocal_language`
+- `duration`
+Anything else is effectively extra metadata for my own workflow. This is why I moved rich analysis detail into `caption` so it is not ignored by the model.
+## Endpoint stack comparison I observed
+I tested two serving stacks on the same tracks/prompts.
+### Stack A (lower quality)
+- Model path: `nvidia/audio-flamingo-3-hf`
+- Runtime style: generic Transformers path with custom endpoint handler
+- Behavior I observed:
+  - Often short outputs
+  - Sometimes repetitive segment text
+  - Less convincing section-by-section progression
+- Latency I observed:
+  - Fast short runs
+  - Medium-length think runs
+### Stack B (higher quality)
+- Model path:
+  - base: `nvidia/audio-flamingo-3`
+  - think adapter: `stage35`
+- Runtime style: NVIDIA-style `llava`/`generate_content` stack
+- Behavior I observed:
+  - Longer, richer timestamped prose
+  - Better flow across sections
+  - Better musical interaction detail (vocals + instruments + arrangement)
+- Latency I observed:
+  - Slower than Stack A
+  - Roughly around 1 minute per track in think/long style runs
+### My conclusion
+If I care about annotation quality, Stack B is clearly better even if it is slower.
+## Main issues I hit and how I resolved them
+### 1) Endpoint failed with `Unknown task custom`
+Observed error:
+- `KeyError: "Unknown task custom ..."`
+What caused it:
+- Endpoint fell back to default pipeline path instead of loading my custom `handler.py`.
+- Log showed: `No custom pipeline found at /repository/handler.py`.
+Fix:
+- Ensure endpoint repo has top-level `handler.py`.
+- Deploy using the custom endpoint template files exactly.
+### 2) AF3 architecture not recognized
+Observed error:
+- `model type audioflamingo3 not recognized`
+What caused it:
+- Endpoint base runtime had older Transformers stack that could not load AF3 model classes.
+Fix:
+- Bootstrap runtime dependencies compatible with AF3 in custom handler/template.
+- Avoid relying on plain default endpoint image assumptions.
+### 3) Processor load failures for HF-converted AF3 repo
+Observed error:
+- `Unrecognized processing class in nvidia/audio-flamingo-3-hf`
+What caused it:
+- Mismatch between model repo packaging and runtime loader expectations.
+Fix:
+- Move to NVIDIA stack template path and serving format that matches expected classes/runtime behavior.
+### 4) Dependency conflicts after forced upgrades
+Observed logs showed conflicts around:
+- `transformers`
+- `huggingface_hub`
+- `torch`/`torchaudio`/`torchvision`
+- `huggingface-inference-toolkit` pinned versions
+What caused it:
+- Upgrading one package in place inside endpoint image caused incompatibility with toolkit pins.
+Fix:
+- Use curated endpoint template/runtime setup instead of ad-hoc package upgrades.
+### 5) Token/auth confusion
+Observed warning:
+- Unauthenticated requests to HF Hub even though I had a token in `.env`.
+What caused it:
+- Variable name mismatch (`hf_token` vs expected runtime env var names like `HF_TOKEN`) in some contexts.
+Fix:
+- Normalize env variable names and pass token consistently in endpoint/runtime settings.
+### 6) Very short or repetitive analysis output
+What caused it:
+- Wrong stack path (HF-converted flow) and/or non-think-compatible runtime behavior.
+Fix:
+- Migrate to NVIDIA think-capable stack.
+- Use longer token budgets and think-oriented prompt settings.
+## Dataset run results and quality checks
+### Batch throughput I observed
+- 22 tracks processed in about 22 minutes.
+- Roughly 60 seconds per track average.
+### Repetition audit outcome
+- No exact duplicate full outputs across tracks.
+- But strong template reuse in phrasing and sentence structures.
+Interpretation:
+- The model output varied by track, but stylistically collapsed into repeated wording patterns.
+## JSON shaping decisions I made
+### Flattening for LoRA compatibility
+I flattened each sidecar to core fields used by `lora_train.py`:
+- `artist`, `caption`, `lyrics`, `bpm`, `keyscale`, `timesignature`, `vocal_language`, `duration`, `source`
+### Keeping rich detail without losing trainability
+I preserved detail under `source.rich_details` and then pushed high-value content into `caption` so training sees it.
+### Global normalization applied
+- `timesignature`: `"4"`
+- `vocal_language`: `"en"`
+- Captions prefixed with `Andrew Spacey:`
+## Important remaining data limitations
+Even after cleanup, these are still weak points in current sidecars:
+- `bpm` is mostly null
+- `keyscale` is mostly unknown/blank
+These are optional for training, but adding reliable BPM/key would likely improve control and consistency.
+## My current recommendation
+1. Keep NVIDIA stack as default for annotation generation quality.
+2. Keep core LoRA fields simple and valid.
+3. Keep rich details in `source.rich_details` for traceability.
+4. Keep detail-rich caption text for actual conditioning.
+5. Add a BPM/key estimation pass next if I want stronger metadata conditioning.
+## Next technical step I want
+I should run a structured event pass (`events` list with start/end/type/intensity) on a subset first, then test whether event-aware captions improve generated song structure over the current caption-only approach.

templates/hf-af3-caption-endpoint/README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Audio Flamingo 3 Caption Endpoint Template
+Use this as a custom `handler.py` runtime for a Hugging Face Dedicated Endpoint.
+## Request contract
+```json
+{
+  "inputs": {
+    "prompt": "Analyze this full song and summarize arrangement changes.",
+    "audio_base64": "<base64-encoded WAV bytes>",
+    "max_new_tokens": 1200,
+    "temperature": 0.1
+  }
+}
+```
+## Response contract
+```json
+{
+  "generated_text": "..."
+}
+```
+## Setup
+Fastest path from this repo:
+```bash
+python scripts/hf_clone.py af3-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_ENDPOINT_REPO
+```
+Then deploy a Dedicated Endpoint from that model repo.
+Important: make sure your endpoint repo contains top-level:
+- `handler.py`
+- `requirements.txt`
+- `README.md`
+Use endpoint task `custom` so the runtime loads `handler.py` instead of a default Transformers pipeline.
+## Endpoint env vars
+Required:
+- `AF3_MODEL_ID=nvidia/audio-flamingo-3-hf`
+Optional runtime bootstrap (defaults shown):
+- `AF3_BOOTSTRAP_RUNTIME=1`
+- `AF3_TRANSFORMERS_SPEC=transformers==5.1.0`
+- `AF3_RUNTIME_DIR=/tmp/af3_runtime`
+- `AF3_STUB_TORCHVISION=1`
+## Notes
+- Audio Flamingo 3 is large; use a GPU endpoint.
+- First boot can take longer because the handler installs AF3-compatible runtime dependencies.
+- This handler returns raw prose analysis. Use the local AF3+ChatGPT pipeline to normalize to LoRA sidecar JSON.

templates/hf-af3-caption-endpoint/handler.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import base64
+import importlib
+import importlib.machinery
+import importlib.util
+import io
+import os
+import subprocess
+import sys
+import types
+from typing import Any, Dict, List, Tuple
+import numpy as np
+import soundfile as sf
+import torch
+def _resolve_model_id(model_dir: str) -> str:
+    default_id = os.getenv("AF3_MODEL_ID", "nvidia/audio-flamingo-3-hf")
+    if model_dir and os.path.isdir(model_dir):
+        has_local = os.path.exists(os.path.join(model_dir, "config.json"))
+        if has_local:
+            return model_dir
+    return default_id
+def _log(msg: str) -> None:
+    print(f"[AF3 handler] {msg}", flush=True)
+def _env_true(name: str, default: bool = False) -> bool:
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return str(raw).strip().lower() in {"1", "true", "yes", "on"}
+def _install_torchvision_stub() -> None:
+    if not _env_true("AF3_STUB_TORCHVISION", True):
+        return
+    interpolation_mode = types.SimpleNamespace(
+        NEAREST=0,
+        BILINEAR=2,
+        BICUBIC=3,
+        BOX=4,
+        HAMMING=5,
+        LANCZOS=1,
+    )
+    transforms_stub = types.ModuleType("torchvision.transforms")
+    setattr(transforms_stub, "InterpolationMode", interpolation_mode)
+    setattr(
+        transforms_stub,
+        "__spec__",
+        importlib.machinery.ModuleSpec(name="torchvision.transforms", loader=None),
+    )
+    tv_stub = types.ModuleType("torchvision")
+    setattr(tv_stub, "transforms", transforms_stub)
+    setattr(
+        tv_stub,
+        "__spec__",
+        importlib.machinery.ModuleSpec(name="torchvision", loader=None),
+    )
+    sys.modules["torchvision"] = tv_stub
+    sys.modules["torchvision.transforms"] = transforms_stub
+_FIND_SPEC_PATCHED = False
+def _patch_optional_backend_discovery() -> None:
+    global _FIND_SPEC_PATCHED
+    if _FIND_SPEC_PATCHED:
+        return
+    blocked = {"torchvision", "librosa"}
+    original_find_spec = importlib.util.find_spec
+    def wrapped_find_spec(name: str, package: str | None = None):
+        root = name.split(".", 1)[0]
+        if root in blocked:
+            return None
+        return original_find_spec(name, package)
+    importlib.util.find_spec = wrapped_find_spec  # type: ignore[assignment]
+    _FIND_SPEC_PATCHED = True
+def _clear_python_modules(prefixes: Tuple[str, ...]) -> None:
+    for name in list(sys.modules.keys()):
+        if any(name == p or name.startswith(f"{p}.") for p in prefixes):
+            sys.modules.pop(name, None)
+def _patch_torch_compat() -> None:
+    try:
+        import torch._dynamo._trace_wrapped_higher_order_op as dyn_wrap
+    except Exception:
+        return
+    if hasattr(dyn_wrap, "TransformGetItemToIndex"):
+        return
+    class TransformGetItemToIndex:  # pragma: no cover - runtime compatibility shim
+        pass
+    setattr(dyn_wrap, "TransformGetItemToIndex", TransformGetItemToIndex)
+def _af3_classes_available() -> tuple[bool, str]:
+    try:
+        from transformers import AudioFlamingo3ForConditionalGeneration  # noqa: F401
+        from transformers import AudioFlamingo3Processor  # noqa: F401
+        return True, ""
+    except Exception as exc:
+        return False, f"{type(exc).__name__}: {exc}"
+def _bootstrap_runtime_transformers(target_dir: str) -> None:
+    packages = [
+        os.getenv("AF3_TRANSFORMERS_SPEC", "transformers==5.1.0"),
+        "numpy<2",
+        "accelerate>=1.1.0",
+        "sentencepiece",
+        "safetensors",
+        "soxr",
+    ]
+    cmd = [sys.executable, "-m", "pip", "install", "--upgrade", "--no-cache-dir", "--target", target_dir, *packages]
+    _log("Installing runtime deps for AF3 (first boot can take a few minutes).")
+    subprocess.check_call(cmd)
+def _ensure_af3_transformers():
+    _patch_optional_backend_discovery()
+    _install_torchvision_stub()
+    _patch_torch_compat()
+    import transformers
+    ok, err = _af3_classes_available()
+    if ok:
+        _log(f"Using bundled transformers={transformers.__version__}")
+        return transformers
+    if not _env_true("AF3_BOOTSTRAP_RUNTIME", True):
+        raise RuntimeError(
+            "AF3 classes are unavailable in bundled transformers "
+            f"({transformers.__version__}) and AF3_BOOTSTRAP_RUNTIME is disabled. "
+            f"Last import error: {err}"
+        )
+    target_dir = os.getenv("AF3_RUNTIME_DIR", "/tmp/af3_runtime")
+    os.makedirs(target_dir, exist_ok=True)
+    _bootstrap_runtime_transformers(target_dir)
+    if target_dir not in sys.path:
+        sys.path.insert(0, target_dir)
+    _clear_python_modules(("transformers", "tokenizers", "huggingface_hub", "safetensors"))
+    _patch_optional_backend_discovery()
+    _install_torchvision_stub()
+    _patch_torch_compat()
+    importlib.invalidate_caches()
+    transformers = importlib.import_module("transformers")
+    ok, err = _af3_classes_available()
+    if not ok:
+        raise RuntimeError(
+            "Failed to load AF3 processor classes after runtime bootstrap. "
+            f"transformers={getattr(transformers, '__version__', 'unknown')} "
+            f"error={err}"
+        )
+    _log(f"Bootstrapped transformers={transformers.__version__}")
+    return transformers
+def _resample_audio_mono(audio: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
+    if src_sr == dst_sr:
+        return audio.astype(np.float32, copy=False)
+    if audio.size == 0:
+        return np.zeros((0,), dtype=np.float32)
+    src_idx = np.arange(audio.shape[0], dtype=np.float64)
+    dst_len = int(round(audio.shape[0] * float(dst_sr) / float(src_sr)))
+    dst_len = max(dst_len, 1)
+    dst_idx = np.linspace(0.0, float(max(audio.shape[0] - 1, 0)), dst_len, dtype=np.float64)
+    out = np.interp(dst_idx, src_idx, audio.astype(np.float64, copy=False))
+    return out.astype(np.float32, copy=False)
+def _decode_audio_from_b64(audio_b64: str) -> tuple[np.ndarray, int]:
+    raw = base64.b64decode(audio_b64)
+    data, sr = sf.read(io.BytesIO(raw), dtype="float32", always_2d=False)
+    if data.ndim == 2:
+        data = np.mean(data, axis=1)
+    if data.ndim != 1:
+        data = np.asarray(data).reshape(-1)
+    target_sr = 16000
+    if int(sr) != target_sr:
+        data = _resample_audio_mono(data, int(sr), target_sr)
+        sr = target_sr
+    return data.astype(np.float32, copy=False), int(sr)
+class EndpointHandler:
+    """
+    Hugging Face Dedicated Endpoint custom handler.
+    Request:
+      {
+        "inputs": {
+          "prompt": "...",
+          "audio_base64": "...",
+          "max_new_tokens": 1200,
+          "temperature": 0.1
+        }
+      }
+    Response:
+      {"generated_text": "..."}
+    """
+    def __init__(self, model_dir: str = ""):
+        self.model_id = _resolve_model_id(model_dir)
+        self.transformers = _ensure_af3_transformers()
+        from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+        _log(
+            f"torch={torch.__version__} cuda={torch.cuda.is_available()} "
+            f"transformers={self.transformers.__version__} model_id={self.model_id}"
+        )
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        self.processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True)
+        self.model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
+            self.model_id,
+            torch_dtype=dtype,
+            trust_remote_code=True,
+        )
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+    def _build_inputs(self, audio: np.ndarray, sample_rate: int, prompt: str) -> Dict[str, Any]:
+        conversation: List[Dict[str, Any]] = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "audio": audio},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        try:
+            return self.processor.apply_chat_template(
+                conversation,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_dict=True,
+                return_tensors="pt",
+                audio_kwargs={"sampling_rate": int(sample_rate)},
+            )
+        except Exception:
+            return self.processor.apply_chat_template(
+                conversation,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        payload = data.get("inputs", data) if isinstance(data, dict) else {}
+        prompt = str(payload.get("prompt", "Analyze this full song and summarize arrangement changes.")).strip()
+        audio_b64 = payload.get("audio_base64")
+        if not audio_b64:
+            return {"error": "audio_base64 is required"}
+        max_new_tokens = int(payload.get("max_new_tokens", 1200))
+        temperature = float(payload.get("temperature", 0.1))
+        try:
+            audio, sample_rate = _decode_audio_from_b64(audio_b64)
+            inputs = self._build_inputs(audio, sample_rate, prompt)
+            device = next(self.model.parameters()).device
+            model_dtype = next(self.model.parameters()).dtype
+            for key, value in list(inputs.items()):
+                if hasattr(value, "to"):
+                    if hasattr(value, "dtype") and torch.is_floating_point(value):
+                        inputs[key] = value.to(device=device, dtype=model_dtype)
+                    else:
+                        inputs[key] = value.to(device)
+            do_sample = bool(temperature > 0)
+            gen_kwargs = {
+                "max_new_tokens": max_new_tokens,
+                "do_sample": do_sample,
+            }
+            if do_sample:
+                gen_kwargs["temperature"] = max(temperature, 1e-5)
+            with torch.no_grad():
+                outputs = self.model.generate(**inputs, **gen_kwargs)
+            start = int(inputs["input_ids"].shape[1])
+            text = self.processor.batch_decode(outputs[:, start:], skip_special_tokens=True)[0].strip()
+            if not text:
+                text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+            return {"generated_text": text}
+        except Exception as exc:
+            return {"error": str(exc)}

templates/hf-af3-caption-endpoint/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ numpy<2
2	+ soundfile

templates/hf-af3-nvidia-endpoint/README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# Audio Flamingo 3 NVIDIA-Stack Endpoint Template
+This template uses the same core runtime pattern as NVIDIA's Space:
+- `llava` code from `nvidia/audio-flamingo-3` (space repo)
+- base checkpoint from `nvidia/audio-flamingo-3` (model repo)
+- optional `stage35` think/long adapter
+## Request contract
+```json
+{
+  "inputs": {
+    "prompt": "Please describe the audio in detail.",
+    "audio_base64": "<base64 WAV bytes>",
+    "think_mode": true,
+    "max_new_tokens": 2048,
+    "temperature": 0.2
+  }
+}
+```
+## Response contract
+```json
+{
+  "generated_text": "...",
+  "mode": "think"
+}
+```
+## Bootstrap command
+```bash
+python scripts/hf_clone.py af3-nvidia-endpoint --repo-id YOUR_USERNAME/YOUR_AF3_NVIDIA_ENDPOINT_REPO
+```
+## Endpoint settings
+- Task: `custom`
+- GPU instance required
+- Secrets:
+  - `HF_TOKEN=<your_token>`
+## Optional env vars
+- `AF3_NV_CODE_REPO_ID=nvidia/audio-flamingo-3`
+- `AF3_NV_MODEL_REPO_ID=nvidia/audio-flamingo-3`
+- `AF3_NV_CODE_REPO_TYPE=space`
+- `AF3_NV_MODEL_REPO_TYPE=model`
+- `AF3_NV_DEFAULT_MODE=think`
+- `AF3_NV_LOAD_THINK=1`
+- `AF3_NV_LOAD_SINGLE=0`
+Default behavior loads think/long mode for higher-quality long-form reasoning.

templates/hf-af3-nvidia-endpoint/handler.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import base64
+import copy
+import os
+import sys
+import tempfile
+from typing import Any, Dict
+import torch
+from huggingface_hub import snapshot_download
+from peft import PeftModel
+DEFAULT_PROMPT = "Please describe the audio in detail."
+def _log(msg: str) -> None:
+    print(f"[AF3 NVIDIA handler] {msg}", flush=True)
+def _env_true(name: str, default: bool = False) -> bool:
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return str(raw).strip().lower() in {"1", "true", "yes", "on"}
+def _strip_state_dict_prefixes(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    out: Dict[str, Any] = {}
+    for key, value in state_dict.items():
+        key2 = key[6:] if key.startswith("model.") else key
+        out[key2] = value
+    return out
+class EndpointHandler:
+    """
+    NVIDIA AF3 stack endpoint handler (matches Space architecture closely).
+    Request:
+      {
+        "inputs": {
+          "prompt": "...",
+          "audio_base64": "...",
+          "think_mode": true,
+          "max_new_tokens": 2048,
+          "temperature": 0.2
+        }
+      }
+    Response:
+      {"generated_text": "...", "mode": "think|single"}
+    """
+    def __init__(self, model_dir: str = ""):
+        del model_dir
+        self.hf_token = os.getenv("HF_TOKEN", "")
+        self.code_repo_id = os.getenv("AF3_NV_CODE_REPO_ID", "nvidia/audio-flamingo-3")
+        self.model_repo_id = os.getenv("AF3_NV_MODEL_REPO_ID", "nvidia/audio-flamingo-3")
+        self.code_repo_type = os.getenv("AF3_NV_CODE_REPO_TYPE", "space")
+        self.model_repo_type = os.getenv("AF3_NV_MODEL_REPO_TYPE", "model")
+        self.default_mode = os.getenv("AF3_NV_DEFAULT_MODE", "think").strip().lower()
+        if self.default_mode not in {"think", "single"}:
+            self.default_mode = "think"
+        self.load_think = _env_true("AF3_NV_LOAD_THINK", True)
+        self.load_single = _env_true("AF3_NV_LOAD_SINGLE", self.default_mode == "single")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        _log(f"torch={torch.__version__} cuda={torch.cuda.is_available()} device={self.device}")
+        _log(
+            f"code_repo={self.code_repo_type}:{self.code_repo_id} "
+            f"model_repo={self.model_repo_type}:{self.model_repo_id} default_mode={self.default_mode}"
+        )
+        self.llava = self._load_llava_runtime()
+        self.model_root = self._download_model_root()
+        self.model_single = None
+        self.model_think = None
+        if self.load_single:
+            self.model_single = self._load_single_model()
+        if self.load_think:
+            self.model_think = self._load_think_model()
+        if self.model_single is None and self.model_think is None:
+            raise RuntimeError("No model loaded. Enable AF3_NV_LOAD_THINK or AF3_NV_LOAD_SINGLE.")
+    def _load_llava_runtime(self):
+        code_root = snapshot_download(
+            repo_id=self.code_repo_id,
+            repo_type=self.code_repo_type,
+            allow_patterns=["llava/**"],
+            token=self.hf_token or None,
+        )
+        if code_root not in sys.path:
+            sys.path.insert(0, code_root)
+        import llava  # type: ignore
+        _log(f"Loaded llava runtime from {code_root}")
+        return llava
+    def _download_model_root(self) -> str:
+        model_root = snapshot_download(
+            repo_id=self.model_repo_id,
+            repo_type=self.model_repo_type,
+            token=self.hf_token or None,
+        )
+        _log(f"Model root: {model_root}")
+        return model_root
+    def _load_single_model(self):
+        _log("Loading single-turn model...")
+        model = self.llava.load(self.model_root, model_base=None)
+        model = model.to(self.device)
+        model.eval()
+        return model
+    def _load_think_model(self):
+        _log("Loading think/long model (stage35 adapter)...")
+        stage35_dir = os.path.join(self.model_root, "stage35")
+        non_lora_path = os.path.join(stage35_dir, "non_lora_trainables.bin")
+        if not os.path.exists(non_lora_path):
+            raise RuntimeError(f"stage35 non_lora_trainables missing: {non_lora_path}")
+        model = self.llava.load(self.model_root, model_base=None)
+        model = model.to(self.device)
+        non_lora_trainables = torch.load(non_lora_path, map_location="cpu")
+        non_lora_trainables = _strip_state_dict_prefixes(non_lora_trainables)
+        model.load_state_dict(non_lora_trainables, strict=False)
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        model = PeftModel.from_pretrained(
+            model,
+            stage35_dir,
+            device_map="auto" if torch.cuda.is_available() else None,
+            torch_dtype=dtype,
+        )
+        model.eval()
+        return model
+    def _select_model(self, think_mode: bool):
+        if think_mode and self.model_think is not None:
+            return self.model_think, "think"
+        if (not think_mode) and self.model_single is not None:
+            return self.model_single, "single"
+        if self.model_think is not None:
+            return self.model_think, "think"
+        return self.model_single, "single"
+    def _build_generation_config(self, model, max_new_tokens: int, temperature: float):
+        base_cfg = getattr(model, "default_generation_config", None)
+        if base_cfg is None:
+            return None
+        cfg = copy.deepcopy(base_cfg)
+        if max_new_tokens > 0:
+            setattr(cfg, "max_new_tokens", int(max_new_tokens))
+        if temperature > 0:
+            setattr(cfg, "temperature", float(temperature))
+            setattr(cfg, "do_sample", True)
+        else:
+            setattr(cfg, "do_sample", False)
+        return cfg
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        payload = data.get("inputs", data) if isinstance(data, dict) else {}
+        audio_b64 = payload.get("audio_base64")
+        if not audio_b64:
+            return {"error": "audio_base64 is required"}
+        prompt = str(payload.get("prompt", DEFAULT_PROMPT)).strip() or DEFAULT_PROMPT
+        think_mode_val = payload.get("think_mode")
+        if think_mode_val is None:
+            think_mode = self.default_mode == "think"
+        else:
+            think_mode = bool(think_mode_val)
+        max_new_tokens = int(payload.get("max_new_tokens", 2048))
+        temperature = float(payload.get("temperature", 0.2))
+        model, mode = self._select_model(think_mode)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp_path = tmp.name
+            tmp.write(base64.b64decode(audio_b64))
+        try:
+            sound = self.llava.Sound(tmp_path)
+            full_prompt = f"<sound>\n{prompt}"
+            gen_cfg = self._build_generation_config(model, max_new_tokens=max_new_tokens, temperature=temperature)
+            with torch.inference_mode():
+                if gen_cfg is not None:
+                    response = model.generate_content([sound, full_prompt], generation_config=gen_cfg)
+                else:
+                    response = model.generate_content([sound, full_prompt])
+            return {"generated_text": str(response).strip(), "mode": mode}
+        except Exception as exc:
+            return {"error": str(exc), "mode": mode}
+        finally:
+            try:
+                os.unlink(tmp_path)
+            except Exception:
+                pass

templates/hf-af3-nvidia-endpoint/requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+transformers==4.46.0
+accelerate==0.34.2
+peft==0.14.0
+numpy==1.26.4
+Pillow
+pydub
+soundfile
+librosa
+openai-whisper
+ftfy
+jiwer
+einops
+hydra-core
+loguru
+matplotlib
+pytorchvideo==0.1.5
+deepspeed==0.15.4
+kaldiio
+wandb
+opencv-python-headless==4.8.0.76
+protobuf==3.20.*
+termcolor
+sentencepiece

templates/hf-qwen-caption-endpoint/README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Qwen2-Audio Caption Endpoint Template
+Use this as a custom `handler.py` runtime for a Hugging Face Dedicated Endpoint.
+## Request contract
+```json
+{
+  "inputs": {
+    "prompt": "Analyze and describe this music segment.",
+    "audio_base64": "<base64-encoded WAV bytes>",
+    "sample_rate": 16000,
+    "max_new_tokens": 384,
+    "temperature": 0.1
+  }
+}
+```
+## Response contract
+```json
+{
+  "generated_text": "..."
+}
+```
+## Setup
+Fastest way from this repo:
+```bash
+python scripts/hf_clone.py qwen-endpoint --repo-id YOUR_USERNAME/YOUR_QWEN_ENDPOINT_REPO
+```
+Then deploy a Dedicated Endpoint from that repo with task `custom`.
+Manual path:
+1. Create a new model repo for your endpoint runtime.
+2. Copy `handler.py` from this folder into that repo as top-level `handler.py`.
+3. Add a `requirements.txt` containing at least:
+   - `torch`
+   - `torchaudio`
+   - `transformers>=4.53.0,<4.58.0`
+   - `soundfile`
+   - `numpy`
+4. Deploy a Dedicated Endpoint from that repo.
+5. Optional endpoint env var:
+   - `QWEN_MODEL_ID=Qwen/Qwen2-Audio-7B-Instruct`
+Then point `qwen_caption_app.py` backend `hf_endpoint` at that endpoint URL.
+## Quick local test script
+From this repo:
+```bash
+python scripts/endpoint/test_qwen_caption_endpoint.py \
+  --url https://YOUR_ENDPOINT.endpoints.huggingface.cloud \
+  --token hf_xxx \
+  --audio path/to/song.wav
+```

templates/hf-qwen-caption-endpoint/handler.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import base64
+import io
+import os
+from typing import Any, Dict
+import numpy as np
+import soundfile as sf
+import torch
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+def _decode_audio_b64(audio_b64: str):
+    raw = base64.b64decode(audio_b64)
+    audio, sr = sf.read(io.BytesIO(raw), dtype="float32", always_2d=True)
+    mono = audio.mean(axis=1).astype(np.float32)
+    return mono, int(sr)
+class EndpointHandler:
+    """
+    HF Dedicated Endpoint custom handler contract:
+      request:
+        {
+          "inputs": {
+            "prompt": "...",
+            "audio_base64": "...",
+            "sample_rate": 16000,
+            "max_new_tokens": 384,
+            "temperature": 0.1
+          }
+        }
+      response:
+        {"generated_text": "..."}
+    """
+    def __init__(self, model_dir: str = ""):
+        model_id = os.getenv("QWEN_MODEL_ID", "Qwen/Qwen2-Audio-7B-Instruct")
+        # Only load from model_dir when actual weights/config are packaged there.
+        if model_dir and os.path.isdir(model_dir):
+            has_local_model = (
+                os.path.exists(os.path.join(model_dir, "config.json"))
+                and (
+                    os.path.exists(os.path.join(model_dir, "model.safetensors"))
+                    or any(name.endswith(".safetensors") for name in os.listdir(model_dir))
+                )
+            )
+            if has_local_model:
+                model_id = model_dir
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        self.model = Qwen2AudioForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            trust_remote_code=True,
+        )
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        payload = data.get("inputs", data) if isinstance(data, dict) else {}
+        prompt = str(payload.get("prompt", "Analyze this music audio.")).strip()
+        audio_b64 = payload.get("audio_base64")
+        if not audio_b64:
+            return {"error": "audio_base64 is required"}
+        max_new_tokens = int(payload.get("max_new_tokens", 384))
+        temperature = float(payload.get("temperature", 0.1))
+        audio, sr = _decode_audio_b64(audio_b64)
+        sampling_rate = int(payload.get("sample_rate", sr))
+        # Use direct audio token format to force audio conditioning.
+        chat_text = f"<|audio_bos|><|AUDIO|><|audio_eos|>\n{prompt}\n"
+        inputs = self.processor(
+            text=chat_text,
+            audio=[audio],
+            sampling_rate=sampling_rate,
+            return_tensors="pt",
+            padding=True,
+        )
+        device = next(self.model.parameters()).device
+        for key, value in list(inputs.items()):
+            if hasattr(value, "to"):
+                inputs[key] = value.to(device)
+        do_sample = bool(temperature and temperature > 0)
+        gen_kwargs = {
+            "max_new_tokens": int(max_new_tokens),
+            "do_sample": do_sample,
+        }
+        if do_sample:
+            gen_kwargs["temperature"] = max(float(temperature), 1e-5)
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, **gen_kwargs)
+        prompt_tokens = inputs["input_ids"].shape[1]
+        generated_new = generated_ids[:, prompt_tokens:]
+        text = self.processor.batch_decode(
+            generated_new,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        if not text.strip():
+            # Some backends may return generated-only ids without prefix tokens.
+            text = self.processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )[0]
+        return {"generated_text": text.strip()}

templates/hf-qwen-caption-endpoint/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchaudio
+soundfile
+numpy
+transformers>=4.53.0,<4.58.0
+accelerate

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

utils/env_config.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""Environment helpers for project-wide .env loading."""
+from __future__ import annotations
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+_PROJECT_ROOT = Path(__file__).resolve().parents[1]
+_DOTENV_PATH = _PROJECT_ROOT / ".env"
+_DOTENV_LOADED = False
+def load_project_env() -> None:
+    global _DOTENV_LOADED
+    if _DOTENV_LOADED:
+        return
+    load_dotenv(dotenv_path=_DOTENV_PATH, override=False)
+    _DOTENV_LOADED = True
+def get_env(*keys: str, default: str = "") -> str:
+    load_project_env()
+    for key in keys:
+        value = os.getenv(key)
+        if value:
+            return value
+    return default
+def set_default_env_file_value(key: str, value: str) -> bool:
+    """Set key=value in .env only if key is missing; returns True when file changed."""
+    key = (key or "").strip()
+    if not key:
+        return False
+    lines = []
+    if _DOTENV_PATH.exists():
+        lines = _DOTENV_PATH.read_text(encoding="utf-8").splitlines()
+    for raw in lines:
+        line = raw.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        k, _ = line.split("=", 1)
+        if k.strip() == key:
+            return False
+    lines.append(f"{key}={value}")
+    _DOTENV_PATH.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
+    return True