Bootstrap Qwen2-Audio custom endpoint repo

Browse files

Files changed (3) hide show

README.md +62 -0
handler.py +110 -0
requirements.txt +6 -0

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Qwen2-Audio Caption Endpoint Template
+Use this as a custom `handler.py` runtime for a Hugging Face Dedicated Endpoint.
+## Request contract
+```json
+{
+  "inputs": {
+    "prompt": "Analyze and describe this music segment.",
+    "audio_base64": "<base64-encoded WAV bytes>",
+    "sample_rate": 16000,
+    "max_new_tokens": 384,
+    "temperature": 0.1
+  }
+}
+```
+## Response contract
+```json
+{
+  "generated_text": "..."
+}
+```
+## Setup
+Fastest way from this repo:
+```bash
+python scripts/hf_clone.py qwen-endpoint --repo-id YOUR_USERNAME/YOUR_QWEN_ENDPOINT_REPO
+```
+Then deploy a Dedicated Endpoint from that repo with task `custom`.
+Manual path:
+1. Create a new model repo for your endpoint runtime.
+2. Copy `handler.py` from this folder into that repo as top-level `handler.py`.
+3. Add a `requirements.txt` containing at least:
+   - `torch`
+   - `torchaudio`
+   - `transformers>=4.53.0,<4.58.0`
+   - `soundfile`
+   - `numpy`
+4. Deploy a Dedicated Endpoint from that repo.
+5. Optional endpoint env var:
+   - `QWEN_MODEL_ID=Qwen/Qwen2-Audio-7B-Instruct`
+Then point `qwen_caption_app.py` backend `hf_endpoint` at that endpoint URL.
+## Quick local test script
+From this repo:
+```bash
+python scripts/endpoint/test_qwen_caption_endpoint.py \
+  --url https://YOUR_ENDPOINT.endpoints.huggingface.cloud \
+  --token hf_xxx \
+  --audio path/to/song.wav
+```

handler.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import base64
+import io
+import os
+from typing import Any, Dict
+import numpy as np
+import soundfile as sf
+import torch
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+def _decode_audio_b64(audio_b64: str):
+    raw = base64.b64decode(audio_b64)
+    audio, sr = sf.read(io.BytesIO(raw), dtype="float32", always_2d=True)
+    mono = audio.mean(axis=1).astype(np.float32)
+    return mono, int(sr)
+class EndpointHandler:
+    """
+    HF Dedicated Endpoint custom handler contract:
+      request:
+        {
+          "inputs": {
+            "prompt": "...",
+            "audio_base64": "...",
+            "sample_rate": 16000,
+            "max_new_tokens": 384,
+            "temperature": 0.1
+          }
+        }
+      response:
+        {"generated_text": "..."}
+    """
+    def __init__(self, model_dir: str = ""):
+        model_id = os.getenv("QWEN_MODEL_ID", "Qwen/Qwen2-Audio-7B-Instruct")
+        if model_dir and os.path.isdir(model_dir):
+            # Allows loading from files packaged in endpoint model repo.
+            model_id = model_dir
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        self.model = Qwen2AudioForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            trust_remote_code=True,
+        )
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        payload = data.get("inputs", data) if isinstance(data, dict) else {}
+        prompt = str(payload.get("prompt", "Analyze this music audio.")).strip()
+        audio_b64 = payload.get("audio_base64")
+        if not audio_b64:
+            return {"error": "audio_base64 is required"}
+        max_new_tokens = int(payload.get("max_new_tokens", 384))
+        temperature = float(payload.get("temperature", 0.1))
+        audio, sr = _decode_audio_b64(audio_b64)
+        sampling_rate = int(payload.get("sample_rate", sr))
+        conversation = [
+            {"role": "system", "content": "You are a precise music analysis assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "audio_url": "local://audio.wav"},
+                    {"type": "text", "text": prompt},
+                ],
+            },
+        ]
+        chat_text = self.processor.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        inputs = self.processor(
+            text=chat_text,
+            audios=[audio],
+            sampling_rate=sampling_rate,
+            return_tensors="pt",
+            padding=True,
+        )
+        device = next(self.model.parameters()).device
+        for key, value in list(inputs.items()):
+            if hasattr(value, "to"):
+                inputs[key] = value.to(device)
+        do_sample = bool(temperature and temperature > 0)
+        gen_kwargs = {
+            "max_new_tokens": int(max_new_tokens),
+            "do_sample": do_sample,
+        }
+        if do_sample:
+            gen_kwargs["temperature"] = max(float(temperature), 1e-5)
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, **gen_kwargs)
+        prompt_tokens = inputs["input_ids"].shape[1]
+        generated_ids = generated_ids[:, prompt_tokens:]
+        text = self.processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        return {"generated_text": text.strip()}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchaudio
+soundfile
+numpy
+transformers>=4.53.0,<4.58.0
+accelerate