VibeVoice-ASR-HFI

@@ -1,48 +1,219 @@
-from typing import Dict, Any
-import torch
-import soundfile as sf
-import io
 import base64
 import numpy as np
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 class EndpointHandler:
-    def __init__(self, path=""):
-        self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
-        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
             path,
             trust_remote_code=True,
-            torch_dtype=torch.float16,
-            device_map="auto"
         )
         self.model.eval()
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        audio_input = data.get("inputs")
         if isinstance(audio_input, str):
-            audio_bytes = base64.b64decode(audio_input)
-        else:
             audio_bytes = audio_input
-        audio_array, sample_rate = sf.read(io.BytesIO(audio_bytes))
-        if audio_array.ndim > 1:
-            audio_array = audio_array.mean(axis=1)
-        inputs = self.processor(
-            audio_array,
-            sampling_rate=sample_rate,
-            return_tensors="pt"
-        ).to(self.model.device)
-        with torch.no_grad():
-            generated_ids = self.model.generate(**inputs)
-        transcription = self.processor.batch_decode(
-            generated_ids,
-            skip_special_tokens=True
-        )[0]
-        return {"text": transcription}

+"""
+Custom Inference Handler for VibeVoice-ASR on Hugging Face Inference Endpoints.
+Setup:
+1. Duplicate the microsoft/VibeVoice-ASR repo to your own HF account
+2. Add this handler.py and the accompanying requirements.txt to the repo root
+3. Deploy as an Inference Endpoint with a GPU instance (min ~18GB VRAM)
+"""
 import base64
+import io
+import os
+import re
+import tempfile
+import logging
+from typing import Any, Dict, List
+import torch
 import numpy as np
+logger = logging.getLogger(__name__)
 class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """
+        Initialize the VibeVoice-ASR model and processor.
+        Args:
+            path: Path to model weights (provided by HF Inference Endpoints).
+        """
+        from vibevoice.asr.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
+        from vibevoice.asr.processing_vibevoice_asr import VibeVoiceASRProcessor
+        logger.info(f"Loading VibeVoice-ASR model from: {path}")
+        self.processor = VibeVoiceASRProcessor.from_pretrained(path)
+        self.model = VibeVoiceASRForConditionalGeneration.from_pretrained(
             path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map="auto",
             trust_remote_code=True,
         )
         self.model.eval()
+        self.device = next(self.model.parameters()).device
+        logger.info(f"VibeVoice-ASR loaded on device: {self.device}")
+    def _load_audio(self, audio_input) -> np.ndarray:
+        """
+        Load audio from various input formats.
+        Supports:
+        - base64-encoded string
+        - raw bytes
+        - file path string
+        """
+        import librosa
         if isinstance(audio_input, str):
+            if os.path.isfile(audio_input):
+                audio, _ = librosa.load(audio_input, sr=16000, mono=True)
+                return audio
+            else:
+                # Assume base64
+                audio_bytes = base64.b64decode(audio_input)
+        elif isinstance(audio_input, bytes):
             audio_bytes = audio_input
+        else:
+            raise ValueError(
+                f"Unsupported audio input type: {type(audio_input)}. "
+                "Expected base64 string, bytes, or file path."
+            )
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp.write(audio_bytes)
+            tmp_path = tmp.name
+        try:
+            audio, _ = librosa.load(tmp_path, sr=16000, mono=True)
+        finally:
+            os.unlink(tmp_path)
+        return audio
+    def _parse_transcription(self, raw_text: str) -> List[Dict[str, Any]]:
+        """
+        Parse the raw model output into structured segments.
+        VibeVoice-ASR outputs text in the format:
+        <speaker:0><start:0.00><end:13.43> Hello, how are you?
+        """
+        segments = []
+        pattern = r"<speaker:(\d+)><start:([\d.]+)><end:([\d.]+)>\s*(.*?)(?=<speaker:|\Z)"
+        matches = re.finditer(pattern, raw_text, re.DOTALL)
+        for match in matches:
+            speaker_id = int(match.group(1))
+            start_time = float(match.group(2))
+            end_time = float(match.group(3))
+            text = match.group(4).strip()
+            if text:
+                segments.append({
+                    "speaker": f"Speaker {speaker_id}",
+                    "start": start_time,
+                    "end": end_time,
+                    "timestamp": f"{start_time:.2f} - {end_time:.2f}",
+                    "text": text,
+                })
+        return segments
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process an inference request.
+        Request body:
+        {
+            "inputs": "<base64-encoded-audio>",
+            "parameters": {                     # all optional
+                "hotwords": "term1, term2",
+                "max_new_tokens": 8192,
+                "temperature": 0.0,
+                "top_p": 0.9,
+                "repetition_penalty": 1.0
+            }
+        }
+        Returns:
+        {
+            "transcription": "plain text transcription",
+            "raw": "raw model output with tags",
+            "segments": [
+                {
+                    "speaker": "Speaker 0",
+                    "start": 0.0,
+                    "end": 13.43,
+                    "timestamp": "0.00 - 13.43",
+                    "text": "Hello, how are you?"
+                }
+            ],
+            "duration": 78.3
+        }
+        """
+        audio_input = data.get("inputs", data)
+        parameters = data.get("parameters", {})
+        hotwords = parameters.get("hotwords", "")
+        max_new_tokens = parameters.get("max_new_tokens", 8192)
+        temperature = parameters.get("temperature", 0.0)
+        top_p = parameters.get("top_p", 0.9)
+        repetition_penalty = parameters.get("repetition_penalty", 1.0)
+        # Load audio
+        try:
+            audio = self._load_audio(audio_input)
+        except Exception as e:
+            return {"error": f"Failed to load audio: {str(e)}"}
+        duration = len(audio) / 16000
+        logger.info(f"Audio loaded: {duration:.1f}s")
+        if duration > 3600:
+            return {"error": "Audio exceeds 60 minute limit"}
+        # Preprocess
+        try:
+            inputs = self.processor(
+                audio=audio,
+                sampling_rate=16000,
+                context=hotwords if hotwords else None,
+                return_tensors="pt",
+            )
+            inputs = {
+                k: v.to(self.device) if isinstance(v, torch.Tensor) else v
+                for k, v in inputs.items()
+            }
+        except Exception as e:
+            return {"error": f"Failed to preprocess audio: {str(e)}"}
+        # Generate
+        try:
+            generate_kwargs = {
+                "max_new_tokens": max_new_tokens,
+                "do_sample": temperature > 0,
+            }
+            if temperature > 0:
+                generate_kwargs["temperature"] = temperature
+                generate_kwargs["top_p"] = top_p
+            if repetition_penalty != 1.0:
+                generate_kwargs["repetition_penalty"] = repetition_penalty
+            with torch.inference_mode():
+                output_ids = self.model.generate(**inputs, **generate_kwargs)
+            raw_text = self.processor.batch_decode(
+                output_ids, skip_special_tokens=False
+            )[0]
+            for token in ["<s>", "</s>", "<pad>", "<eos>", "<bos>"]:
+                raw_text = raw_text.replace(token, "")
+            raw_text = raw_text.strip()
+        except Exception as e:
+            logger.error(f"Generation failed: {str(e)}")
+            return {"error": f"Transcription failed: {str(e)}"}
+        segments = self._parse_transcription(raw_text)
+        plain_text = " ".join(seg["text"] for seg in segments) if segments else raw_text
+        return {
+            "transcription": plain_text,
+            "raw": raw_text,
+            "segments": segments,
+            "duration": round(duration, 2),
+        }