import torch from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration class EndpointHandler: def __init__(self, path: str = ""): self.processor = AutoProcessor.from_pretrained(path) self.model = VibeVoiceAsrForConditionalGeneration.from_pretrained( path, torch_dtype=torch.float16, device_map="auto", ) def __call__(self, data): inputs_data = data.pop("inputs", data) prompt = data.pop("prompt", None) inputs = self.processor.apply_transcription_request( audio=inputs_data, prompt=prompt, return_tensors="pt", ).to(self.model.device, self.model.dtype) with torch.no_grad(): output_ids = self.model.generate(**inputs) generated_ids = output_ids[:, inputs["input_ids"].shape[1]:] transcription = self.processor.decode( generated_ids, return_format="transcription_only", )[0] return {"text": transcription}