# handler.py (for handling asr with faster_whisper) from faster_whisper import WhisperModel, BatchedInferencePipeline from typing import Any, Dict, List from transformers.pipelines.audio_utils import ffmpeg_read class EndpointHandler: def __init__(self, path=""): self.model_size = "distil-large-v3" # the distilled whisper v3 model self.model = WhisperModel(self.model_size, device="cuda", compute_type="float16") self.batched_model = BatchedInferencePipeline(model=self.model) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Args: data (:obj:): includes the base64 encoded audio file as 'inputs' whether to use batching as the 'batched' argument and any additional arguments as a 'parameters' dict Return: segments of transcribed text, joined """ # process input inputs = data.pop("inputs", data) audio_nparray = ffmpeg_read(inputs, 16000) # read the encoded audio and convert at 16k # Retrieve custom arguments batched = data.pop("batched", True) # default is True if not specified params = data.pop("parameters", {}) # all parameters for the model if batched: segments, info = self.batched_model.transcribe(audio_nparray, **params) else: segments, info = self.model.transcribe(audio_nparray, beam_size=5) segments = [segment.text for segment in segments] return " ".join(segments)