| | |
| | from faster_whisper import WhisperModel, BatchedInferencePipeline |
| | from typing import Any, Dict, List |
| | from transformers.pipelines.audio_utils import ffmpeg_read |
| |
|
| | class EndpointHandler: |
| |
|
| | def __init__(self, path=""): |
| | self.model_size = "distil-large-v3" |
| | self.model = WhisperModel(self.model_size, device="cuda", compute_type="float16") |
| | self.batched_model = BatchedInferencePipeline(model=self.model) |
| |
|
| | def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| | """ |
| | Args: |
| | data (:obj:): |
| | includes the base64 encoded audio file as 'inputs' |
| | whether to use batching as the 'batched' argument |
| | and any additional arguments as a 'parameters' dict |
| | Return: |
| | segments of transcribed text, joined |
| | """ |
| | |
| | inputs = data.pop("inputs", data) |
| | audio_nparray = ffmpeg_read(inputs, 16000) |
| | |
| | batched = data.pop("batched", True) |
| | params = data.pop("parameters", {}) |
| | if batched: |
| | segments, info = self.batched_model.transcribe(audio_nparray, **params) |
| | else: |
| | segments, info = self.model.transcribe(audio_nparray, beam_size=5) |
| |
|
| | segments = [segment.text for segment in segments] |
| | return " ".join(segments) |
| |
|