File size: 1,551 Bytes
090c3d1 cb4b944 090c3d1 ce0bcc2 090c3d1 ce0bcc2 090c3d1 a21a26c ce0bcc2 744dbe2 ce0bcc2 090c3d1 744dbe2 090c3d1 cb4b944 090c3d1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | # handler.py (for handling asr with faster_whisper)
from faster_whisper import WhisperModel, BatchedInferencePipeline
from typing import Any, Dict, List
from transformers.pipelines.audio_utils import ffmpeg_read
class EndpointHandler:
def __init__(self, path=""):
self.model_size = "distil-large-v3" # the distilled whisper v3 model
self.model = WhisperModel(self.model_size, device="cuda", compute_type="float16")
self.batched_model = BatchedInferencePipeline(model=self.model)
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Args:
data (:obj:):
includes the base64 encoded audio file as 'inputs'
whether to use batching as the 'batched' argument
and any additional arguments as a 'parameters' dict
Return:
segments of transcribed text, joined
"""
# process input
inputs = data.pop("inputs", data)
audio_nparray = ffmpeg_read(inputs, 16000) # read the encoded audio and convert at 16k
# Retrieve custom arguments
batched = data.pop("batched", True) # default is True if not specified
params = data.pop("parameters", {}) # all parameters for the model
if batched:
segments, info = self.batched_model.transcribe(audio_nparray, **params)
else:
segments, info = self.model.transcribe(audio_nparray, beam_size=5)
segments = [segment.text for segment in segments]
return " ".join(segments)
|