lingdoc's picture
Update handler.py
ce0bcc2 verified
# handler.py (for handling asr with faster_whisper)
from faster_whisper import WhisperModel, BatchedInferencePipeline
from typing import Any, Dict, List
from transformers.pipelines.audio_utils import ffmpeg_read
class EndpointHandler:
def __init__(self, path=""):
self.model_size = "distil-large-v3" # the distilled whisper v3 model
self.model = WhisperModel(self.model_size, device="cuda", compute_type="float16")
self.batched_model = BatchedInferencePipeline(model=self.model)
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Args:
data (:obj:):
includes the base64 encoded audio file as 'inputs'
whether to use batching as the 'batched' argument
and any additional arguments as a 'parameters' dict
Return:
segments of transcribed text, joined
"""
# process input
inputs = data.pop("inputs", data)
audio_nparray = ffmpeg_read(inputs, 16000) # read the encoded audio and convert at 16k
# Retrieve custom arguments
batched = data.pop("batched", True) # default is True if not specified
params = data.pop("parameters", {}) # all parameters for the model
if batched:
segments, info = self.batched_model.transcribe(audio_nparray, **params)
else:
segments, info = self.model.transcribe(audio_nparray, beam_size=5)
segments = [segment.text for segment in segments]
return " ".join(segments)