import json import whisper from typing import Dict from huggingface_hub import hf_hub_download from whisper import load_model, transcribe from transformers.pipelines.audio_utils import ffmpeg_read SAMPLE_RATE = 16000 class EndpointHandler(): def __init__(self, path=""): # load the model self.model = whisper.load_model("large-v3") def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: """ Args: data (:obj:): includes the deserialized audio file as bytes Return: A :obj:`dict`:. base64 encoded image """ # process input inputs = data.pop("inputs", data) audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) pred_out = transcribe( self.model, audio=audio_nparray, language="en", word_timestamps=True, ) segments = pred_out['segments'] result = [] for segment in segments: start = segment['start'] end = segment['end'] text = segment['text'] words = segment['words'] print(f"Segment: {start} - {end}: {text}\n") result.append({ "start": start, "end": end, "text": text, "words": [{ "start": word['start'], "end": word['end'], "word": word['word'] } for word in words] }) return {"text": json.dumps(result, indent=2, ensure_ascii=False)}