| | import json |
| | import whisper |
| | from typing import Dict |
| |
|
| | from huggingface_hub import hf_hub_download |
| | from whisper import load_model, transcribe |
| | from transformers.pipelines.audio_utils import ffmpeg_read |
| |
|
| | SAMPLE_RATE = 16000 |
| |
|
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | self.model = whisper.load_model("large-v3") |
| |
|
| | def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| | """ |
| | Args: |
| | data (:obj:): |
| | includes the deserialized audio file as bytes |
| | Return: |
| | A :obj:`dict`:. base64 encoded image |
| | """ |
| | |
| | inputs = data.pop("inputs", data) |
| | audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) |
| |
|
| | pred_out = transcribe( |
| | self.model, |
| | audio=audio_nparray, |
| | language="en", |
| | word_timestamps=True, |
| | ) |
| | segments = pred_out['segments'] |
| | result = [] |
| | for segment in segments: |
| | start = segment['start'] |
| | end = segment['end'] |
| | text = segment['text'] |
| | words = segment['words'] |
| | print(f"Segment: {start} - {end}: {text}\n") |
| | result.append({ |
| | "start": start, |
| | "end": end, |
| | "text": text, |
| | "words": [{ |
| | "start": word['start'], |
| | "end": word['end'], |
| | "word": word['word'] |
| | } for word in words] |
| | }) |
| | return {"text": json.dumps(result, indent=2, ensure_ascii=False)} |