File size: 1,585 Bytes
5a7eea4 4d6b797 5a7eea4 d5ea96d 5a7eea4 dec48f3 5a7eea4 12ade36 5a7eea4 dec48f3 d5ea96d 5a7eea4 d5ea96d 5a7eea4 d5ea96d 5a7eea4 d5ea96d 5a7eea4 d5ea96d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import json
import whisper
from typing import Dict
from huggingface_hub import hf_hub_download
from whisper import load_model, transcribe
from transformers.pipelines.audio_utils import ffmpeg_read
SAMPLE_RATE = 16000
class EndpointHandler():
def __init__(self, path=""):
# load the model
self.model = whisper.load_model("large-v3")
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
"""
Args:
data (:obj:):
includes the deserialized audio file as bytes
Return:
A :obj:`dict`:. base64 encoded image
"""
# process input
inputs = data.pop("inputs", data)
audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
pred_out = transcribe(
self.model,
audio=audio_nparray,
language="en",
word_timestamps=True,
)
segments = pred_out['segments']
result = []
for segment in segments:
start = segment['start']
end = segment['end']
text = segment['text']
words = segment['words']
print(f"Segment: {start} - {end}: {text}\n")
result.append({
"start": start,
"end": end,
"text": text,
"words": [{
"start": word['start'],
"end": word['end'],
"word": word['word']
} for word in words]
})
return {"text": json.dumps(result, indent=2, ensure_ascii=False)} |