asr-model / handler.py
zou8944's picture
Update handler.py
4d6b797 verified
import json
import whisper
from typing import Dict
from huggingface_hub import hf_hub_download
from whisper import load_model, transcribe
from transformers.pipelines.audio_utils import ffmpeg_read
SAMPLE_RATE = 16000
class EndpointHandler():
def __init__(self, path=""):
# load the model
self.model = whisper.load_model("large-v3")
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
"""
Args:
data (:obj:):
includes the deserialized audio file as bytes
Return:
A :obj:`dict`:. base64 encoded image
"""
# process input
inputs = data.pop("inputs", data)
audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
pred_out = transcribe(
self.model,
audio=audio_nparray,
language="en",
word_timestamps=True,
)
segments = pred_out['segments']
result = []
for segment in segments:
start = segment['start']
end = segment['end']
text = segment['text']
words = segment['words']
print(f"Segment: {start} - {end}: {text}\n")
result.append({
"start": start,
"end": end,
"text": text,
"words": [{
"start": word['start'],
"end": word['end'],
"word": word['word']
} for word in words]
})
return {"text": json.dumps(result, indent=2, ensure_ascii=False)}