File size: 1,585 Bytes

5a7eea4
4d6b797
5a7eea4
 
d5ea96d
 
5a7eea4
 
 
 
 
 
dec48f3
5a7eea4
12ade36
5a7eea4
 
 
 
 
 
 
 
 
 
 
 
dec48f3
d5ea96d
 
 
 
 
 
 
5a7eea4
 
d5ea96d
 
 
 
 
5a7eea4
d5ea96d
 
 
5a7eea4
d5ea96d
 
 
 
5a7eea4
d5ea96d

import json
import whisper
from typing import Dict

from huggingface_hub import hf_hub_download
from whisper import load_model, transcribe
from transformers.pipelines.audio_utils import ffmpeg_read

SAMPLE_RATE = 16000


class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        self.model = whisper.load_model("large-v3")

    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the deserialized audio file as bytes
        Return:
            A :obj:`dict`:. base64 encoded image
        """
        # process input
        inputs = data.pop("inputs", data)
        audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)

        pred_out = transcribe(
            self.model,
            audio=audio_nparray,
            language="en",
            word_timestamps=True,
        )
        segments = pred_out['segments']
        result = []
        for segment in segments:
            start = segment['start']
            end = segment['end']
            text = segment['text']
            words = segment['words']
            print(f"Segment: {start} - {end}: {text}\n")
            result.append({
                "start": start,
                "end": end,
                "text": text,
                "words": [{
                    "start": word['start'],
                    "end": word['end'],
                    "word": word['word']
                } for word in words]
            })
        return {"text": json.dumps(result, indent=2, ensure_ascii=False)}