File size: 1,585 Bytes
5a7eea4
4d6b797
5a7eea4
 
d5ea96d
 
5a7eea4
 
 
 
 
 
dec48f3
5a7eea4
12ade36
5a7eea4
 
 
 
 
 
 
 
 
 
 
 
dec48f3
d5ea96d
 
 
 
 
 
 
5a7eea4
 
d5ea96d
 
 
 
 
5a7eea4
d5ea96d
 
 
5a7eea4
d5ea96d
 
 
 
5a7eea4
d5ea96d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import json
import whisper
from typing import Dict

from huggingface_hub import hf_hub_download
from whisper import load_model, transcribe
from transformers.pipelines.audio_utils import ffmpeg_read

SAMPLE_RATE = 16000


class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        self.model = whisper.load_model("large-v3")

    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the deserialized audio file as bytes
        Return:
            A :obj:`dict`:. base64 encoded image
        """
        # process input
        inputs = data.pop("inputs", data)
        audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)

        pred_out = transcribe(
            self.model,
            audio=audio_nparray,
            language="en",
            word_timestamps=True,
        )
        segments = pred_out['segments']
        result = []
        for segment in segments:
            start = segment['start']
            end = segment['end']
            text = segment['text']
            words = segment['words']
            print(f"Segment: {start} - {end}: {text}\n")
            result.append({
                "start": start,
                "end": end,
                "text": text,
                "words": [{
                    "start": word['start'],
                    "end": word['end'],
                    "word": word['word']
                } for word in words]
            })
        return {"text": json.dumps(result, indent=2, ensure_ascii=False)}