File size: 1,551 Bytes
090c3d1
 
 
cb4b944
090c3d1
 
 
 
 
 
 
 
 
 
 
 
ce0bcc2
 
 
090c3d1
ce0bcc2
090c3d1
 
a21a26c
ce0bcc2
744dbe2
ce0bcc2
 
090c3d1
744dbe2
090c3d1
cb4b944
090c3d1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# handler.py (for handling asr with faster_whisper)
from faster_whisper import WhisperModel, BatchedInferencePipeline
from typing import Any, Dict, List
from transformers.pipelines.audio_utils import ffmpeg_read

class EndpointHandler:

    def __init__(self, path=""):
        self.model_size = "distil-large-v3" # the distilled whisper v3 model
        self.model = WhisperModel(self.model_size, device="cuda", compute_type="float16")
        self.batched_model = BatchedInferencePipeline(model=self.model)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Args:
            data (:obj:):
                includes the base64 encoded audio file as 'inputs'
                whether to use batching as the 'batched' argument
                and any additional arguments as a 'parameters' dict
        Return:
            segments of transcribed text, joined
        """
        # process input
        inputs = data.pop("inputs", data)
        audio_nparray = ffmpeg_read(inputs, 16000) # read the encoded audio and convert at 16k
        # Retrieve custom arguments
        batched = data.pop("batched", True) # default is True if not specified
        params = data.pop("parameters", {}) # all parameters for the model
        if batched:
            segments, info = self.batched_model.transcribe(audio_nparray, **params)
        else:
            segments, info = self.model.transcribe(audio_nparray, beam_size=5)

        segments = [segment.text for segment in segments]
        return " ".join(segments)