| | from typing import Dict, Any |
| |
|
| | import torch |
| | from transformers import pipeline |
| | from transformers.pipelines.audio_utils import ffmpeg_read |
| |
|
| |
|
| | class EndpointHandler: |
| |
|
| | def __init__(self, asr_model_path: str = "vphu123/whisper-endpoint"): |
| | |
| | device = 0 if torch.cuda.is_available() else "cpu" |
| | self.pipe = pipeline( |
| | task="automatic-speech-recognition", |
| | model=asr_model_path, |
| | chunk_length_s=30, |
| | device=device, |
| | max_new_tokens = 10000, |
| | ) |
| | |
| | self.pipe.model.config.forced_decoder_ids = self.pipe.tokenizer.get_decoder_prompt_ids(language="vi", task="transcribe") |
| | |
| |
|
| | def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| |
|
| | |
| | inputs = data.pop("inputs", data) |
| | audio_nparray = ffmpeg_read(inputs, 16000) |
| | audio_tensor= torch.from_numpy(audio_nparray) |
| |
|
| | |
| | result = self.pipe(audio_nparray) |
| |
|
| | |
| | return {"text": result["text"]} |