|
|
""" |
|
|
Created By: ishwor subedi |
|
|
Date: 2024-07-31 |
|
|
""" |
|
|
import torch |
|
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline |
|
|
|
|
|
|
|
|
class SpeechToText: |
|
|
def __init__(self): |
|
|
self.device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
|
|
|
model_id = "openai/whisper-large-v3" |
|
|
|
|
|
self.model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
|
model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True |
|
|
).to(self.device) |
|
|
self.processor = AutoProcessor.from_pretrained(model_id) |
|
|
self.speech_to_text_pipeline = self.pipeline() |
|
|
|
|
|
def pipeline(self, max_new_tokens=128, chunk_length_s=30, batch_size=16): |
|
|
pipe = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model=self.model, |
|
|
tokenizer=self.processor.tokenizer, |
|
|
feature_extractor=self.processor.feature_extractor, |
|
|
max_new_tokens=max_new_tokens, |
|
|
chunk_length_s=chunk_length_s, |
|
|
batch_size=batch_size, |
|
|
return_timestamps=True, |
|
|
torch_dtype=self.torch_dtype, |
|
|
device=self.device, |
|
|
|
|
|
) |
|
|
return pipe |
|
|
|
|
|
def transcribe_audio(self, audio, language: str = "en"): |
|
|
result = self.speech_to_text_pipeline(audio, return_timestamps=True, |
|
|
generate_kwargs={"language": language, "task": "translate"}) |
|
|
return result["chunks"], result["text"] |
|
|
|