Final_Assignment_Template / tools /speech_recognition_tool.py
FD900's picture
Update tools/speech_recognition_tool.py
25b2219 verified
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
import torch
import warnings
from tools.base_tool import BaseTool
class SpeechRecognitionTool(BaseTool):
name = 'speech_to_text'
description = 'Transcribes speech from audio input.'
def __init__(self):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float16 if device == 'cuda' else torch.float32
model_id = 'openai/whisper-large-v3-turbo'
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=dtype,
low_cpu_mem_usage=True,
use_safetensors=True,
).to(device)
self.processor = AutoProcessor.from_pretrained(model_id)
logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=FutureWarning)
self.pipeline = pipeline(
"automatic-speech-recognition",
model=self.model,
tokenizer=self.processor.tokenizer,
feature_extractor=self.processor.feature_extractor,
torch_dtype=dtype,
device=device,
chunk_length_s=30,
return_timestamps=True,
)
def transcribe(self, audio_path: str, with_timestamps: bool = False) -> str:
result = self.pipeline(audio_path)
if not with_timestamps:
return result['text'].strip()
formatted = ""
for chunk in self._parse_timed_chunks(result['chunks']):
formatted += f"[{chunk['start']:.2f}]\n{chunk['text']}\n[{chunk['end']:.2f}]\n"
return formatted.strip()
def _parse_timed_chunks(self, chunks):
absolute_offset = 0.0
current_offset = 0.0
normalized = []
max_chunk = 30.0
for c in chunks:
start, end = c['timestamp']
if start < current_offset:
absolute_offset += max_chunk
current_offset = start
start_time = absolute_offset + start
if end < start:
absolute_offset += max_chunk
end_time = absolute_offset + end
current_offset = end
text = c['text'].strip()
if text:
normalized.append({"start": start_time, "end": end_time, "text": text})
return normalized