Final_Assignment_Template

Sleeping

File size: 2,350 Bytes

a6908e4
fa9bc69
a6908e4
25b2219
a6908e4
25b2219
a6908e4
fa9bc69
a6908e4
fa9bc69
 
 
a6908e4
 
fa9bc69
a6908e4
 
 
 
 
 
fa9bc69
a6908e4
 
 
 
fa9bc69
 
 
 
 
a6908e4
 
fa9bc69
a6908e4
 
 
fa9bc69
 
a6908e4
fa9bc69
a6908e4
 
fa9bc69

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
import torch
import warnings
from tools.base_tool import BaseTool

class SpeechRecognitionTool(BaseTool):
    name = 'speech_to_text'
    description = 'Transcribes speech from audio input.'

    def __init__(self):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        dtype = torch.float16 if device == 'cuda' else torch.float32
        model_id = 'openai/whisper-large-v3-turbo'

        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True,
        ).to(device)

        self.processor = AutoProcessor.from_pretrained(model_id)

        logging.set_verbosity_error()
        warnings.filterwarnings("ignore", category=FutureWarning)

        self.pipeline = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            tokenizer=self.processor.tokenizer,
            feature_extractor=self.processor.feature_extractor,
            torch_dtype=dtype,
            device=device,
            chunk_length_s=30,
            return_timestamps=True,
        )

    def transcribe(self, audio_path: str, with_timestamps: bool = False) -> str:
        result = self.pipeline(audio_path)

        if not with_timestamps:
            return result['text'].strip()

        formatted = ""
        for chunk in self._parse_timed_chunks(result['chunks']):
            formatted += f"[{chunk['start']:.2f}]\n{chunk['text']}\n[{chunk['end']:.2f}]\n"
        return formatted.strip()

    def _parse_timed_chunks(self, chunks):
        absolute_offset = 0.0
        current_offset = 0.0
        normalized = []
        max_chunk = 30.0

        for c in chunks:
            start, end = c['timestamp']
            if start < current_offset:
                absolute_offset += max_chunk
                current_offset = start
            start_time = absolute_offset + start

            if end < start:
                absolute_offset += max_chunk
            end_time = absolute_offset + end
            current_offset = end

            text = c['text'].strip()
            if text:
                normalized.append({"start": start_time, "end": end_time, "text": text})

        return normalized