Final_Assignment_Template

Sleeping

App Files Files Community

FD900 commited on Jun 30, 2025

Commit

fa9bc69

verified ·

1 Parent(s): fa39ad6

Update tools/speech_recognition_tool.py

Browse files

Files changed (1) hide show

tools/speech_recognition_tool.py +45 -82

tools/speech_recognition_tool.py CHANGED Viewed

@@ -1,107 +1,70 @@
-from smolagents import Tool
-import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
 import warnings
-class SpeechRecognitionTool(Tool):
     name = 'speech_to_text'
-    description = 'Transcribes spoken audio to text with optional time markers.'
-    inputs = {
-        'audio': {
-            'type': 'string',
-            'description': 'Local path to the audio file to transcribe.',
-        },
-        'with_time_markers': {
-            'type': 'boolean',
-            'description': 'Include timestamps in output.',
-            'nullable': True,
-            'default': False,
-        },
-    }
-    output_type = 'string'
-    chunk_length_s = 30  # chunk length for inference
-    def __new__(cls, *args, **kwargs):
-        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
-        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
         model_id = 'openai/whisper-large-v3-turbo'
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
             model_id,
             torch_dtype=dtype,
             low_cpu_mem_usage=True,
             use_safetensors=True,
         ).to(device)
-        processor = AutoProcessor.from_pretrained(model_id)
         logging.set_verbosity_error()
         warnings.filterwarnings("ignore", category=FutureWarning)
-        cls.pipe = pipeline(
-            task='automatic-speech-recognition',
-            model=model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
             torch_dtype=dtype,
             device=device,
-            chunk_length_s=cls.chunk_length_s,
             return_timestamps=True,
         )
-        return super().__new__(cls, *args, **kwargs)
-    def forward(self, audio: str, with_time_markers: bool = False) -> str:
-        """
-        Run speech recognition on the input audio file.
-        Args:
-            audio (str): Path to a local .wav or .mp3 file
-            with_time_markers (bool): Whether to return chunked timestamps
-        Returns:
-            str: Transcript or chunked transcript with [start]\n[text]\n[end]
-        """
-        result = self.pipe(audio)
-        if not with_time_markers:
             return result['text'].strip()
-        chunks = self._normalize_chunks(result['chunks'])
-        lines = []
-        for ch in chunks:
-            lines.append(f"[{ch['start']:.2f}]\n{ch['text']}\n[{ch['end']:.2f}]")
-        return "\n".join(lines).strip()
-    def _normalize_chunks(self, chunks):
-        offset = 0.0
-        chunk_offset = 0.0
-        norm_chunks = []
-        for chunk in chunks:
-            ts_start, ts_end = chunk['timestamp']
-            if ts_start < chunk_offset:
-                offset += self.chunk_length_s
-                chunk_offset = ts_start
-            start = offset + ts_start
-            if ts_end < ts_start:
-                offset += self.chunk_length_s
-            end = offset + ts_end
-            chunk_offset = ts_end
-            if chunk['text'].strip():
-                norm_chunks.append({
-                    'start': start,
-                    'end': end,
-                    'text': chunk['text'].strip(),
-                })
-        return norm_chunks

 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
+import torch
 import warnings
+class SpeechRecognitionTool:
     name = 'speech_to_text'
+    description = 'Transcribes speech from audio input.'
+    def __init__(self):
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        dtype = torch.float16 if device == 'cuda' else torch.float32
         model_id = 'openai/whisper-large-v3-turbo'
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
             model_id,
             torch_dtype=dtype,
             low_cpu_mem_usage=True,
             use_safetensors=True,
         ).to(device)
+        self.processor = AutoProcessor.from_pretrained(model_id)
         logging.set_verbosity_error()
         warnings.filterwarnings("ignore", category=FutureWarning)
+        self.pipeline = pipeline(
+            "automatic-speech-recognition",
+            model=self.model,
+            tokenizer=self.processor.tokenizer,
+            feature_extractor=self.processor.feature_extractor,
             torch_dtype=dtype,
             device=device,
+            chunk_length_s=30,
             return_timestamps=True,
         )
+    def transcribe(self, audio_path: str, with_timestamps: bool = False) -> str:
+        result = self.pipeline(audio_path)
+        if not with_timestamps:
             return result['text'].strip()
+        formatted = ""
+        for chunk in self._parse_timed_chunks(result['chunks']):
+            formatted += f"[{chunk['start']:.2f}]\n{chunk['text']}\n[{chunk['end']:.2f}]\n"
+        return formatted.strip()
+    def _parse_timed_chunks(self, chunks):
+        absolute_offset = 0.0
+        current_offset = 0.0
+        normalized = []
+        max_chunk = 30.0
+        for c in chunks:
+            start, end = c['timestamp']
+            if start < current_offset:
+                absolute_offset += max_chunk
+                current_offset = start
+            start_time = absolute_offset + start
+            if end < start:
+                absolute_offset += max_chunk
+            end_time = absolute_offset + end
+            current_offset = end
+            text = c['text'].strip()
+            if text:
+                normalized.append({"start": start_time, "end": end_time, "text": text})
+        return normalized