Spaces:

Scrapyard-Brampton
/

Testing

Configuration error

App Files Files Community

Sidak Singh commited on Aug 14, 2025

Commit

7b7174c

1 Parent(s): 8b3bbb3

transcribing works

Browse files

Files changed (4) hide show

__pycache__/transcriber.cpython-310.pyc +0 -0
app.py +13 -12
nodemon.json +27 -0
transcriber.py +219 -71

__pycache__/transcriber.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/transcriber.cpython-310.pyc and b/__pycache__/transcriber.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -13,15 +13,16 @@ def process_mic_audio(audio):
     """Process audio from Gradio microphone and update transcription"""
     if audio is None:
         return gr.update(), gr.update()
     sr, y = audio
     # Add to processor and possibly trigger transcription
     buffer_size = processor.add_audio(y, sr)
     # Get current transcription
     transcription = processor.get_transcription()
     # Return status update and transcription
     buffer_seconds = buffer_size / processor.sample_rate
     return (
@@ -45,29 +46,29 @@ def force_transcribe():
 # Create Gradio interface
 with gr.Blocks(title="Live Speech Transcription") as demo:
     gr.Markdown("# Live Speech Recognition with Buffer Playback")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
     with gr.Row():
         status_output = gr.Textbox(label="Buffer Status", interactive=False)
         buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
     with gr.Row():
         clear_btn = gr.Button("Clear Buffer")
         play_btn = gr.Button("Get Buffer for Playback")
         force_btn = gr.Button("Force Transcribe")
     with gr.Row():
         transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
     # Connect components
     audio_input.stream(
-        process_mic_audio,
-        audio_input,
         [status_output, transcription_output]
     )
     clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
     play_btn.click(get_current_buffer, None, buffer_audio)
     force_btn.click(force_transcribe, None, transcription_output)

     """Process audio from Gradio microphone and update transcription"""
     if audio is None:
         return gr.update(), gr.update()
     sr, y = audio
     # Add to processor and possibly trigger transcription
     buffer_size = processor.add_audio(y, sr)
     # Get current transcription
     transcription = processor.get_transcription()
+    print(transcription)
     # Return status update and transcription
     buffer_seconds = buffer_size / processor.sample_rate
     return (
 # Create Gradio interface
 with gr.Blocks(title="Live Speech Transcription") as demo:
     gr.Markdown("# Live Speech Recognition with Buffer Playback")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
     with gr.Row():
         status_output = gr.Textbox(label="Buffer Status", interactive=False)
         buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
     with gr.Row():
         clear_btn = gr.Button("Clear Buffer")
         play_btn = gr.Button("Get Buffer for Playback")
         force_btn = gr.Button("Force Transcribe")
     with gr.Row():
         transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
     # Connect components
     audio_input.stream(
+        process_mic_audio,
+        audio_input,
         [status_output, transcription_output]
     )
     clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
     play_btn.click(get_current_buffer, None, buffer_audio)
     force_btn.click(force_transcribe, None, transcription_output)

nodemon.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "watch": [
+    "*.py",
+    "**/*.py"
+  ],
+  "ext": "py",
+  "ignore": [
+    "__pycache__/",
+    "*.pyc",
+    ".git/",
+    "node_modules/",
+    "venv/",
+    "env/",
+    ".pytest_cache/",
+    "*.log"
+  ],
+  "exec": "python3 transcriber.py",
+  "env": {
+    "PYTHONPATH": ".",
+    "PYTHONUNBUFFERED": "1"
+  },
+  "delay": 1000,
+  "verbose": true,
+  "restartable": "rs",
+  "colours": true,
+  "legacy-watch": false
+}

transcriber.py CHANGED Viewed

@@ -11,25 +11,197 @@ class AudioProcessor:
         self.processed_length = 0         # Length of audio already processed
         self.sample_rate = 16000          # Default sample rate for whisper
         self.lock = threading.Lock()      # Thread safety for buffer access
-        self.transcription = ['']         # List of transcription segments
         self.min_process_length = 1 * self.sample_rate  # Process at least 1 second
         self.max_buffer_size = 30 * self.sample_rate  # Maximum buffer size (30 seconds)
         self.last_process_time = time.time()
         self.process_interval = 1.0       # Process every 1 second
         self.is_processing = False        # Flag to prevent concurrent processing
         # Initialize the whisper model
         self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
         print(f"Initialized {model_size} model on {device}")
     def add_audio(self, audio_data, sr):
         """
         Add audio to the buffer and process if needed
         Args:
             audio_data (numpy.ndarray): Audio data to add
             sr (int): Sample rate of the audio data
         Returns:
             int: Current buffer size in samples
         """
@@ -37,11 +209,11 @@ class AudioProcessor:
             # Convert to mono if stereo
             if audio_data.ndim > 1:
                 audio_data = audio_data.mean(axis=1)
-            # Keep original format without normalization
             audio_data = audio_data.astype(np.float32)
-            # Resample properly if needed
             if sr != self.sample_rate:
                 try:
                     # Use scipy for proper resampling
@@ -49,106 +221,82 @@ class AudioProcessor:
                     audio_data = signal.resample(audio_data, number_of_samples)
                 except Exception as e:
                     print(f"Resampling error: {e}")
-                    # Fallback to simple method if scipy fails
                     ratio = self.sample_rate / sr
                     audio_data = np.interp(
                         np.arange(0, len(audio_data) * ratio, ratio),
                         np.arange(0, len(audio_data)),
                         audio_data
                     )
-            # Apply fade-in to prevent clicks at chunk boundaries (5ms fade)
             fade_samples = min(int(0.005 * self.sample_rate), len(audio_data))
             if fade_samples > 0:
                 fade_in = np.linspace(0, 1, fade_samples)
-                audio_data[:fade_samples] = audio_data[:fade_samples] * fade_in
             # Add to buffer
             if len(self.audio_buffer) == 0:
                 self.audio_buffer = audio_data
             else:
                 self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
-            # Trim buffer if it gets too large
-            if len(self.audio_buffer) > self.max_buffer_size:
-                excess = len(self.audio_buffer) - self.max_buffer_size
-                self.audio_buffer = self.audio_buffer[excess:]
-                # Adjust processed length when trimming
-                self.processed_length = max(0, self.processed_length - excess)
             # Check if we should process now
             should_process = (
                 len(self.audio_buffer) >= self.min_process_length and
                 time.time() - self.last_process_time >= self.process_interval and
                 not self.is_processing
             )
             if should_process:
                 self.last_process_time = time.time()
                 self.is_processing = True
-                # Process the buffer in a separate thread to avoid blocking
-                threading.Thread(target=self._process_audio).start()
             return len(self.audio_buffer)
-    def _process_audio(self):
-        """Process the current audio buffer (should be called in a separate thread)"""
-        try:
-            with self.lock:
-                # Get unprocessed portion of the buffer
-                if self.processed_length >= len(self.audio_buffer):
-                    self.is_processing = False
-                    return
-                # Make a copy of the full buffer for processing
-                audio = self.audio_buffer.copy()
-            # Normalize for transcription
-            audio_norm = audio.astype(np.float32)
-            if np.max(np.abs(audio_norm)) > 0:
-                audio_norm = audio_norm / np.max(np.abs(audio_norm))
-            # Transcribe with whisper
-            segments, info = self.audio_model.transcribe(audio_norm, beam_size=5)
-            result = list(segments)
-            if result:
-                with self.lock:
-                    # Update the transcription
-                    self.transcription = [seg.text for seg in result]
-                    # Mark the whole buffer as processed
-                    self.processed_length = len(self.audio_buffer)
-        except Exception as e:
-            print(f"Transcription error: {e}")
-        finally:
-            # Reset processing flag
-            self.is_processing = False
-    def get_transcription(self):
-        """Get the current transcription text"""
-        with self.lock:
-            return " ".join(self.transcription)
     def clear_buffer(self):
-        """Clear the audio buffer"""
         with self.lock:
             self.audio_buffer = np.array([])
             self.processed_length = 0
-            self.transcription = ['']
             self.is_processing = False
             return "Buffers cleared"
     def get_playback_audio(self):
         """Get properly formatted audio for Gradio playback"""
         with self.lock:
             if len(self.audio_buffer) == 0:
                 return None
             # Make a copy and ensure proper format for Gradio
             audio = self.audio_buffer.copy()
             # Ensure audio is in the correct range for playback (-1 to 1)
             if np.max(np.abs(audio)) > 0:
                 audio = audio / max(1.0, np.max(np.abs(audio)))
             return (self.sample_rate, audio)

         self.processed_length = 0         # Length of audio already processed
         self.sample_rate = 16000          # Default sample rate for whisper
         self.lock = threading.Lock()      # Thread safety for buffer access
         self.min_process_length = 1 * self.sample_rate  # Process at least 1 second
         self.max_buffer_size = 30 * self.sample_rate  # Maximum buffer size (30 seconds)
+        self.overlap_size = 3 * self.sample_rate  # Keep 3 seconds of overlap when trimming
         self.last_process_time = time.time()
         self.process_interval = 1.0       # Process every 1 second
         self.is_processing = False        # Flag to prevent concurrent processing
+        self.full_transcription = ""      # Complete history of transcription
+        self.last_segment_text = ""       # Last segment that was transcribed
+        self.confirmed_transcription = "" # Transcription that won't change (beyond overlap zone)
         # Initialize the whisper model
         self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
         print(f"Initialized {model_size} model on {device}")
+    def _trim_buffer_intelligently(self):
+        """
+        Trim the buffer while preserving transcription continuity
+        Keep some overlap to maintain context for the next processing
+        """
+        if len(self.audio_buffer) <= self.max_buffer_size:
+            return
+        # Calculate how much to trim (keep overlap_size for context)
+        trim_amount = len(self.audio_buffer) - self.max_buffer_size + self.overlap_size
+        # Make sure we don't trim more than we have
+        trim_amount = min(trim_amount, len(self.audio_buffer) - self.overlap_size)
+        if trim_amount > 0:
+            # Before trimming, finalize the transcription for the part we're removing
+            # This ensures we don't lose confirmed text
+            if self.processed_length > trim_amount:
+                # We're removing audio that was already processed
+                # The transcription for this part should be considered final
+                pass  # The full_transcription already contains this
+            # Trim the buffer
+            self.audio_buffer = self.audio_buffer[trim_amount:]
+            # Adjust processed_length to account for trimmed audio
+            self.processed_length = max(0, self.processed_length - trim_amount)
+            # Reset last_segment_text since our context has changed
+            # This forces the next processing to start fresh with overlap handling
+            self.last_segment_text = ""
+    def _process_audio_chunk(self):
+        """Process the current audio buffer and return new transcription"""
+        try:
+            with self.lock:
+                # Check if there's enough new content to process
+                unprocessed_length = len(self.audio_buffer) - self.processed_length
+                if unprocessed_length < self.min_process_length:
+                    self.is_processing = False
+                    return None
+                # Determine what portion to process
+                # Include some overlap from already processed audio for context
+                overlap_samples = min(self.overlap_size, self.processed_length)
+                start_pos = max(0, self.processed_length - overlap_samples)
+                # Process from start_pos to end of buffer
+                audio_to_process = self.audio_buffer[start_pos:].copy()
+                end_pos = len(self.audio_buffer)
+            # Normalize for transcription
+            audio_norm = audio_to_process.astype(np.float32)
+            if np.max(np.abs(audio_norm)) > 0:
+                audio_norm = audio_norm / np.max(np.abs(audio_norm))
+            # Transcribe with faster settings for real-time processing
+            segments, info = self.audio_model.transcribe(
+                audio_norm,
+                beam_size=1,
+                word_timestamps=False,
+                vad_filter=True,
+                vad_parameters=dict(min_silence_duration_ms=500)
+            )
+            result = list(segments)
+            if result:
+                # Get the new text from all segments
+                current_segment_text = " ".join([seg.text.strip() for seg in result if seg.text.strip()])
+                if not current_segment_text:
+                    self.is_processing = False
+                    return None
+                # Handle overlap and merge with existing transcription
+                new_text = self._merge_transcription_intelligently(current_segment_text)
+                if new_text:
+                    # Append new text to full transcription
+                    if self.full_transcription and not self.full_transcription.endswith(' '):
+                        self.full_transcription += " "
+                    self.full_transcription += new_text
+                # Update state
+                self.last_segment_text = current_segment_text
+                self.processed_length = end_pos
+                return self.full_transcription
+            return None
+        except Exception as e:
+            print(f"Transcription error: {e}")
+            return None
+        finally:
+            self.is_processing = False
+    def _merge_transcription_intelligently(self, new_segment_text):
+        """
+        Intelligently merge new transcription with existing text
+        Handles overlap detection and prevents duplication
+        """
+        if not new_segment_text or not new_segment_text.strip():
+            return ""
+        # If this is the first transcription or we reset context, use it directly
+        if not self.last_segment_text:
+            return new_segment_text
+        # Normalize text for comparison
+        import re
+        def normalize_for_comparison(text):
+            # Convert to lowercase and remove punctuation for comparison
+            text = text.lower()
+            text = re.sub(r'[^\w\s]', '', text)
+            return text.strip()
+        norm_prev = normalize_for_comparison(self.last_segment_text)
+        norm_new = normalize_for_comparison(new_segment_text)
+        if not norm_prev or not norm_new:
+            return new_segment_text
+        # Split into words for overlap detection
+        prev_words = norm_prev.split()
+        new_words = norm_new.split()
+        # Find the longest overlap between end of previous and start of new
+        max_overlap = min(len(prev_words), len(new_words), 15)  # Check up to 15 words
+        overlap_found = 0
+        for i in range(max_overlap, 2, -1):  # Minimum 3 words to consider overlap
+            if prev_words[-i:] == new_words[:i]:
+                overlap_found = i
+                break
+        # Handle special cases for numbers (counting sequences)
+        if overlap_found == 0:
+            # Check if we have a counting sequence
+            prev_numbers = [int(x) for x in re.findall(r'\b\d+\b', norm_prev)]
+            new_numbers = [int(x) for x in re.findall(r'\b\d+\b', norm_new)]
+            if prev_numbers and new_numbers:
+                max_prev = max(prev_numbers)
+                min_new = min(new_numbers)
+                # If there's a logical continuation, find where it starts
+                if min_new <= max_prev + 5:  # Allow some gap in counting
+                    new_text_words = new_segment_text.split()
+                    for i, word in enumerate(new_text_words):
+                        if re.search(r'\b\d+\b', word):
+                            num = int(re.search(r'\d+', word).group())
+                            if num > max_prev:
+                                return " ".join(new_text_words[i:])
+        # Apply overlap removal if found
+        if overlap_found > 0:
+            new_text_words = new_segment_text.split()
+            return " ".join(new_text_words[overlap_found:])
+        else:
+            # Check if new text is completely contained in previous (avoid duplication)
+            if norm_new in norm_prev:
+                return ""
+            # No overlap found, return the full new text
+            return new_segment_text
     def add_audio(self, audio_data, sr):
         """
         Add audio to the buffer and process if needed
         Args:
             audio_data (numpy.ndarray): Audio data to add
             sr (int): Sample rate of the audio data
         Returns:
             int: Current buffer size in samples
         """
             # Convert to mono if stereo
             if audio_data.ndim > 1:
                 audio_data = audio_data.mean(axis=1)
+            # Convert to float32
             audio_data = audio_data.astype(np.float32)
+            # Resample if needed
             if sr != self.sample_rate:
                 try:
                     # Use scipy for proper resampling
                     audio_data = signal.resample(audio_data, number_of_samples)
                 except Exception as e:
                     print(f"Resampling error: {e}")
+                    # Fallback resampling
                     ratio = self.sample_rate / sr
                     audio_data = np.interp(
                         np.arange(0, len(audio_data) * ratio, ratio),
                         np.arange(0, len(audio_data)),
                         audio_data
                     )
+            # Apply fade-in to prevent clicks (5ms fade)
             fade_samples = min(int(0.005 * self.sample_rate), len(audio_data))
             if fade_samples > 0:
                 fade_in = np.linspace(0, 1, fade_samples)
+                audio_data[:fade_samples] *= fade_in
             # Add to buffer
             if len(self.audio_buffer) == 0:
                 self.audio_buffer = audio_data
             else:
                 self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
+            # Intelligently trim buffer if it gets too large
+            self._trim_buffer_intelligently()
             # Check if we should process now
             should_process = (
                 len(self.audio_buffer) >= self.min_process_length and
                 time.time() - self.last_process_time >= self.process_interval and
                 not self.is_processing
             )
             if should_process:
                 self.last_process_time = time.time()
                 self.is_processing = True
+                # Process in a separate thread
+                threading.Thread(target=self._process_audio_chunk, daemon=True).start()
             return len(self.audio_buffer)
     def clear_buffer(self):
+        """Clear the audio buffer and transcription"""
         with self.lock:
             self.audio_buffer = np.array([])
             self.processed_length = 0
+            self.full_transcription = ""
+            self.last_segment_text = ""
+            self.confirmed_transcription = ""
             self.is_processing = False
             return "Buffers cleared"
+    def get_transcription(self):
+        """Get the current transcription text"""
+        with self.lock:
+            return self.full_transcription
     def get_playback_audio(self):
         """Get properly formatted audio for Gradio playback"""
         with self.lock:
             if len(self.audio_buffer) == 0:
                 return None
             # Make a copy and ensure proper format for Gradio
             audio = self.audio_buffer.copy()
             # Ensure audio is in the correct range for playback (-1 to 1)
             if np.max(np.abs(audio)) > 0:
                 audio = audio / max(1.0, np.max(np.abs(audio)))
             return (self.sample_rate, audio)
+    def get_buffer_info(self):
+        """Get information about the current buffer state"""
+        with self.lock:
+            return {
+                "buffer_length_seconds": len(self.audio_buffer) / self.sample_rate,
+                "processed_length_seconds": self.processed_length / self.sample_rate,
+                "unprocessed_length_seconds": (len(self.audio_buffer) - self.processed_length) / self.sample_rate,
+                "is_processing": self.is_processing,
+                "transcription_length": len(self.full_transcription)
+            }