Spaces:

Scrapyard-Brampton
/

Testing

Configuration error

App Files Files Community

Scrapyard commited on Aug 8, 2025

Commit

8b3bbb3

1 Parent(s): bc075a6

smother voice expericnce

Browse files

Files changed (3) hide show

__pycache__/transcriber.cpython-310.pyc +0 -0
app.py +19 -121
transcriber.py +154 -0

__pycache__/transcriber.cpython-310.pyc ADDED Viewed

Binary file (4.33 kB). View file

app.py CHANGED Viewed

@@ -1,123 +1,13 @@
 import gradio as gr
 import numpy as np
-from faster_whisper import WhisperModel
-import threading
-import time
-import scipy.signal as signal
-# Initialize the WhisperModel
-audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
-class AudioProcessor:
-    def __init__(self):
-        self.audio_buffer = np.array([])  # Stores raw audio for playback
-        self.sample_rate = 16000          # Default sample rate for whisper
-        self.lock = threading.Lock()      # Thread safety for buffer access
-        self.transcription = ['']         # List of transcription segments
-        self.min_process_length = 1 * self.sample_rate  # Process at least 1 second
-        self.max_buffer_size = 30 * self.sample_rate  # Maximum buffer size (30 seconds)
-        self.last_process_time = time.time()
-        self.process_interval = 1.0       # Process every 1 second
-    def add_audio(self, audio_data, sr):
-        """Add audio to the buffer and process if needed"""
-        with self.lock:
-            # Convert to mono if stereo
-            if audio_data.ndim > 1:
-                audio_data = audio_data.mean(axis=1)
-            # Keep original format without normalization
-            audio_data = audio_data.astype(np.float32)
-            # Resample properly if needed
-            if sr != self.sample_rate:
-                try:
-                    number_of_samples = int(len(audio_data) * self.sample_rate / sr)
-                    audio_data = signal.resample(audio_data, number_of_samples)
-                except Exception as e:
-                    print(f"Resampling error: {e}")
-                    ratio = self.sample_rate / sr
-                    audio_data = np.interp(
-                        np.arange(0, len(audio_data) * ratio, ratio),
-                        np.arange(0, len(audio_data)),
-                        audio_data
-                    )
-            # Add to buffer without renormalizing
-            if len(self.audio_buffer) == 0:
-                self.audio_buffer = audio_data
-            else:
-                self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
-            # Trim buffer if it gets too large
-            if len(self.audio_buffer) > self.max_buffer_size:
-                self.audio_buffer = self.audio_buffer[-self.max_buffer_size:]
-            # Check if we should process now
-            should_process = (
-                len(self.audio_buffer) >= self.min_process_length and
-                time.time() - self.last_process_time >= self.process_interval
-            )
-            if should_process:
-                self.last_process_time = time.time()
-                # Process the buffer in a separate thread to avoid blocking
-                threading.Thread(target=self._process_audio).start()
-            return len(self.audio_buffer)
-    def _process_audio(self):
-        """Process the current audio buffer (should be called in a separate thread)"""
-        with self.lock:
-            # Make a copy for processing
-            audio = self.audio_buffer.copy()
-        # Normalize for transcription
-        audio_norm = audio.astype(np.float32)
-        if np.max(np.abs(audio_norm)) > 0:
-            audio_norm = audio_norm / np.max(np.abs(audio_norm))
-        try:
-            # Transcribe with whisper
-            segments, info = audio_model.transcribe(audio_norm, beam_size=5)
-            result = list(segments)
-            if result:
-                with self.lock:
-                    # Update the transcription
-                    self.transcription = [seg.text for seg in result]
-        except Exception as e:
-            print(f"Transcription error: {e}")
-    def get_transcription(self):
-        """Get the current transcription text"""
-        with self.lock:
-            return " ".join(self.transcription)
-    def clear_buffer(self):
-        """Clear the audio buffer"""
-        with self.lock:
-            self.audio_buffer = np.array([])
-            self.transcription = ['']
-            return "Buffers cleared"
-    def get_playback_audio(self):
-        """Get properly formatted audio for Gradio playback"""
-        with self.lock:
-            if len(self.audio_buffer) == 0:
-                return None
-            # Make a copy and ensure proper format for Gradio
-            audio = self.audio_buffer.copy()
-            # Ensure audio is in the correct range for playback (-1 to 1)
-            if np.max(np.abs(audio)) > 0:
-                audio = audio / max(1.0, np.max(np.abs(audio)))
-            return (self.sample_rate, audio)
-# Create processor instance
-processor = AudioProcessor()
 def process_mic_audio(audio):
     """Process audio from Gradio microphone and update transcription"""
@@ -135,7 +25,7 @@ def process_mic_audio(audio):
     # Return status update and transcription
     buffer_seconds = buffer_size / processor.sample_rate
     return (
-        f"Buffer size: {buffer_size} samples ({buffer_seconds:.2f} seconds)",
         transcription
     )
@@ -147,8 +37,13 @@ def get_current_buffer():
     """Get the current buffer for playback"""
     return processor.get_playback_audio()
 # Create Gradio interface
-with gr.Blocks() as demo:
     gr.Markdown("# Live Speech Recognition with Buffer Playback")
     with gr.Row():
@@ -161,11 +56,12 @@ with gr.Blocks() as demo:
     with gr.Row():
         clear_btn = gr.Button("Clear Buffer")
         play_btn = gr.Button("Get Buffer for Playback")
     with gr.Row():
         transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
-    # Connect components - removed the 'every' parameter for compatibility
     audio_input.stream(
         process_mic_audio,
         audio_input,
@@ -174,6 +70,8 @@ with gr.Blocks() as demo:
     clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
     play_btn.click(get_current_buffer, None, buffer_audio)
-# Launch the interface
-demo.launch()

 import gradio as gr
 import numpy as np
+from transcriber import AudioProcessor
+# Create processor instance with more conservative settings
+processor = AudioProcessor(model_size="tiny.en", device="cpu")
+# Adjust some settings for better quality
+processor.min_process_length = 2 * processor.sample_rate  # Need at least 2 seconds before processing
+processor.process_interval = 1.5  # Process at most every 1.5 seconds
 def process_mic_audio(audio):
     """Process audio from Gradio microphone and update transcription"""
     # Return status update and transcription
     buffer_seconds = buffer_size / processor.sample_rate
     return (
+        f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
         transcription
     )
     """Get the current buffer for playback"""
     return processor.get_playback_audio()
+def force_transcribe():
+    """Force transcription of current buffer"""
+    processor._process_audio()
+    return processor.get_transcription()
 # Create Gradio interface
+with gr.Blocks(title="Live Speech Transcription") as demo:
     gr.Markdown("# Live Speech Recognition with Buffer Playback")
     with gr.Row():
     with gr.Row():
         clear_btn = gr.Button("Clear Buffer")
         play_btn = gr.Button("Get Buffer for Playback")
+        force_btn = gr.Button("Force Transcribe")
     with gr.Row():
         transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
+    # Connect components
     audio_input.stream(
         process_mic_audio,
         audio_input,
     clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
     play_btn.click(get_current_buffer, None, buffer_audio)
+    force_btn.click(force_transcribe, None, transcription_output)
+if __name__ == "__main__":
+    # Launch the interface
+    demo.launch()

transcriber.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import numpy as np
+import threading
+import time
+from faster_whisper import WhisperModel
+import scipy.signal as signal
+class AudioProcessor:
+    def __init__(self, model_size="tiny.en", device="cpu", compute_type="int8"):
+        """Initialize the audio processor with configurable parameters"""
+        self.audio_buffer = np.array([])  # Stores raw audio for playback
+        self.processed_length = 0         # Length of audio already processed
+        self.sample_rate = 16000          # Default sample rate for whisper
+        self.lock = threading.Lock()      # Thread safety for buffer access
+        self.transcription = ['']         # List of transcription segments
+        self.min_process_length = 1 * self.sample_rate  # Process at least 1 second
+        self.max_buffer_size = 30 * self.sample_rate  # Maximum buffer size (30 seconds)
+        self.last_process_time = time.time()
+        self.process_interval = 1.0       # Process every 1 second
+        self.is_processing = False        # Flag to prevent concurrent processing
+        # Initialize the whisper model
+        self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
+        print(f"Initialized {model_size} model on {device}")
+    def add_audio(self, audio_data, sr):
+        """
+        Add audio to the buffer and process if needed
+        Args:
+            audio_data (numpy.ndarray): Audio data to add
+            sr (int): Sample rate of the audio data
+        Returns:
+            int: Current buffer size in samples
+        """
+        with self.lock:
+            # Convert to mono if stereo
+            if audio_data.ndim > 1:
+                audio_data = audio_data.mean(axis=1)
+            # Keep original format without normalization
+            audio_data = audio_data.astype(np.float32)
+            # Resample properly if needed
+            if sr != self.sample_rate:
+                try:
+                    # Use scipy for proper resampling
+                    number_of_samples = int(len(audio_data) * self.sample_rate / sr)
+                    audio_data = signal.resample(audio_data, number_of_samples)
+                except Exception as e:
+                    print(f"Resampling error: {e}")
+                    # Fallback to simple method if scipy fails
+                    ratio = self.sample_rate / sr
+                    audio_data = np.interp(
+                        np.arange(0, len(audio_data) * ratio, ratio),
+                        np.arange(0, len(audio_data)),
+                        audio_data
+                    )
+            # Apply fade-in to prevent clicks at chunk boundaries (5ms fade)
+            fade_samples = min(int(0.005 * self.sample_rate), len(audio_data))
+            if fade_samples > 0:
+                fade_in = np.linspace(0, 1, fade_samples)
+                audio_data[:fade_samples] = audio_data[:fade_samples] * fade_in
+            # Add to buffer
+            if len(self.audio_buffer) == 0:
+                self.audio_buffer = audio_data
+            else:
+                self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
+            # Trim buffer if it gets too large
+            if len(self.audio_buffer) > self.max_buffer_size:
+                excess = len(self.audio_buffer) - self.max_buffer_size
+                self.audio_buffer = self.audio_buffer[excess:]
+                # Adjust processed length when trimming
+                self.processed_length = max(0, self.processed_length - excess)
+            # Check if we should process now
+            should_process = (
+                len(self.audio_buffer) >= self.min_process_length and
+                time.time() - self.last_process_time >= self.process_interval and
+                not self.is_processing
+            )
+            if should_process:
+                self.last_process_time = time.time()
+                self.is_processing = True
+                # Process the buffer in a separate thread to avoid blocking
+                threading.Thread(target=self._process_audio).start()
+            return len(self.audio_buffer)
+    def _process_audio(self):
+        """Process the current audio buffer (should be called in a separate thread)"""
+        try:
+            with self.lock:
+                # Get unprocessed portion of the buffer
+                if self.processed_length >= len(self.audio_buffer):
+                    self.is_processing = False
+                    return
+                # Make a copy of the full buffer for processing
+                audio = self.audio_buffer.copy()
+            # Normalize for transcription
+            audio_norm = audio.astype(np.float32)
+            if np.max(np.abs(audio_norm)) > 0:
+                audio_norm = audio_norm / np.max(np.abs(audio_norm))
+            # Transcribe with whisper
+            segments, info = self.audio_model.transcribe(audio_norm, beam_size=5)
+            result = list(segments)
+            if result:
+                with self.lock:
+                    # Update the transcription
+                    self.transcription = [seg.text for seg in result]
+                    # Mark the whole buffer as processed
+                    self.processed_length = len(self.audio_buffer)
+        except Exception as e:
+            print(f"Transcription error: {e}")
+        finally:
+            # Reset processing flag
+            self.is_processing = False
+    def get_transcription(self):
+        """Get the current transcription text"""
+        with self.lock:
+            return " ".join(self.transcription)
+    def clear_buffer(self):
+        """Clear the audio buffer"""
+        with self.lock:
+            self.audio_buffer = np.array([])
+            self.processed_length = 0
+            self.transcription = ['']
+            self.is_processing = False
+            return "Buffers cleared"
+    def get_playback_audio(self):
+        """Get properly formatted audio for Gradio playback"""
+        with self.lock:
+            if len(self.audio_buffer) == 0:
+                return None
+            # Make a copy and ensure proper format for Gradio
+            audio = self.audio_buffer.copy()
+            # Ensure audio is in the correct range for playback (-1 to 1)
+            if np.max(np.abs(audio)) > 0:
+                audio = audio / max(1.0, np.max(np.abs(audio)))
+            return (self.sample_rate, audio)