Spaces:

lyimo
/

speech_separation

Runtime error

App Files Files Community

lyimo commited on Oct 28, 2024

Commit

17508a1

verified ·

1 Parent(s): 59d91ab

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -74

app.py CHANGED Viewed

@@ -5,84 +5,133 @@ import os
 from pydub import AudioSegment
 import tempfile
 from speechbrain.pretrained.separation import SepformerSeparation
-class AudioDenoiser:
     def __init__(self):
-        # Initialize the SepFormer model for audio enhancement
         self.model = SepformerSeparation.from_hparams(
             source="speechbrain/sepformer-dns4-16k-enhancement",
             savedir='pretrained_models/sepformer-dns4-16k-enhancement'
         )
-        # Create output directory if it doesn't exist
-        os.makedirs("enhanced_audio", exist_ok=True)
-    def convert_audio_to_wav(self, input_path):
-        """
-        Convert any audio format to WAV with proper settings
-        Args:
-            input_path (str): Path to input audio file
-        Returns:
-            str: Path to converted WAV file
-        """
         try:
-            # Create a temporary file for the converted audio
-            temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
-            temp_wav_path = temp_wav.name
-            # Load audio using pydub (supports multiple formats)
-            audio = AudioSegment.from_file(input_path)
-            # Convert to mono if stereo
-            if audio.channels > 1:
-                audio = audio.set_channels(1)
-            # Export as WAV with proper settings
-            audio.export(
-                temp_wav_path,
-                format='wav',
-                parameters=[
-                    '-ar', '16000',  # Set sample rate to 16kHz
-                    '-ac', '1'       # Set channels to mono
-                ]
-            )
-            return temp_wav_path
         except Exception as e:
-            raise gr.Error(f"Error converting audio format: {str(e)}")
-    def enhance_audio(self, audio_path):
         """
-        Process the input audio file and return the enhanced version
-        Args:
-            audio_path (str): Path to the input audio file
-        Returns:
-            str: Path to the enhanced audio file
         """
         try:
-            # Convert input audio to proper WAV format
-            wav_path = self.convert_audio_to_wav(audio_path)
-            # Separate and enhance the audio
-            est_sources = self.model.separate_file(path=wav_path)
-            # Generate output filename
-            output_path = os.path.join("enhanced_audio", "enhanced_audio.wav")
-            # Save the enhanced audio
-            torchaudio.save(
-                output_path,
-                est_sources[:, :, 0].detach().cpu(),
-                16000  # Sample rate
-            )
-            # Clean up temporary file
-            os.unlink(wav_path)
             return output_path
@@ -91,11 +140,11 @@ class AudioDenoiser:
 def create_gradio_interface():
     # Initialize the denoiser
-    denoiser = AudioDenoiser()
     # Create the Gradio interface
     interface = gr.Interface(
-        fn=denoiser.enhance_audio,
         inputs=gr.Audio(
             type="filepath",
             label="Upload Noisy Audio"
@@ -104,21 +153,10 @@ def create_gradio_interface():
             label="Enhanced Audio",
             type="filepath"
         ),
-        title="Audio Denoising using SepFormer",
         description="""
-        This application uses the SepFormer model from SpeechBrain to enhance audio quality
-        by removing background noise. Supports various audio formats including MP3 and WAV.
-        """,
-        article="""
-        Supported audio formats:
-        - MP3
-        - WAV
-        - OGG
-        - FLAC
-        - M4A
-        and more...
-        The audio will automatically be converted to the correct format for processing.
         """
     )

 from pydub import AudioSegment
 import tempfile
 from speechbrain.pretrained.separation import SepformerSeparation
+import numpy as np
+import threading
+from queue import Queue
+import time
+class RealtimeAudioDenoiser:
     def __init__(self):
+        # Initialize the model
         self.model = SepformerSeparation.from_hparams(
             source="speechbrain/sepformer-dns4-16k-enhancement",
             savedir='pretrained_models/sepformer-dns4-16k-enhancement'
         )
+        # Move model to GPU if available
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        # Enable inference mode for better performance
+        self.model.eval()
+        torch.set_grad_enabled(False)
+        # Set chunk size for streaming (500ms chunks)
+        self.chunk_duration = 0.5  # seconds
+        self.sample_rate = 16000
+        self.chunk_size = int(self.sample_rate * self.chunk_duration)
+        # Initialize processing queue and buffer
+        self.processing_queue = Queue()
+        self.output_buffer = Queue()
+        self.is_processing = False
+        # Start processing thread
+        self.processing_thread = threading.Thread(target=self._process_queue)
+        self.processing_thread.daemon = True
+        self.processing_thread.start()
+        # Create output directory
+        os.makedirs("enhanced_audio", exist_ok=True)
+    def _optimize_model(self):
+        """Optimize model for inference"""
+        if self.device.type == 'cuda':
+            # Use mixed precision for faster processing
+            self.model = torch.quantization.quantize_dynamic(
+                self.model, {torch.nn.Linear}, dtype=torch.qint8
+            )
+            torch.backends.cudnn.benchmark = True
+    def _process_queue(self):
+        """Background thread for processing audio chunks"""
+        while True:
+            if not self.processing_queue.empty():
+                chunk = self.processing_queue.get()
+                if chunk is None:
+                    continue
+                # Process audio chunk
+                enhanced_chunk = self._enhance_chunk(chunk)
+                self.output_buffer.put(enhanced_chunk)
+            else:
+                time.sleep(0.01)  # Small delay to prevent CPU overuse
+    def _enhance_chunk(self, audio_chunk):
+        """Process a single chunk of audio"""
         try:
+            # Convert to tensor and move to device
+            chunk_tensor = torch.FloatTensor(audio_chunk).to(self.device)
+            chunk_tensor = chunk_tensor.unsqueeze(0)  # Add batch dimension
+            # Process with model
+            with torch.inference_mode():
+                enhanced = self.model.separate_batch(chunk_tensor)
+                enhanced = enhanced.squeeze(0).cpu().numpy()
+            return enhanced
         except Exception as e:
+            print(f"Error processing chunk: {str(e)}")
+            return audio_chunk
+    def process_stream(self, audio_path):
         """
+        Process audio in streaming fashion
         """
         try:
+            # Convert input audio to proper format
+            audio = AudioSegment.from_file(audio_path)
+            audio = audio.set_frame_rate(self.sample_rate)
+            audio = audio.set_channels(1)
+            # Convert to numpy array
+            samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
+            samples = samples / np.max(np.abs(samples))  # Normalize
+            # Process in chunks
+            enhanced_chunks = []
+            for i in range(0, len(samples), self.chunk_size):
+                chunk = samples[i:i + self.chunk_size]
+                # Pad last chunk if necessary
+                if len(chunk) < self.chunk_size:
+                    chunk = np.pad(chunk, (0, self.chunk_size - len(chunk)))
+                # Add to processing queue
+                self.processing_queue.put(chunk)
+            # Wait for all chunks to be processed
+            while self.processing_queue.qsize() > 0 or self.output_buffer.qsize() > 0:
+                if not self.output_buffer.empty():
+                    enhanced_chunks.append(self.output_buffer.get())
+                time.sleep(0.01)
+            # Combine chunks
+            enhanced_audio = np.concatenate(enhanced_chunks)
+            # Save enhanced audio
+            output_path = os.path.join("enhanced_audio", "enhanced_realtime.wav")
+            enhanced_audio = enhanced_audio * 32767  # Convert to int16 range
+            enhanced_audio = enhanced_audio.astype(np.int16)
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+                torchaudio.save(
+                    f.name,
+                    torch.tensor(enhanced_audio).unsqueeze(0),
+                    self.sample_rate
+                )
+                os.replace(f.name, output_path)
             return output_path
 def create_gradio_interface():
     # Initialize the denoiser
+    denoiser = RealtimeAudioDenoiser()
     # Create the Gradio interface
     interface = gr.Interface(
+        fn=denoiser.process_stream,
         inputs=gr.Audio(
             type="filepath",
             label="Upload Noisy Audio"
             label="Enhanced Audio",
             type="filepath"
         ),
+        title="Real-time Audio Denoising using SepFormer",
         description="""
+        Optimized for real-time processing with low latency.
+        Processes audio in 500ms chunks for streaming applications.
         """
     )