Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Jul 22

Commit

7cf016f

1 Parent(s): aaba71b

Add batched inference support in WhisperTranscriber for improved transcription performance. Update methods to accept batch size parameters and adjust output formatting accordingly.

Browse files

Files changed (1) hide show

app.py +68 -40

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ import subprocess
 import os
 import tempfile
 import spaces
-from faster_whisper import WhisperModel
 from faster_whisper.vad import VadOptions
 import requests
 import base64
@@ -64,6 +64,7 @@ model_cache_path = LOCAL_DIR      # <‑‑ this is what we pass to WhisperModel
 # Lazy global holder ----------------------------------------------------------
 _whisper = None
 _diarizer = None
 # Create global diarization pipeline
@@ -87,7 +88,7 @@ except Exception as e:
 @spaces.GPU   # GPU is guaranteed to exist *inside* this function
 def _load_models():
-    global _whisper, _diarizer
     if _whisper is None:
         print("Loading Whisper model...")
         _whisper = WhisperModel(
@@ -95,8 +96,11 @@ def _load_models():
             device="cuda",
             compute_type="float16",
         )
-        print("Whisper model loaded successfully")
-    return _whisper, _diarizer
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
@@ -121,18 +125,18 @@ class WhisperTranscriber:
             raise RuntimeError(f"Audio conversion failed: {e}")
     @spaces.GPU           # each call gets a GPU slice
-    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None):
-        """Transcribe the entire audio file without speaker diarization"""
-        whisper, _ = _load_models()   # models live on the GPU
-        print("Transcribing full audio...")
         start_time = time.time()
-        # Prepare options
         options = dict(
             language=language,
             beam_size=5,
-            vad_filter=True,
             vad_parameters=VadOptions(
                 max_speech_duration_s=whisper.feature_extractor.chunk_length,
                 min_speech_duration_ms=100,
@@ -146,8 +150,12 @@ class WhisperTranscriber:
             task="translate" if translate else "transcribe",
         )
-        # Transcribe the entire audio
-        segments, transcript_info = whisper.transcribe(audio_path, batch_size=24, **options)
         segments = list(segments)
         detected_language = transcript_info.language
@@ -176,9 +184,9 @@ class WhisperTranscriber:
                 "words": words_list,
                 "duration": float(seg.end - seg.start)
             })
-        print(results)
         transcription_time = time.time() - start_time
-        print(f"Full audio transcribed in {transcription_time:.2f} seconds")
         return results, detected_language
@@ -214,14 +222,14 @@ class WhisperTranscriber:
         return audio_segments
     @spaces.GPU           # each call gets a GPU slice
-    def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
-        """Transcribe multiple audio segments using faster_whisper"""
-        whisper, diarizer = _load_models()   # models live on the GPU
-        print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
-        # Prepare options similar to replicate.py
         options = dict(
             language=language,
             beam_size=5,
@@ -245,8 +253,12 @@ class WhisperTranscriber:
         for i, segment in enumerate(audio_segments):
             print(f"Processing segment {i+1}/{len(audio_segments)}")
-            # Transcribe this segment
-            segments, transcript_info = whisper.transcribe(segment["audio_path"], batch_size=24, **options)
             segments = list(segments)
             # Get detected language from first segment
@@ -255,7 +267,7 @@ class WhisperTranscriber:
             # Process each transcribed segment
             for seg in segments:
-                # Create result entry with detailed format like replicate.py
                 words_list = []
                 if seg.words:
                     for word in seg.words:
@@ -283,14 +295,14 @@ class WhisperTranscriber:
                 os.unlink(segment["audio_path"])
         transcription_time = time.time() - start_time
-        print(f"All segments transcribed in {transcription_time:.2f} seconds")
         return results, detected_language
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
-        whisper, diarizer = _load_models()   # models live on the GPU
         if diarizer is None:
             print("Diarization model not available, creating single speaker segment")
@@ -376,7 +388,7 @@ class WhisperTranscriber:
         return grouped_segments
     @spaces.GPU           # each call gets a GPU slice
-    def process_audio_full(self, audio_file, language=None, translate=False, prompt=None, group_segments=True):
         """Process audio with full transcription (no speaker diarization)"""
         if audio_file is None:
             return {"error": "No audio file provided"}
@@ -389,9 +401,9 @@ class WhisperTranscriber:
             print("Converting audio format...")
             converted_audio_path = self.convert_audio_format(audio_file)
-            # Step 2: Transcribe the entire audio
             transcription_results, detected_language = self.transcribe_full_audio(
-                converted_audio_path, language, translate, prompt
             )
             # Step 3: Group segments if requested (based on time gaps and sentence endings)
@@ -403,7 +415,8 @@ class WhisperTranscriber:
                 "segments": transcription_results,
                 "language": detected_language,
                 "num_speakers": 1,  # Single speaker assumption
-                "transcription_method": "full_audio"
             }
         except Exception as e:
@@ -418,7 +431,7 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def process_audio(self, audio_file, num_speakers=None, language=None,
-                     translate=False, prompt=None, group_segments=True):
         """Main processing function - diarization first, then transcription"""
         if audio_file is None:
             return {"error": "No audio file provided"}
@@ -439,21 +452,22 @@ class WhisperTranscriber:
             # Step 3: Cut audio into segments based on diarization
             audio_segments = self.cut_audio_segments(converted_audio_path, diarization_segments)
-            # Step 4: Transcribe each segment
             transcription_results, detected_language = self.transcribe_audio_segments(
-                audio_segments, language, translate, prompt
             )
             # Step 5: Group segments if requested
             if group_segments:
                 transcription_results = self.group_segments_by_speaker(transcription_results)
-            # Step 6: Return in replicate.py format
             return {
                 "segments": transcription_results,
                 "language": detected_language,
                 "num_speakers": detected_num_speakers,
-                "transcription_method": "diarized_segments"
             }
         except Exception as e:
@@ -478,12 +492,14 @@ def format_segments_for_display(result):
     language = result.get("language", "unknown")
     num_speakers = result.get("num_speakers", 1)
     method = result.get("transcription_method", "unknown")
     output = f"🎯 **Detection Results:**\n"
     output += f"- Language: {language}\n"
     output += f"- Speakers: {num_speakers}\n"
     output += f"- Segments: {len(segments)}\n"
-    output += f"- Method: {method}\n\n"
     output += "📝 **Transcription:**\n\n"
@@ -499,7 +515,7 @@ def format_segments_for_display(result):
     return output
 @spaces.GPU
-def process_audio_gradio(audio_file, num_speakers, language, translate, prompt, group_segments, use_diarization):
     """Gradio interface function"""
     if use_diarization:
         result = transcriber.process_audio(
@@ -508,7 +524,8 @@ def process_audio_gradio(audio_file, num_speakers, language, translate, prompt,
             language=language if language != "auto" else None,
             translate=translate,
             prompt=prompt if prompt and prompt.strip() else None,
-            group_segments=group_segments
         )
     else:
         result = transcriber.process_audio_full(
@@ -516,7 +533,8 @@ def process_audio_gradio(audio_file, num_speakers, language, translate, prompt,
             language=language if language != "auto" else None,
             translate=translate,
             prompt=prompt if prompt and prompt.strip() else None,
-            group_segments=group_segments
         )
     formatted_output = format_segments_for_display(result)
@@ -533,7 +551,7 @@ with demo:
     # 🎙️ Advanced Audio Transcription & Speaker Diarization
     Upload an audio file to get accurate transcription with speaker identification, powered by:
-    - **Whisper Large V3 Turbo** with Flash Attention for fast transcription
     - **Pyannote 3.1** for speaker diarization
     - **ZeroGPU** acceleration for optimal performance
     """)
@@ -552,6 +570,15 @@ with demo:
                     info="Uncheck for faster transcription without speaker identification"
                 )
                 num_speakers = gr.Slider(
                     minimum=0,
                     maximum=20,
@@ -613,7 +640,8 @@ with demo:
             translate,
             prompt,
             group_segments,
-            use_diarization
         ],
         outputs=[output_text, output_json]
     )
@@ -622,7 +650,7 @@ with demo:
     gr.Markdown("### 📋 Usage Tips:")
     gr.Markdown("""
     - **Supported formats**: MP3, WAV, M4A, FLAC, OGG, and more
-    - **Max duration**: Recommended under 10 minutes for optimal performance
     - **Speaker diarization**: Enable for speaker identification (slower), disable for faster transcription
     - **Languages**: Supports 100+ languages with auto-detection
     - **Vocabulary**: Add names and technical terms in the prompt for better accuracy

 import os
 import tempfile
 import spaces
+from faster_whisper import WhisperModel, BatchedInferencePipeline
 from faster_whisper.vad import VadOptions
 import requests
 import base64
 # Lazy global holder ----------------------------------------------------------
 _whisper = None
+_batched_whisper = None
 _diarizer = None
 # Create global diarization pipeline
 @spaces.GPU   # GPU is guaranteed to exist *inside* this function
 def _load_models():
+    global _whisper, _batched_whisper, _diarizer
     if _whisper is None:
         print("Loading Whisper model...")
         _whisper = WhisperModel(
             device="cuda",
             compute_type="float16",
         )
+        # Create batched inference pipeline for improved performance
+        _batched_whisper = BatchedInferencePipeline(model=_whisper)
+        print("Whisper model and batched pipeline loaded successfully")
+    return _whisper, _batched_whisper, _diarizer
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
             raise RuntimeError(f"Audio conversion failed: {e}")
     @spaces.GPU           # each call gets a GPU slice
+    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16):
+        """Transcribe the entire audio file without speaker diarization using batched inference"""
+        whisper, batched_whisper, _ = _load_models()   # models live on the GPU
+        print(f"Transcribing full audio with batch size {batch_size}...")
         start_time = time.time()
+        # Prepare options for batched inference
         options = dict(
             language=language,
             beam_size=5,
+            vad_filter=True,  # VAD is enabled by default for batched transcription
             vad_parameters=VadOptions(
                 max_speech_duration_s=whisper.feature_extractor.chunk_length,
                 min_speech_duration_ms=100,
             task="translate" if translate else "transcribe",
         )
+        # Use batched inference for better performance
+        segments, transcript_info = batched_whisper.transcribe(
+            audio_path,
+            batch_size=batch_size,
+            **options
+        )
         segments = list(segments)
         detected_language = transcript_info.language
                 "words": words_list,
                 "duration": float(seg.end - seg.start)
             })
         transcription_time = time.time() - start_time
+        print(f"Full audio transcribed in {transcription_time:.2f} seconds using batch size {batch_size}")
         return results, detected_language
         return audio_segments
     @spaces.GPU           # each call gets a GPU slice
+    def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None, batch_size=8):
+        """Transcribe multiple audio segments using faster_whisper with batching"""
+        whisper, batched_whisper, _ = _load_models()   # models live on the GPU
+        print(f"Transcribing {len(audio_segments)} audio segments with batch size {batch_size}...")
         start_time = time.time()
+        # Prepare options
         options = dict(
             language=language,
             beam_size=5,
         for i, segment in enumerate(audio_segments):
             print(f"Processing segment {i+1}/{len(audio_segments)}")
+            # Use batched inference for each segment
+            segments, transcript_info = batched_whisper.transcribe(
+                segment["audio_path"],
+                batch_size=batch_size,
+                **options
+            )
             segments = list(segments)
             # Get detected language from first segment
             # Process each transcribed segment
             for seg in segments:
+                # Create result entry with detailed format
                 words_list = []
                 if seg.words:
                     for word in seg.words:
                 os.unlink(segment["audio_path"])
         transcription_time = time.time() - start_time
+        print(f"All segments transcribed in {transcription_time:.2f} seconds using batch size {batch_size}")
         return results, detected_language
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
+        _, _, diarizer = _load_models()   # models live on the GPU
         if diarizer is None:
             print("Diarization model not available, creating single speaker segment")
         return grouped_segments
     @spaces.GPU           # each call gets a GPU slice
+    def process_audio_full(self, audio_file, language=None, translate=False, prompt=None, group_segments=True, batch_size=16):
         """Process audio with full transcription (no speaker diarization)"""
         if audio_file is None:
             return {"error": "No audio file provided"}
             print("Converting audio format...")
             converted_audio_path = self.convert_audio_format(audio_file)
+            # Step 2: Transcribe the entire audio with batching
             transcription_results, detected_language = self.transcribe_full_audio(
+                converted_audio_path, language, translate, prompt, batch_size
             )
             # Step 3: Group segments if requested (based on time gaps and sentence endings)
                 "segments": transcription_results,
                 "language": detected_language,
                 "num_speakers": 1,  # Single speaker assumption
+                "transcription_method": "full_audio_batched",
+                "batch_size": batch_size
             }
         except Exception as e:
     @spaces.GPU           # each call gets a GPU slice
     def process_audio(self, audio_file, num_speakers=None, language=None,
+                     translate=False, prompt=None, group_segments=True, batch_size=8):
         """Main processing function - diarization first, then transcription"""
         if audio_file is None:
             return {"error": "No audio file provided"}
             # Step 3: Cut audio into segments based on diarization
             audio_segments = self.cut_audio_segments(converted_audio_path, diarization_segments)
+            # Step 4: Transcribe each segment with batching
             transcription_results, detected_language = self.transcribe_audio_segments(
+                audio_segments, language, translate, prompt, batch_size
             )
             # Step 5: Group segments if requested
             if group_segments:
                 transcription_results = self.group_segments_by_speaker(transcription_results)
+            # Step 6: Return results
             return {
                 "segments": transcription_results,
                 "language": detected_language,
                 "num_speakers": detected_num_speakers,
+                "transcription_method": "diarized_segments_batched",
+                "batch_size": batch_size
             }
         except Exception as e:
     language = result.get("language", "unknown")
     num_speakers = result.get("num_speakers", 1)
     method = result.get("transcription_method", "unknown")
+    batch_size = result.get("batch_size", "N/A")
     output = f"🎯 **Detection Results:**\n"
     output += f"- Language: {language}\n"
     output += f"- Speakers: {num_speakers}\n"
     output += f"- Segments: {len(segments)}\n"
+    output += f"- Method: {method}\n"
+    output += f"- Batch Size: {batch_size}\n\n"
     output += "📝 **Transcription:**\n\n"
     return output
 @spaces.GPU
+def process_audio_gradio(audio_file, num_speakers, language, translate, prompt, group_segments, use_diarization, batch_size):
     """Gradio interface function"""
     if use_diarization:
         result = transcriber.process_audio(
             language=language if language != "auto" else None,
             translate=translate,
             prompt=prompt if prompt and prompt.strip() else None,
+            group_segments=group_segments,
+            batch_size=batch_size
         )
     else:
         result = transcriber.process_audio_full(
             language=language if language != "auto" else None,
             translate=translate,
             prompt=prompt if prompt and prompt.strip() else None,
+            group_segments=group_segments,
+            batch_size=batch_size
         )
     formatted_output = format_segments_for_display(result)
     # 🎙️ Advanced Audio Transcription & Speaker Diarization
     Upload an audio file to get accurate transcription with speaker identification, powered by:
+    - **Faster-Whisper Large V3 Turbo** with batched inference for optimal performance
     - **Pyannote 3.1** for speaker diarization
     - **ZeroGPU** acceleration for optimal performance
     """)
                     info="Uncheck for faster transcription without speaker identification"
                 )
+                batch_size = gr.Slider(
+                    minimum=1,
+                    maximum=32,
+                    value=16,
+                    step=1,
+                    label="Batch Size",
+                    info="Higher values = faster processing but more GPU memory usage. Recommended: 8-24"
+                )
                 num_speakers = gr.Slider(
                     minimum=0,
                     maximum=20,
             translate,
             prompt,
             group_segments,
+            use_diarization,
+            batch_size
         ],
         outputs=[output_text, output_json]
     )
     gr.Markdown("### 📋 Usage Tips:")
     gr.Markdown("""
     - **Supported formats**: MP3, WAV, M4A, FLAC, OGG, and more
+    - **Batch Size**: Higher values (16-24) = faster processing but more GPU memory
     - **Speaker diarization**: Enable for speaker identification (slower), disable for faster transcription
     - **Languages**: Supports 100+ languages with auto-detection
     - **Vocabulary**: Add names and technical terms in the prompt for better accuracy