Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Jul 20

Commit

99ff812

1 Parent(s): 77abe68

fast whisper

Browse files

Files changed (2) hide show

app.py +118 -73
requirements.txt +6 -3

app.py CHANGED Viewed

@@ -28,64 +28,38 @@ import subprocess
 import os
 import tempfile
 import spaces
-from transformers import pipeline
 from pyannote.audio import Pipeline
 import requests
 import base64
-# Install flash attention for acceleration
-'''
-try:
-    subprocess.run(
-        "pip install flash-attn --no-build-isolation",
-        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-        shell=True,
-        check=True
-    )
-except subprocess.CalledProcessError:
-    print("Warning: Could not install flash-attn, falling back to default attention")
-'''
-# Create global Whisper pipeline
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-large-v3-turbo",
-    torch_dtype=torch.float16,
     device="cuda",
-    model_kwargs={"attn_implementation": "flash_attention_2"},#flash_attention_2
-    return_timestamps=True,
 )
 # Create global diarization pipeline
 diarization_pipe = None
 try:
     print("Loading diarization model...")
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.set_float32_matmul_precision('high')
     diarization_pipe = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
         use_auth_token=os.getenv("HF_TOKEN"),
         torch_dtype=torch.float16,
     ).to(torch.device("cuda"))
-    pipe.model.half()                       # FP16
-    for m in pipe.model.modules():          # compact LSTM weights
-        if isinstance(m, torch.nn.LSTM):
-            m.flatten_parameters()
-    pipe.model = torch.compile(pipe.model, mode="reduce-overhead")
     print("Diarization model loaded successfully")
 except Exception as e:
-    import traceback
-    traceback.print_exc()
     print(f"Could not load diarization model: {e}")
     diarization_pipe = None
 class WhisperTranscriber:
     def __init__(self):
-        self.pipe = pipe  # Use global Whisper pipeline
         self.diarization_model = diarization_pipe  # Use global diarization pipeline
     def convert_audio_format(self, audio_path):
@@ -137,42 +111,65 @@ class WhisperTranscriber:
     @spaces.GPU
     def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
-        """Transcribe multiple audio segments"""
         print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
-        # Prepare generation kwargs
-        generate_kwargs = {}
-        if language:
-            generate_kwargs["language"] = language
-        if translate:
-            generate_kwargs["task"] = "translate"
-        if prompt:
-            generate_kwargs["prompt_ids"] = self.pipe.tokenizer.encode(prompt)
         results = []
         for i, segment in enumerate(audio_segments):
             print(f"Processing segment {i+1}/{len(audio_segments)}")
             # Transcribe this segment
-            result = self.pipe(
-                segment["audio_path"],
-                return_timestamps=True,
-                generate_kwargs=generate_kwargs,
-                chunk_length_s=30,
-                batch_size=128,
-            )
-            # Extract text
-            text = result["text"].strip() if "text" in result else ""
-            # Create result entry
-            results.append({
-                "start_time": segment["start"],
-                "end_time": segment["end"],
-                "speaker_label": segment["speaker"],
-                "text": text
-            })
         # Clean up temporary files
         for segment in audio_segments:
@@ -182,7 +179,7 @@ class WhisperTranscriber:
         transcription_time = time.time() - start_time
         print(f"All segments transcribed in {transcription_time:.2f} seconds")
-        return results
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
@@ -228,6 +225,47 @@ class WhisperTranscriber:
         return diarize_segments, detected_num_speakers
     @spaces.GPU
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
@@ -252,14 +290,19 @@ class WhisperTranscriber:
             audio_segments = self.cut_audio_segments(converted_audio_path, diarization_segments)
             # Step 4: Transcribe each segment
-            transcription_results = self.transcribe_audio_segments(
                 audio_segments, language, translate, prompt
             )
-            # Step 5: Return in requested format
             return {
-                "speaker_count": detected_num_speakers,
-                "transcription": transcription_results
             }
         except Exception as e:
@@ -280,19 +323,21 @@ def format_segments_for_display(result):
     if "error" in result:
         return f"❌ Error: {result['error']}"
-    speaker_count = result.get("speaker_count", 1)
-    transcription = result.get("transcription", [])
     output = f"🎯 **Detection Results:**\n"
-    output += f"- Speakers: {speaker_count}\n"
-    output += f"- Segments: {len(transcription)}\n\n"
     output += "📝 **Transcription:**\n\n"
-    for i, segment in enumerate(transcription, 1):
-        start_time = str(datetime.timedelta(seconds=int(segment["start_time"])))
-        end_time = str(datetime.timedelta(seconds=int(segment["end_time"])))
-        speaker = segment.get("speaker_label", "SPEAKER_00")
         text = segment["text"]
         output += f"**{speaker}** ({start_time} → {end_time})\n"

 import os
 import tempfile
 import spaces
+from faster_whisper import WhisperModel
+from faster_whisper.vad import VadOptions
 from pyannote.audio import Pipeline
 import requests
 import base64
+# Create global Whisper model
+print("Loading Whisper model...")
+model = WhisperModel(
+    "large-v3-turbo",
     device="cuda",
+    compute_type="float16",
 )
+print("Whisper model loaded successfully")
 # Create global diarization pipeline
 diarization_pipe = None
 try:
     print("Loading diarization model...")
     diarization_pipe = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
         use_auth_token=os.getenv("HF_TOKEN"),
         torch_dtype=torch.float16,
     ).to(torch.device("cuda"))
     print("Diarization model loaded successfully")
 except Exception as e:
     print(f"Could not load diarization model: {e}")
     diarization_pipe = None
 class WhisperTranscriber:
     def __init__(self):
+        self.model = model  # Use global Whisper model
         self.diarization_model = diarization_pipe  # Use global diarization pipeline
     def convert_audio_format(self, audio_path):
     @spaces.GPU
     def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
+        """Transcribe multiple audio segments using faster_whisper"""
         print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
+        # Prepare options similar to replicate.py
+        options = dict(
+            language=language,
+            beam_size=5,
+            vad_filter=True,
+            vad_parameters=VadOptions(
+                max_speech_duration_s=self.model.feature_extractor.chunk_length,
+                min_speech_duration_ms=100,
+                speech_pad_ms=100,
+                threshold=0.25,
+                neg_threshold=0.2,
+            ),
+            word_timestamps=True,
+            initial_prompt=prompt,
+            language_detection_segments=1,
+            task="translate" if translate else "transcribe",
+        )
         results = []
+        detected_language = None
         for i, segment in enumerate(audio_segments):
             print(f"Processing segment {i+1}/{len(audio_segments)}")
             # Transcribe this segment
+            segments, transcript_info = self.model.transcribe(segment["audio_path"], **options)
+            segments = list(segments)
+            # Get detected language from first segment
+            if detected_language is None:
+                detected_language = transcript_info.language
+            # Process each transcribed segment
+            for seg in segments:
+                # Create result entry with detailed format like replicate.py
+                words_list = []
+                if seg.words:
+                    for word in seg.words:
+                        words_list.append({
+                            "start": float(word.start) + segment["start"],
+                            "end": float(word.end) + segment["start"],
+                            "word": word.word,
+                            "probability": word.probability,
+                            "speaker": segment["speaker"]
+                        })
+                results.append({
+                    "start": float(seg.start) + segment["start"],
+                    "end": float(seg.end) + segment["start"],
+                    "text": seg.text,
+                    "speaker": segment["speaker"],
+                    "avg_logprob": seg.avg_logprob,
+                    "words": words_list,
+                    "duration": float(seg.end - seg.start)
+                })
         # Clean up temporary files
         for segment in audio_segments:
         transcription_time = time.time() - start_time
         print(f"All segments transcribed in {transcription_time:.2f} seconds")
+        return results, detected_language
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
         return diarize_segments, detected_num_speakers
+    def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
+        """Group consecutive segments from the same speaker"""
+        if not segments:
+            return segments
+        grouped_segments = []
+        current_group = segments[0].copy()
+        sentence_end_pattern = r"[.!?]+"
+        for segment in segments[1:]:
+            time_gap = segment["start"] - current_group["end"]
+            current_duration = current_group["end"] - current_group["start"]
+            # Conditions for combining segments
+            can_combine = (
+                segment["speaker"] == current_group["speaker"] and
+                time_gap <= max_gap and
+                current_duration < max_duration and
+                not re.search(sentence_end_pattern, current_group["text"][-1:])
+            )
+            if can_combine:
+                # Merge segments
+                current_group["end"] = segment["end"]
+                current_group["text"] += " " + segment["text"]
+                current_group["words"].extend(segment["words"])
+                current_group["duration"] = current_group["end"] - current_group["start"]
+            else:
+                # Start new group
+                grouped_segments.append(current_group)
+                current_group = segment.copy()
+        grouped_segments.append(current_group)
+        # Clean up text
+        for segment in grouped_segments:
+            segment["text"] = re.sub(r"\s+", " ", segment["text"]).strip()
+            segment["text"] = re.sub(r"\s+([.,!?])", r"\1", segment["text"])
+        return grouped_segments
     @spaces.GPU
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
             audio_segments = self.cut_audio_segments(converted_audio_path, diarization_segments)
             # Step 4: Transcribe each segment
+            transcription_results, detected_language = self.transcribe_audio_segments(
                 audio_segments, language, translate, prompt
             )
+            # Step 5: Group segments if requested
+            if group_segments:
+                transcription_results = self.group_segments_by_speaker(transcription_results)
+            # Step 6: Return in replicate.py format
             return {
+                "segments": transcription_results,
+                "language": detected_language,
+                "num_speakers": detected_num_speakers
             }
         except Exception as e:
     if "error" in result:
         return f"❌ Error: {result['error']}"
+    segments = result.get("segments", [])
+    language = result.get("language", "unknown")
+    num_speakers = result.get("num_speakers", 1)
     output = f"🎯 **Detection Results:**\n"
+    output += f"- Language: {language}\n"
+    output += f"- Speakers: {num_speakers}\n"
+    output += f"- Segments: {len(segments)}\n\n"
     output += "📝 **Transcription:**\n\n"
+    for i, segment in enumerate(segments, 1):
+        start_time = str(datetime.timedelta(seconds=int(segment["start"])))
+        end_time = str(datetime.timedelta(seconds=int(segment["end"])))
+        speaker = segment.get("speaker", "SPEAKER_00")
         text = segment["text"]
         output += f"**{speaker}** ({start_time} → {end_time})\n"

requirements.txt CHANGED Viewed

@@ -1,11 +1,14 @@
 # 1.  Do NOT pin torch/torchaudio here – keep the CUDA builds that come with the image
 torch==2.4.0
 transformers==4.48.0
-# pre-built wheel that matches torch+cu126
-https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.4-cp310-cp310-linux_x86_64.whl
 pydantic==2.10.6
-# 2.  Extra libs your app really needs
 gradio==5.0.1
 spaces>=0.19.0
 pyannote.audio>=3.1.0

 # 1.  Do NOT pin torch/torchaudio here – keep the CUDA builds that come with the image
 torch==2.4.0
 transformers==4.48.0
+# Removed flash-attention since faster-whisper handles this internally
+# https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.4-cp310-cp310-linux_x86_64.whl
 pydantic==2.10.6
+# 2.  Main whisper model
+faster-whisper>=1.0.0
+# 3.  Extra libs your app really needs
 gradio==5.0.1
 spaces>=0.19.0
 pyannote.audio>=3.1.0