Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Oct 10

Commit

d36869b

1 Parent(s): 62ed41c

switch to whisperX

Browse files

Files changed (1) hide show

app.py +147 -126

app.py CHANGED Viewed

@@ -84,6 +84,7 @@ import tempfile
 import spaces
 from faster_whisper import WhisperModel, BatchedInferencePipeline
 from faster_whisper.vad import VadOptions
 import requests
 import base64
 from pyannote.audio import Pipeline, Inference, Model
@@ -154,39 +155,19 @@ from huggingface_hub import snapshot_download
 # -----------------------------------------------------------------------------
 MODELS = {
     "large-v3-turbo": {
-        "repo_id": "deepdml/faster-whisper-large-v3-turbo-ct2",
-        "local_dir": f"{CACHE_ROOT}/whisper_turbo_v3"
     },
     "large-v3": {
-        "repo_id": "Systran/faster-whisper-large-v3",
-        "local_dir": f"{CACHE_ROOT}/whisper_large_v3"
     },
     "large-v2": {
-        "repo_id": "Systran/faster-whisper-large-v2",
-        "local_dir": f"{CACHE_ROOT}/whisper_large_v2"
     },
 }
 DEFAULT_MODEL = "large-v3-turbo"
-def _download_model(model_name: str):
-    """Downloads a model from the hub if not already present."""
-    if model_name not in MODELS:
-        raise ValueError(f"Model '{model_name}' not found in MODELS registry.")
-    model_info = MODELS[model_name]
-    if not os.path.exists(model_info["local_dir"]):
-        print(f"Downloading model '{model_name}' from {model_info['repo_id']}...")
-        snapshot_download(
-            repo_id=model_info["repo_id"],
-            local_dir=model_info["local_dir"],
-            local_dir_use_symlinks=True,
-            resume_download=True
-        )
-    return model_info["local_dir"]
-# Download the default model on startup
-for model in MODELS:
-    _download_model(model)
 # -----------------------------------------------------------------------------
@@ -412,60 +393,88 @@ def _process_single_chunk(task: dict, out_dir: str) -> dict:
 # model_cache_path = LOCAL_DIR      # <‑‑ this is what we pass to WhisperModel
 # Lazy global holder ----------------------------------------------------------
-_whisper_models = {}
-_batched_whisper_models = {}
 _whipser_x_transcribe_models = {}
 _whipser_x_align_models = {}
 _diarizer = None
 _embedder = None
-# Create global diarization pipeline
-try:
-    print("Loading diarization model...")
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.set_float32_matmul_precision('high')
-    _diarizer = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-3.1",
-        use_auth_token=os.getenv("HF_TOKEN"),
-    ).to(torch.device("cuda"))
-    print("Diarization model loaded successfully")
-except Exception as e:
-    import traceback
-    traceback.print_exc()
-    print(f"Could not load diarization model: {e}")
-    _diarizer = None
-@spaces.GPU   # GPU is guaranteed to exist *inside* this function
-def _load_models(model_name: str = DEFAULT_MODEL):
-    global _whisper_models, _batched_whisper_models, _diarizer
-    if model_name not in _whisper_models:
-        print(f"Loading Whisper model '{model_name}'...")
-        model_cache_path = _download_model(model_name)
-        model = WhisperModel(
-            model_cache_path,
-            device="cuda",
-            compute_type="float16",
-        )
-        # Create batched inference pipeline for improved performance
-        batched_model = BatchedInferencePipeline(model=model)
-        _whisper_models[model_name] = model
-        _batched_whisper_models[model_name] = batched_model
-        print(f"Whisper model '{model_name}' and batched pipeline loaded successfully")
-    whisper = _whisper_models[model_name]
-    batched_whisper = _batched_whisper_models[model_name]
-    return whisper, batched_whisper, _diarizer
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
@@ -494,81 +503,84 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
-        """Transcribe the entire audio file without speaker diarization using batched inference"""
-        whisper, batched_whisper, _ = _load_models(model_name)   # models live on the GPU
-        print(f"Transcribing full audio with '{model_name}' and batch size {batch_size}...")
         start_time = time.time()
-        # Prepare options for batched inference
-        options = dict(
             language=language,
-            beam_size=5,
-            word_timestamps=True,
-            initial_prompt=prompt,
-            condition_on_previous_text=False,  # avoid runaway context
-            language_detection_segments=1,
-            task="translate" if translate else "transcribe",
         )
-        if clip_timestamps:
-            options["vad_filter"] = False
-            options["clip_timestamps"] = clip_timestamps
-        else:
-            vad_options = transcribe_options.get("vad_parameters", None)
-            options["vad_filter"] = True  # VAD is enabled by default for batched transcription
-            options["vad_parameters"] = VadOptions(**vad_options) if vad_options else VadOptions(
-                max_speech_duration_s=whisper.feature_extractor.chunk_length,
-                min_speech_duration_ms=180,   # ignore ultra-short blips
-                min_silence_duration_ms=120,  # split on short Mandarin pauses (if supported)
-                speech_pad_ms=120,
-                threshold=0.35,
-                neg_threshold=0.2,
-            )
-        if batch_size > 1:
-            # Use batched inference for better performance
-            segments, transcript_info = batched_whisper.transcribe(
-                audio_path,
-                batch_size=batch_size,
-                **options
-            )
-        else:
-            segments, transcript_info = whisper.transcribe(
-                audio_path,
-                **options
-            )
-        segments = list(segments)
-        detected_language = transcript_info.language
-        print("Detected language: ", detected_language, "segments: ", len(segments))
-        # Process segments
         results = []
         for seg in segments:
             # Create result entry with detailed format
             words_list = []
-            if seg.words:
-                for word in seg.words:
                     words_list.append({
-                        "start": float(word.start) + float(base_offset_s),
-                        "end": float(word.end) + float(base_offset_s),
-                        "word": word.word,
-                        "probability": word.probability,
                         "speaker": "SPEAKER_00"  # No speaker identification in full transcription
                     })
             results.append({
-                "start": float(seg.start) + float(base_offset_s),
-                "end": float(seg.end) + float(base_offset_s),
-                "text": seg.text,
                 "speaker": "SPEAKER_00",  # Single speaker assumption
-                "avg_logprob": seg.avg_logprob,
                 "words": words_list,
-                "duration": float(seg.end - seg.start)
             })
         transcription_time = time.time() - start_time
-        print(f"Full audio transcribed in {transcription_time:.2f} seconds using batch size {batch_size}")
-        print(results)
         return results, detected_language
     # Removed audio cutting; transcription is done once on the full (preprocessed) audio
@@ -576,9 +588,9 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None, base_offset_s: float = 0.0):
         """Perform speaker diarization; return segments with global timestamps and per-speaker embeddings."""
-        _, _, diarizer = _load_models()   # models live on the GPU
-        if diarizer is None:
             print("Diarization model not available, creating single speaker segment")
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
@@ -611,7 +623,7 @@ class WhisperTranscriber:
         waveform, sample_rate = torchaudio.load(audio_path)
         # Perform diarization
-        diarization = diarizer(
             {"waveform": waveform, "sample_rate": sample_rate},
             num_speakers=num_speakers,
         )
@@ -1526,5 +1538,14 @@ with demo:
     - Vocabulary: Add names and technical terms in the prompt for better accuracy
     """)
 if __name__ == "__main__":
     demo.launch(debug=True)

 import spaces
 from faster_whisper import WhisperModel, BatchedInferencePipeline
 from faster_whisper.vad import VadOptions
+import whisperx
 import requests
 import base64
 from pyannote.audio import Pipeline, Inference, Model
 # -----------------------------------------------------------------------------
 MODELS = {
     "large-v3-turbo": {
+        "whisperx_name": "large-v3-turbo",
     },
     "large-v3": {
+        "whisperx_name": "large-v3",
     },
     "large-v2": {
+        "whisperx_name": "large-v2",
     },
 }
 DEFAULT_MODEL = "large-v3-turbo"
+# Supported languages for alignment models (whisperX)
+ALIGN_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "nl", "tr", "pl", "cs", "sv", "da", "fi", "no", "uk"]
 # -----------------------------------------------------------------------------
 # model_cache_path = LOCAL_DIR      # <‑‑ this is what we pass to WhisperModel
 # Lazy global holder ----------------------------------------------------------
 _whipser_x_transcribe_models = {}
 _whipser_x_align_models = {}
 _diarizer = None
 _embedder = None
+# Preload alignment and diarization models at startup (no GPU decorator)
+def _preload_alignment_and_diarization_models():
+    """Preload WhisperX alignment and diarization models on CUDA device"""
+    global _whipser_x_align_models, _diarizer
+    print("Preloading all WhisperX alignment models...")
+    for lang in ALIGN_LANGUAGES:
+        try:
+            print(f"Loading alignment model for language '{lang}'...")
+            device = "cuda"
+            align_model, align_metadata = whisperx.load_align_model(
+                language_code=lang,
+                device=device,
+                model_dir=CACHE_ROOT
+            )
+            _whipser_x_align_models[lang] = {
+                "model": align_model,
+                "metadata": align_metadata
+            }
+            print(f"Alignment model for '{lang}' loaded successfully")
+        except Exception as e:
+            print(f"Could not load alignment model for '{lang}': {e}")
+    # Create global diarization pipeline
+    try:
+        print("Loading diarization model...")
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.set_float32_matmul_precision('high')
+        _diarizer = Pipeline.from_pretrained(
+            "pyannote/speaker-diarization-3.1",
+            use_auth_token=os.getenv("HF_TOKEN"),
+        ).to(torch.device("cuda"))
+        print("Diarization model loaded successfully")
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        print(f"Could not load diarization model: {e}")
+        _diarizer = None
+    print("WhisperX alignment and diarization models preloaded successfully!")
+# Call preload function at startup
+_preload_alignment_and_diarization_models()
+# Preload WhisperX transcribe models with GPU decorator
+@spaces.GPU
+def _preload_whisperx_transcribe_models():
+    """Preload all WhisperX transcribe models on GPU"""
+    global _whipser_x_transcribe_models
+    print("Preloading all WhisperX transcribe models on GPU...")
+    for model_name in MODELS.keys():
+        try:
+            print(f"Loading WhisperX transcribe model '{model_name}'...")
+            whisperx_model_name = MODELS[model_name]["whisperx_name"]
+            device = "cuda"
+            compute_type = "float16"
+            model = whisperx.load_model(
+                whisperx_model_name,
+                device=device,
+                compute_type=compute_type,
+                download_root=CACHE_ROOT
+            )
+            _whipser_x_transcribe_models[model_name] = model
+            print(f"WhisperX transcribe model '{model_name}' loaded successfully")
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            print(f"Could not load WhisperX transcribe model '{model_name}': {e}")
+    print("All WhisperX transcribe models preloaded successfully!")
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
+        """Transcribe the entire audio file using WhisperX with alignment"""
+        global _whipser_x_transcribe_models, _whipser_x_align_models
+        # Get preloaded whisperX model
+        if model_name not in _whipser_x_transcribe_models:
+            raise ValueError(f"WhisperX model '{model_name}' not preloaded. Available models: {list(_whipser_x_transcribe_models.keys())}")
+        whisper_model = _whipser_x_transcribe_models[model_name]
+        print(f"Transcribing full audio with WhisperX model '{model_name}' and batch size {batch_size}...")
         start_time = time.time()
+        # Load audio with whisperx
+        audio = whisperx.load_audio(audio_path)
+        # Transcribe with whisperx
+        result = whisper_model.transcribe(
+            audio,
             language=language,
+            batch_size=batch_size,
+            #initial_prompt=prompt,
+            #task="translate" if translate else "transcribe"
         )
+        detected_language = result.get("language", language if language else "unknown")
+        segments = result.get("segments", [])
+        print(f"Detected language: {detected_language}, segments: {len(segments)}, transcribing done in {time.time() - start_time:.2f} seconds")
+        # Align whisper output with alignment model if language is supported
+        if detected_language in _whipser_x_align_models:
+            print(f"Performing WhisperX alignment for language '{detected_language}'...")
+            align_start = time.time()
+            try:
+                align_info = _whipser_x_align_models[detected_language]
+                result = whisperx.align(
+                    result["segments"],
+                    align_info["model"],
+                    align_info["metadata"],
+                    audio,
+                    "cuda",
+                    return_char_alignments=False
+                )
+                segments = result.get("segments", segments)
+                print(f"WhisperX alignment completed in {time.time() - align_start:.2f} seconds")
+            except Exception as e:
+                print(f"WhisperX alignment failed: {e}, using original timestamps")
+        else:
+            print(f"No WhisperX alignment model available for language '{detected_language}', using original timestamps")
+        # Process segments into the expected format
         results = []
         for seg in segments:
             # Create result entry with detailed format
             words_list = []
+            if "words" in seg:
+                for word in seg["words"]:
                     words_list.append({
+                        "start": float(word.get("start", 0.0)) + float(base_offset_s),
+                        "end": float(word.get("end", 0.0)) + float(base_offset_s),
+                        "word": word.get("word", ""),
+                        "probability": word.get("score", 1.0),
                         "speaker": "SPEAKER_00"  # No speaker identification in full transcription
                     })
             results.append({
+                "start": float(seg.get("start", 0.0)) + float(base_offset_s),
+                "end": float(seg.get("end", 0.0)) + float(base_offset_s),
+                "text": seg.get("text", ""),
                 "speaker": "SPEAKER_00",  # Single speaker assumption
+                "avg_logprob": seg.get("avg_logprob", 0.0) if "avg_logprob" in seg else 0.0,
                 "words": words_list,
+                "duration": float(seg.get("end", 0.0)) - float(seg.get("start", 0.0))
             })
         transcription_time = time.time() - start_time
+        print(f"Full audio transcribed and aligned in {transcription_time:.2f} seconds using batch size {batch_size}")
         return results, detected_language
     # Removed audio cutting; transcription is done once on the full (preprocessed) audio
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None, base_offset_s: float = 0.0):
         """Perform speaker diarization; return segments with global timestamps and per-speaker embeddings."""
+        global _diarizer
+        if _diarizer is None:
             print("Diarization model not available, creating single speaker segment")
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
         waveform, sample_rate = torchaudio.load(audio_path)
         # Perform diarization
+        diarization = _diarizer(
             {"waveform": waveform, "sample_rate": sample_rate},
             num_speakers=num_speakers,
         )
     - Vocabulary: Add names and technical terms in the prompt for better accuracy
     """)
+# Preload all WhisperX transcribe models once at service initialization
+print("Preloading all WhisperX transcribe models at startup...")
+try:
+    _preload_whisperx_transcribe_models()
+    print("All WhisperX transcribe models preloaded at startup!")
+except Exception as e:
+    print(f"Warning: Could not preload WhisperX transcribe models at startup: {e}")
+    print("Models will be loaded on first use instead.")
 if __name__ == "__main__":
     demo.launch(debug=True)