Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Oct 10

Commit

3de05cb

1 Parent(s): d2ef882

switch to whisperX

Browse files

Files changed (2) hide show

app.py +115 -100
requirements.txt +2 -3

app.py CHANGED Viewed

@@ -35,8 +35,7 @@ import subprocess
 import os
 import tempfile
 import spaces
-from faster_whisper import WhisperModel, BatchedInferencePipeline
-from faster_whisper.vad import VadOptions
 import requests
 import base64
 from pyannote.audio import Pipeline, Inference, Model
@@ -118,39 +117,19 @@ from huggingface_hub import snapshot_download
 # -----------------------------------------------------------------------------
 MODELS = {
     "large-v3-turbo": {
-        "repo_id": "deepdml/faster-whisper-large-v3-turbo-ct2",
-        "local_dir": f"{CACHE_ROOT}/whisper_turbo_v3"
     },
     "large-v3": {
-        "repo_id": "Systran/faster-whisper-large-v3",
-        "local_dir": f"{CACHE_ROOT}/whisper_large_v3"
     },
     "large-v2": {
-        "repo_id": "Systran/faster-whisper-large-v2",
-        "local_dir": f"{CACHE_ROOT}/whisper_large_v2"
     },
 }
 DEFAULT_MODEL = "large-v3-turbo"
-def _download_model(model_name: str):
-    """Downloads a model from the hub if not already present."""
-    if model_name not in MODELS:
-        raise ValueError(f"Model '{model_name}' not found in MODELS registry.")
-    model_info = MODELS[model_name]
-    if not os.path.exists(model_info["local_dir"]):
-        print(f"Downloading model '{model_name}' from {model_info['repo_id']}...")
-        snapshot_download(
-            repo_id=model_info["repo_id"],
-            local_dir=model_info["local_dir"],
-            local_dir_use_symlinks=True,
-            resume_download=True
-        )
-    return model_info["local_dir"]
-# Download the default model on startup
-for model in MODELS:
-    _download_model(model)
 # -----------------------------------------------------------------------------
@@ -378,9 +357,54 @@ def _process_single_chunk(task: dict, out_dir: str) -> dict:
 # Lazy global holder ----------------------------------------------------------
 _whisper_models = {}
 _batched_whisper_models = {}
 _diarizer = None
 _embedder = None
 # Create global diarization pipeline
 try:
     print("Loading diarization model...")
@@ -402,31 +426,22 @@ except Exception as e:
 @spaces.GPU   # GPU is guaranteed to exist *inside* this function
 def _load_models(model_name: str = DEFAULT_MODEL):
-    global _whisper_models, _batched_whisper_models, _diarizer
-    if model_name not in _whisper_models:
-        print(f"Loading Whisper model '{model_name}'...")
-        model_cache_path = _download_model(model_name)
-        model = WhisperModel(
-            model_cache_path,
-            device="cuda",
-            compute_type="float16",
-        )
-        # Create batched inference pipeline for improved performance
-        batched_model = BatchedInferencePipeline(model=model)
-        _whisper_models[model_name] = model
-        _batched_whisper_models[model_name] = batched_model
-        print(f"Whisper model '{model_name}' and batched pipeline loaded successfully")
-    whisper = _whisper_models[model_name]
-    batched_whisper = _batched_whisper_models[model_name]
-    return whisper, batched_whisper, _diarizer
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
@@ -455,76 +470,76 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
-        """Transcribe the entire audio file without speaker diarization using batched inference"""
-        whisper, batched_whisper, _ = _load_models(model_name)   # models live on the GPU
-        print(f"Transcribing full audio with '{model_name}' and batch size {batch_size}...")
         start_time = time.time()
-        # Prepare options for batched inference
-        options = dict(
             language=language,
-            beam_size=5,
-            word_timestamps=True,
             initial_prompt=prompt,
-            condition_on_previous_text=False,  # avoid runaway context
-            language_detection_segments=1,
-            task="translate" if translate else "transcribe",
         )
-        if clip_timestamps:
-            options["vad_filter"] = False
-            options["clip_timestamps"] = clip_timestamps
-        else:
-            vad_options = transcribe_options.get("vad_parameters", None)
-            options["vad_filter"] = True  # VAD is enabled by default for batched transcription
-            options["vad_parameters"] = VadOptions(**vad_options) if vad_options else VadOptions(
-                max_speech_duration_s=whisper.feature_extractor.chunk_length,
-                min_speech_duration_ms=180,   # ignore ultra-short blips
-                min_silence_duration_ms=120,  # split on short Mandarin pauses (if supported)
-                speech_pad_ms=120,
-                threshold=0.35,
-                neg_threshold=0.2,
-            )
-        if batch_size > 1:
-            # Use batched inference for better performance
-            segments, transcript_info = batched_whisper.transcribe(
-                audio_path,
-                batch_size=batch_size,
-                **options
             )
         else:
-            segments, transcript_info = whisper.transcribe(
-                audio_path,
-                **options
-            )
-        segments = list(segments)
-        detected_language = transcript_info.language
-        print("Detected language: ", detected_language, "segments: ", len(segments))
-        # Process segments
         results = []
         for seg in segments:
             # Create result entry with detailed format
             words_list = []
-            if seg.words:
-                for word in seg.words:
                     words_list.append({
-                        "start": float(word.start) + float(base_offset_s),
-                        "end": float(word.end) + float(base_offset_s),
-                        "word": word.word,
-                        "probability": word.probability,
                         "speaker": "SPEAKER_00"  # No speaker identification in full transcription
                     })
             results.append({
-                "start": float(seg.start) + float(base_offset_s),
-                "end": float(seg.end) + float(base_offset_s),
-                "text": seg.text,
                 "speaker": "SPEAKER_00",  # Single speaker assumption
-                "avg_logprob": seg.avg_logprob,
                 "words": words_list,
-                "duration": float(seg.end - seg.start)
             })
         transcription_time = time.time() - start_time
@@ -549,7 +564,7 @@ class WhisperTranscriber:
             try:
                 embedder = self._load_embedder()
                 # Provide waveform as (channel, time) and pad if too short
-                min_embed_duration_sec = 3.0
                 min_samples = int(min_embed_duration_sec * sample_rate)
                 if waveform.shape[1] < min_samples:
                     pad_len = min_samples - waveform.shape[1]

 import os
 import tempfile
 import spaces
+import whisperx
 import requests
 import base64
 from pyannote.audio import Pipeline, Inference, Model
 # -----------------------------------------------------------------------------
 MODELS = {
     "large-v3-turbo": {
+        "whisperx_name": "large-v3-turbo",
     },
     "large-v3": {
+        "whisperx_name": "large-v3",
     },
     "large-v2": {
+        "whisperx_name": "large-v2",
     },
 }
 DEFAULT_MODEL = "large-v3-turbo"
+# Supported languages for alignment models
+ALIGN_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "nl", "tr", "pl", "cs", "sv", "da", "fi", "no", "uk"]
 # -----------------------------------------------------------------------------
 # Lazy global holder ----------------------------------------------------------
 _whisper_models = {}
 _batched_whisper_models = {}
+_whipser_x_transcribe_models = {}
+_whipser_x_align_models = {}
 _diarizer = None
 _embedder = None
+# Preload all WhisperX transcribe models
+print("Preloading all WhisperX transcribe models...")
+for model_name in MODELS.keys():
+    try:
+        print(f"Loading WhisperX model '{model_name}'...")
+        whisperx_model_name = MODELS[model_name]["whisperx_name"]
+        device = "cpu"  # Load on CPU initially, will move to GPU when needed
+        compute_type = "float16"
+        model = whisperx.load_model(
+            whisperx_model_name,
+            device=device,
+            compute_type=compute_type,
+            download_root=CACHE_ROOT
+        )
+        _whipser_x_transcribe_models[model_name] = model
+        print(f"WhisperX model '{model_name}' loaded successfully")
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        print(f"Could not load WhisperX model '{model_name}': {e}")
+# Preload all alignment models for supported languages
+print("Preloading all WhisperX alignment models...")
+for lang in ALIGN_LANGUAGES:
+    try:
+        print(f"Loading alignment model for language '{lang}'...")
+        device = "cpu"  # Load on CPU initially, will move to GPU when needed
+        align_model, align_metadata = whisperx.load_align_model(
+            language_code=lang,
+            device=device,
+            model_dir=CACHE_ROOT
+        )
+        _whipser_x_align_models[lang] = {
+            "model": align_model,
+            "metadata": align_metadata
+        }
+        print(f"Alignment model for '{lang}' loaded successfully")
+    except Exception as e:
+        print(f"Could not load alignment model for '{lang}': {e}")
 # Create global diarization pipeline
 try:
     print("Loading diarization model...")
 @spaces.GPU   # GPU is guaranteed to exist *inside* this function
 def _load_models(model_name: str = DEFAULT_MODEL):
+    global _whipser_x_transcribe_models, _whipser_x_align_models, _diarizer
+    if model_name not in _whipser_x_transcribe_models:
+        raise ValueError(f"Model '{model_name}' not preloaded. Available models: {list(_whipser_x_transcribe_models.keys())}")
+    whisper_model = _whipser_x_transcribe_models[model_name]
+    # Move model to GPU if not already
+    if hasattr(whisper_model, 'model') and hasattr(whisper_model.model, 'device'):
+        current_device = str(whisper_model.model.device)
+        if 'cpu' in current_device:
+            print(f"Moving WhisperX model '{model_name}' to GPU...")
+            whisper_model = whisper_model.to("cuda")
+            _whipser_x_transcribe_models[model_name] = whisper_model
+    return whisper_model, _diarizer
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
+        """Transcribe the entire audio file without speaker diarization using WhisperX"""
+        whisper_model, _ = _load_models(model_name)   # models live on the GPU
+        print(f"Transcribing full audio with WhisperX model '{model_name}' and batch size {batch_size}...")
         start_time = time.time()
+        # Load audio with whisperx
+        audio = whisperx.load_audio(audio_path)
+        # Transcribe with whisperx
+        result = whisper_model.transcribe(
+            audio,
             language=language,
+            batch_size=batch_size,
             initial_prompt=prompt,
+            task="translate" if translate else "transcribe"
         )
+        detected_language = result.get("language", language if language else "unknown")
+        segments = result.get("segments", [])
+        print(f"Detected language: {detected_language}, segments: {len(segments)}")
+        # Align whisper output with alignment model if language is supported
+        if detected_language in _whipser_x_align_models:
+            print(f"Performing alignment for language '{detected_language}'...")
+            align_info = _whipser_x_align_models[detected_language]
+            # Move alignment model to GPU if needed
+            align_model = align_info["model"]
+            if hasattr(align_model, 'to'):
+                align_model = align_model.to("cuda")
+                _whipser_x_align_models[detected_language]["model"] = align_model
+            result = whisperx.align(
+                result["segments"],
+                align_info["model"],
+                align_info["metadata"],
+                audio,
+                "cuda",
+                return_char_alignments=False
             )
+            segments = result.get("segments", segments)
+            print(f"Alignment completed")
         else:
+            print(f"No alignment model available for language '{detected_language}', using original timestamps")
+        # Process segments into the expected format
         results = []
         for seg in segments:
             # Create result entry with detailed format
             words_list = []
+            if "words" in seg:
+                for word in seg["words"]:
                     words_list.append({
+                        "start": float(word.get("start", 0.0)) + float(base_offset_s),
+                        "end": float(word.get("end", 0.0)) + float(base_offset_s),
+                        "word": word.get("word", ""),
+                        "probability": word.get("score", 1.0),
                         "speaker": "SPEAKER_00"  # No speaker identification in full transcription
                     })
             results.append({
+                "start": float(seg.get("start", 0.0)) + float(base_offset_s),
+                "end": float(seg.get("end", 0.0)) + float(base_offset_s),
+                "text": seg.get("text", ""),
                 "speaker": "SPEAKER_00",  # Single speaker assumption
+                "avg_logprob": seg.get("avg_logprob", 0.0) if "avg_logprob" in seg else 0.0,
                 "words": words_list,
+                "duration": float(seg.get("end", 0.0)) - float(seg.get("start", 0.0))
             })
         transcription_time = time.time() - start_time
             try:
                 embedder = self._load_embedder()
                 # Provide waveform as (channel, time) and pad if too short
+                min_embed_duration_sec = 1.0
                 min_samples = int(min_embed_duration_sec * sample_rate)
                 if waveform.shape[1] < min_samples:
                     pad_len = min_samples - waveform.shape[1]

requirements.txt CHANGED Viewed

@@ -4,9 +4,8 @@ transformers==4.48.0
 # https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.4-cp310-cp310-linux_x86_64.whl
 pydantic==2.10.6
-# 2.  Main whisper model
-faster-whisper==1.1.1
-ctranslate2==4.5.0
 torch
 # 3.  Extra libs your app really needs

 # https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.4-cp310-cp310-linux_x86_64.whl
 pydantic==2.10.6
+# 2.  Main whisper model - using whisperx instead of faster-whisper
+whisperx
 torch
 # 3.  Extra libs your app really needs