Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Oct 15

Commit

25a2b6b

1 Parent(s): 57aeeb0

Enhance audio transcription by adding support for 'faster_whisper' engine alongside 'whisperx'. Implement lazy loading for both transcription models and improve handling of transcribe options. Update transcribe_full_audio method to accommodate engine selection and adjust alignment process accordingly.

Browse files

Files changed (1) hide show

app.py +178 -54

app.py CHANGED Viewed

@@ -395,6 +395,8 @@ def _process_single_chunk(task: dict, out_dir: str) -> dict:
 # Lazy global holder ----------------------------------------------------------
 _whipser_x_transcribe_models = {}
 _whipser_x_align_models = {}
 _diarizer = None
 _embedder = None
@@ -502,77 +504,198 @@ class WhisperTranscriber:
         return meta
     @spaces.GPU           # each call gets a GPU slice
-    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
-        """Transcribe the entire audio file using WhisperX with alignment"""
-        global _whipser_x_transcribe_models, _whipser_x_align_models
-        # Load whisperX model lazily on first use (within GPU context)
-        if model_name not in _whipser_x_transcribe_models:
-            print(f"Loading WhisperX transcribe model '{model_name}' on GPU...")
-            if model_name not in MODELS:
-                raise ValueError(f"Model '{model_name}' not found in MODELS registry. Available: {list(MODELS.keys())}")
-            whisperx_model_name = MODELS[model_name]["whisperx_name"]
-            device = "cuda"
-            compute_type = "float16"
-            whisper_model = whisperx.load_model(
-                whisperx_model_name,
-                device=device,
-                compute_type=compute_type,
-                download_root=CACHE_ROOT
-            )
-            _whipser_x_transcribe_models[model_name] = whisper_model
-            print(f"WhisperX transcribe model '{model_name}' loaded successfully")
-        else:
-            whisper_model = _whipser_x_transcribe_models[model_name]
-        print(f"Transcribing full audio with WhisperX model '{model_name}' and batch size {batch_size}...")
         start_time = time.time()
-        # Load audio with whisperx
         audio = whisperx.load_audio(audio_path)
         print(audio_path)
-        # Transcribe with whisperx
-        result = whisper_model.transcribe(
-            audio,
-            language=language,
-            batch_size=batch_size,
-            #initial_prompt=prompt,
-            #task="translate" if translate else "transcribe"
-        )
-        detected_language = result.get("language", language if language else "unknown")
-        segments = result.get("segments", [])
-        print(f"Detected language: {detected_language}, segments: {len(segments)}, transcribing done in {time.time() - start_time:.2f} seconds")
-        print(segments)
-        # Align whisper output with alignment model if language is supported
         if detected_language in _whipser_x_align_models:
             print(f"Performing WhisperX alignment for language '{detected_language}'...")
             align_start = time.time()
             try:
                 align_info = _whipser_x_align_models[detected_language]
-                result = whisperx.align(
-                    result["segments"],
                     align_info["model"],
                     align_info["metadata"],
                     audio,
                     "cuda",
                     return_char_alignments=False
                 )
-                segments = result.get("segments", segments)
                 print(f"WhisperX alignment completed in {time.time() - align_start:.2f} seconds")
             except Exception as e:
                 print(f"WhisperX alignment failed: {e}, using original timestamps")
         else:
             print(f"No WhisperX alignment model available for language '{detected_language}', using original timestamps")
         # Process segments into the expected format
         results = []
         for seg in segments:
-            # Create result entry with detailed format
             words_list = []
             if "words" in seg:
                 for word in seg["words"]:
@@ -581,18 +704,19 @@ class WhisperTranscriber:
                         "end": float(word.get("end", 0.0)) + float(base_offset_s),
                         "word": word.get("word", ""),
                         "probability": word.get("score", 1.0),
-                        "speaker": "SPEAKER_00"  # No speaker identification in full transcription
                     })
             results.append({
                 "start": float(seg.get("start", 0.0)) + float(base_offset_s),
                 "end": float(seg.get("end", 0.0)) + float(base_offset_s),
                 "text": seg.get("text", ""),
-                "speaker": "SPEAKER_00",  # Single speaker assumption
                 "avg_logprob": seg.get("avg_logprob", 0.0) if "avg_logprob" in seg else 0.0,
                 "words": words_list,
                 "duration": float(seg.get("end", 0.0)) - float(seg.get("start", 0.0))
             })
         print(results)
         transcription_time = time.time() - start_time
         print(f"Full audio transcribed and aligned in {transcription_time:.2f} seconds using batch size {batch_size}")
@@ -1043,7 +1167,7 @@ class WhisperTranscriber:
             # Step 2: Transcribe full audio once
             transcription_results, detected_language = self.transcribe_full_audio(
-                wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s, clip_timestamps=None, model_name=model_name, transcribe_options=transcribe_options
             )
             # Step 6: Return results
@@ -1094,7 +1218,7 @@ class WhisperTranscriber:
                 # Step 2: Transcribe full audio once
                 transcription_result, detected_language = self.transcribe_full_audio(
-                    wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s, clip_timestamps=None, model_name=model_name, transcribe_options=transcribe_options
                 )
                 # Step 6: Return results

 # Lazy global holder ----------------------------------------------------------
 _whipser_x_transcribe_models = {}
 _whipser_x_align_models = {}
+_faster_whisper_transcribe_models = {}
+_faster_whisper_batched_pipelines = {}
 _diarizer = None
 _embedder = None
         return meta
     @spaces.GPU           # each call gets a GPU slice
+    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, engine="whisperx", model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
+        """Transcribe the entire audio file using selected engine, then align with WhisperX.
+        engine: "whisperx" | "faster_whisper"
+        Always uses WhisperX alignment regardless of transcription engine.
+        """
+        global _whipser_x_transcribe_models, _whipser_x_align_models, _faster_whisper_transcribe_models
         start_time = time.time()
+        # Load audio (float32, 16k) once
         audio = whisperx.load_audio(audio_path)
         print(audio_path)
+        # Resolve engine (allow override from transcribe_options)
+        if transcribe_options and isinstance(transcribe_options, dict) and transcribe_options.get("engine"):
+            engine = str(transcribe_options.get("engine")).strip().lower()
+        # Transcribe using the selected engine
+        initial_segments = []
+        detected_language = language if language else "unknown"
+        if engine == "whisperx":
+            # Lazy-load WhisperX model on first use
+            if model_name not in _whipser_x_transcribe_models:
+                print(f"Loading WhisperX transcribe model '{model_name}' on GPU...")
+                if model_name not in MODELS:
+                    raise ValueError(f"Model '{model_name}' not found in MODELS registry. Available: {list(MODELS.keys())}")
+                whisperx_model_name = MODELS[model_name]["whisperx_name"]
+                device = "cuda"
+                compute_type = "float16"
+                whisper_model = whisperx.load_model(
+                    whisperx_model_name,
+                    device=device,
+                    compute_type=compute_type,
+                    download_root=CACHE_ROOT,
+                    asr_options=transcribe_options
+                )
+                _whipser_x_transcribe_models[model_name] = whisper_model
+                print(f"WhisperX transcribe model '{model_name}' loaded successfully")
+            else:
+                whisper_model = _whipser_x_transcribe_models[model_name]
+            print(f"Transcribing full audio with WhisperX model '{model_name}' and batch size {batch_size}...")
+            result = whisper_model.transcribe(
+                audio,
+                language=language,
+                batch_size=batch_size,
+                #initial_prompt=prompt,
+                #task="translate" if translate else "transcribe"
+            )
+            detected_language = result.get("language", detected_language)
+            initial_segments = result.get("segments", [])
+        elif engine == "faster_whisper":
+            # Lazy-load Faster-Whisper model on first use
+            if model_name not in _faster_whisper_transcribe_models:
+                print(f"Loading Faster-Whisper transcribe model '{model_name}' on GPU...")
+                # Use the same name by default; extend MODELS with specific mapping if needed
+                faster_name = MODELS.get(model_name, {}).get("whisperx_name", model_name)
+                fw_model = WhisperModel(
+                    faster_name,
+                    device="cuda",
+                    compute_type="float16",
+                    download_root=CACHE_ROOT,
+                )
+                _faster_whisper_transcribe_models[model_name] = fw_model
+                print(f"Faster-Whisper transcribe model '{model_name}' loaded successfully")
+            else:
+                fw_model = _faster_whisper_transcribe_models[model_name]
+            print(f"Transcribing full audio with Faster-Whisper model '{model_name}' and batch size {batch_size}...")
+            task = "translate" if translate else "transcribe"
+            # Build kwargs from transcribe_options for Faster-Whisper's transcribe API
+            fw_kwargs = {}
+            if isinstance(transcribe_options, dict):
+                allowed = {
+                    "log_progress",
+                    "beam_size",
+                    "best_of",
+                    "patience",
+                    "length_penalty",
+                    "repetition_penalty",
+                    "no_repeat_ngram_size",
+                    "temperature",
+                    "compression_ratio_threshold",
+                    "log_prob_threshold",
+                    "no_speech_threshold",
+                    "condition_on_previous_text",
+                    "prompt_reset_on_temperature",
+                    "initial_prompt",
+                    "prefix",
+                    "suppress_blank",
+                    "suppress_tokens",
+                    "without_timestamps",
+                    "max_initial_timestamp",
+                    #"word_timestamps",
+                    #"prepend_punctuations",
+                    #"append_punctuations",
+                    "multilingual",
+                    "vad_filter",
+                    "vad_parameters",
+                    "max_new_tokens",
+                    "chunk_length",
+                    "clip_timestamps",
+                    "hallucination_silence_threshold",
+                    "batch_size",
+                    "hotwords",
+                    "language_detection_threshold",
+                    "language_detection_segments",
+                }
+                for k in allowed:
+                    if k in transcribe_options and transcribe_options[k] is not None:
+                        fw_kwargs[k] = transcribe_options[k]
+            # Ensure sensible defaults and avoid duplicates
+            if "initial_prompt" not in fw_kwargs and prompt is not None:
+                fw_kwargs["initial_prompt"] = prompt
+            if "batch_size" not in fw_kwargs and batch_size is not None:
+                fw_kwargs["batch_size"] = batch_size
+            if "vad_filter" not in fw_kwargs:
+                fw_kwargs["vad_filter"] = False  # preserve boundaries for alignment
+            # language and task are passed explicitly; do not include in fw_kwargs
+            fw_kwargs.pop("language", None)
+            fw_kwargs.pop("task", None)
+            fw_kwargs["prepend_punctuations"] = "\"'“¿([{-"
+            fw_kwargs["append_punctuations"] = "\"'.。,，!！?？:：”)]}、"
+            fw_kwargs["without_timestamps"] = True
+            fw_kwargs["max_initial_timestamp"] = 0.0
+            fw_kwargs["word_timestamps"] = False
+            # Choose between single and batched transcription per docs
+            effective_bs = int(fw_kwargs.get("batch_size", batch_size if batch_size is not None else 8))
+            use_batched = effective_bs > 1
+            # Note: pass numpy audio
+            if use_batched:
+                if model_name not in _faster_whisper_batched_pipelines:
+                    _faster_whisper_batched_pipelines[model_name] = BatchedInferencePipeline(model=fw_model)
+                batched_model = _faster_whisper_batched_pipelines[model_name]
+                segments_iter, info = batched_model.transcribe(
+                    audio,
+                    language=language,
+                    task=task,
+                    **fw_kwargs,
+                )
+            else:
+                segments_iter, info = fw_model.transcribe(
+                    audio,
+                    language=language,
+                    task=task,
+                    **fw_kwargs,
+                )
+            detected_language = getattr(info, "language", detected_language)
+            # Convert to WhisperX-like segment dicts
+            initial_segments = [{
+                "start": float(s.start),
+                "end": float(s.end),
+                "text": s.text or "",
+            } for s in segments_iter]
+        else:
+            raise ValueError(f"Unknown engine '{engine}'. Supported: 'whisperx', 'faster_whisper'")
+        print(f"Detected language: {detected_language}, segments: {len(initial_segments)}, transcribing done in {time.time() - start_time:.2f} seconds")
+        # Align with WhisperX if supported for detected language (always attempt when available)
+        segments = initial_segments
         if detected_language in _whipser_x_align_models:
             print(f"Performing WhisperX alignment for language '{detected_language}'...")
             align_start = time.time()
             try:
                 align_info = _whipser_x_align_models[detected_language]
+                align_result = whisperx.align(
+                    initial_segments,
                     align_info["model"],
                     align_info["metadata"],
                     audio,
                     "cuda",
                     return_char_alignments=False
                 )
+                segments = align_result.get("segments", segments)
                 print(f"WhisperX alignment completed in {time.time() - align_start:.2f} seconds")
             except Exception as e:
                 print(f"WhisperX alignment failed: {e}, using original timestamps")
         else:
             print(f"No WhisperX alignment model available for language '{detected_language}', using original timestamps")
         # Process segments into the expected format
         results = []
         for seg in segments:
             words_list = []
             if "words" in seg:
                 for word in seg["words"]:
                         "end": float(word.get("end", 0.0)) + float(base_offset_s),
                         "word": word.get("word", ""),
                         "probability": word.get("score", 1.0),
+                        "speaker": "SPEAKER_00"
                     })
             results.append({
                 "start": float(seg.get("start", 0.0)) + float(base_offset_s),
                 "end": float(seg.get("end", 0.0)) + float(base_offset_s),
                 "text": seg.get("text", ""),
+                "speaker": "SPEAKER_00",
                 "avg_logprob": seg.get("avg_logprob", 0.0) if "avg_logprob" in seg else 0.0,
                 "words": words_list,
                 "duration": float(seg.get("end", 0.0)) - float(seg.get("start", 0.0))
             })
         print(results)
         transcription_time = time.time() - start_time
         print(f"Full audio transcribed and aligned in {transcription_time:.2f} seconds using batch size {batch_size}")
             # Step 2: Transcribe full audio once
             transcription_results, detected_language = self.transcribe_full_audio(
+                wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s, engine=transcribe_options.get("engine", "whisperx"), model_name=model_name, transcribe_options=transcribe_options
             )
             # Step 6: Return results
                 # Step 2: Transcribe full audio once
                 transcription_result, detected_language = self.transcribe_full_audio(
+                    wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s, engine=transcribe_options.get("engine", "faster_whisper"), model_name=model_name, transcribe_options=transcribe_options
                 )
                 # Step 6: Return results