Spaces:

Curify
/

studio_V1

Sleeping

qqwjq1981 commited on Mar 10, 2025

Commit

2e011e4

verified ·

1 Parent(s): 9ecc376

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -137,28 +137,32 @@ def transcribe_video_with_speakers(video_path):
     # Set up device
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
-    # Load WhisperX model
-    model = whisperx.load_model("large-v2", device)
-    logger.info("WhisperX model loaded")
-    # Transcribe with WhisperX
-    result = model.transcribe(audio_path)
-    logger.info("Audio transcription completed")
-    # Align transcription
-    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-    result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
-    logger.info("Transcription alignment completed")
-    # Perform speaker diarization
-    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
-    diarize_segments = diarize_model(audio_path)
-    logger.info("Speaker diarization completed")
-    # Assign speakers to transcribed segments
-    result = whisperx.assign_word_speakers(diarize_segments, result)
-    logger.info("Speakers assigned to transcribed segments")
     # Extract timestamps, text, and speaker IDs
     transcript_with_speakers = [

     # Set up device
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
+    try:
+        # Load a medium model with float32 for broader compatibility
+        model = whisperx.load_model("medium", device=device, compute_type="float32")
+        logger.info("WhisperX model loaded")
+        # Transcribe
+        result = model.transcribe(audio_path)
+        logger.info("Audio transcription completed")
+        # Alignment
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
+        logger.info("Transcription alignment completed")
+        # Diarization (works independently of Whisper model size)
+        diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
+        diarize_segments = diarize_model(audio_path)
+        logger.info("Speaker diarization completed")
+        # Assign speakers
+        result = whisperx.assign_word_speakers(diarize_segments, result)
+        logger.info("Speakers assigned to transcribed segments")
+    except Exception as e:
+        logger.error(f"❌ WhisperX pipeline failed: {e}")
     # Extract timestamps, text, and speaker IDs
     transcript_with_speakers = [