Update app.py
Browse files
app.py
CHANGED
|
@@ -137,28 +137,32 @@ def transcribe_video_with_speakers(video_path):
|
|
| 137 |
# Set up device
|
| 138 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 139 |
logger.info(f"Using device: {device}")
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
# Extract timestamps, text, and speaker IDs
|
| 164 |
transcript_with_speakers = [
|
|
|
|
| 137 |
# Set up device
|
| 138 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 139 |
logger.info(f"Using device: {device}")
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
# Load a medium model with float32 for broader compatibility
|
| 143 |
+
model = whisperx.load_model("medium", device=device, compute_type="float32")
|
| 144 |
+
logger.info("WhisperX model loaded")
|
| 145 |
+
|
| 146 |
+
# Transcribe
|
| 147 |
+
result = model.transcribe(audio_path)
|
| 148 |
+
logger.info("Audio transcription completed")
|
| 149 |
+
|
| 150 |
+
# Alignment
|
| 151 |
+
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
| 152 |
+
result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
|
| 153 |
+
logger.info("Transcription alignment completed")
|
| 154 |
+
|
| 155 |
+
# Diarization (works independently of Whisper model size)
|
| 156 |
+
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
|
| 157 |
+
diarize_segments = diarize_model(audio_path)
|
| 158 |
+
logger.info("Speaker diarization completed")
|
| 159 |
+
|
| 160 |
+
# Assign speakers
|
| 161 |
+
result = whisperx.assign_word_speakers(diarize_segments, result)
|
| 162 |
+
logger.info("Speakers assigned to transcribed segments")
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.error(f"❌ WhisperX pipeline failed: {e}")
|
| 166 |
|
| 167 |
# Extract timestamps, text, and speaker IDs
|
| 168 |
transcript_with_speakers = [
|