Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -279,87 +279,42 @@ def transcribe_video_with_speakers(video_path):
|
|
| 279 |
|
| 280 |
return transcript_with_speakers, detected_language
|
| 281 |
|
| 282 |
-
def segment_audio_from_video(video_path
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
video_path (str): The path to the input video file.
|
| 289 |
-
frame_duration_ms (int): The duration of a frame in milliseconds for VAD (10, 20, or 30).
|
| 290 |
-
Lower values are more precise but computationally intensive.
|
| 291 |
-
|
| 292 |
-
Returns:
|
| 293 |
-
tuple: A tuple containing:
|
| 294 |
-
- audio_path (str): Path to the extracted temporary audio file.
|
| 295 |
-
- speech_segments (list): A list of dictionaries, where each dictionary
|
| 296 |
-
represents a speech segment with 'start' and 'end' timestamps in seconds.
|
| 297 |
-
- error_message (str, optional): An error message if processing fails.
|
| 298 |
-
"""
|
| 299 |
-
audio_path = "temp_extracted_audio.wav"
|
| 300 |
-
speech_segments = []
|
| 301 |
-
error_message = None
|
| 302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
try:
|
| 304 |
-
#
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
#
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
logger.info(f"Audio extracted to: {audio_path}")
|
| 312 |
-
|
| 313 |
-
# 2. Load audio for VAD
|
| 314 |
-
audio = AudioSegment.from_wav(audio_path)
|
| 315 |
-
sample_rate = audio.frame_rate
|
| 316 |
-
audio_data = np.array(audio.get_array_of_samples())
|
| 317 |
-
|
| 318 |
-
# WebRTC VAD operates on 16-bit mono audio at 8kHz, 16kHz, or 32kHz.
|
| 319 |
-
# We already saved at 16kHz, so we can proceed.
|
| 320 |
-
if sample_rate not in [8000, 16000, 32000]:
|
| 321 |
-
error_message = f"Unsupported sample rate for VAD: {sample_rate} Hz. Must be 8kHz, 16kHz, or 32kHz."
|
| 322 |
-
logger.error(error_message)
|
| 323 |
-
return audio_path, [], error_message
|
| 324 |
-
|
| 325 |
-
vad = webrtcvad.Vad(3) # Aggressiveness mode (0-3, 3 is most aggressive)
|
| 326 |
-
frames = []
|
| 327 |
-
offset = 0
|
| 328 |
-
while offset + frame_duration_ms <= len(audio):
|
| 329 |
-
frame_start = offset
|
| 330 |
-
frame_end = offset + frame_duration_ms
|
| 331 |
-
frame = audio[frame_start:frame_end]
|
| 332 |
-
frames.append(frame)
|
| 333 |
-
offset += frame_duration_ms
|
| 334 |
-
|
| 335 |
-
logger.info(f"Running WebRTC VAD on {len(frames)} frames...")
|
| 336 |
-
|
| 337 |
-
current_segment_start = None
|
| 338 |
-
for i, frame in enumerate(frames):
|
| 339 |
-
is_speech = vad.is_speech(frame.raw_data, sample_rate)
|
| 340 |
-
|
| 341 |
-
frame_start_time = (i * frame_duration_ms) / 1000.0
|
| 342 |
-
frame_end_time = ((i + 1) * frame_duration_ms) / 1000.0
|
| 343 |
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
current_segment_start = frame_start_time
|
| 347 |
-
else:
|
| 348 |
-
if current_segment_start is not None:
|
| 349 |
-
speech_segments.append({"start": current_segment_start, "end": frame_end_time})
|
| 350 |
-
current_segment_start = None
|
| 351 |
-
|
| 352 |
-
# Add the last segment if it ended with speech
|
| 353 |
-
if current_segment_start is not None:
|
| 354 |
-
speech_segments.append({"start": current_segment_start, "end": len(audio) / 1000.0})
|
| 355 |
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
-
|
| 359 |
-
error_message = f"An error occurred during audio segmentation: {e}"
|
| 360 |
-
logger.error(error_message)
|
| 361 |
-
|
| 362 |
-
return audio_path, speech_segments, error_message
|
| 363 |
|
| 364 |
def transcribe_segments_with_scribe(full_audio_path, segments):
|
| 365 |
"""
|
|
@@ -1373,9 +1328,7 @@ def upload_and_manage(file, target_language, process_mode):
|
|
| 1373 |
|
| 1374 |
# Step 1: Segment audio from the uploaded video/audio file
|
| 1375 |
logger.info("Segmenting audio...")
|
| 1376 |
-
temp_audio_for_vad, speech_segments
|
| 1377 |
-
if seg_error:
|
| 1378 |
-
raise Exception(f"Audio segmentation failed: {seg_error}")
|
| 1379 |
if not speech_segments:
|
| 1380 |
raise Exception("No speech segments detected in the audio.")
|
| 1381 |
logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")
|
|
|
|
| 279 |
|
| 280 |
return transcript_with_speakers, detected_language
|
| 281 |
|
| 282 |
+
def segment_audio_from_video(video_path):
|
| 283 |
+
# Extract audio from video
|
| 284 |
+
video = VideoFileClip(video_path)
|
| 285 |
+
audio_path = "audio.wav"
|
| 286 |
+
video.audio.write_audiofile(audio_path)
|
| 287 |
+
logger.info(f"Audio extracted from video: {audio_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
+
segment_result, speech_audio_path = segment_background_audio(audio_path)
|
| 290 |
+
print(f"Saved non-speech (background) audio to local")
|
| 291 |
+
|
| 292 |
+
# Set up device
|
| 293 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 294 |
+
logger.info(f"Using device: {device}")
|
| 295 |
+
|
| 296 |
try:
|
| 297 |
+
# Load a medium model with float32 for broader compatibility
|
| 298 |
+
model = whisperx.load_model("large-v3", device=device, compute_type="float32")
|
| 299 |
+
logger.info("WhisperX model loaded")
|
| 300 |
+
|
| 301 |
+
# Transcribe
|
| 302 |
+
result = model.transcribe(speech_audio_path, chunk_size=4, print_progress = True)
|
| 303 |
+
logger.info("Audio transcription completed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
+
except Exception as e:
|
| 306 |
+
logger.error(f"❌ WhisperX pipeline failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
+
# Extract timestamps, text, and speaker IDs
|
| 309 |
+
transcript_with_speakers = [
|
| 310 |
+
{
|
| 311 |
+
"start": segment["start"],
|
| 312 |
+
"end": segment["end"]
|
| 313 |
+
}
|
| 314 |
+
for segment in result["segments"]
|
| 315 |
+
]
|
| 316 |
|
| 317 |
+
return audio_path, transcript_with_speakers
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
def transcribe_segments_with_scribe(full_audio_path, segments):
|
| 320 |
"""
|
|
|
|
| 1328 |
|
| 1329 |
# Step 1: Segment audio from the uploaded video/audio file
|
| 1330 |
logger.info("Segmenting audio...")
|
| 1331 |
+
temp_audio_for_vad, speech_segments = segment_audio_from_video(file.name)
|
|
|
|
|
|
|
| 1332 |
if not speech_segments:
|
| 1333 |
raise Exception("No speech segments detected in the audio.")
|
| 1334 |
logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")
|