Update app.py
Browse files
app.py
CHANGED
|
@@ -126,55 +126,31 @@ def handle_feedback(feedback):
|
|
| 126 |
return "Thank you for your feedback!", None
|
| 127 |
|
| 128 |
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
| 129 |
-
|
| 130 |
-
"""
|
| 131 |
-
Detects and extracts non-speech (background) segments from audio using pyannote VAD.
|
| 132 |
-
|
| 133 |
-
Parameters:
|
| 134 |
-
- audio_path (str): Path to input audio (.wav).
|
| 135 |
-
- segment_audio_path (str): Path to save the output non-speech audio.
|
| 136 |
-
- hf_token (str): Hugging Face auth token for pyannote.
|
| 137 |
-
|
| 138 |
-
Returns:
|
| 139 |
-
- List of non-speech timestamp tuples (start, end) in seconds.
|
| 140 |
-
"""
|
| 141 |
-
|
| 142 |
-
# Step 1: Load pipeline
|
| 143 |
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
| 144 |
-
|
| 145 |
-
# Step 2: Apply VAD to get speech segments
|
| 146 |
vad_result = pipeline(audio_path)
|
| 147 |
-
print("✅ Speech segments detected.")
|
| 148 |
|
| 149 |
-
# Step 3: Get full duration of the audio
|
| 150 |
full_audio = AudioSegment.from_wav(audio_path)
|
| 151 |
full_duration_sec = len(full_audio) / 1000.0
|
| 152 |
|
| 153 |
-
# Step 4: Compute non-speech segments
|
| 154 |
-
background_segments = []
|
| 155 |
current_time = 0.0
|
|
|
|
| 156 |
|
| 157 |
for segment in vad_result.itersegments():
|
|
|
|
| 158 |
if current_time < segment.start:
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
current_time = segment.end
|
| 161 |
|
|
|
|
| 162 |
if current_time < full_duration_sec:
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
print(f"🕒 Non-speech segments: {background_segments}")
|
| 166 |
-
|
| 167 |
-
# Step 5: Extract and combine non-speech segments
|
| 168 |
-
non_speech_audio = AudioSegment.empty()
|
| 169 |
-
for start, end in background_segments:
|
| 170 |
-
segment = full_audio[int(start * 1000):int(end * 1000)]
|
| 171 |
-
non_speech_audio += segment
|
| 172 |
-
|
| 173 |
-
# Step 6: Export the non-speech audio
|
| 174 |
-
non_speech_audio.export(background_audio_path, format="wav")
|
| 175 |
-
print(f"🎵 Non-speech audio saved to: {background_audio_path}")
|
| 176 |
|
| 177 |
-
|
|
|
|
| 178 |
|
| 179 |
def transcribe_video_with_speakers(video_path):
|
| 180 |
# Extract audio from video
|
|
@@ -427,9 +403,11 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 427 |
speaker = entry.get("speaker", "default")
|
| 428 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 429 |
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
| 431 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
| 432 |
-
|
| 433 |
else:
|
| 434 |
generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
|
| 435 |
|
|
@@ -504,9 +482,9 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
| 504 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
| 505 |
|
| 506 |
if background_audio_path and os.path.exists(background_audio_path):
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
final_audio = voice_audio
|
| 510 |
logger.info("✅ Background audio loaded and merged with voiceover.")
|
| 511 |
else:
|
| 512 |
final_audio = voice_audio
|
|
|
|
| 126 |
return "Thank you for your feedback!", None
|
| 127 |
|
| 128 |
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
|
|
|
|
|
|
| 130 |
vad_result = pipeline(audio_path)
|
|
|
|
| 131 |
|
|
|
|
| 132 |
full_audio = AudioSegment.from_wav(audio_path)
|
| 133 |
full_duration_sec = len(full_audio) / 1000.0
|
| 134 |
|
|
|
|
|
|
|
| 135 |
current_time = 0.0
|
| 136 |
+
result_audio = AudioSegment.empty()
|
| 137 |
|
| 138 |
for segment in vad_result.itersegments():
|
| 139 |
+
# Background segment before the speech
|
| 140 |
if current_time < segment.start:
|
| 141 |
+
bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
|
| 142 |
+
result_audio += bg
|
| 143 |
+
# Add silence for the speech duration
|
| 144 |
+
silence_duration = segment.end - segment.start
|
| 145 |
+
result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
|
| 146 |
current_time = segment.end
|
| 147 |
|
| 148 |
+
# Handle any remaining background after the last speech
|
| 149 |
if current_time < full_duration_sec:
|
| 150 |
+
result_audio += full_audio[int(current_time * 1000):]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
result_audio.export(background_audio_path, format="wav")
|
| 153 |
+
return background_audio_path
|
| 154 |
|
| 155 |
def transcribe_video_with_speakers(video_path):
|
| 156 |
# Extract audio from video
|
|
|
|
| 403 |
speaker = entry.get("speaker", "default")
|
| 404 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 405 |
|
| 406 |
+
# Assume this is the list of supported languages for the TTS model
|
| 407 |
+
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
| 408 |
+
|
| 409 |
+
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
| 410 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
|
|
|
| 411 |
else:
|
| 412 |
generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
|
| 413 |
|
|
|
|
| 482 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
| 483 |
|
| 484 |
if background_audio_path and os.path.exists(background_audio_path):
|
| 485 |
+
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
| 486 |
+
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
| 487 |
+
# final_audio = voice_audio
|
| 488 |
logger.info("✅ Background audio loaded and merged with voiceover.")
|
| 489 |
else:
|
| 490 |
final_audio = voice_audio
|