Update app.py
Browse files
app.py
CHANGED
|
@@ -126,59 +126,57 @@ def handle_feedback(feedback):
|
|
| 126 |
return "Thank you for your feedback!", None
|
| 127 |
|
| 128 |
def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
|
| 129 |
-
return 10
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
# Parameters:
|
| 135 |
-
# - audio_path (str): Path to input audio (.wav).
|
| 136 |
-
# - output_path (str): Path to save the output non-speech audio.
|
| 137 |
-
# - hf_token (str): Hugging Face auth token for pyannote.
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
# raise ValueError("Hugging Face token is required for pyannote pipeline.")
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
-
#
|
| 149 |
-
|
| 150 |
-
# print("✅ Speech segments detected.")
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
|
| 154 |
-
|
| 155 |
|
| 156 |
-
#
|
| 157 |
-
|
| 158 |
-
|
| 159 |
|
| 160 |
-
#
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
# current_time = segment.end
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
|
|
|
|
| 169 |
|
| 170 |
-
|
| 171 |
-
# non_speech_audio = AudioSegment.empty()
|
| 172 |
-
# for start, end in background_segments:
|
| 173 |
-
# segment = full_audio[int(start * 1000):int(end * 1000)]
|
| 174 |
-
# non_speech_audio += segment
|
| 175 |
|
| 176 |
-
#
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
| 179 |
|
| 180 |
-
#
|
|
|
|
|
|
|
| 181 |
|
|
|
|
| 182 |
|
| 183 |
def transcribe_video_with_speakers(video_path):
|
| 184 |
# Extract audio from video
|
|
@@ -476,7 +474,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
| 476 |
## Need to implmenet backup option.
|
| 477 |
|
| 478 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 479 |
-
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
|
| 480 |
for i, entry in enumerate(translated_json)]
|
| 481 |
|
| 482 |
results = []
|
|
@@ -500,15 +498,27 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
| 500 |
final_video = CompositeVideoClip([video] + text_clips)
|
| 501 |
|
| 502 |
if add_voiceover and audio_segments:
|
| 503 |
-
|
| 504 |
-
|
| 505 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
logger.info(f"Saving the final video to: {output_path}")
|
| 507 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
| 508 |
|
| 509 |
logger.info("Video processing completed successfully.")
|
| 510 |
|
| 511 |
-
# Optional: return errors
|
| 512 |
if error_messages:
|
| 513 |
logger.warning("⚠️ Errors encountered during processing:")
|
| 514 |
for msg in error_messages:
|
|
@@ -516,7 +526,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
| 516 |
|
| 517 |
return error_messages
|
| 518 |
|
| 519 |
-
def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path):
|
| 520 |
try:
|
| 521 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
| 522 |
if not full_text.strip():
|
|
@@ -529,13 +539,6 @@ def generate_voiceover_clone(translated_json, tts_model, desired_duration, targe
|
|
| 529 |
logger.error(msg)
|
| 530 |
return None, msg, msg
|
| 531 |
|
| 532 |
-
# # Truncate text based on max token assumption (~60 tokens)
|
| 533 |
-
# MAX_TTS_TOKENS = 60
|
| 534 |
-
# tokens = full_text.split() # crude token count
|
| 535 |
-
# if len(tokens) > MAX_TTS_TOKENS:
|
| 536 |
-
# logger.warning(f"⚠️ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
|
| 537 |
-
# full_text = " ".join(tokens[:MAX_TTS_TOKENS])
|
| 538 |
-
|
| 539 |
speed_tts = calibrated_speed(full_text, desired_duration)
|
| 540 |
tts_model.tts_to_file(
|
| 541 |
text=full_text,
|
|
|
|
| 126 |
return "Thank you for your feedback!", None
|
| 127 |
|
| 128 |
def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
|
|
|
|
| 129 |
|
| 130 |
+
"""
|
| 131 |
+
Detects and extracts non-speech (background) segments from audio using pyannote VAD.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
Parameters:
|
| 134 |
+
- audio_path (str): Path to input audio (.wav).
|
| 135 |
+
- output_path (str): Path to save the output non-speech audio.
|
| 136 |
+
- hf_token (str): Hugging Face auth token for pyannote.
|
|
|
|
| 137 |
|
| 138 |
+
Returns:
|
| 139 |
+
- List of non-speech timestamp tuples (start, end) in seconds.
|
| 140 |
+
"""
|
| 141 |
+
if not hf_token:
|
| 142 |
+
raise ValueError("Hugging Face token is required for pyannote pipeline.")
|
| 143 |
|
| 144 |
+
# Step 1: Load pipeline
|
| 145 |
+
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token)
|
|
|
|
| 146 |
|
| 147 |
+
# Step 2: Apply VAD to get speech segments
|
| 148 |
+
vad_result = pipeline(audio_path)
|
| 149 |
+
print("✅ Speech segments detected.")
|
| 150 |
|
| 151 |
+
# Step 3: Get full duration of the audio
|
| 152 |
+
full_audio = AudioSegment.from_wav(audio_path)
|
| 153 |
+
full_duration_sec = len(full_audio) / 1000.0
|
| 154 |
|
| 155 |
+
# Step 4: Compute non-speech segments
|
| 156 |
+
background_segments = []
|
| 157 |
+
current_time = 0.0
|
|
|
|
| 158 |
|
| 159 |
+
for segment in vad_result.itersegments():
|
| 160 |
+
if current_time < segment.start:
|
| 161 |
+
background_segments.append((current_time, segment.start))
|
| 162 |
+
current_time = segment.end
|
| 163 |
|
| 164 |
+
if current_time < full_duration_sec:
|
| 165 |
+
background_segments.append((current_time, full_duration_sec))
|
| 166 |
|
| 167 |
+
print(f"🕒 Non-speech segments: {background_segments}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
# Step 5: Extract and combine non-speech segments
|
| 170 |
+
non_speech_audio = AudioSegment.empty()
|
| 171 |
+
for start, end in background_segments:
|
| 172 |
+
segment = full_audio[int(start * 1000):int(end * 1000)]
|
| 173 |
+
non_speech_audio += segment
|
| 174 |
|
| 175 |
+
# Step 6: Export the non-speech audio
|
| 176 |
+
non_speech_audio.export(output_path, format="wav")
|
| 177 |
+
print(f"🎵 Non-speech audio saved to: {output_path}")
|
| 178 |
|
| 179 |
+
return background_segments
|
| 180 |
|
| 181 |
def transcribe_video_with_speakers(video_path):
|
| 182 |
# Extract audio from video
|
|
|
|
| 474 |
## Need to implmenet backup option.
|
| 475 |
|
| 476 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 477 |
+
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths, background_audio_path="background_segments.wav")
|
| 478 |
for i, entry in enumerate(translated_json)]
|
| 479 |
|
| 480 |
results = []
|
|
|
|
| 498 |
final_video = CompositeVideoClip([video] + text_clips)
|
| 499 |
|
| 500 |
if add_voiceover and audio_segments:
|
| 501 |
+
try:
|
| 502 |
+
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
| 503 |
|
| 504 |
+
if background_audio_path and os.path.exists(background_audio_path):
|
| 505 |
+
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
| 506 |
+
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
| 507 |
+
logger.info("✅ Background audio loaded and merged with voiceover.")
|
| 508 |
+
else:
|
| 509 |
+
final_audio = voice_audio
|
| 510 |
+
logger.info("⚠️ No background audio found. Using voiceover only.")
|
| 511 |
+
|
| 512 |
+
final_video = final_video.set_audio(final_audio)
|
| 513 |
+
|
| 514 |
+
except Exception as e:
|
| 515 |
+
logger.error(f"❌ Failed to set audio: {e}")
|
| 516 |
+
|
| 517 |
logger.info(f"Saving the final video to: {output_path}")
|
| 518 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
| 519 |
|
| 520 |
logger.info("Video processing completed successfully.")
|
| 521 |
|
|
|
|
| 522 |
if error_messages:
|
| 523 |
logger.warning("⚠️ Errors encountered during processing:")
|
| 524 |
for msg in error_messages:
|
|
|
|
| 526 |
|
| 527 |
return error_messages
|
| 528 |
|
| 529 |
+
def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path, use_clone=False):
|
| 530 |
try:
|
| 531 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
| 532 |
if not full_text.strip():
|
|
|
|
| 539 |
logger.error(msg)
|
| 540 |
return None, msg, msg
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
speed_tts = calibrated_speed(full_text, desired_duration)
|
| 543 |
tts_model.tts_to_file(
|
| 544 |
text=full_text,
|