Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -82,7 +82,7 @@ class AudioProcessor:
|
|
| 82 |
|
| 83 |
return results
|
| 84 |
|
| 85 |
-
def translate_segments_batch(self, segments):
|
| 86 |
"""Translate all text segments in a single batch request"""
|
| 87 |
try:
|
| 88 |
# Filter out None segments (pauses)
|
|
@@ -94,14 +94,14 @@ class AudioProcessor:
|
|
| 94 |
print(f"Translating {len(text_segments)} segments in batch...")
|
| 95 |
|
| 96 |
# Prepare the prompt with clear formatting instructions
|
| 97 |
-
prompt = f"""Translate the following
|
| 98 |
|
| 99 |
{chr(10).join(text_segments)}
|
| 100 |
|
| 101 |
IMPORTANT INSTRUCTIONS:
|
| 102 |
1. Maintain the EXACT same order and number of segments
|
| 103 |
2. Each line must be a separate translation
|
| 104 |
-
3. Use natural conversational
|
| 105 |
4. Preserve meaning/context
|
| 106 |
5. Leave proper nouns unchanged
|
| 107 |
6. Match original word count where possible
|
|
@@ -122,7 +122,7 @@ class AudioProcessor:
|
|
| 122 |
messages=[
|
| 123 |
{
|
| 124 |
"role": "system",
|
| 125 |
-
"content": "You are a professional translator from
|
| 126 |
},
|
| 127 |
{
|
| 128 |
"role": "user",
|
|
@@ -165,7 +165,7 @@ def get_audio_duration(audio_path):
|
|
| 165 |
print(f"Duration error: {e}")
|
| 166 |
return None
|
| 167 |
|
| 168 |
-
async def synthesize_tts_to_wav(text, voice):
|
| 169 |
import edge_tts
|
| 170 |
temp_mp3 = "temp_tts.mp3"
|
| 171 |
communicate = edge_tts.Communicate(text, voice)
|
|
@@ -205,8 +205,8 @@ def cleanup_files(file_list):
|
|
| 205 |
if os.path.exists(file):
|
| 206 |
os.remove(file)
|
| 207 |
|
| 208 |
-
# --- Main
|
| 209 |
-
async def process_audio_chunks(input_audio_path, voice
|
| 210 |
audio_processor = AudioProcessor()
|
| 211 |
|
| 212 |
print("🔎 Separating vocals and music using Demucs...")
|
|
@@ -222,7 +222,7 @@ async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
|
|
| 222 |
segment_texts = [seg[2] if seg[2] is not None else None for seg in segments]
|
| 223 |
|
| 224 |
# Batch translate all segments at once
|
| 225 |
-
translated_texts = audio_processor.translate_segments_batch(segment_texts)
|
| 226 |
|
| 227 |
chunk_files = []
|
| 228 |
chunk_idx = 0
|
|
@@ -239,7 +239,7 @@ async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
|
|
| 239 |
print(f"🔤 {chunk_idx}: Translated: {translated}")
|
| 240 |
|
| 241 |
# Synthesize TTS audio
|
| 242 |
-
raw_tts = await synthesize_tts_to_wav(translated, voice)
|
| 243 |
|
| 244 |
# Stretch the audio to match the target duration
|
| 245 |
stretched = stretch_audio(raw_tts, duration)
|
|
@@ -267,7 +267,8 @@ async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
|
|
| 267 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 268 |
return final_audio_path, final_background_path
|
| 269 |
|
| 270 |
-
|
|
|
|
| 271 |
try:
|
| 272 |
# Create temporary directory for processing
|
| 273 |
temp_dir = Path(tempfile.mkdtemp())
|
|
@@ -286,7 +287,7 @@ def gradio_interface(video_file, voice):
|
|
| 286 |
return None
|
| 287 |
|
| 288 |
# Process audio chunks
|
| 289 |
-
audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
|
| 290 |
|
| 291 |
if audio_output_path is None or background_path is None:
|
| 292 |
return None
|
|
@@ -345,6 +346,51 @@ def combine_video_audio(video_path, audio_path, output_path):
|
|
| 345 |
print(f"Video combining error: {e}")
|
| 346 |
return False
|
| 347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
# Create Gradio interface
|
| 349 |
with gr.Blocks() as demo:
|
| 350 |
gr.Markdown("# Video Dubbing Application")
|
|
@@ -352,8 +398,13 @@ with gr.Blocks() as demo:
|
|
| 352 |
|
| 353 |
with gr.Row():
|
| 354 |
video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
voice_dropdown = gr.Dropdown(
|
| 356 |
-
["
|
| 357 |
label="Select Voice",
|
| 358 |
value="hi-IN-MadhurNeural"
|
| 359 |
)
|
|
@@ -362,9 +413,14 @@ with gr.Blocks() as demo:
|
|
| 362 |
|
| 363 |
submit_btn = gr.Button("Start Dubbing")
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
submit_btn.click(
|
| 366 |
gradio_interface,
|
| 367 |
-
inputs=[video_input, voice_dropdown],
|
| 368 |
outputs=output_video
|
| 369 |
)
|
| 370 |
|
|
|
|
| 82 |
|
| 83 |
return results
|
| 84 |
|
| 85 |
+
def translate_segments_batch(self, segments, target_language):
|
| 86 |
"""Translate all text segments in a single batch request"""
|
| 87 |
try:
|
| 88 |
# Filter out None segments (pauses)
|
|
|
|
| 94 |
print(f"Translating {len(text_segments)} segments in batch...")
|
| 95 |
|
| 96 |
# Prepare the prompt with clear formatting instructions
|
| 97 |
+
prompt = f"""Translate the following text segments to {target_language} while maintaining EXACTLY the same format and order:
|
| 98 |
|
| 99 |
{chr(10).join(text_segments)}
|
| 100 |
|
| 101 |
IMPORTANT INSTRUCTIONS:
|
| 102 |
1. Maintain the EXACT same order and number of segments
|
| 103 |
2. Each line must be a separate translation
|
| 104 |
+
3. Use natural conversational {target_language}
|
| 105 |
4. Preserve meaning/context
|
| 106 |
5. Leave proper nouns unchanged
|
| 107 |
6. Match original word count where possible
|
|
|
|
| 122 |
messages=[
|
| 123 |
{
|
| 124 |
"role": "system",
|
| 125 |
+
"content": f"You are a professional translator from English to {target_language}. Translate exactly as requested."
|
| 126 |
},
|
| 127 |
{
|
| 128 |
"role": "user",
|
|
|
|
| 165 |
print(f"Duration error: {e}")
|
| 166 |
return None
|
| 167 |
|
| 168 |
+
async def synthesize_tts_to_wav(text, voice, target_language):
|
| 169 |
import edge_tts
|
| 170 |
temp_mp3 = "temp_tts.mp3"
|
| 171 |
communicate = edge_tts.Communicate(text, voice)
|
|
|
|
| 205 |
if os.path.exists(file):
|
| 206 |
os.remove(file)
|
| 207 |
|
| 208 |
+
# --- Main Process Function ---
|
| 209 |
+
async def process_audio_chunks(input_audio_path, voice, target_language):
|
| 210 |
audio_processor = AudioProcessor()
|
| 211 |
|
| 212 |
print("🔎 Separating vocals and music using Demucs...")
|
|
|
|
| 222 |
segment_texts = [seg[2] if seg[2] is not None else None for seg in segments]
|
| 223 |
|
| 224 |
# Batch translate all segments at once
|
| 225 |
+
translated_texts = audio_processor.translate_segments_batch(segment_texts, target_language)
|
| 226 |
|
| 227 |
chunk_files = []
|
| 228 |
chunk_idx = 0
|
|
|
|
| 239 |
print(f"🔤 {chunk_idx}: Translated: {translated}")
|
| 240 |
|
| 241 |
# Synthesize TTS audio
|
| 242 |
+
raw_tts = await synthesize_tts_to_wav(translated, voice, target_language)
|
| 243 |
|
| 244 |
# Stretch the audio to match the target duration
|
| 245 |
stretched = stretch_audio(raw_tts, duration)
|
|
|
|
| 267 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 268 |
return final_audio_path, final_background_path
|
| 269 |
|
| 270 |
+
# --- Gradio Interface ---
|
| 271 |
+
def gradio_interface(video_file, voice, target_language):
|
| 272 |
try:
|
| 273 |
# Create temporary directory for processing
|
| 274 |
temp_dir = Path(tempfile.mkdtemp())
|
|
|
|
| 287 |
return None
|
| 288 |
|
| 289 |
# Process audio chunks
|
| 290 |
+
audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice, target_language))
|
| 291 |
|
| 292 |
if audio_output_path is None or background_path is None:
|
| 293 |
return None
|
|
|
|
| 346 |
print(f"Video combining error: {e}")
|
| 347 |
return False
|
| 348 |
|
| 349 |
+
# Voice options for each language
|
| 350 |
+
voice_options = {
|
| 351 |
+
"Hindi": [
|
| 352 |
+
"hi-IN-MadhurNeural", # Male
|
| 353 |
+
"hi-IN-SwaraNeural" # Female
|
| 354 |
+
],
|
| 355 |
+
"English": [
|
| 356 |
+
"en-US-GuyNeural", # Male
|
| 357 |
+
"en-US-BenjaminRUS", # Male
|
| 358 |
+
"en-US-ChristopherNeural", # Male
|
| 359 |
+
"en-US-AriaNeural", # Female
|
| 360 |
+
"en-US-JessaNeural", # Female
|
| 361 |
+
"en-US-JennyNeural" # Female
|
| 362 |
+
],
|
| 363 |
+
"Spanish": [
|
| 364 |
+
"es-ES-AlvaroNeural", # Male
|
| 365 |
+
"es-MX-JorgeNeural", # Male
|
| 366 |
+
"es-US-AlonsoNeural", # Male
|
| 367 |
+
"es-ES-ElviraNeural", # Female
|
| 368 |
+
"es-MX-DaliaNeural", # Female
|
| 369 |
+
"es-US-PalomaNeural" # Female
|
| 370 |
+
],
|
| 371 |
+
"French": [
|
| 372 |
+
"fr-FR-HenriNeural", # Male
|
| 373 |
+
"fr-FR-RemyMultilingualNeural", # Male
|
| 374 |
+
"fr-CA-AntoineNeural", # Male
|
| 375 |
+
"fr-FR-DeniseNeural", # Female
|
| 376 |
+
"fr-FR-JulieNeural", # Female
|
| 377 |
+
"fr-FR-VivienneMultilingualNeural" # Female
|
| 378 |
+
],
|
| 379 |
+
"Japanese": [
|
| 380 |
+
"ja-JP-KeitaNeural", # Male
|
| 381 |
+
"ja-JP-DaichiNeural", # Male
|
| 382 |
+
"ja-JP-RikuNeural", # Male
|
| 383 |
+
"ja-JP-AoiNeural", # Female
|
| 384 |
+
"ja-JP-NanamiNeural", # Female
|
| 385 |
+
"ja-JP-ShioriNeural" # Female
|
| 386 |
+
],
|
| 387 |
+
"Korean": [
|
| 388 |
+
"ko-KR-InJoonNeural", # Male
|
| 389 |
+
"ko-KR-SunHiNeural" # Female
|
| 390 |
+
]
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
|
| 394 |
# Create Gradio interface
|
| 395 |
with gr.Blocks() as demo:
|
| 396 |
gr.Markdown("# Video Dubbing Application")
|
|
|
|
| 398 |
|
| 399 |
with gr.Row():
|
| 400 |
video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
|
| 401 |
+
language_dropdown = gr.Dropdown(
|
| 402 |
+
list(voice_options.keys()),
|
| 403 |
+
label="Translate to",
|
| 404 |
+
value="Hindi"
|
| 405 |
+
)
|
| 406 |
voice_dropdown = gr.Dropdown(
|
| 407 |
+
voice_options["Hindi"],
|
| 408 |
label="Select Voice",
|
| 409 |
value="hi-IN-MadhurNeural"
|
| 410 |
)
|
|
|
|
| 413 |
|
| 414 |
submit_btn = gr.Button("Start Dubbing")
|
| 415 |
|
| 416 |
+
def update_voice_options(language):
|
| 417 |
+
return gr.update(choices=voice_options[language], value=voice_options[language][0])
|
| 418 |
+
|
| 419 |
+
language_dropdown.change(update_voice_options, inputs=[language_dropdown], outputs=[voice_dropdown])
|
| 420 |
+
|
| 421 |
submit_btn.click(
|
| 422 |
gradio_interface,
|
| 423 |
+
inputs=[video_input, voice_dropdown, language_dropdown],
|
| 424 |
outputs=output_video
|
| 425 |
)
|
| 426 |
|