Spaces:

mohitrai76
/

DeepDub

Running

App Files Files Community

mohitrai76 commited on Jun 1, 2025

Commit

b071cfb

verified ·

1 Parent(s): 56c4c63

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -13

app.py CHANGED Viewed

@@ -82,7 +82,7 @@ class AudioProcessor:
         return results
-    def translate_segments_batch(self, segments):
         """Translate all text segments in a single batch request"""
         try:
             # Filter out None segments (pauses)
@@ -94,14 +94,14 @@ class AudioProcessor:
             print(f"Translating {len(text_segments)} segments in batch...")
             # Prepare the prompt with clear formatting instructions
-            prompt = f"""Translate the following Given language text segments to Hindi while maintaining EXACTLY the same format and order:
             {chr(10).join(text_segments)}
             IMPORTANT INSTRUCTIONS:
             1. Maintain the EXACT same order and number of segments
             2. Each line must be a separate translation
-            3. Use natural conversational Hindi
             4. Preserve meaning/context
             5. Leave proper nouns unchanged
             6. Match original word count where possible
@@ -122,7 +122,7 @@ class AudioProcessor:
                 messages=[
                     {
                         "role": "system",
-                        "content": "You are a professional translator from Given language to Hindi. Translate exactly as requested."
                     },
                     {
                         "role": "user",
@@ -165,7 +165,7 @@ def get_audio_duration(audio_path):
         print(f"Duration error: {e}")
         return None
-async def synthesize_tts_to_wav(text, voice):
     import edge_tts
     temp_mp3 = "temp_tts.mp3"
     communicate = edge_tts.Communicate(text, voice)
@@ -205,8 +205,8 @@ def cleanup_files(file_list):
         if os.path.exists(file):
             os.remove(file)
-# --- Main Gradio Interface ---
-async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
     audio_processor = AudioProcessor()
     print("🔎 Separating vocals and music using Demucs...")
@@ -222,7 +222,7 @@ async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
     segment_texts = [seg[2] if seg[2] is not None else None for seg in segments]
     # Batch translate all segments at once
-    translated_texts = audio_processor.translate_segments_batch(segment_texts)
     chunk_files = []
     chunk_idx = 0
@@ -239,7 +239,7 @@ async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
             print(f"🔤 {chunk_idx}: Translated: {translated}")
             # Synthesize TTS audio
-            raw_tts = await synthesize_tts_to_wav(translated, voice)
             # Stretch the audio to match the target duration
             stretched = stretch_audio(raw_tts, duration)
@@ -267,7 +267,8 @@ async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
     shutil.rmtree(temp_dir, ignore_errors=True)
     return final_audio_path, final_background_path
-def gradio_interface(video_file, voice):
     try:
         # Create temporary directory for processing
         temp_dir = Path(tempfile.mkdtemp())
@@ -286,7 +287,7 @@ def gradio_interface(video_file, voice):
             return None
         # Process audio chunks
-        audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
         if audio_output_path is None or background_path is None:
             return None
@@ -345,6 +346,51 @@ def combine_video_audio(video_path, audio_path, output_path):
         print(f"Video combining error: {e}")
         return False
 # Create Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Video Dubbing Application")
@@ -352,8 +398,13 @@ with gr.Blocks() as demo:
     with gr.Row():
         video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
         voice_dropdown = gr.Dropdown(
-            ["hi-IN-MadhurNeural", "hi-IN-RekhaNeural", "hi-IN-SwaraNeural"],
             label="Select Voice",
             value="hi-IN-MadhurNeural"
         )
@@ -362,9 +413,14 @@ with gr.Blocks() as demo:
     submit_btn = gr.Button("Start Dubbing")
     submit_btn.click(
         gradio_interface,
-        inputs=[video_input, voice_dropdown],
         outputs=output_video
     )

         return results
+    def translate_segments_batch(self, segments, target_language):
         """Translate all text segments in a single batch request"""
         try:
             # Filter out None segments (pauses)
             print(f"Translating {len(text_segments)} segments in batch...")
             # Prepare the prompt with clear formatting instructions
+            prompt = f"""Translate the following text segments to {target_language} while maintaining EXACTLY the same format and order:
             {chr(10).join(text_segments)}
             IMPORTANT INSTRUCTIONS:
             1. Maintain the EXACT same order and number of segments
             2. Each line must be a separate translation
+            3. Use natural conversational {target_language}
             4. Preserve meaning/context
             5. Leave proper nouns unchanged
             6. Match original word count where possible
                 messages=[
                     {
                         "role": "system",
+                        "content": f"You are a professional translator from English to {target_language}. Translate exactly as requested."
                     },
                     {
                         "role": "user",
         print(f"Duration error: {e}")
         return None
+async def synthesize_tts_to_wav(text, voice, target_language):
     import edge_tts
     temp_mp3 = "temp_tts.mp3"
     communicate = edge_tts.Communicate(text, voice)
         if os.path.exists(file):
             os.remove(file)
+# --- Main Process Function ---
+async def process_audio_chunks(input_audio_path, voice, target_language):
     audio_processor = AudioProcessor()
     print("🔎 Separating vocals and music using Demucs...")
     segment_texts = [seg[2] if seg[2] is not None else None for seg in segments]
     # Batch translate all segments at once
+    translated_texts = audio_processor.translate_segments_batch(segment_texts, target_language)
     chunk_files = []
     chunk_idx = 0
             print(f"🔤 {chunk_idx}: Translated: {translated}")
             # Synthesize TTS audio
+            raw_tts = await synthesize_tts_to_wav(translated, voice, target_language)
             # Stretch the audio to match the target duration
             stretched = stretch_audio(raw_tts, duration)
     shutil.rmtree(temp_dir, ignore_errors=True)
     return final_audio_path, final_background_path
+# --- Gradio Interface ---
+def gradio_interface(video_file, voice, target_language):
     try:
         # Create temporary directory for processing
         temp_dir = Path(tempfile.mkdtemp())
             return None
         # Process audio chunks
+        audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice, target_language))
         if audio_output_path is None or background_path is None:
             return None
         print(f"Video combining error: {e}")
         return False
+# Voice options for each language
+voice_options = {
+    "Hindi": [
+        "hi-IN-MadhurNeural",   # Male
+        "hi-IN-SwaraNeural"     # Female
+    ],
+    "English": [
+        "en-US-GuyNeural",      # Male
+        "en-US-BenjaminRUS",    # Male
+        "en-US-ChristopherNeural",  # Male
+        "en-US-AriaNeural",     # Female
+        "en-US-JessaNeural",    # Female
+        "en-US-JennyNeural"     # Female
+    ],
+    "Spanish": [
+        "es-ES-AlvaroNeural",   # Male
+        "es-MX-JorgeNeural",    # Male
+        "es-US-AlonsoNeural",   # Male
+        "es-ES-ElviraNeural",   # Female
+        "es-MX-DaliaNeural",    # Female
+        "es-US-PalomaNeural"    # Female
+    ],
+    "French": [
+        "fr-FR-HenriNeural",    # Male
+        "fr-FR-RemyMultilingualNeural", # Male
+        "fr-CA-AntoineNeural",  # Male
+        "fr-FR-DeniseNeural",   # Female
+        "fr-FR-JulieNeural",    # Female
+        "fr-FR-VivienneMultilingualNeural"  # Female
+    ],
+    "Japanese": [
+        "ja-JP-KeitaNeural",    # Male
+        "ja-JP-DaichiNeural",   # Male
+        "ja-JP-RikuNeural",     # Male
+        "ja-JP-AoiNeural",      # Female
+        "ja-JP-NanamiNeural",   # Female
+        "ja-JP-ShioriNeural"    # Female
+    ],
+    "Korean": [
+        "ko-KR-InJoonNeural",   # Male
+         "ko-KR-SunHiNeural" # Female
+    ]
+}
 # Create Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Video Dubbing Application")
     with gr.Row():
         video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
+        language_dropdown = gr.Dropdown(
+            list(voice_options.keys()),
+            label="Translate to",
+            value="Hindi"
+        )
         voice_dropdown = gr.Dropdown(
+            voice_options["Hindi"],
             label="Select Voice",
             value="hi-IN-MadhurNeural"
         )
     submit_btn = gr.Button("Start Dubbing")
+    def update_voice_options(language):
+        return gr.update(choices=voice_options[language], value=voice_options[language][0])
+    language_dropdown.change(update_voice_options, inputs=[language_dropdown], outputs=[voice_dropdown])
     submit_btn.click(
         gradio_interface,
+        inputs=[video_input, voice_dropdown, language_dropdown],
         outputs=output_video
     )