Spaces:

dooyum
/

AI_Video_Dubber

Runtime error

App Files Files Community

dooyum commited on Sep 13, 2025

Commit

63ffd97

verified ·

1 Parent(s): 65f0f6f

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -346

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import sys
 import subprocess
 import tempfile
-import logging
 from pathlib import Path
 from dotenv import load_dotenv
 import whisper
@@ -10,11 +9,11 @@ import gradio as gr
 import azure.cognitiveservices.speech as speechsdk
 import requests
 from pydub import AudioSegment
 import shutil
 import io
 import asyncio
 import json
-from langdetect import detect
 # Limit OMP threads (fix libgomp issue)
 os.environ["OMP_NUM_THREADS"] = os.getenv("OMP_NUM_THREADS", "1")
@@ -42,9 +41,6 @@ if not AZURE_REGION:
 if missing:
     sys.exit(f"❌ Missing environment variables: {', '.join(missing)}")
-# Setup logging
-logging.basicConfig(filename="dubbing.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # --- Language map ---
 LANGUAGE_MAP = {
     "French": "fr",
@@ -56,34 +52,6 @@ LANGUAGE_MAP = {
     "Spanish": "es",
     "Polish": "pl",
     "Arabic": "ar",
-    "Chinese (Mandarin, Simplified)": "zh-Hans",
-    "Chinese (Mandarin, Traditional)": "zh-Hant",
-    "Czech": "cs",
-    "Danish": "da",
-    "English (US)": "en",
-    "English (UK)": "en",
-    "Estonian": "et",
-    "Finnish": "fi",
-    "Greek": "el",
-    "Hebrew": "he",
-    "Hindi": "hi",
-    "Hungarian": "hu",
-    "Indonesian": "id",
-    "Korean": "ko",
-    "Latvian": "lv",
-    "Lithuanian": "lt",
-    "Malay": "ms",
-    "Norwegian": "nb",
-    "Portuguese (Brazil)": "pt",
-    "Portuguese (Portugal)": "pt-pt",
-    "Romanian": "ro",
-    "Russian": "ru",
-    "Slovak": "sk",
-    "Slovenian": "sl",
-    "Thai": "th",
-    "Turkish": "tr",
-    "Ukrainian": "uk",
-    "Vietnamese": "vi",
 }
 # --- Helper function for SRT formatting ---
@@ -96,193 +64,51 @@ def _format_time(seconds):
     return f"{h:02}:{m:02}:{s:02},{ms:03}"
 # --- Async TTS helper function ---
-async def _synthesize_tts_async(speech_config, text, line_index):
-    """Synthesize TTS with better error handling and debugging"""
-    try:
-        # Create audio config to capture output
-        audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=False)
-        synthesizer = speechsdk.SpeechSynthesizer(
-            speech_config=speech_config,
-            audio_config=audio_config
-        )
-        # Log what we're trying to synthesize
-        logging.info(f"Line {line_index+1}: Attempting TTS for text: '{text}'")
-        print(f"🔊 Line {line_index+1}: Synthesizing: '{text}'")
-        # Use synchronous call within executor
-        loop = asyncio.get_running_loop()
-        result = await loop.run_in_executor(
-            None,
-            lambda: synthesizer.speak_text_async(text).get()
-        )
-        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
-            audio_data = result.audio_data
-            if not audio_data:
-                error_msg = f"Line {line_index+1}: Empty audio data returned"
-                logging.error(error_msg)
-                print(f"❌ {error_msg}")
-                # Fallback: create silent audio of estimated length
-                estimated_duration = len(text.split()) * 0.3  # 0.3s per word
-                return AudioSegment.silent(duration=int(estimated_duration * 1000))
-            logging.info(f"Line {line_index+1}: TTS synthesis successful")
-            print(f"✅ Line {line_index+1}: TTS synthesis successful")
-            # Convert to AudioSegment
-            try:
-                # Save to temp file for pydub processing
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-                    temp_wav.write(audio_data)
-                    temp_wav_path = temp_wav.name
-                # Load with pydub
-                audio_segment = AudioSegment.from_wav(temp_wav_path)
-                # Clean up temp file
-                os.unlink(temp_wav_path)
-                return audio_segment
-            except Exception as e:
-                error_msg = f"Line {line_index+1}: Failed to convert audio data: {str(e)}"
-                logging.error(error_msg)
-                print(f"❌ {error_msg}")
-                # Fallback: create silent audio of estimated length
-                estimated_duration = len(text.split()) * 0.3
-                return AudioSegment.silent(duration=int(estimated_duration * 1000))
-        else:
-            cancellation_details = speechsdk.SpeechSynthesisCancellationDetails(result)
-            error_msg = f"Line {line_index+1}: TTS failed - Reason: {cancellation_details.reason}, Error: {cancellation_details.error_details}"
-            logging.error(error_msg)
-            print(f"❌ {error_msg}")
-            # Fallback: create silent audio of estimated length
-            estimated_duration = len(text.split()) * 0.3
-            return AudioSegment.silent(duration=int(estimated_duration * 1000))
-    except Exception as e:
-        error_msg = f"Line {line_index+1}: TTS synthesis error: {str(e)}"
-        logging.error(error_msg)
-        print(f"❌ {error_msg}")
-        # Fallback: create silent audio of estimated length
-        estimated_duration = len(text.split()) * 0.3
-        return AudioSegment.silent(duration=int(estimated_duration * 1000))
-def translate_with_azure(texts, target_lang_code):
-    """Translate text using Azure Translator REST API"""
     try:
-        endpoint = f"https://{AZURE_TRANSLATOR_REGION}.api.cognitive.microsoft.com"
-        headers = {
-            "Ocp-Apim-Subscription-Key": AZURE_TRANSLATOR_KEY,
-            "Ocp-Apim-Subscription-Region": AZURE_TRANSLATOR_REGION,
-            "Content-Type": "application/json",
-            "Accept": "application/json"
-        }
-        # Prepare the request body
-        body = [{'text': text} for text in texts]
-        # Make the request
-        response = requests.post(
-            f"{endpoint}/translator/text/v3.0/translate?api-version=3.0&from=en&to={target_lang_code}",
-            headers=headers,
-            json=body
-        )
-        response.raise_for_status()
-        # Parse the response
-        result = response.json()
-        translated_texts = []
-        for item in result:
-            if 'translations' in item and len(item['translations']) > 0:
-                translated_texts.append(item['translations'][0]['text'])
-            else:
-                translated_texts.append("")  # Fallback for failed translations
-        return translated_texts
     except Exception as e:
-        logging.error(f"Azure Translator error: {str(e)}")
-        print(f"❌ Azure Translator error: {str(e)}")
-        # Fallback: return original texts if translation fails
-        return texts
-def fix_outlier_lines(translated_lines, english_lines, target_lang_code):
-    """Fix lines that are not in the target language"""
-    corrected_lines = translated_lines.copy()
-    fixed_indices = []
-    expected_lang = target_lang_code.split("-")[0]
-    for i, line in enumerate(translated_lines):
-        if not line.strip() or len(line.strip()) < 3:
-            corrected_lines[i] = english_lines[i]  # Fallback for short/empty lines
-            logging.info(f"Line {i+1}: Used English fallback for short/empty line: {english_lines[i]}")
-            continue
         try:
-            detected_lang = detect(line)
-        except Exception:
-            detected_lang = None
-        if detected_lang != expected_lang:
-            # For Azure Translator, we'll retry the translation
-            retry_translation = translate_with_azure([english_lines[i]], target_lang_code)
-            if retry_translation and retry_translation[0]:
-                fixed_line = retry_translation[0].strip()
-                # Verify re-translated line
-                try:
-                    if detect(fixed_line) != expected_lang:
-                        logging.warning(f"Line {i+1}: Re-translation still not in {target_lang_code}: {fixed_line}")
-                        corrected_lines[i] = english_lines[i]  # Fallback to English
-                    else:
-                        corrected_lines[i] = fixed_line
-                        fixed_indices.append(i)
-                        logging.info(f"Line {i+1}: Fixed from {detected_lang} to {target_lang_code}: {fixed_line}")
-                except Exception:
-                    corrected_lines[i] = english_lines[i]  # Fallback to English
-                    logging.warning(f"Line {i+1}: Language detection failed for re-translated line: {fixed_line}")
-            else:
-                corrected_lines[i] = english_lines[i]  # Fallback to English
-                logging.warning(f"Line {i+1}: Re-translation failed for text: {english_lines[i]}")
-        else:
-            logging.info(f"Line {i+1}: Correct language detected: {detected_lang}")
-    return corrected_lines, fixed_indices
-def test_azure_tts_connection():
-    """Test Azure TTS connection before starting"""
-    try:
-        speech_config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
-        speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"
-        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
-        result = synthesizer.speak_text_async("Test connection").get()
-        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
-            print("✅ Azure TTS connection test successful")
-            logging.info("Azure TTS connection test successful")
-            return True
-        else:
-            print("❌ Azure TTS connection test failed")
-            logging.error("Azure TTS connection test failed")
-            return False
-    except Exception as e:
-        print(f"❌ Azure TTS connection error: {str(e)}")
-        logging.error(f"Azure TTS connection error: {str(e)}")
-        return False
 # --- Main dubbing function ---
 async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
-    print(f"🎬 Video={uploaded_video_path}, Lang={target_lang_name}, Voice={voice_gender}")
-    logging.info(f"Starting dubbing: video={uploaded_video_path}, lang={target_lang_name}, voice={voice_gender}")
-    # Test Azure TTS connection first
-    if not test_azure_tts_connection():
-        return None, None, "❌ Error: Azure TTS connection failed. Please check your Azure credentials and region."
     target_lang_code = LANGUAGE_MAP.get(target_lang_name)
     if not target_lang_code:
-        logging.error(f"Invalid language selected: {target_lang_name}")
         return None, None, "❌ Error: Invalid language selected."
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -295,44 +121,67 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
         shutil.copy(uploaded_video_path, video_in)
         print("🎧 Extracting audio...")
-        logging.info("Extracting audio from video")
         subprocess.run(["ffmpeg", "-y", "-i", video_in, "-ac", "1", "-ar", "16000", audio_wav])
         print("📝 Transcribing (Whisper)...")
-        logging.info("Transcribing audio with Whisper base model")
-        model = whisper.load_model("base")  # Using base for faster testing
         result = model.transcribe(str(audio_wav), language="en")
         segments = result["segments"]
-        print(f"🌐 Translating to {target_lang_name} using Azure Translator...")
-        logging.info(f"Translating to {target_lang_name} using Azure Translator")
         english_lines = [seg["text"].strip() for seg in segments]
-        # For testing, use simple translations or keep original
-        if not english_lines:
-            english_lines = ["Hello world", "This is a test"]
-        # Translate using Azure Translator
-        translated_lines = translate_with_azure(english_lines, target_lang_code)
         print(f"Translated lines:\n{translated_lines}")
-        logging.info(f"Initial translation: {translated_lines}")
-        # Validate line count
-        if len(translated_lines) != len(english_lines):
-            logging.warning(f"Translation line count mismatch: got {len(translated_lines)}, expected {len(english_lines)}")
-            translated_lines = translated_lines[:len(english_lines)] + [""] * (len(english_lines) - len(translated_lines))
-        # --- Language detection + auto-fix ---
-        translated_lines, fixed_indices = fix_outlier_lines(
-            translated_lines=translated_lines,
-            english_lines=english_lines,
-            target_lang_code=target_lang_code
-        )
         print("🔊 Generating speech with Azure Neural TTS...")
-        logging.info(f"Generating TTS with voice: {voice_gender}")
         voice_map = {
             "fr": {"female": "fr-FR-DeniseNeural", "male": "fr-FR-HenriNeural"},
             "de": {"female": "de-DE-KatjaNeural", "male": "de-DE-ConradNeural"},
@@ -343,114 +192,24 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
             "es": {"female": "es-ES-ElviraNeural", "male": "es-ES-AlvaroNeural"},
             "pl": {"female": "pl-PL-AgnieszkaNeural", "male": "pl-PL-MarekNeural"},
             "ar": {"female": "ar-SA-ZariyahNeural", "male": "ar-SA-HamedNeural"},
-            "zh-Hans": {"female": "zh-CN-XiaoxiaoNeural", "male": "zh-CN-YunyangNeural"},
-            "zh-Hant": {"female": "zh-TW-HsiaoChenNeural", "male": "zh-TW-YunJheNeural"},
-            "cs": {"female": "cs-CZ-VlastaNeural", "male": "cs-CZ-AntoninNeural"},
-            "da": {"female": "da-DK-ChristelNeural", "male": "da-DK-JeppeNeural"},
-            "en": {"female": "en-US-JennyNeural", "male": "en-US-GuyNeural"},
-            "et": {"female": "et-EE-AnuNeural", "male": "et-EE-KertNeural"},
-            "fi": {"female": "fi-FI-NooraNeural", "male": "fi-FI-HarriNeural"},
-            "el": {"female": "el-GR-AthinaNeural", "male": "el-GR-NestorasNeural"},
-            "he": {"female": "he-IL-HilaNeural", "male": "he-IL-AvriNeural"},
-            "hi": {"female": "hi-IN-SwaraNeural", "male": "hi-IN-MadhurNeural"},
-            "hu": {"female": "hu-HU-NoemiNeural", "male": "hu-HU-TamasNeural"},
-            "id": {"female": "id-ID-GadisNeural", "male": "id-ID-ArdiNeural"},
-            "ko": {"female": "ko-KR-SunHiNeural", "male": "ko-KR-InJoonNeural"},
-            "lv": {"female": "lv-LV-EveritaNeural", "male": "lv-LV-NilsNeural"},
-            "lt": {"female": "lt-LT-OnaNeural", "male": "lt-LT-LeonasNeural"},
-            "ms": {"female": "ms-MY-YasminNeural", "male": "ms-MY-OsmanNeural"},
-            "nb": {"female": "nb-NO-IselinNeural", "male": "nb-NO-FinnNeural"},
-            "pt": {"female": "pt-BR-FranciscaNeural", "male": "pt-BR-AntonioNeural"},
-            "pt-pt": {"female": "pt-PT-FernandaNeural", "male": "pt-PT-DuarteNeural"},
-            "ro": {"female": "ro-RO-AlinaNeural", "male": "ro-RO-EmilNeural"},
-            "ru": {"female": "ru-RU-DariyaNeural", "male": "ru-RU-DmitryNeural"},
-            "sk": {"female": "sk-SK-ViktoriaNeural", "male": "sk-SK-LukasNeural"},
-            "sl": {"female": "sl-SI-PetraNeural", "male": "sl-SI-RokNeural"},
-            "th": {"female": "th-TH-AcharaNeural", "male": "th-TH-NiwatNeural"},
-            "tr": {"female": "tr-TR-EmelNeural", "male": "tr-TR-AhmetNeural"},
-            "uk": {"female": "uk-UA-PolinaNeural", "male": "uk-UA-OstapNeural"},
-            "vi": {"female": "vi-VN-HoaiMyNeural", "male": "vi-VN-NamMinhNeural"},
         }
         selected_voice = voice_map.get(target_lang_code, {}).get(voice_gender)
         if not selected_voice:
-            # Fallback to English if voice not found
-            selected_voice = "en-US-JennyNeural" if voice_gender == "female" else "en-US-GuyNeural"
-            print(f"⚠ Voice not found for {target_lang_name}, using fallback: {selected_voice}")
-            logging.warning(f"Voice not found for {target_lang_name}, using fallback: {selected_voice}")
         print(f"Using TTS voice: {selected_voice}")
-        logging.info(f"Using TTS voice: {selected_voice}")
-        # Configure speech with detailed error reporting
         speech_config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
         speech_config.speech_synthesis_voice_name = selected_voice
-        # Set output format to ensure compatibility
-        speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
-        # Generate TTS for each line with timeout
-        tasks = []
-        for i, text in enumerate(translated_lines):
-            if text.strip():  # Only process non-empty text
-                tasks.append(_synthesize_tts_async(speech_config, text, i))
-            else:
-                # For empty text, create a short silent segment
-                tasks.append(asyncio.sleep(0))
-        segment_audios = await asyncio.gather(*tasks, return_exceptions=True)
-        # Handle failed TTS segments
-        valid_audios = []
-        valid_segments = []
-        valid_lines = []
-        for i, (audio, seg, line) in enumerate(zip(segment_audios, segments, translated_lines)):
-            if audio is not None and not isinstance(audio, Exception):
-                valid_audios.append(audio)
-                valid_segments.append(seg)
-                valid_lines.append(line)
-                print(f"✅ Successfully processed line {i+1}")
-            else:
-                print(f"❌ Failed to process line {i+1}: '{line}'")
-                logging.warning(f"Line {i+1}: Skipping due to TTS failure: {line}")
-                # Add silent audio as fallback
-                estimated_duration = len(line.split()) * 0.3 * 1000 if line.strip() else 1000
-                valid_audios.append(AudioSegment.silent(duration=int(estimated_duration)))
-                valid_segments.append(seg)
-                valid_lines.append(line)
-        if not valid_audios:
-            error_msg = "❌ Error: No valid audio segments generated. Check Azure TTS configuration."
-            logging.error(error_msg)
-            return None, None, error_msg
-        print(f"✅ Successfully generated {len(valid_audios)} audio segments")
-        logging.info(f"Successfully generated {len(valid_audios)} audio segments")
-        # Adjust timestamps based on audio durations
-        adjusted_segments = []
-        current_time = 0
-        for i, audio in enumerate(valid_audios):
-            start = current_time / 1000
-            duration = len(audio)  # Duration in milliseconds
-            end = (current_time + duration) / 1000
-            adjusted_segments.append({"start": start, "end": end, "text": valid_lines[i]})
-            current_time += duration
-            logging.info(f"Segment {i+1}: Start={start:.2f}s, End={end:.2f}s, Text={valid_lines[i]}")
-        print("🎥 Merging dubbed audio into video...")
-        logging.info("Merging dubbed audio into video")
-        # Create silent audio of total duration
-        full_audio = AudioSegment.silent(duration=current_time)
-        for seg, segment_audio in zip(adjusted_segments, valid_audios):
             start_ms = int(seg["start"] * 1000)
             full_audio = full_audio.overlay(segment_audio, position=start_ms)
         full_audio.export(str(dubbed_audio_path), format="wav")
-        # Merge audio with video
         subprocess.run([
             "ffmpeg", "-y",
             "-i", str(video_in),
@@ -458,23 +217,21 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
             "-c:v", "copy",
             "-map", "0:v:0",
             "-map", "1:a:0",
-            "-shortest",  # Ensure output duration matches the shortest input
             str(output_video_temp)
-        ], check=True)
         print("📄 Generating subtitle file...")
-        logging.info("Generating subtitle file")
         srt_content = ""
-        for i, seg in enumerate(adjusted_segments):
             start_time = _format_time(seg["start"])
             end_time = _format_time(seg["end"])
-            srt_content += f"{i + 1}\n{start_time} --> {end_time}\n{seg['text']}\n\n"
         output_subtitles_temp.write_text(srt_content, encoding="utf-8")
-        print("✅ Dubbing completed successfully!")
-        logging.info("Dubbing completed successfully")
-        # Copy to output directory
         output_dir = Path(tempfile.mkdtemp(prefix="dubbed_output_"))
         output_video_path = output_dir / "output_dubbed.mp4"
         output_subtitles_path = output_dir / "subtitles.srt"
@@ -482,7 +239,7 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
         shutil.copy(output_video_temp, output_video_path)
         shutil.copy(output_subtitles_temp, output_subtitles_path)
-        return str(output_video_path), str(output_subtitles_path), "✅ Dubbing completed successfully!"
 # --- Gradio UI setup ---
 with gr.Blocks(title="AI Video Dubber") as demo:
@@ -492,18 +249,22 @@ with gr.Blocks(title="AI Video Dubber") as demo:
     with gr.Row():
         with gr.Column():
             uploaded_video = gr.Video(label="📤 Upload your video")
             target_lang_choices = list(LANGUAGE_MAP.keys())
             target_lang_dropdown = gr.Dropdown(
                 label="🌍 Target language",
                 choices=target_lang_choices,
-                value="English (US)",
             )
             voice_gender_dropdown = gr.Dropdown(
                 label="🎙️ Voice Gender",
                 choices=["female", "male"],
                 value="female"
             )
             run_button = gr.Button("🚀 Start Dubbing")
         with gr.Column():
             dubbed_video_out = gr.Video(label="Dubbed Video")
             download_subtitles = gr.File(label="Download Subtitle File")
@@ -516,4 +277,4 @@ with gr.Blocks(title="AI Video Dubber") as demo:
     )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import sys
 import subprocess
 import tempfile
 from pathlib import Path
 from dotenv import load_dotenv
 import whisper
 import azure.cognitiveservices.speech as speechsdk
 import requests
 from pydub import AudioSegment
+from pydub.utils import make_chunks
 import shutil
 import io
 import asyncio
 import json
 # Limit OMP threads (fix libgomp issue)
 os.environ["OMP_NUM_THREADS"] = os.getenv("OMP_NUM_THREADS", "1")
 if missing:
     sys.exit(f"❌ Missing environment variables: {', '.join(missing)}")
 # --- Language map ---
 LANGUAGE_MAP = {
     "French": "fr",
     "Spanish": "es",
     "Polish": "pl",
     "Arabic": "ar",
 }
 # --- Helper function for SRT formatting ---
     return f"{h:02}:{m:02}:{s:02},{ms:03}"
 # --- Async TTS helper function ---
+async def _synthesize_tts_async(speech_config, text):
+    loop = asyncio.get_running_loop()
+    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
+    # Run blocking .get() in thread executor
+    result = await loop.run_in_executor(
+        None, lambda: synthesizer.speak_text_async(text).get()
+    )
+    if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
+        print(f"TTS synthesis failed with reason: {result.reason}")
+        return AudioSegment.silent(duration=1000)  # Return silent audio as fallback
+    audio_data = result.audio_data
+    if not audio_data:
+        print("No audio data received from TTS")
+        return AudioSegment.silent(duration=1000)
     try:
+        # Save to temp file for pydub processing
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+            temp_wav.write(audio_data)
+            temp_wav_path = temp_wav.name
+        # Load with pydub
+        audio_segment = AudioSegment.from_wav(temp_wav_path)
+        # Clean up temp file
+        os.unlink(temp_wav_path)
+        return audio_segment
     except Exception as e:
+        print(f"Error processing TTS audio: {e}")
+        # Fallback: try to create silent audio of estimated length
         try:
+            estimated_duration = len(text.split()) * 0.3  # Rough estimate: 0.3s per word
+            return AudioSegment.silent(duration=int(estimated_duration * 1000))
+        except:
+            return AudioSegment.silent(duration=1000)
 # --- Main dubbing function ---
 async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
+    print(f"Received inputs: video={uploaded_video_path}, lang={target_lang_name}, voice={voice_gender}")
     target_lang_code = LANGUAGE_MAP.get(target_lang_name)
     if not target_lang_code:
         return None, None, "❌ Error: Invalid language selected."
     with tempfile.TemporaryDirectory() as temp_dir:
         shutil.copy(uploaded_video_path, video_in)
         print("🎧 Extracting audio...")
         subprocess.run(["ffmpeg", "-y", "-i", video_in, "-ac", "1", "-ar", "16000", audio_wav])
         print("📝 Transcribing (Whisper)...")
+        model = whisper.load_model("large")
         result = model.transcribe(str(audio_wav), language="en")
         segments = result["segments"]
+        print(f"🌐 Translating to {target_lang_name}...")
         english_lines = [seg["text"].strip() for seg in segments]
+        translated_lines = []
+        endpoint = f"https://{AZURE_TRANSLATOR_REGION}.api.cognitive.microsoft.com"
+        headers = {
+            "Ocp-Apim-Subscription-Key": AZURE_TRANSLATOR_KEY,
+            "Ocp-Apim-Subscription-Region": AZURE_TRANSLATOR_REGION,
+            "Content-Type": "application/json",
+            "Accept": "application/json"
+        }
+        for line in english_lines:
+            if line:  # Only translate non-empty lines
+                body = [{"text": line}]
+                response = requests.post(
+                    f"{endpoint}/translator/text/v3.0/translate?api-version=3.0&from=en&to={target_lang_code}",
+                    headers=headers,
+                    json=body
+                )
+                if response.status_code == 200:
+                    translations = response.json()
+                    translated_text = translations[0]["translations"][0]["text"]
+                    translated_lines.append(translated_text)
+                else:
+                    print(f"Translation error: {response.status_code} - {response.text}")
+                    translated_lines.append(line)  # Fallback to original
+            else:
+                translated_lines.append("")
         print(f"Translated lines:\n{translated_lines}")
+        # --- LANGUAGE DETECTION + AUTO-CORRECTION ---
+        from langdetect import detect
+        for i, line in enumerate(translated_lines):
+            if line:  # Skip empty lines
+                detected_lang = detect(line)
+                if detected_lang != target_lang_code:
+                    print(f"⚠️ Warning: Detected {detected_lang}, correcting to {target_lang_code}...")
+                    try:
+                        body = [{"text": line}]
+                        response = requests.post(
+                            f"{endpoint}/translator/text/v3.0/translate?api-version=3.0&from={detected_lang}&to={target_lang_code}",
+                            headers=headers,
+                            json=body
+                        )
+                        if response.status_code == 200:
+                            translations = response.json()
+                            corrected_text = translations[0]["translations"][0]["text"]
+                            translated_lines[i] = corrected_text
+                            print(f"✅ Corrected: {corrected_text}")
+                        else:
+                            print(f"❌ Correction failed ({response.status_code}) - keeping original line.")
+                    except Exception as e:
+                        print(f"❌ Error correcting translation: {e}")
         print("🔊 Generating speech with Azure Neural TTS...")
         voice_map = {
             "fr": {"female": "fr-FR-DeniseNeural", "male": "fr-FR-HenriNeural"},
             "de": {"female": "de-DE-KatjaNeural", "male": "de-DE-ConradNeural"},
             "es": {"female": "es-ES-ElviraNeural", "male": "es-ES-AlvaroNeural"},
             "pl": {"female": "pl-PL-AgnieszkaNeural", "male": "pl-PL-MarekNeural"},
             "ar": {"female": "ar-SA-ZariyahNeural", "male": "ar-SA-HamedNeural"},
         }
         selected_voice = voice_map.get(target_lang_code, {}).get(voice_gender)
         if not selected_voice:
+            return None, None, f"❌ Error: Voice for {target_lang_name} ({voice_gender}) not found."
         print(f"Using TTS voice: {selected_voice}")
         speech_config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
         speech_config.speech_synthesis_voice_name = selected_voice
+        tasks = [_synthesize_tts_async(speech_config, translated_text) for translated_text in translated_lines]
+        segment_audios = await asyncio.gather(*tasks)
+        full_audio = AudioSegment.silent(duration=segments[-1]["end"] * 1000)
+        for seg, segment_audio in zip(segments, segment_audios):
             start_ms = int(seg["start"] * 1000)
             full_audio = full_audio.overlay(segment_audio, position=start_ms)
+        print("🎥 Merging dubbed audio into video...")
         full_audio.export(str(dubbed_audio_path), format="wav")
         subprocess.run([
             "ffmpeg", "-y",
             "-i", str(video_in),
             "-c:v", "copy",
             "-map", "0:v:0",
             "-map", "1:a:0",
+            "-map", "-0:a",
             str(output_video_temp)
+        ])
         print("📄 Generating subtitle file...")
         srt_content = ""
+        for i, (seg, translated_text) in enumerate(zip(segments, translated_lines)):
             start_time = _format_time(seg["start"])
             end_time = _format_time(seg["end"])
+            srt_content += f"{i + 1}\n"
+            srt_content += f"{start_time} --> {end_time}\n"
+            srt_content += f"{translated_text}\n\n"
         output_subtitles_temp.write_text(srt_content, encoding="utf-8")
+        print("✅ Done!")
         output_dir = Path(tempfile.mkdtemp(prefix="dubbed_output_"))
         output_video_path = output_dir / "output_dubbed.mp4"
         output_subtitles_path = output_dir / "subtitles.srt"
         shutil.copy(output_video_temp, output_video_path)
         shutil.copy(output_subtitles_temp, output_subtitles_path)
+        return str(output_video_path), str(output_subtitles_path), "✅ Done! Your video and subtitles are ready."
 # --- Gradio UI setup ---
 with gr.Blocks(title="AI Video Dubber") as demo:
     with gr.Row():
         with gr.Column():
             uploaded_video = gr.Video(label="📤 Upload your video")
             target_lang_choices = list(LANGUAGE_MAP.keys())
             target_lang_dropdown = gr.Dropdown(
                 label="🌍 Target language",
                 choices=target_lang_choices,
+                value=target_lang_choices[0],
             )
             voice_gender_dropdown = gr.Dropdown(
                 label="🎙️ Voice Gender",
                 choices=["female", "male"],
                 value="female"
             )
             run_button = gr.Button("🚀 Start Dubbing")
         with gr.Column():
             dubbed_video_out = gr.Video(label="Dubbed Video")
             download_subtitles = gr.File(label="Download Subtitle File")
     )
 if __name__ == "__main__":
+    demo.launch()