""" 🎬 Commentary Video Dubbing App β€” English to Arabic / German """ import os import base64 import shutil import struct import subprocess import tempfile import time import gradio as gr from openai import OpenAI # ────────────────────────────────────────────── # Configuration # ────────────────────────────────────────────── MODEL = "qwen3.5-omni-plus" BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" LANGUAGES = { "Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى)": { "code": "ar", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Modern Standard Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى).\n" "3. Respond ONLY with the Arabic translation spoken aloud β€” no English, no commentary,\n" " no meta-text, no transliteration. Speak entirely in Arabic.\n" "4. Match the tone, emotion, and pacing of the original speaker as closely as possible.\n" "5. If there are pauses or silence in the original audio, maintain similar pacing.\n" "6. Translate idioms and cultural references into their Arabic equivalents.\n" "7. Use clear, professional Arabic pronunciation suitable for a broad Arab audience." ), "user_prompt": "Translate this English speech into Arabic. Respond only with the spoken Arabic translation. Use Modern Standard Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى).", }, "German (Deutsch)": { "code": "de", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent German.\n" "3. Respond ONLY with the German translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker as closely\n" " as possible.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their German equivalents rather than\n" " translating literally." ), "user_prompt": "Translate this English speech into German. Respond only with the spoken German translation.", }, "French (FranΓ§ais)": { "code": "fr", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent French.\n" "3. Respond ONLY with the French translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their French equivalents." ), "user_prompt": "Translate this English speech into French. Respond only with the spoken French translation.", }, "Spanish (EspaΓ±ol)": { "code": "es", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Spanish.\n" "3. Respond ONLY with the Spanish translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their Spanish equivalents." ), "user_prompt": "Translate this English speech into Spanish. Respond only with the spoken Spanish translation.", }, "Russian (Русский)": { "code": "ru", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Russian.\n" "3. Respond ONLY with the Russian translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their Russian equivalents." ), "user_prompt": "Translate this English speech into Russian. Respond only with the spoken Russian translation.", }, "Japanese (ζ—₯本θͺž)": { "code": "ja", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Japanese.\n" "3. Respond ONLY with the Japanese translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their Japanese equivalents." ), "user_prompt": "Translate this English speech into Japanese. Respond only with the spoken Japanese translation.", }, "Korean (ν•œκ΅­μ–΄)": { "code": "ko", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Korean.\n" "3. Respond ONLY with the Korean translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their Korean equivalents." ), "user_prompt": "Translate this English speech into Korean. Respond only with the spoken Korean translation.", }, "Portuguese (PortuguΓͺs)": { "code": "pt", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Portuguese.\n" "3. Respond ONLY with the Portuguese translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their Portuguese equivalents." ), "user_prompt": "Translate this English speech into Portuguese. Respond only with the spoken Portuguese translation.", }, "Italian (Italiano)": { "code": "it", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Italian.\n" "3. Respond ONLY with the Italian translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their Italian equivalents." ), "user_prompt": "Translate this English speech into Italian. Respond only with the spoken Italian translation.", }, "Chinese (δΈ­ζ–‡)": { "code": "zh", "system_prompt": ( "You are a professional video dubbing translator. You will receive audio in English.\n" "Your task:\n" "1. Listen carefully to the English speech.\n" "2. Translate it into natural, fluent Mandarin Chinese.\n" "3. Respond ONLY with the Chinese translation spoken aloud β€” no English, no commentary,\n" " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" "5. Translate idioms and cultural references into their Chinese equivalents." ), "user_prompt": "Translate this English speech into Mandarin Chinese. Respond only with the spoken Chinese translation.", }, } VOICES = [ "Cherry", "Serena", "Ethan", "Chelsie", "Momo", "Vivian", "Moon", "Maia", "Kai", "Nofish", "Bella", "Jennifer", "Ryan", "Katerina", "Aiden", "Eldric Sage", "Mia", "Mochi", "Bellona", "Vincent", "Bunny", "Neil", "Elias", "Arthur", "Seren", "Bodega", "Sonrisa", "Alek", "Dolce", "Sohee", "Ono Anna", "Lenn", "Emilien", "Andre", ] # ────────────────────────────────────────────── # Audio helpers # ────────────────────────────────────────────── def get_duration(filepath: str) -> float: result = subprocess.run( ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", filepath], capture_output=True, text=True, ) return float(result.stdout.strip()) def extract_audio_chunk(video_path, output_wav, start_sec, duration_sec): subprocess.run( ["ffmpeg", "-y", "-ss", str(start_sec), "-t", str(duration_sec), "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_wav], capture_output=True, check=True, ) def wav_to_base64(wav_path): with open(wav_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") def base64_to_wav(b64_data, output_path): audio_bytes = base64.b64decode(b64_data) sample_rate = 24000 num_channels = 1 bits_per_sample = 16 byte_rate = sample_rate * num_channels * bits_per_sample // 8 block_align = num_channels * bits_per_sample // 8 data_size = len(audio_bytes) with open(output_path, "wb") as f: f.write(b"RIFF") f.write(struct.pack(" 3600: raise gr.Error("Video is longer than 1 hour. Please use a shorter clip.") # ── Split ── progress(0.1, desc="Extracting audio chunks...") num_chunks = max( 1, int(total_duration // chunk_seconds) + (1 if total_duration % chunk_seconds > 0 else 0), ) input_chunks = [] for i in range(num_chunks): start = i * chunk_seconds duration = min(chunk_seconds, total_duration - start) chunk_path = os.path.join(tmp_dir, f"chunk_{i:03d}.wav") extract_audio_chunk(video_file, chunk_path, start, duration) input_chunks.append(chunk_path) # ── Translate ── output_chunks = [] all_transcripts = [] for i, chunk_path in enumerate(input_chunks): frac = 0.15 + 0.7 * (i / num_chunks) progress(frac, desc=f"Translating chunk {i+1}/{num_chunks}...") result_path, transcript = translate_chunk( client, chunk_path, voice, lang_config, i ) if transcript: all_transcripts.append(transcript) if result_path: output_chunks.append(result_path) else: # Silence fallback duration = get_duration(chunk_path) silence_path = os.path.join(tmp_dir, f"silence_{i:03d}.wav") subprocess.run( ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=24000:cl=mono", "-t", str(duration), "-acodec", "pcm_s16le", silence_path], capture_output=True, check=True, ) output_chunks.append(silence_path) # ── Concatenate ── progress(0.88, desc="Assembling audio...") full_audio = os.path.join(tmp_dir, "full_dubbed_audio.wav") concatenate_wavs(output_chunks, full_audio) # ── Mux ── progress(0.93, desc="Muxing audio onto video...") ext = os.path.splitext(video_file)[1] or ".mp4" output_video = os.path.join(tmp_dir, f"dubbed_{lang_config['code']}{ext}") mux_audio_to_video(video_file, full_audio, output_video) progress(1.0, desc="Done!") transcript_text = "\n\n".join( f"**Chunk {i+1}:**\n{t}" for i, t in enumerate(all_transcripts) ) or "No transcript available." return output_video, transcript_text except Exception as e: # Clean up on error shutil.rmtree(tmp_dir, ignore_errors=True) raise gr.Error(str(e)) # ────────────────────────────────────────────── # Gradio UI # ────────────────────────────────────────────── DESCRIPTION = """ # 🎬 Commentary Video Dubbing β€” English to Any Language Upload an English video and get it dubbed into Arabic, German, French, Spanish, and more. The model translates the speech and generates natural-sounding voice output in the target language. **Supported output languages:** Arabic, Chinese, German, French, Spanish, Portuguese, Italian, Russian, Japanese, Korean """ with gr.Blocks( title="Video Dubbing β€” Qwen3.5-Omni", theme=gr.themes.Soft( primary_hue="amber", secondary_hue="orange", neutral_hue="stone", ), ) as demo: gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=1): video_input = gr.Video(label="Upload English Video", sources=["upload"]) target_lang = gr.Dropdown( choices=list(LANGUAGES.keys()), value="Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى)", label="Target Language", ) voice_select = gr.Dropdown( choices=VOICES, value="Ethan", label="Voice", info="All voices support all output languages.", ) chunk_slider = gr.Slider( minimum=30, maximum=300, value=120, step=10, label="Chunk Duration (seconds)", info="Shorter chunks = more API calls but less risk of timeout.", ) dub_btn = gr.Button("πŸŽ™οΈ Start Dubbing", variant="primary", size="lg") with gr.Column(scale=1): video_output = gr.Video(label="Dubbed Video") transcript_output = gr.Markdown(label="Translation Transcript") dub_btn.click( fn=dub_video, inputs=[video_input, target_lang, voice_select, chunk_slider], outputs=[video_output, transcript_output], ) gr.Markdown( "---\n" "**Built by:** Plotweaver " ) if __name__ == "__main__": demo.launch()