| """ |
| π¬ Commentary Video Dubbing App β English to Arabic / German |
| |
| """ |
|
|
| import os |
| import base64 |
| import shutil |
| import struct |
| import subprocess |
| import tempfile |
| import time |
|
|
| import gradio as gr |
| from openai import OpenAI |
|
|
| |
| |
| |
| MODEL = "qwen3.5-omni-plus" |
| BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" |
|
|
| LANGUAGES = { |
| "Arabic (Ψ§ΩΨΉΨ±Ψ¨ΩΨ© Ψ§ΩΩΨ΅ΨΩ)": { |
| "code": "ar", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Modern Standard Arabic (Ψ§ΩΨΉΨ±Ψ¨ΩΨ© Ψ§ΩΩΨ΅ΨΩ).\n" |
| "3. Respond ONLY with the Arabic translation spoken aloud β no English, no commentary,\n" |
| " no meta-text, no transliteration. Speak entirely in Arabic.\n" |
| "4. Match the tone, emotion, and pacing of the original speaker as closely as possible.\n" |
| "5. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "6. Translate idioms and cultural references into their Arabic equivalents.\n" |
| "7. Use clear, professional Arabic pronunciation suitable for a broad Arab audience." |
| ), |
| "user_prompt": "Translate this English speech into Arabic. Respond only with the spoken Arabic translation. Use Modern Standard Arabic (Ψ§ΩΨΉΨ±Ψ¨ΩΨ© Ψ§ΩΩΨ΅ΨΩ).", |
| }, |
| "German (Deutsch)": { |
| "code": "de", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent German.\n" |
| "3. Respond ONLY with the German translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker as closely\n" |
| " as possible.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their German equivalents rather than\n" |
| " translating literally." |
| ), |
| "user_prompt": "Translate this English speech into German. Respond only with the spoken German translation.", |
| }, |
| "French (FranΓ§ais)": { |
| "code": "fr", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent French.\n" |
| "3. Respond ONLY with the French translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their French equivalents." |
| ), |
| "user_prompt": "Translate this English speech into French. Respond only with the spoken French translation.", |
| }, |
| "Spanish (EspaΓ±ol)": { |
| "code": "es", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Spanish.\n" |
| "3. Respond ONLY with the Spanish translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their Spanish equivalents." |
| ), |
| "user_prompt": "Translate this English speech into Spanish. Respond only with the spoken Spanish translation.", |
| }, |
| "Russian (Π ΡΡΡΠΊΠΈΠΉ)": { |
| "code": "ru", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Russian.\n" |
| "3. Respond ONLY with the Russian translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their Russian equivalents." |
| ), |
| "user_prompt": "Translate this English speech into Russian. Respond only with the spoken Russian translation.", |
| }, |
| "Japanese (ζ₯ζ¬θͺ)": { |
| "code": "ja", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Japanese.\n" |
| "3. Respond ONLY with the Japanese translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their Japanese equivalents." |
| ), |
| "user_prompt": "Translate this English speech into Japanese. Respond only with the spoken Japanese translation.", |
| }, |
| "Korean (νκ΅μ΄)": { |
| "code": "ko", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Korean.\n" |
| "3. Respond ONLY with the Korean translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their Korean equivalents." |
| ), |
| "user_prompt": "Translate this English speech into Korean. Respond only with the spoken Korean translation.", |
| }, |
| "Portuguese (PortuguΓͺs)": { |
| "code": "pt", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Portuguese.\n" |
| "3. Respond ONLY with the Portuguese translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their Portuguese equivalents." |
| ), |
| "user_prompt": "Translate this English speech into Portuguese. Respond only with the spoken Portuguese translation.", |
| }, |
| "Italian (Italiano)": { |
| "code": "it", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Italian.\n" |
| "3. Respond ONLY with the Italian translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their Italian equivalents." |
| ), |
| "user_prompt": "Translate this English speech into Italian. Respond only with the spoken Italian translation.", |
| }, |
| "Chinese (δΈζ)": { |
| "code": "zh", |
| "system_prompt": ( |
| "You are a professional video dubbing translator. You will receive audio in English.\n" |
| "Your task:\n" |
| "1. Listen carefully to the English speech.\n" |
| "2. Translate it into natural, fluent Mandarin Chinese.\n" |
| "3. Respond ONLY with the Chinese translation spoken aloud β no English, no commentary,\n" |
| " no meta-text. Match the tone, emotion, and pacing of the original speaker.\n" |
| "4. If there are pauses or silence in the original audio, maintain similar pacing.\n" |
| "5. Translate idioms and cultural references into their Chinese equivalents." |
| ), |
| "user_prompt": "Translate this English speech into Mandarin Chinese. Respond only with the spoken Chinese translation.", |
| }, |
| } |
|
|
| VOICES = [ |
| "Cherry", "Serena", "Ethan", "Chelsie", "Momo", "Vivian", "Moon", "Maia", |
| "Kai", "Nofish", "Bella", "Jennifer", "Ryan", "Katerina", "Aiden", |
| "Eldric Sage", "Mia", "Mochi", "Bellona", "Vincent", "Bunny", "Neil", |
| "Elias", "Arthur", "Seren", "Bodega", "Sonrisa", "Alek", "Dolce", |
| "Sohee", "Ono Anna", "Lenn", "Emilien", "Andre", |
| ] |
|
|
| |
| |
| |
| def get_duration(filepath: str) -> float: |
| result = subprocess.run( |
| ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", |
| "-of", "default=noprint_wrappers=1:nokey=1", filepath], |
| capture_output=True, text=True, |
| ) |
| return float(result.stdout.strip()) |
|
|
|
|
| def extract_audio_chunk(video_path, output_wav, start_sec, duration_sec): |
| subprocess.run( |
| ["ffmpeg", "-y", "-ss", str(start_sec), "-t", str(duration_sec), |
| "-i", video_path, "-vn", "-acodec", "pcm_s16le", |
| "-ar", "16000", "-ac", "1", output_wav], |
| capture_output=True, check=True, |
| ) |
|
|
|
|
| def wav_to_base64(wav_path): |
| with open(wav_path, "rb") as f: |
| return base64.b64encode(f.read()).decode("utf-8") |
|
|
|
|
| def base64_to_wav(b64_data, output_path): |
| audio_bytes = base64.b64decode(b64_data) |
| sample_rate = 24000 |
| num_channels = 1 |
| bits_per_sample = 16 |
| byte_rate = sample_rate * num_channels * bits_per_sample // 8 |
| block_align = num_channels * bits_per_sample // 8 |
| data_size = len(audio_bytes) |
| with open(output_path, "wb") as f: |
| f.write(b"RIFF") |
| f.write(struct.pack("<I", 36 + data_size)) |
| f.write(b"WAVE") |
| f.write(b"fmt ") |
| f.write(struct.pack("<I", 16)) |
| f.write(struct.pack("<H", 1)) |
| f.write(struct.pack("<H", num_channels)) |
| f.write(struct.pack("<I", sample_rate)) |
| f.write(struct.pack("<I", byte_rate)) |
| f.write(struct.pack("<H", block_align)) |
| f.write(struct.pack("<H", bits_per_sample)) |
| f.write(b"data") |
| f.write(struct.pack("<I", data_size)) |
| f.write(audio_bytes) |
|
|
|
|
| def concatenate_wavs(wav_files, output_path): |
| if len(wav_files) == 1: |
| shutil.copy2(wav_files[0], output_path) |
| return |
| list_file = output_path + ".txt" |
| with open(list_file, "w") as f: |
| for wav in wav_files: |
| f.write(f"file '{wav}'\n") |
| subprocess.run( |
| ["ffmpeg", "-y", "-f", "concat", "-safe", "0", |
| "-i", list_file, "-c", "copy", output_path], |
| capture_output=True, check=True, |
| ) |
| os.remove(list_file) |
|
|
|
|
| def mux_audio_to_video(original_video, new_audio, output_video): |
| result = subprocess.run( |
| ["ffmpeg", "-y", "-i", original_video, "-i", new_audio, |
| "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0", |
| "-shortest", output_video], |
| capture_output=True, text=True, |
| ) |
| if result.returncode != 0: |
| raise RuntimeError(f"FFmpeg mux failed:\n{result.stderr[-500:]}") |
|
|
|
|
| |
| |
| |
| def translate_chunk(client, wav_path, voice, lang_config, chunk_index): |
| audio_b64 = wav_to_base64(wav_path) |
| output_wav = wav_path.replace(".wav", f"_{lang_config['code']}.wav") |
|
|
| completion = client.chat.completions.create( |
| model=MODEL, |
| messages=[ |
| {"role": "system", "content": lang_config["system_prompt"]}, |
| { |
| "role": "user", |
| "content": [ |
| { |
| "type": "input_audio", |
| "input_audio": { |
| "data": f"data:audio/wav;base64,{audio_b64}", |
| "format": "wav", |
| }, |
| }, |
| {"type": "text", "text": lang_config["user_prompt"]}, |
| ], |
| }, |
| ], |
| modalities=["text", "audio"], |
| audio={"voice": voice, "format": "wav"}, |
| stream=True, |
| stream_options={"include_usage": True}, |
| ) |
|
|
| audio_chunks = [] |
| transcript_parts = [] |
|
|
| for event in completion: |
| if not event.choices: |
| continue |
| delta = event.choices[0].delta |
| if hasattr(delta, "content") and delta.content: |
| transcript_parts.append(delta.content) |
| if hasattr(delta, "audio") and delta.audio: |
| if isinstance(delta.audio, dict): |
| if "data" in delta.audio: |
| audio_chunks.append(delta.audio["data"]) |
| elif hasattr(delta.audio, "data") and delta.audio.data: |
| audio_chunks.append(delta.audio.data) |
|
|
| transcript = "".join(transcript_parts) |
|
|
| if audio_chunks: |
| full_audio_b64 = "".join(audio_chunks) |
| base64_to_wav(full_audio_b64, output_wav) |
| return output_wav, transcript |
| return None, transcript |
|
|
|
|
| |
| |
| |
| def dub_video(video_file, target_language, voice, chunk_seconds, progress=gr.Progress()): |
| if video_file is None: |
| raise gr.Error("Please upload a video file.") |
|
|
| api_key = os.environ.get("DASHSCOPE_API_KEY", "") |
| if not api_key: |
| raise gr.Error( |
| "DASHSCOPE_API_KEY not set. Add it as a Space Secret " |
| "(Settings β Secrets β New Secret)." |
| ) |
|
|
| lang_config = LANGUAGES[target_language] |
| client = OpenAI(api_key=api_key, base_url=BASE_URL) |
| tmp_dir = tempfile.mkdtemp(prefix="dub_") |
|
|
| try: |
| |
| progress(0.05, desc="Analyzing video...") |
| total_duration = get_duration(video_file) |
|
|
| if total_duration > 3600: |
| raise gr.Error("Video is longer than 1 hour. Please use a shorter clip.") |
|
|
| |
| progress(0.1, desc="Extracting audio chunks...") |
| num_chunks = max( |
| 1, |
| int(total_duration // chunk_seconds) |
| + (1 if total_duration % chunk_seconds > 0 else 0), |
| ) |
|
|
| input_chunks = [] |
| for i in range(num_chunks): |
| start = i * chunk_seconds |
| duration = min(chunk_seconds, total_duration - start) |
| chunk_path = os.path.join(tmp_dir, f"chunk_{i:03d}.wav") |
| extract_audio_chunk(video_file, chunk_path, start, duration) |
| input_chunks.append(chunk_path) |
|
|
| |
| output_chunks = [] |
| all_transcripts = [] |
|
|
| for i, chunk_path in enumerate(input_chunks): |
| frac = 0.15 + 0.7 * (i / num_chunks) |
| progress(frac, desc=f"Translating chunk {i+1}/{num_chunks}...") |
|
|
| result_path, transcript = translate_chunk( |
| client, chunk_path, voice, lang_config, i |
| ) |
| if transcript: |
| all_transcripts.append(transcript) |
|
|
| if result_path: |
| output_chunks.append(result_path) |
| else: |
| |
| duration = get_duration(chunk_path) |
| silence_path = os.path.join(tmp_dir, f"silence_{i:03d}.wav") |
| subprocess.run( |
| ["ffmpeg", "-y", "-f", "lavfi", |
| "-i", "anullsrc=r=24000:cl=mono", |
| "-t", str(duration), "-acodec", "pcm_s16le", silence_path], |
| capture_output=True, check=True, |
| ) |
| output_chunks.append(silence_path) |
|
|
| |
| progress(0.88, desc="Assembling audio...") |
| full_audio = os.path.join(tmp_dir, "full_dubbed_audio.wav") |
| concatenate_wavs(output_chunks, full_audio) |
|
|
| |
| progress(0.93, desc="Muxing audio onto video...") |
| ext = os.path.splitext(video_file)[1] or ".mp4" |
| output_video = os.path.join(tmp_dir, f"dubbed_{lang_config['code']}{ext}") |
| mux_audio_to_video(video_file, full_audio, output_video) |
|
|
| progress(1.0, desc="Done!") |
|
|
| transcript_text = "\n\n".join( |
| f"**Chunk {i+1}:**\n{t}" for i, t in enumerate(all_transcripts) |
| ) or "No transcript available." |
|
|
| return output_video, transcript_text |
|
|
| except Exception as e: |
| |
| shutil.rmtree(tmp_dir, ignore_errors=True) |
| raise gr.Error(str(e)) |
|
|
|
|
| |
| |
| |
| DESCRIPTION = """ |
| # π¬ Commentary Video Dubbing β English to Any Language |
| |
| Upload an English video and get it dubbed into Arabic, German, French, Spanish, and more. |
| The model translates the speech and generates natural-sounding voice output in the target language. |
| |
| **Supported output languages:** Arabic, Chinese, German, French, Spanish, Portuguese, Italian, Russian, Japanese, Korean |
| |
| """ |
|
|
| with gr.Blocks( |
| title="Video Dubbing β Qwen3.5-Omni", |
| theme=gr.themes.Soft( |
| primary_hue="amber", |
| secondary_hue="orange", |
| neutral_hue="stone", |
| ), |
| ) as demo: |
|
|
| gr.Markdown(DESCRIPTION) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| video_input = gr.Video(label="Upload English Video", sources=["upload"]) |
|
|
| target_lang = gr.Dropdown( |
| choices=list(LANGUAGES.keys()), |
| value="Arabic (Ψ§ΩΨΉΨ±Ψ¨ΩΨ© Ψ§ΩΩΨ΅ΨΩ)", |
| label="Target Language", |
| ) |
|
|
| voice_select = gr.Dropdown( |
| choices=VOICES, |
| value="Ethan", |
| label="Voice", |
| info="All voices support all output languages.", |
| ) |
|
|
| chunk_slider = gr.Slider( |
| minimum=30, |
| maximum=300, |
| value=120, |
| step=10, |
| label="Chunk Duration (seconds)", |
| info="Shorter chunks = more API calls but less risk of timeout.", |
| ) |
|
|
| dub_btn = gr.Button("ποΈ Start Dubbing", variant="primary", size="lg") |
|
|
| with gr.Column(scale=1): |
| video_output = gr.Video(label="Dubbed Video") |
| transcript_output = gr.Markdown(label="Translation Transcript") |
|
|
| dub_btn.click( |
| fn=dub_video, |
| inputs=[video_input, target_lang, voice_select, chunk_slider], |
| outputs=[video_output, transcript_output], |
| ) |
|
|
| gr.Markdown( |
| "---\n" |
| "**Built by:** Plotweaver " |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|