PlotweaverModel's picture
app file updated
88edbd8 verified
"""
🎬 Commentary Video Dubbing App β€” English to Arabic / German
"""
import os
import base64
import shutil
import struct
import subprocess
import tempfile
import time
import gradio as gr
from openai import OpenAI
# ──────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────
MODEL = "qwen3.5-omni-plus"
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
LANGUAGES = {
"Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى)": {
"code": "ar",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Modern Standard Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى).\n"
"3. Respond ONLY with the Arabic translation spoken aloud β€” no English, no commentary,\n"
" no meta-text, no transliteration. Speak entirely in Arabic.\n"
"4. Match the tone, emotion, and pacing of the original speaker as closely as possible.\n"
"5. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"6. Translate idioms and cultural references into their Arabic equivalents.\n"
"7. Use clear, professional Arabic pronunciation suitable for a broad Arab audience."
),
"user_prompt": "Translate this English speech into Arabic. Respond only with the spoken Arabic translation. Use Modern Standard Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى).",
},
"German (Deutsch)": {
"code": "de",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent German.\n"
"3. Respond ONLY with the German translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker as closely\n"
" as possible.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their German equivalents rather than\n"
" translating literally."
),
"user_prompt": "Translate this English speech into German. Respond only with the spoken German translation.",
},
"French (FranΓ§ais)": {
"code": "fr",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent French.\n"
"3. Respond ONLY with the French translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their French equivalents."
),
"user_prompt": "Translate this English speech into French. Respond only with the spoken French translation.",
},
"Spanish (EspaΓ±ol)": {
"code": "es",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Spanish.\n"
"3. Respond ONLY with the Spanish translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their Spanish equivalents."
),
"user_prompt": "Translate this English speech into Spanish. Respond only with the spoken Spanish translation.",
},
"Russian (Русский)": {
"code": "ru",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Russian.\n"
"3. Respond ONLY with the Russian translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their Russian equivalents."
),
"user_prompt": "Translate this English speech into Russian. Respond only with the spoken Russian translation.",
},
"Japanese (ζ—₯本θͺž)": {
"code": "ja",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Japanese.\n"
"3. Respond ONLY with the Japanese translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their Japanese equivalents."
),
"user_prompt": "Translate this English speech into Japanese. Respond only with the spoken Japanese translation.",
},
"Korean (ν•œκ΅­μ–΄)": {
"code": "ko",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Korean.\n"
"3. Respond ONLY with the Korean translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their Korean equivalents."
),
"user_prompt": "Translate this English speech into Korean. Respond only with the spoken Korean translation.",
},
"Portuguese (PortuguΓͺs)": {
"code": "pt",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Portuguese.\n"
"3. Respond ONLY with the Portuguese translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their Portuguese equivalents."
),
"user_prompt": "Translate this English speech into Portuguese. Respond only with the spoken Portuguese translation.",
},
"Italian (Italiano)": {
"code": "it",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Italian.\n"
"3. Respond ONLY with the Italian translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their Italian equivalents."
),
"user_prompt": "Translate this English speech into Italian. Respond only with the spoken Italian translation.",
},
"Chinese (δΈ­ζ–‡)": {
"code": "zh",
"system_prompt": (
"You are a professional video dubbing translator. You will receive audio in English.\n"
"Your task:\n"
"1. Listen carefully to the English speech.\n"
"2. Translate it into natural, fluent Mandarin Chinese.\n"
"3. Respond ONLY with the Chinese translation spoken aloud β€” no English, no commentary,\n"
" no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
"4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
"5. Translate idioms and cultural references into their Chinese equivalents."
),
"user_prompt": "Translate this English speech into Mandarin Chinese. Respond only with the spoken Chinese translation.",
},
}
VOICES = [
"Cherry", "Serena", "Ethan", "Chelsie", "Momo", "Vivian", "Moon", "Maia",
"Kai", "Nofish", "Bella", "Jennifer", "Ryan", "Katerina", "Aiden",
"Eldric Sage", "Mia", "Mochi", "Bellona", "Vincent", "Bunny", "Neil",
"Elias", "Arthur", "Seren", "Bodega", "Sonrisa", "Alek", "Dolce",
"Sohee", "Ono Anna", "Lenn", "Emilien", "Andre",
]
# ──────────────────────────────────────────────
# Audio helpers
# ──────────────────────────────────────────────
def get_duration(filepath: str) -> float:
result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", filepath],
capture_output=True, text=True,
)
return float(result.stdout.strip())
def extract_audio_chunk(video_path, output_wav, start_sec, duration_sec):
subprocess.run(
["ffmpeg", "-y", "-ss", str(start_sec), "-t", str(duration_sec),
"-i", video_path, "-vn", "-acodec", "pcm_s16le",
"-ar", "16000", "-ac", "1", output_wav],
capture_output=True, check=True,
)
def wav_to_base64(wav_path):
with open(wav_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def base64_to_wav(b64_data, output_path):
audio_bytes = base64.b64decode(b64_data)
sample_rate = 24000
num_channels = 1
bits_per_sample = 16
byte_rate = sample_rate * num_channels * bits_per_sample // 8
block_align = num_channels * bits_per_sample // 8
data_size = len(audio_bytes)
with open(output_path, "wb") as f:
f.write(b"RIFF")
f.write(struct.pack("<I", 36 + data_size))
f.write(b"WAVE")
f.write(b"fmt ")
f.write(struct.pack("<I", 16))
f.write(struct.pack("<H", 1))
f.write(struct.pack("<H", num_channels))
f.write(struct.pack("<I", sample_rate))
f.write(struct.pack("<I", byte_rate))
f.write(struct.pack("<H", block_align))
f.write(struct.pack("<H", bits_per_sample))
f.write(b"data")
f.write(struct.pack("<I", data_size))
f.write(audio_bytes)
def concatenate_wavs(wav_files, output_path):
if len(wav_files) == 1:
shutil.copy2(wav_files[0], output_path)
return
list_file = output_path + ".txt"
with open(list_file, "w") as f:
for wav in wav_files:
f.write(f"file '{wav}'\n")
subprocess.run(
["ffmpeg", "-y", "-f", "concat", "-safe", "0",
"-i", list_file, "-c", "copy", output_path],
capture_output=True, check=True,
)
os.remove(list_file)
def mux_audio_to_video(original_video, new_audio, output_video):
result = subprocess.run(
["ffmpeg", "-y", "-i", original_video, "-i", new_audio,
"-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
"-shortest", output_video],
capture_output=True, text=True,
)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg mux failed:\n{result.stderr[-500:]}")
# ──────────────────────────────────────────────
# Translation
# ──────────────────────────────────────────────
def translate_chunk(client, wav_path, voice, lang_config, chunk_index):
audio_b64 = wav_to_base64(wav_path)
output_wav = wav_path.replace(".wav", f"_{lang_config['code']}.wav")
completion = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": lang_config["system_prompt"]},
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": f"data:audio/wav;base64,{audio_b64}",
"format": "wav",
},
},
{"type": "text", "text": lang_config["user_prompt"]},
],
},
],
modalities=["text", "audio"],
audio={"voice": voice, "format": "wav"},
stream=True,
stream_options={"include_usage": True},
)
audio_chunks = []
transcript_parts = []
for event in completion:
if not event.choices:
continue
delta = event.choices[0].delta
if hasattr(delta, "content") and delta.content:
transcript_parts.append(delta.content)
if hasattr(delta, "audio") and delta.audio:
if isinstance(delta.audio, dict):
if "data" in delta.audio:
audio_chunks.append(delta.audio["data"])
elif hasattr(delta.audio, "data") and delta.audio.data:
audio_chunks.append(delta.audio.data)
transcript = "".join(transcript_parts)
if audio_chunks:
full_audio_b64 = "".join(audio_chunks)
base64_to_wav(full_audio_b64, output_wav)
return output_wav, transcript
return None, transcript
# ──────────────────────────────────────────────
# Main pipeline (called by Gradio)
# ──────────────────────────────────────────────
def dub_video(video_file, target_language, voice, chunk_seconds, progress=gr.Progress()):
if video_file is None:
raise gr.Error("Please upload a video file.")
api_key = os.environ.get("DASHSCOPE_API_KEY", "")
if not api_key:
raise gr.Error(
"DASHSCOPE_API_KEY not set. Add it as a Space Secret "
"(Settings β†’ Secrets β†’ New Secret)."
)
lang_config = LANGUAGES[target_language]
client = OpenAI(api_key=api_key, base_url=BASE_URL)
tmp_dir = tempfile.mkdtemp(prefix="dub_")
try:
# ── Duration ──
progress(0.05, desc="Analyzing video...")
total_duration = get_duration(video_file)
if total_duration > 3600:
raise gr.Error("Video is longer than 1 hour. Please use a shorter clip.")
# ── Split ──
progress(0.1, desc="Extracting audio chunks...")
num_chunks = max(
1,
int(total_duration // chunk_seconds)
+ (1 if total_duration % chunk_seconds > 0 else 0),
)
input_chunks = []
for i in range(num_chunks):
start = i * chunk_seconds
duration = min(chunk_seconds, total_duration - start)
chunk_path = os.path.join(tmp_dir, f"chunk_{i:03d}.wav")
extract_audio_chunk(video_file, chunk_path, start, duration)
input_chunks.append(chunk_path)
# ── Translate ──
output_chunks = []
all_transcripts = []
for i, chunk_path in enumerate(input_chunks):
frac = 0.15 + 0.7 * (i / num_chunks)
progress(frac, desc=f"Translating chunk {i+1}/{num_chunks}...")
result_path, transcript = translate_chunk(
client, chunk_path, voice, lang_config, i
)
if transcript:
all_transcripts.append(transcript)
if result_path:
output_chunks.append(result_path)
else:
# Silence fallback
duration = get_duration(chunk_path)
silence_path = os.path.join(tmp_dir, f"silence_{i:03d}.wav")
subprocess.run(
["ffmpeg", "-y", "-f", "lavfi",
"-i", "anullsrc=r=24000:cl=mono",
"-t", str(duration), "-acodec", "pcm_s16le", silence_path],
capture_output=True, check=True,
)
output_chunks.append(silence_path)
# ── Concatenate ──
progress(0.88, desc="Assembling audio...")
full_audio = os.path.join(tmp_dir, "full_dubbed_audio.wav")
concatenate_wavs(output_chunks, full_audio)
# ── Mux ──
progress(0.93, desc="Muxing audio onto video...")
ext = os.path.splitext(video_file)[1] or ".mp4"
output_video = os.path.join(tmp_dir, f"dubbed_{lang_config['code']}{ext}")
mux_audio_to_video(video_file, full_audio, output_video)
progress(1.0, desc="Done!")
transcript_text = "\n\n".join(
f"**Chunk {i+1}:**\n{t}" for i, t in enumerate(all_transcripts)
) or "No transcript available."
return output_video, transcript_text
except Exception as e:
# Clean up on error
shutil.rmtree(tmp_dir, ignore_errors=True)
raise gr.Error(str(e))
# ──────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────
DESCRIPTION = """
# 🎬 Commentary Video Dubbing β€” English to Any Language
Upload an English video and get it dubbed into Arabic, German, French, Spanish, and more.
The model translates the speech and generates natural-sounding voice output in the target language.
**Supported output languages:** Arabic, Chinese, German, French, Spanish, Portuguese, Italian, Russian, Japanese, Korean
"""
with gr.Blocks(
title="Video Dubbing β€” Qwen3.5-Omni",
theme=gr.themes.Soft(
primary_hue="amber",
secondary_hue="orange",
neutral_hue="stone",
),
) as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=1):
video_input = gr.Video(label="Upload English Video", sources=["upload"])
target_lang = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى)",
label="Target Language",
)
voice_select = gr.Dropdown(
choices=VOICES,
value="Ethan",
label="Voice",
info="All voices support all output languages.",
)
chunk_slider = gr.Slider(
minimum=30,
maximum=300,
value=120,
step=10,
label="Chunk Duration (seconds)",
info="Shorter chunks = more API calls but less risk of timeout.",
)
dub_btn = gr.Button("πŸŽ™οΈ Start Dubbing", variant="primary", size="lg")
with gr.Column(scale=1):
video_output = gr.Video(label="Dubbed Video")
transcript_output = gr.Markdown(label="Translation Transcript")
dub_btn.click(
fn=dub_video,
inputs=[video_input, target_lang, voice_select, chunk_slider],
outputs=[video_output, transcript_output],
)
gr.Markdown(
"---\n"
"**Built by:** Plotweaver "
)
if __name__ == "__main__":
demo.launch()