File size: 20,876 Bytes
2148cac
88edbd8
2148cac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88edbd8
2148cac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc9f65b
 
2148cac
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
"""
🎬 Commentary Video Dubbing App β€” English to Arabic / German

"""

import os
import base64
import shutil
import struct
import subprocess
import tempfile
import time

import gradio as gr
from openai import OpenAI

# ──────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────
MODEL = "qwen3.5-omni-plus"
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"

LANGUAGES = {
    "Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى)": {
        "code": "ar",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Modern Standard Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى).\n"
            "3. Respond ONLY with the Arabic translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text, no transliteration. Speak entirely in Arabic.\n"
            "4. Match the tone, emotion, and pacing of the original speaker as closely as possible.\n"
            "5. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "6. Translate idioms and cultural references into their Arabic equivalents.\n"
            "7. Use clear, professional Arabic pronunciation suitable for a broad Arab audience."
        ),
        "user_prompt": "Translate this English speech into Arabic. Respond only with the spoken Arabic translation. Use Modern Standard Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى).",
    },
    "German (Deutsch)": {
        "code": "de",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent German.\n"
            "3. Respond ONLY with the German translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker as closely\n"
            "   as possible.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their German equivalents rather than\n"
            "   translating literally."
        ),
        "user_prompt": "Translate this English speech into German. Respond only with the spoken German translation.",
    },
    "French (FranΓ§ais)": {
        "code": "fr",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent French.\n"
            "3. Respond ONLY with the French translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their French equivalents."
        ),
        "user_prompt": "Translate this English speech into French. Respond only with the spoken French translation.",
    },
    "Spanish (EspaΓ±ol)": {
        "code": "es",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Spanish.\n"
            "3. Respond ONLY with the Spanish translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Spanish equivalents."
        ),
        "user_prompt": "Translate this English speech into Spanish. Respond only with the spoken Spanish translation.",
    },
    "Russian (Русский)": {
        "code": "ru",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Russian.\n"
            "3. Respond ONLY with the Russian translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Russian equivalents."
        ),
        "user_prompt": "Translate this English speech into Russian. Respond only with the spoken Russian translation.",
    },
    "Japanese (ζ—₯本θͺž)": {
        "code": "ja",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Japanese.\n"
            "3. Respond ONLY with the Japanese translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Japanese equivalents."
        ),
        "user_prompt": "Translate this English speech into Japanese. Respond only with the spoken Japanese translation.",
    },
    "Korean (ν•œκ΅­μ–΄)": {
        "code": "ko",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Korean.\n"
            "3. Respond ONLY with the Korean translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Korean equivalents."
        ),
        "user_prompt": "Translate this English speech into Korean. Respond only with the spoken Korean translation.",
    },
    "Portuguese (PortuguΓͺs)": {
        "code": "pt",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Portuguese.\n"
            "3. Respond ONLY with the Portuguese translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Portuguese equivalents."
        ),
        "user_prompt": "Translate this English speech into Portuguese. Respond only with the spoken Portuguese translation.",
    },
    "Italian (Italiano)": {
        "code": "it",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Italian.\n"
            "3. Respond ONLY with the Italian translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Italian equivalents."
        ),
        "user_prompt": "Translate this English speech into Italian. Respond only with the spoken Italian translation.",
    },
    "Chinese (δΈ­ζ–‡)": {
        "code": "zh",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Mandarin Chinese.\n"
            "3. Respond ONLY with the Chinese translation spoken aloud β€” no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Chinese equivalents."
        ),
        "user_prompt": "Translate this English speech into Mandarin Chinese. Respond only with the spoken Chinese translation.",
    },
}

VOICES = [
    "Cherry", "Serena", "Ethan", "Chelsie", "Momo", "Vivian", "Moon", "Maia",
    "Kai", "Nofish", "Bella", "Jennifer", "Ryan", "Katerina", "Aiden",
    "Eldric Sage", "Mia", "Mochi", "Bellona", "Vincent", "Bunny", "Neil",
    "Elias", "Arthur", "Seren", "Bodega", "Sonrisa", "Alek", "Dolce",
    "Sohee", "Ono Anna", "Lenn", "Emilien", "Andre",
]

# ──────────────────────────────────────────────
# Audio helpers
# ──────────────────────────────────────────────
def get_duration(filepath: str) -> float:
    result = subprocess.run(
        ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
         "-of", "default=noprint_wrappers=1:nokey=1", filepath],
        capture_output=True, text=True,
    )
    return float(result.stdout.strip())


def extract_audio_chunk(video_path, output_wav, start_sec, duration_sec):
    subprocess.run(
        ["ffmpeg", "-y", "-ss", str(start_sec), "-t", str(duration_sec),
         "-i", video_path, "-vn", "-acodec", "pcm_s16le",
         "-ar", "16000", "-ac", "1", output_wav],
        capture_output=True, check=True,
    )


def wav_to_base64(wav_path):
    with open(wav_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


def base64_to_wav(b64_data, output_path):
    audio_bytes = base64.b64decode(b64_data)
    sample_rate = 24000
    num_channels = 1
    bits_per_sample = 16
    byte_rate = sample_rate * num_channels * bits_per_sample // 8
    block_align = num_channels * bits_per_sample // 8
    data_size = len(audio_bytes)
    with open(output_path, "wb") as f:
        f.write(b"RIFF")
        f.write(struct.pack("<I", 36 + data_size))
        f.write(b"WAVE")
        f.write(b"fmt ")
        f.write(struct.pack("<I", 16))
        f.write(struct.pack("<H", 1))
        f.write(struct.pack("<H", num_channels))
        f.write(struct.pack("<I", sample_rate))
        f.write(struct.pack("<I", byte_rate))
        f.write(struct.pack("<H", block_align))
        f.write(struct.pack("<H", bits_per_sample))
        f.write(b"data")
        f.write(struct.pack("<I", data_size))
        f.write(audio_bytes)


def concatenate_wavs(wav_files, output_path):
    if len(wav_files) == 1:
        shutil.copy2(wav_files[0], output_path)
        return
    list_file = output_path + ".txt"
    with open(list_file, "w") as f:
        for wav in wav_files:
            f.write(f"file '{wav}'\n")
    subprocess.run(
        ["ffmpeg", "-y", "-f", "concat", "-safe", "0",
         "-i", list_file, "-c", "copy", output_path],
        capture_output=True, check=True,
    )
    os.remove(list_file)


def mux_audio_to_video(original_video, new_audio, output_video):
    result = subprocess.run(
        ["ffmpeg", "-y", "-i", original_video, "-i", new_audio,
         "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
         "-shortest", output_video],
        capture_output=True, text=True,
    )
    if result.returncode != 0:
        raise RuntimeError(f"FFmpeg mux failed:\n{result.stderr[-500:]}")


# ──────────────────────────────────────────────
# Translation
# ──────────────────────────────────────────────
def translate_chunk(client, wav_path, voice, lang_config, chunk_index):
    audio_b64 = wav_to_base64(wav_path)
    output_wav = wav_path.replace(".wav", f"_{lang_config['code']}.wav")

    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": lang_config["system_prompt"]},
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {
                            "data": f"data:audio/wav;base64,{audio_b64}",
                            "format": "wav",
                        },
                    },
                    {"type": "text", "text": lang_config["user_prompt"]},
                ],
            },
        ],
        modalities=["text", "audio"],
        audio={"voice": voice, "format": "wav"},
        stream=True,
        stream_options={"include_usage": True},
    )

    audio_chunks = []
    transcript_parts = []

    for event in completion:
        if not event.choices:
            continue
        delta = event.choices[0].delta
        if hasattr(delta, "content") and delta.content:
            transcript_parts.append(delta.content)
        if hasattr(delta, "audio") and delta.audio:
            if isinstance(delta.audio, dict):
                if "data" in delta.audio:
                    audio_chunks.append(delta.audio["data"])
            elif hasattr(delta.audio, "data") and delta.audio.data:
                audio_chunks.append(delta.audio.data)

    transcript = "".join(transcript_parts)

    if audio_chunks:
        full_audio_b64 = "".join(audio_chunks)
        base64_to_wav(full_audio_b64, output_wav)
        return output_wav, transcript
    return None, transcript


# ──────────────────────────────────────────────
# Main pipeline (called by Gradio)
# ──────────────────────────────────────────────
def dub_video(video_file, target_language, voice, chunk_seconds, progress=gr.Progress()):
    if video_file is None:
        raise gr.Error("Please upload a video file.")

    api_key = os.environ.get("DASHSCOPE_API_KEY", "")
    if not api_key:
        raise gr.Error(
            "DASHSCOPE_API_KEY not set. Add it as a Space Secret "
            "(Settings β†’ Secrets β†’ New Secret)."
        )

    lang_config = LANGUAGES[target_language]
    client = OpenAI(api_key=api_key, base_url=BASE_URL)
    tmp_dir = tempfile.mkdtemp(prefix="dub_")

    try:
        # ── Duration ──
        progress(0.05, desc="Analyzing video...")
        total_duration = get_duration(video_file)

        if total_duration > 3600:
            raise gr.Error("Video is longer than 1 hour. Please use a shorter clip.")

        # ── Split ──
        progress(0.1, desc="Extracting audio chunks...")
        num_chunks = max(
            1,
            int(total_duration // chunk_seconds)
            + (1 if total_duration % chunk_seconds > 0 else 0),
        )

        input_chunks = []
        for i in range(num_chunks):
            start = i * chunk_seconds
            duration = min(chunk_seconds, total_duration - start)
            chunk_path = os.path.join(tmp_dir, f"chunk_{i:03d}.wav")
            extract_audio_chunk(video_file, chunk_path, start, duration)
            input_chunks.append(chunk_path)

        # ── Translate ──
        output_chunks = []
        all_transcripts = []

        for i, chunk_path in enumerate(input_chunks):
            frac = 0.15 + 0.7 * (i / num_chunks)
            progress(frac, desc=f"Translating chunk {i+1}/{num_chunks}...")

            result_path, transcript = translate_chunk(
                client, chunk_path, voice, lang_config, i
            )
            if transcript:
                all_transcripts.append(transcript)

            if result_path:
                output_chunks.append(result_path)
            else:
                # Silence fallback
                duration = get_duration(chunk_path)
                silence_path = os.path.join(tmp_dir, f"silence_{i:03d}.wav")
                subprocess.run(
                    ["ffmpeg", "-y", "-f", "lavfi",
                     "-i", "anullsrc=r=24000:cl=mono",
                     "-t", str(duration), "-acodec", "pcm_s16le", silence_path],
                    capture_output=True, check=True,
                )
                output_chunks.append(silence_path)

        # ── Concatenate ──
        progress(0.88, desc="Assembling audio...")
        full_audio = os.path.join(tmp_dir, "full_dubbed_audio.wav")
        concatenate_wavs(output_chunks, full_audio)

        # ── Mux ──
        progress(0.93, desc="Muxing audio onto video...")
        ext = os.path.splitext(video_file)[1] or ".mp4"
        output_video = os.path.join(tmp_dir, f"dubbed_{lang_config['code']}{ext}")
        mux_audio_to_video(video_file, full_audio, output_video)

        progress(1.0, desc="Done!")

        transcript_text = "\n\n".join(
            f"**Chunk {i+1}:**\n{t}" for i, t in enumerate(all_transcripts)
        ) or "No transcript available."

        return output_video, transcript_text

    except Exception as e:
        # Clean up on error
        shutil.rmtree(tmp_dir, ignore_errors=True)
        raise gr.Error(str(e))


# ──────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────
DESCRIPTION = """
# 🎬 Commentary Video Dubbing β€” English to Any Language

Upload an English video and get it dubbed into Arabic, German, French, Spanish, and more.
The model translates the speech and generates natural-sounding voice output in the target language.

**Supported output languages:** Arabic, Chinese, German, French, Spanish, Portuguese, Italian, Russian, Japanese, Korean

"""

with gr.Blocks(
    title="Video Dubbing β€” Qwen3.5-Omni",
    theme=gr.themes.Soft(
        primary_hue="amber",
        secondary_hue="orange",
        neutral_hue="stone",
    ),
) as demo:

    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1):
            video_input = gr.Video(label="Upload English Video", sources=["upload"])

            target_lang = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="Arabic (Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ© الفءحى)",
                label="Target Language",
            )

            voice_select = gr.Dropdown(
                choices=VOICES,
                value="Ethan",
                label="Voice",
                info="All voices support all output languages.",
            )

            chunk_slider = gr.Slider(
                minimum=30,
                maximum=300,
                value=120,
                step=10,
                label="Chunk Duration (seconds)",
                info="Shorter chunks = more API calls but less risk of timeout.",
            )

            dub_btn = gr.Button("πŸŽ™οΈ Start Dubbing", variant="primary", size="lg")

        with gr.Column(scale=1):
            video_output = gr.Video(label="Dubbed Video")
            transcript_output = gr.Markdown(label="Translation Transcript")

    dub_btn.click(
        fn=dub_video,
        inputs=[video_input, target_lang, voice_select, chunk_slider],
        outputs=[video_output, transcript_output],
    )

    gr.Markdown(
        "---\n"
        "**Built by:** Plotweaver "
            )

if __name__ == "__main__":
    demo.launch()