Spaces:

PlotweaverModel
/

ENG-AR-Commentary-video-dubbing

Sleeping

File size: 20,876 Bytes

"""
🎬 Commentary Video Dubbing App — English to Arabic / German

"""

import os
import base64
import shutil
import struct
import subprocess
import tempfile
import time

import gradio as gr
from openai import OpenAI

# ──────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────
MODEL = "qwen3.5-omni-plus"
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"

LANGUAGES = {
    "Arabic (العربية الفصحى)": {
        "code": "ar",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Modern Standard Arabic (العربية الفصحى).\n"
            "3. Respond ONLY with the Arabic translation spoken aloud — no English, no commentary,\n"
            "   no meta-text, no transliteration. Speak entirely in Arabic.\n"
            "4. Match the tone, emotion, and pacing of the original speaker as closely as possible.\n"
            "5. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "6. Translate idioms and cultural references into their Arabic equivalents.\n"
            "7. Use clear, professional Arabic pronunciation suitable for a broad Arab audience."
        ),
        "user_prompt": "Translate this English speech into Arabic. Respond only with the spoken Arabic translation. Use Modern Standard Arabic (العربية الفصحى).",
    },
    "German (Deutsch)": {
        "code": "de",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent German.\n"
            "3. Respond ONLY with the German translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker as closely\n"
            "   as possible.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their German equivalents rather than\n"
            "   translating literally."
        ),
        "user_prompt": "Translate this English speech into German. Respond only with the spoken German translation.",
    },
    "French (Français)": {
        "code": "fr",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent French.\n"
            "3. Respond ONLY with the French translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their French equivalents."
        ),
        "user_prompt": "Translate this English speech into French. Respond only with the spoken French translation.",
    },
    "Spanish (Español)": {
        "code": "es",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Spanish.\n"
            "3. Respond ONLY with the Spanish translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Spanish equivalents."
        ),
        "user_prompt": "Translate this English speech into Spanish. Respond only with the spoken Spanish translation.",
    },
    "Russian (Русский)": {
        "code": "ru",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Russian.\n"
            "3. Respond ONLY with the Russian translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Russian equivalents."
        ),
        "user_prompt": "Translate this English speech into Russian. Respond only with the spoken Russian translation.",
    },
    "Japanese (日本語)": {
        "code": "ja",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Japanese.\n"
            "3. Respond ONLY with the Japanese translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Japanese equivalents."
        ),
        "user_prompt": "Translate this English speech into Japanese. Respond only with the spoken Japanese translation.",
    },
    "Korean (한국어)": {
        "code": "ko",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Korean.\n"
            "3. Respond ONLY with the Korean translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Korean equivalents."
        ),
        "user_prompt": "Translate this English speech into Korean. Respond only with the spoken Korean translation.",
    },
    "Portuguese (Português)": {
        "code": "pt",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Portuguese.\n"
            "3. Respond ONLY with the Portuguese translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Portuguese equivalents."
        ),
        "user_prompt": "Translate this English speech into Portuguese. Respond only with the spoken Portuguese translation.",
    },
    "Italian (Italiano)": {
        "code": "it",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Italian.\n"
            "3. Respond ONLY with the Italian translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Italian equivalents."
        ),
        "user_prompt": "Translate this English speech into Italian. Respond only with the spoken Italian translation.",
    },
    "Chinese (中文)": {
        "code": "zh",
        "system_prompt": (
            "You are a professional video dubbing translator. You will receive audio in English.\n"
            "Your task:\n"
            "1. Listen carefully to the English speech.\n"
            "2. Translate it into natural, fluent Mandarin Chinese.\n"
            "3. Respond ONLY with the Chinese translation spoken aloud — no English, no commentary,\n"
            "   no meta-text. Match the tone, emotion, and pacing of the original speaker.\n"
            "4. If there are pauses or silence in the original audio, maintain similar pacing.\n"
            "5. Translate idioms and cultural references into their Chinese equivalents."
        ),
        "user_prompt": "Translate this English speech into Mandarin Chinese. Respond only with the spoken Chinese translation.",
    },
}

VOICES = [
    "Cherry", "Serena", "Ethan", "Chelsie", "Momo", "Vivian", "Moon", "Maia",
    "Kai", "Nofish", "Bella", "Jennifer", "Ryan", "Katerina", "Aiden",
    "Eldric Sage", "Mia", "Mochi", "Bellona", "Vincent", "Bunny", "Neil",
    "Elias", "Arthur", "Seren", "Bodega", "Sonrisa", "Alek", "Dolce",
    "Sohee", "Ono Anna", "Lenn", "Emilien", "Andre",
]

# ──────────────────────────────────────────────
# Audio helpers
# ──────────────────────────────────────────────
def get_duration(filepath: str) -> float:
    result = subprocess.run(
        ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
         "-of", "default=noprint_wrappers=1:nokey=1", filepath],
        capture_output=True, text=True,
    )
    return float(result.stdout.strip())


def extract_audio_chunk(video_path, output_wav, start_sec, duration_sec):
    subprocess.run(
        ["ffmpeg", "-y", "-ss", str(start_sec), "-t", str(duration_sec),
         "-i", video_path, "-vn", "-acodec", "pcm_s16le",
         "-ar", "16000", "-ac", "1", output_wav],
        capture_output=True, check=True,
    )


def wav_to_base64(wav_path):
    with open(wav_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


def base64_to_wav(b64_data, output_path):
    audio_bytes = base64.b64decode(b64_data)
    sample_rate = 24000
    num_channels = 1
    bits_per_sample = 16
    byte_rate = sample_rate * num_channels * bits_per_sample // 8
    block_align = num_channels * bits_per_sample // 8
    data_size = len(audio_bytes)
    with open(output_path, "wb") as f:
        f.write(b"RIFF")
        f.write(struct.pack("<I", 36 + data_size))
        f.write(b"WAVE")
        f.write(b"fmt ")
        f.write(struct.pack("<I", 16))
        f.write(struct.pack("<H", 1))
        f.write(struct.pack("<H", num_channels))
        f.write(struct.pack("<I", sample_rate))
        f.write(struct.pack("<I", byte_rate))
        f.write(struct.pack("<H", block_align))
        f.write(struct.pack("<H", bits_per_sample))
        f.write(b"data")
        f.write(struct.pack("<I", data_size))
        f.write(audio_bytes)


def concatenate_wavs(wav_files, output_path):
    if len(wav_files) == 1:
        shutil.copy2(wav_files[0], output_path)
        return
    list_file = output_path + ".txt"
    with open(list_file, "w") as f:
        for wav in wav_files:
            f.write(f"file '{wav}'\n")
    subprocess.run(
        ["ffmpeg", "-y", "-f", "concat", "-safe", "0",
         "-i", list_file, "-c", "copy", output_path],
        capture_output=True, check=True,
    )
    os.remove(list_file)


def mux_audio_to_video(original_video, new_audio, output_video):
    result = subprocess.run(
        ["ffmpeg", "-y", "-i", original_video, "-i", new_audio,
         "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
         "-shortest", output_video],
        capture_output=True, text=True,
    )
    if result.returncode != 0:
        raise RuntimeError(f"FFmpeg mux failed:\n{result.stderr[-500:]}")


# ──────────────────────────────────────────────
# Translation
# ──────────────────────────────────────────────
def translate_chunk(client, wav_path, voice, lang_config, chunk_index):
    audio_b64 = wav_to_base64(wav_path)
    output_wav = wav_path.replace(".wav", f"_{lang_config['code']}.wav")

    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": lang_config["system_prompt"]},
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {
                            "data": f"data:audio/wav;base64,{audio_b64}",
                            "format": "wav",
                        },
                    },
                    {"type": "text", "text": lang_config["user_prompt"]},
                ],
            },
        ],
        modalities=["text", "audio"],
        audio={"voice": voice, "format": "wav"},
        stream=True,
        stream_options={"include_usage": True},
    )

    audio_chunks = []
    transcript_parts = []

    for event in completion:
        if not event.choices:
            continue
        delta = event.choices[0].delta
        if hasattr(delta, "content") and delta.content:
            transcript_parts.append(delta.content)
        if hasattr(delta, "audio") and delta.audio:
            if isinstance(delta.audio, dict):
                if "data" in delta.audio:
                    audio_chunks.append(delta.audio["data"])
            elif hasattr(delta.audio, "data") and delta.audio.data:
                audio_chunks.append(delta.audio.data)

    transcript = "".join(transcript_parts)

    if audio_chunks:
        full_audio_b64 = "".join(audio_chunks)
        base64_to_wav(full_audio_b64, output_wav)
        return output_wav, transcript
    return None, transcript


# ──────────────────────────────────────────────
# Main pipeline (called by Gradio)
# ──────────────────────────────────────────────
def dub_video(video_file, target_language, voice, chunk_seconds, progress=gr.Progress()):
    if video_file is None:
        raise gr.Error("Please upload a video file.")

    api_key = os.environ.get("DASHSCOPE_API_KEY", "")
    if not api_key:
        raise gr.Error(
            "DASHSCOPE_API_KEY not set. Add it as a Space Secret "
            "(Settings → Secrets → New Secret)."
        )

    lang_config = LANGUAGES[target_language]
    client = OpenAI(api_key=api_key, base_url=BASE_URL)
    tmp_dir = tempfile.mkdtemp(prefix="dub_")

    try:
        # ── Duration ──
        progress(0.05, desc="Analyzing video...")
        total_duration = get_duration(video_file)

        if total_duration > 3600:
            raise gr.Error("Video is longer than 1 hour. Please use a shorter clip.")

        # ── Split ──
        progress(0.1, desc="Extracting audio chunks...")
        num_chunks = max(
            1,
            int(total_duration // chunk_seconds)
            + (1 if total_duration % chunk_seconds > 0 else 0),
        )

        input_chunks = []
        for i in range(num_chunks):
            start = i * chunk_seconds
            duration = min(chunk_seconds, total_duration - start)
            chunk_path = os.path.join(tmp_dir, f"chunk_{i:03d}.wav")
            extract_audio_chunk(video_file, chunk_path, start, duration)
            input_chunks.append(chunk_path)

        # ── Translate ──
        output_chunks = []
        all_transcripts = []

        for i, chunk_path in enumerate(input_chunks):
            frac = 0.15 + 0.7 * (i / num_chunks)
            progress(frac, desc=f"Translating chunk {i+1}/{num_chunks}...")

            result_path, transcript = translate_chunk(
                client, chunk_path, voice, lang_config, i
            )
            if transcript:
                all_transcripts.append(transcript)

            if result_path:
                output_chunks.append(result_path)
            else:
                # Silence fallback
                duration = get_duration(chunk_path)
                silence_path = os.path.join(tmp_dir, f"silence_{i:03d}.wav")
                subprocess.run(
                    ["ffmpeg", "-y", "-f", "lavfi",
                     "-i", "anullsrc=r=24000:cl=mono",
                     "-t", str(duration), "-acodec", "pcm_s16le", silence_path],
                    capture_output=True, check=True,
                )
                output_chunks.append(silence_path)

        # ── Concatenate ──
        progress(0.88, desc="Assembling audio...")
        full_audio = os.path.join(tmp_dir, "full_dubbed_audio.wav")
        concatenate_wavs(output_chunks, full_audio)

        # ── Mux ──
        progress(0.93, desc="Muxing audio onto video...")
        ext = os.path.splitext(video_file)[1] or ".mp4"
        output_video = os.path.join(tmp_dir, f"dubbed_{lang_config['code']}{ext}")
        mux_audio_to_video(video_file, full_audio, output_video)

        progress(1.0, desc="Done!")

        transcript_text = "\n\n".join(
            f"**Chunk {i+1}:**\n{t}" for i, t in enumerate(all_transcripts)
        ) or "No transcript available."

        return output_video, transcript_text

    except Exception as e:
        # Clean up on error
        shutil.rmtree(tmp_dir, ignore_errors=True)
        raise gr.Error(str(e))


# ──────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────
DESCRIPTION = """
# 🎬 Commentary Video Dubbing — English to Any Language

Upload an English video and get it dubbed into Arabic, German, French, Spanish, and more.
The model translates the speech and generates natural-sounding voice output in the target language.

**Supported output languages:** Arabic, Chinese, German, French, Spanish, Portuguese, Italian, Russian, Japanese, Korean

"""

with gr.Blocks(
    title="Video Dubbing — Qwen3.5-Omni",
    theme=gr.themes.Soft(
        primary_hue="amber",
        secondary_hue="orange",
        neutral_hue="stone",
    ),
) as demo:

    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1):
            video_input = gr.Video(label="Upload English Video", sources=["upload"])

            target_lang = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="Arabic (العربية الفصحى)",
                label="Target Language",
            )

            voice_select = gr.Dropdown(
                choices=VOICES,
                value="Ethan",
                label="Voice",
                info="All voices support all output languages.",
            )

            chunk_slider = gr.Slider(
                minimum=30,
                maximum=300,
                value=120,
                step=10,
                label="Chunk Duration (seconds)",
                info="Shorter chunks = more API calls but less risk of timeout.",
            )

            dub_btn = gr.Button("🎙️ Start Dubbing", variant="primary", size="lg")

        with gr.Column(scale=1):
            video_output = gr.Video(label="Dubbed Video")
            transcript_output = gr.Markdown(label="Translation Transcript")

    dub_btn.click(
        fn=dub_video,
        inputs=[video_input, target_lang, voice_select, chunk_slider],
        outputs=[video_output, transcript_output],
    )

    gr.Markdown(
        "---\n"
        "**Built by:** Plotweaver "
            )

if __name__ == "__main__":
    demo.launch()