Spaces:

artificialguybr
/

video-dubbing

Runtime error

App Files Files Community

XtewaldX commited on Mar 15

Commit

a6ee9c3

verified ·

1 Parent(s): bdb4d21

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -206

app.py CHANGED Viewed

@@ -2,225 +2,231 @@ import os
 import uuid
 import asyncio
 import subprocess
-import json
-from zipfile import ZipFile
-import stat
 import gradio as gr
-import ffmpeg
-import cv2
 import edge_tts
-from googletrans import Translator
-from huggingface_hub import HfApi
-import moviepy.editor as mp
-import spaces
-# Constants and initialization
-HF_TOKEN = os.environ.get("HF_TOKEN")
-REPO_ID = "artificialguybr/video-dubbing"
-MAX_VIDEO_DURATION = 60  # seconds
-api = HfApi(token=HF_TOKEN)
-# Extract and set permissions for ffmpeg
-ZipFile("ffmpeg.zip").extractall()
-st = os.stat('ffmpeg')
-os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
-language_mapping = {
-    'English': ('en', 'en-US-EricNeural'),
-    'Spanish': ('es', 'es-ES-AlvaroNeural'),
-    'French': ('fr', 'fr-FR-HenriNeural'),
-    'German': ('de', 'de-DE-ConradNeural'),
-    'Italian': ('it', 'it-IT-DiegoNeural'),
-    'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
-    'Polish': ('pl', 'pl-PL-MarekNeural'),
-    'Turkish': ('tr', 'tr-TR-AhmetNeural'),
-    'Russian': ('ru', 'ru-RU-DmitryNeural'),
-    'Dutch': ('nl', 'nl-NL-MaartenNeural'),
-    'Czech': ('cs', 'cs-CZ-AntoninNeural'),
-    'Arabic': ('ar', 'ar-SA-HamedNeural'),
-    'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
-    'Japanese': ('ja', 'ja-JP-KeitaNeural'),
-    'Korean': ('ko', 'ko-KR-InJoonNeural'),
-    'Hindi': ('hi', 'hi-IN-MadhurNeural'),
-    'Swedish': ('sv', 'sv-SE-MattiasNeural'),
-    'Danish': ('da', 'da-DK-JeppeNeural'),
-    'Finnish': ('fi', 'fi-FI-HarriNeural'),
-    'Greek': ('el', 'el-GR-NestorasNeural')
 }
-print("Starting the program...")
-def generate_unique_filename(extension):
-    return f"{uuid.uuid4()}{extension}"
-def cleanup_files(*files):
-    for file in files:
-        if file and os.path.exists(file):
-            os.remove(file)
-            print(f"Removed file: {file}")
-@spaces.GPU(duration=90)
-def transcribe_audio(file_path):
-    print(f"Starting transcription of file: {file_path}")
-    temp_audio = None
-    if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
-        print("Video file detected. Extracting audio...")
-        try:
-            video = mp.VideoFileClip(file_path)
-            temp_audio = generate_unique_filename(".wav")
-            video.audio.write_audiofile(temp_audio)
-            file_path = temp_audio
-        except Exception as e:
-            print(f"Error extracting audio from video: {e}")
-            raise
-    output_file = generate_unique_filename(".json")
-    command = [
-        "insanely-fast-whisper",
-        "--file-name", file_path,
-        "--device-id", "0",
-        "--model-name", "openai/whisper-large-v3",
-        "--task", "transcribe",
-        "--timestamp", "chunk",
-        "--transcript-path", output_file
-    ]
-    try:
-        result = subprocess.run(command, check=True, capture_output=True, text=True)
-        print(f"Transcription output: {result.stdout}")
-    except subprocess.CalledProcessError as e:
-        print(f"Error running insanely-fast-whisper: {e}")
-        raise
     try:
-        with open(output_file, "r") as f:
-            transcription = json.load(f)
-    except json.JSONDecodeError as e:
-        print(f"Error decoding JSON: {e}")
-        raise
-    result = transcription.get("text", " ".join([chunk["text"] for chunk in transcription.get("chunks", [])]))
-    cleanup_files(output_file, temp_audio)
-    return result
-async def text_to_speech(text, voice, output_file):
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-@spaces.GPU
-def process_video(video, target_language, use_wav2lip):
-    try:
-        if target_language is None:
-            raise ValueError("Please select a Target Language for Dubbing.")
-        run_uuid = uuid.uuid4().hex[:6]
-        output_filename = f"{run_uuid}_resized_video.mp4"
-        ffmpeg.input(video).output(output_filename, vf='scale=-2:720').run()
-        video_path = output_filename
-        if not os.path.exists(video_path):
-            raise FileNotFoundError(f"Error: {video_path} does not exist.")
-        video_info = ffmpeg.probe(video_path)
-        video_duration = float(video_info['streams'][0]['duration'])
-        if video_duration > MAX_VIDEO_DURATION:
-            cleanup_files(video_path)
-            raise ValueError(f"Video duration exceeds {MAX_VIDEO_DURATION} seconds. Please upload a shorter video.")
-        ffmpeg.input(video_path).output(f"{run_uuid}_output_audio.wav", acodec='pcm_s24le', ar=48000, map='a').run()
-        subprocess.run(f"ffmpeg -y -i {run_uuid}_output_audio.wav -af lowpass=3000,highpass=100 {run_uuid}_output_audio_final.wav", shell=True, check=True)
-        whisper_text = transcribe_audio(f"{run_uuid}_output_audio_final.wav")
-        print(f"Transcription successful: {whisper_text}")
-        target_language_code, voice = language_mapping[target_language]
-        translator = Translator()
-        translated_text = translator.translate(whisper_text, dest=target_language_code).text
-        print(f"Translated text: {translated_text}")
-        asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
-        if use_wav2lip:
-            try:
-                subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
-            except subprocess.CalledProcessError as e:
-                print(f"Wav2Lip error: {str(e)}")
-                gr.Warning("Wav2lip encountered an error. Falling back to simple audio replacement.")
-                subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
-        else:
-            subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
-        output_video_path = f"{run_uuid}_output_video.mp4"
-        if not os.path.exists(output_video_path):
-            raise FileNotFoundError(f"Error: {output_video_path} was not generated.")
-        cleanup_files(
-            f"{run_uuid}_resized_video.mp4",
-            f"{run_uuid}_output_audio.wav",
-            f"{run_uuid}_output_audio_final.wav",
-            f"{run_uuid}_output_synth.wav"
         )
-        return output_video_path, ""
     except Exception as e:
-        print(f"Error in process_video: {str(e)}")
-        return None, f"Error: {str(e)}"
-# Gradio interface setup
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# AI Video Dubbing")
-    gr.Markdown("This tool uses AI to dub videos into different languages. Upload a video, choose a target language, and get a dubbed version!")
     with gr.Row():
-        with gr.Column(scale=2):
-            video_input = gr.Video(label="Upload Video")
-            target_language = gr.Dropdown(
-                choices=list(language_mapping.keys()),
-                label="Target Language for Dubbing",
-                value="Spanish"
             )
-            use_wav2lip = gr.Checkbox(
-                label="Use Wav2Lip for lip sync",
-                value=False,
-                info="Enable this if the video has close-up faces. May not work for all videos."
             )
-            submit_button = gr.Button("Process Video", variant="primary")
-        with gr.Column(scale=2):
-            output_video = gr.Video(label="Processed Video")
-            error_message = gr.Textbox(label="Status/Error Message")
-    submit_button.click(
-        process_video,
-        inputs=[video_input, target_language, use_wav2lip],
-        outputs=[output_video, error_message]
-    )
-    gr.Markdown("""
-    ## Notes:
-    - Video limit is 1 minute. The tool will dub all speakers using a single voice.
-    - Processing may take up to 5 minutes.
-    - This is an alpha version using open-source models.
-    - Quality vs. speed trade-off was made for scalability and hardware limitations.
-    - For videos longer than 1 minute, please duplicate this Space and adjust the limit in the code.
-    """)
-    gr.Markdown("""
-    ---
-    Developed by [@artificialguybr](https://twitter.com/artificialguybr) using open-source tools.
-    Special thanks to Hugging Face for GPU support and [@yeswondwer](https://twitter.com/@yeswondwerr) for the original code.
-    Try our [Video Transcription and Translation](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) tool!
-    """)
-print("Launching Gradio interface...")
 demo.queue()
 demo.launch()

 import uuid
 import asyncio
 import subprocess
+import shutil
+import nest_asyncio
 import gradio as gr
 import edge_tts
+from deep_translator import GoogleTranslator
+from faster_whisper import WhisperModel
+# Allow asyncio to run inside Gradio's existing event loop
+nest_asyncio.apply()
+# Load Whisper model once at startup
+# small = good balance between speed and accuracy on CPU
+# int8 = quantized for lower memory usage
+model = WhisperModel("small", device="cpu", compute_type="int8")
+# Supported target languages
+# Format: "Display Name": ("translation_code", "edge_tts_voice_name")
+languages = {
+    "English": ("en", "en-US-EricNeural"),
+    "Spanish": ("es", "es-ES-AlvaroNeural"),
+    "French": ("fr", "fr-FR-HenriNeural"),
+    "German": ("de", "de-DE-ConradNeural"),
+    "Italian": ("it", "it-IT-DiegoNeural"),
+    "Russian": ("ru", "ru-RU-DmitryNeural"),
 }
+def transcribe(audio):
+    """
+    Transcribe audio file to text using faster-whisper.
+    Returns a single string with all segments joined.
+    """
+    segments, _ = model.transcribe(audio)
+    text = ""
+    for s in segments:
+        text += s.text + " "
+    return text.strip()
+async def tts_async(text, voice, out):
+    """
+    Async function to generate speech from text using Microsoft Edge TTS.
+    Saves the result to the given output file path.
+    """
+    t = edge_tts.Communicate(text, voice)
+    await t.save(out)
+def run_tts(text, voice, out):
+    """
+    Wrapper to run the async TTS function synchronously
+    inside the existing asyncio event loop (required for Gradio).
+    """
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(tts_async(text, voice, out))
+def process(video, language, use_lipsync):
+    """
+    Main video dubbing pipeline:
+    Step 1 - Resize: scale video to 480p for faster processing
+    Step 2 - Extract audio: pull mono 16kHz WAV from video (Whisper format)
+    Step 3 - Transcribe: convert audio to text using Whisper
+    Step 4 - Translate: translate text to target language using Google Translate
+    Step 5 - TTS: generate new speech audio using Edge TTS
+    Step 6 - Combine:
+        - If lip sync enabled: run Wav2Lip to animate mouth movements
+          - If Wav2Lip fails: fallback to simple audio replacement
+        - If lip sync disabled: directly replace audio track with TTS audio
+    Returns: (output_video_path, status_message)
+    """
     try:
+        # gr.Video returns the file path directly as a string
+        video_path = video
+        # Create an isolated temp directory for this job
+        # Using short UUID to avoid path collisions between concurrent users
+        uid = uuid.uuid4().hex[:6]
+        work_dir = f"/tmp/{uid}"
+        os.makedirs(work_dir, exist_ok=True)
+        # Copy uploaded video into our work directory
+        input_video = os.path.join(work_dir, "input.mp4")
+        shutil.copy(video_path, input_video)
+        # -------------------------------------------------------------------
+        # Step 1: Resize video to 480p
+        # -vf scale=-2:480 keeps aspect ratio, height = 480px
+        # Smaller resolution = faster Whisper transcription and Wav2Lip
+        # -------------------------------------------------------------------
+        resized = os.path.join(work_dir, "video.mp4")
+        subprocess.run(
+            ["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized],
+            check=True,
+        )
+        # -------------------------------------------------------------------
+        # Step 2: Extract audio track from resized video
+        # -vn = no video, -ac 1 = mono, -ar 16000 = 16kHz sample rate
+        # 16kHz mono WAV is the required input format for Whisper
+        # -------------------------------------------------------------------
+        audio = os.path.join(work_dir, "audio.wav")
+        subprocess.run(
+            ["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio],
+            check=True,
         )
+        # -------------------------------------------------------------------
+        # Step 3: Transcribe audio to text using Whisper
+        # -------------------------------------------------------------------
+        text = transcribe(audio)
+        if not text:
+            return None, "❌ Transcription failed or audio is silent."
+        # -------------------------------------------------------------------
+        # Step 4: Translate transcribed text to the target language
+        # source="auto" = Whisper auto-detects the original language
+        # -------------------------------------------------------------------
+        lang, voice = languages[language]
+        translated = GoogleTranslator(source="auto", target=lang).translate(text)
+        if not translated:
+            return None, "❌ Translation failed."
+        # -------------------------------------------------------------------
+        # Step 5: Generate TTS speech from translated text
+        # Edge TTS uses Microsoft neural voices (free, no API key needed)
+        # -------------------------------------------------------------------
+        speech = os.path.join(work_dir, "tts.wav")
+        run_tts(translated, voice, speech)
+        # Output file path for final video
+        output = os.path.join(work_dir, "lipsync.mp4")
+        # -------------------------------------------------------------------
+        # Step 6a: Lip sync mode — run Wav2Lip to animate mouth movements
+        # Wav2Lip requires: face video + audio -> outputs lip-synced video
+        # -------------------------------------------------------------------
+        if use_lipsync:
+            result = subprocess.run(
+                [
+                    "python", "Wav2Lip/inference.py",
+                    "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
+                    "--face", resized,       # input face video
+                    "--audio", speech,       # new TTS audio
+                    "--outfile", output,     # output lip-synced video
+                ],
+                capture_output=True,
+                text=True,
+            )
+            # If Wav2Lip failed for any reason, fall back to simple audio swap
+            if result.returncode != 0:
+                print(f"WAV2LIP STDERR: {result.stderr}")
+                print(f"WAV2LIP STDOUT: {result.stdout}")
+                # Fallback: copy video stream, replace audio stream
+                subprocess.run(
+                    f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
+                    f"-map 0:v:0 -map 1:a:0 {output}",
+                    shell=True,
+                    check=True,
+                )
+                return output, f"⚠️ Wav2Lip failed, used audio replacement instead.\n{result.stderr}"
+            return output, "✅ Done with lip sync!"
+        # -------------------------------------------------------------------
+        # Step 6b: No lip sync — just replace the audio track
+        # -c:v copy = keep original video stream unchanged
+        # -c:a aac = encode new audio as AAC
+        # -map 0:v:0 = take video from first input
+        # -map 1:a:0 = take audio from second input (TTS)
+        # -------------------------------------------------------------------
+        else:
+            subprocess.run(
+                f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
+                f"-map 0:v:0 -map 1:a:0 {output}",
+                shell=True,
+                check=True,
+            )
+            return output, "✅ Done! (audio replacement, no lip sync)"
     except Exception as e:
+        # Catch any unexpected errors and return them as status message
+        return None, f"❌ Error: {str(e)}"
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎬 AI Video Dubbing + Lip Sync")
     with gr.Row():
+        with gr.Column():
+            # Video upload widget — shows preview before processing
+            video = gr.Video(label="Upload Video")
+            # Target language selector
+            lang = gr.Dropdown(
+                list(languages.keys()),
+                value="Spanish",
+                label="Target Language",
             )
+            # Toggle to enable/disable Wav2Lip lip sync
+            # Disabled by default — faster, works on all videos
+            # Enable only if video has close-up face shots
+            use_lipsync = gr.Checkbox(
+                label="Enable Lip Sync (Wav2Lip)",
+                value=False,
+                info="Enable if video has close-up face. Slower processing.",
             )
+            # Submit button
+            run = gr.Button("▶ Process", variant="primary")
+        with gr.Column():
+            # Output video player
+            out = gr.Video(label="Result")
+            # Status/error message box
+            status = gr.Textbox(label="Status", lines=3)
+    # Wire up the button click to the process function
+    run.click(process, inputs=[video, lang, use_lipsync], outputs=[out, status])
 demo.queue()
 demo.launch()