video-dubbing / app.py
XtewaldX's picture
Update app.py
a6ee9c3 verified
raw
history blame
9.16 kB
import os
import uuid
import asyncio
import subprocess
import shutil
import nest_asyncio
import gradio as gr
import edge_tts
from deep_translator import GoogleTranslator
from faster_whisper import WhisperModel
# Allow asyncio to run inside Gradio's existing event loop
nest_asyncio.apply()
# Load Whisper model once at startup
# small = good balance between speed and accuracy on CPU
# int8 = quantized for lower memory usage
model = WhisperModel("small", device="cpu", compute_type="int8")
# Supported target languages
# Format: "Display Name": ("translation_code", "edge_tts_voice_name")
languages = {
"English": ("en", "en-US-EricNeural"),
"Spanish": ("es", "es-ES-AlvaroNeural"),
"French": ("fr", "fr-FR-HenriNeural"),
"German": ("de", "de-DE-ConradNeural"),
"Italian": ("it", "it-IT-DiegoNeural"),
"Russian": ("ru", "ru-RU-DmitryNeural"),
}
def transcribe(audio):
"""
Transcribe audio file to text using faster-whisper.
Returns a single string with all segments joined.
"""
segments, _ = model.transcribe(audio)
text = ""
for s in segments:
text += s.text + " "
return text.strip()
async def tts_async(text, voice, out):
"""
Async function to generate speech from text using Microsoft Edge TTS.
Saves the result to the given output file path.
"""
t = edge_tts.Communicate(text, voice)
await t.save(out)
def run_tts(text, voice, out):
"""
Wrapper to run the async TTS function synchronously
inside the existing asyncio event loop (required for Gradio).
"""
loop = asyncio.get_event_loop()
loop.run_until_complete(tts_async(text, voice, out))
def process(video, language, use_lipsync):
"""
Main video dubbing pipeline:
Step 1 - Resize: scale video to 480p for faster processing
Step 2 - Extract audio: pull mono 16kHz WAV from video (Whisper format)
Step 3 - Transcribe: convert audio to text using Whisper
Step 4 - Translate: translate text to target language using Google Translate
Step 5 - TTS: generate new speech audio using Edge TTS
Step 6 - Combine:
- If lip sync enabled: run Wav2Lip to animate mouth movements
- If Wav2Lip fails: fallback to simple audio replacement
- If lip sync disabled: directly replace audio track with TTS audio
Returns: (output_video_path, status_message)
"""
try:
# gr.Video returns the file path directly as a string
video_path = video
# Create an isolated temp directory for this job
# Using short UUID to avoid path collisions between concurrent users
uid = uuid.uuid4().hex[:6]
work_dir = f"/tmp/{uid}"
os.makedirs(work_dir, exist_ok=True)
# Copy uploaded video into our work directory
input_video = os.path.join(work_dir, "input.mp4")
shutil.copy(video_path, input_video)
# -------------------------------------------------------------------
# Step 1: Resize video to 480p
# -vf scale=-2:480 keeps aspect ratio, height = 480px
# Smaller resolution = faster Whisper transcription and Wav2Lip
# -------------------------------------------------------------------
resized = os.path.join(work_dir, "video.mp4")
subprocess.run(
["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized],
check=True,
)
# -------------------------------------------------------------------
# Step 2: Extract audio track from resized video
# -vn = no video, -ac 1 = mono, -ar 16000 = 16kHz sample rate
# 16kHz mono WAV is the required input format for Whisper
# -------------------------------------------------------------------
audio = os.path.join(work_dir, "audio.wav")
subprocess.run(
["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio],
check=True,
)
# -------------------------------------------------------------------
# Step 3: Transcribe audio to text using Whisper
# -------------------------------------------------------------------
text = transcribe(audio)
if not text:
return None, "❌ Transcription failed or audio is silent."
# -------------------------------------------------------------------
# Step 4: Translate transcribed text to the target language
# source="auto" = Whisper auto-detects the original language
# -------------------------------------------------------------------
lang, voice = languages[language]
translated = GoogleTranslator(source="auto", target=lang).translate(text)
if not translated:
return None, "❌ Translation failed."
# -------------------------------------------------------------------
# Step 5: Generate TTS speech from translated text
# Edge TTS uses Microsoft neural voices (free, no API key needed)
# -------------------------------------------------------------------
speech = os.path.join(work_dir, "tts.wav")
run_tts(translated, voice, speech)
# Output file path for final video
output = os.path.join(work_dir, "lipsync.mp4")
# -------------------------------------------------------------------
# Step 6a: Lip sync mode β€” run Wav2Lip to animate mouth movements
# Wav2Lip requires: face video + audio -> outputs lip-synced video
# -------------------------------------------------------------------
if use_lipsync:
result = subprocess.run(
[
"python", "Wav2Lip/inference.py",
"--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
"--face", resized, # input face video
"--audio", speech, # new TTS audio
"--outfile", output, # output lip-synced video
],
capture_output=True,
text=True,
)
# If Wav2Lip failed for any reason, fall back to simple audio swap
if result.returncode != 0:
print(f"WAV2LIP STDERR: {result.stderr}")
print(f"WAV2LIP STDOUT: {result.stdout}")
# Fallback: copy video stream, replace audio stream
subprocess.run(
f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
f"-map 0:v:0 -map 1:a:0 {output}",
shell=True,
check=True,
)
return output, f"⚠️ Wav2Lip failed, used audio replacement instead.\n{result.stderr}"
return output, "βœ… Done with lip sync!"
# -------------------------------------------------------------------
# Step 6b: No lip sync β€” just replace the audio track
# -c:v copy = keep original video stream unchanged
# -c:a aac = encode new audio as AAC
# -map 0:v:0 = take video from first input
# -map 1:a:0 = take audio from second input (TTS)
# -------------------------------------------------------------------
else:
subprocess.run(
f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
f"-map 0:v:0 -map 1:a:0 {output}",
shell=True,
check=True,
)
return output, "βœ… Done! (audio replacement, no lip sync)"
except Exception as e:
# Catch any unexpected errors and return them as status message
return None, f"❌ Error: {str(e)}"
# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# 🎬 AI Video Dubbing + Lip Sync")
with gr.Row():
with gr.Column():
# Video upload widget β€” shows preview before processing
video = gr.Video(label="Upload Video")
# Target language selector
lang = gr.Dropdown(
list(languages.keys()),
value="Spanish",
label="Target Language",
)
# Toggle to enable/disable Wav2Lip lip sync
# Disabled by default β€” faster, works on all videos
# Enable only if video has close-up face shots
use_lipsync = gr.Checkbox(
label="Enable Lip Sync (Wav2Lip)",
value=False,
info="Enable if video has close-up face. Slower processing.",
)
# Submit button
run = gr.Button("β–Ά Process", variant="primary")
with gr.Column():
# Output video player
out = gr.Video(label="Result")
# Status/error message box
status = gr.Textbox(label="Status", lines=3)
# Wire up the button click to the process function
run.click(process, inputs=[video, lang, use_lipsync], outputs=[out, status])
demo.queue()
demo.launch()