Spaces:

artificialguybr
/

video-dubbing

Runtime error

App Files Files Community

video-dubbing / app.py

XtewaldX

Update app.py

a6ee9c3 verified about 1 month ago

raw

history blame

9.16 kB

	import os
	import uuid
	import asyncio
	import subprocess
	import shutil
	import nest_asyncio
	import gradio as gr
	import edge_tts
	from deep_translator import GoogleTranslator
	from faster_whisper import WhisperModel

	# Allow asyncio to run inside Gradio's existing event loop
	nest_asyncio.apply()

	# Load Whisper model once at startup
	# small = good balance between speed and accuracy on CPU
	# int8 = quantized for lower memory usage
	model = WhisperModel("small", device="cpu", compute_type="int8")

	# Supported target languages
	# Format: "Display Name": ("translation_code", "edge_tts_voice_name")
	languages = {
	"English": ("en", "en-US-EricNeural"),
	"Spanish": ("es", "es-ES-AlvaroNeural"),
	"French": ("fr", "fr-FR-HenriNeural"),
	"German": ("de", "de-DE-ConradNeural"),
	"Italian": ("it", "it-IT-DiegoNeural"),
	"Russian": ("ru", "ru-RU-DmitryNeural"),
	}


	def transcribe(audio):
	"""
	Transcribe audio file to text using faster-whisper.
	Returns a single string with all segments joined.
	"""
	segments, _ = model.transcribe(audio)
	text = ""
	for s in segments:
	text += s.text + " "
	return text.strip()


	async def tts_async(text, voice, out):
	"""
	Async function to generate speech from text using Microsoft Edge TTS.
	Saves the result to the given output file path.
	"""
	t = edge_tts.Communicate(text, voice)
	await t.save(out)


	def run_tts(text, voice, out):
	"""
	Wrapper to run the async TTS function synchronously
	inside the existing asyncio event loop (required for Gradio).
	"""
	loop = asyncio.get_event_loop()
	loop.run_until_complete(tts_async(text, voice, out))


	def process(video, language, use_lipsync):
	"""
	Main video dubbing pipeline:
	Step 1 - Resize: scale video to 480p for faster processing
	Step 2 - Extract audio: pull mono 16kHz WAV from video (Whisper format)
	Step 3 - Transcribe: convert audio to text using Whisper
	Step 4 - Translate: translate text to target language using Google Translate
	Step 5 - TTS: generate new speech audio using Edge TTS
	Step 6 - Combine:
	- If lip sync enabled: run Wav2Lip to animate mouth movements
	- If Wav2Lip fails: fallback to simple audio replacement
	- If lip sync disabled: directly replace audio track with TTS audio
	Returns: (output_video_path, status_message)
	"""
	try:
	# gr.Video returns the file path directly as a string
	video_path = video

	# Create an isolated temp directory for this job
	# Using short UUID to avoid path collisions between concurrent users
	uid = uuid.uuid4().hex[:6]
	work_dir = f"/tmp/{uid}"
	os.makedirs(work_dir, exist_ok=True)

	# Copy uploaded video into our work directory
	input_video = os.path.join(work_dir, "input.mp4")
	shutil.copy(video_path, input_video)

	# -------------------------------------------------------------------
	# Step 1: Resize video to 480p
	# -vf scale=-2:480 keeps aspect ratio, height = 480px
	# Smaller resolution = faster Whisper transcription and Wav2Lip
	# -------------------------------------------------------------------
	resized = os.path.join(work_dir, "video.mp4")
	subprocess.run(
	["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized],
	check=True,
	)

	# -------------------------------------------------------------------
	# Step 2: Extract audio track from resized video
	# -vn = no video, -ac 1 = mono, -ar 16000 = 16kHz sample rate
	# 16kHz mono WAV is the required input format for Whisper
	# -------------------------------------------------------------------
	audio = os.path.join(work_dir, "audio.wav")
	subprocess.run(
	["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio],
	check=True,
	)

	# -------------------------------------------------------------------
	# Step 3: Transcribe audio to text using Whisper
	# -------------------------------------------------------------------
	text = transcribe(audio)
	if not text:
	return None, "❌ Transcription failed or audio is silent."

	# -------------------------------------------------------------------
	# Step 4: Translate transcribed text to the target language
	# source="auto" = Whisper auto-detects the original language
	# -------------------------------------------------------------------
	lang, voice = languages[language]
	translated = GoogleTranslator(source="auto", target=lang).translate(text)
	if not translated:
	return None, "❌ Translation failed."

	# -------------------------------------------------------------------
	# Step 5: Generate TTS speech from translated text
	# Edge TTS uses Microsoft neural voices (free, no API key needed)
	# -------------------------------------------------------------------
	speech = os.path.join(work_dir, "tts.wav")
	run_tts(translated, voice, speech)

	# Output file path for final video
	output = os.path.join(work_dir, "lipsync.mp4")

	# -------------------------------------------------------------------
	# Step 6a: Lip sync mode — run Wav2Lip to animate mouth movements
	# Wav2Lip requires: face video + audio -> outputs lip-synced video
	# -------------------------------------------------------------------
	if use_lipsync:
	result = subprocess.run(
	[
	"python", "Wav2Lip/inference.py",
	"--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
	"--face", resized, # input face video
	"--audio", speech, # new TTS audio
	"--outfile", output, # output lip-synced video
	],
	capture_output=True,
	text=True,
	)

	# If Wav2Lip failed for any reason, fall back to simple audio swap
	if result.returncode != 0:
	print(f"WAV2LIP STDERR: {result.stderr}")
	print(f"WAV2LIP STDOUT: {result.stdout}")

	# Fallback: copy video stream, replace audio stream
	subprocess.run(
	f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
	f"-map 0:v:0 -map 1:a:0 {output}",
	shell=True,
	check=True,
	)
	return output, f"⚠️ Wav2Lip failed, used audio replacement instead.\n{result.stderr}"

	return output, "✅ Done with lip sync!"

	# -------------------------------------------------------------------
	# Step 6b: No lip sync — just replace the audio track
	# -c:v copy = keep original video stream unchanged
	# -c:a aac = encode new audio as AAC
	# -map 0:v:0 = take video from first input
	# -map 1:a:0 = take audio from second input (TTS)
	# -------------------------------------------------------------------
	else:
	subprocess.run(
	f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
	f"-map 0:v:0 -map 1:a:0 {output}",
	shell=True,
	check=True,
	)
	return output, "✅ Done! (audio replacement, no lip sync)"

	except Exception as e:
	# Catch any unexpected errors and return them as status message
	return None, f"❌ Error: {str(e)}"


	# ---------------------------------------------------------------------------
	# Gradio UI
	# ---------------------------------------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# 🎬 AI Video Dubbing + Lip Sync")
	with gr.Row():
	with gr.Column():
	# Video upload widget — shows preview before processing
	video = gr.Video(label="Upload Video")

	# Target language selector
	lang = gr.Dropdown(
	list(languages.keys()),
	value="Spanish",
	label="Target Language",
	)

	# Toggle to enable/disable Wav2Lip lip sync
	# Disabled by default — faster, works on all videos
	# Enable only if video has close-up face shots
	use_lipsync = gr.Checkbox(
	label="Enable Lip Sync (Wav2Lip)",
	value=False,
	info="Enable if video has close-up face. Slower processing.",
	)

	# Submit button
	run = gr.Button("▶ Process", variant="primary")

	with gr.Column():
	# Output video player
	out = gr.Video(label="Result")

	# Status/error message box
	status = gr.Textbox(label="Status", lines=3)

	# Wire up the button click to the process function
	run.click(process, inputs=[video, lang, use_lipsync], outputs=[out, status])

	demo.queue()
	demo.launch()