Spaces:
Runtime error
Runtime error
| import os | |
| import uuid | |
| import asyncio | |
| import subprocess | |
| import shutil | |
| import nest_asyncio | |
| import gradio as gr | |
| import edge_tts | |
| from deep_translator import GoogleTranslator | |
| from faster_whisper import WhisperModel | |
| # Allow asyncio to run inside Gradio's existing event loop | |
| nest_asyncio.apply() | |
| # Load Whisper model once at startup | |
| # small = good balance between speed and accuracy on CPU | |
| # int8 = quantized for lower memory usage | |
| model = WhisperModel("small", device="cpu", compute_type="int8") | |
| # Supported target languages | |
| # Format: "Display Name": ("translation_code", "edge_tts_voice_name") | |
| languages = { | |
| "English": ("en", "en-US-EricNeural"), | |
| "Spanish": ("es", "es-ES-AlvaroNeural"), | |
| "French": ("fr", "fr-FR-HenriNeural"), | |
| "German": ("de", "de-DE-ConradNeural"), | |
| "Italian": ("it", "it-IT-DiegoNeural"), | |
| "Russian": ("ru", "ru-RU-DmitryNeural"), | |
| } | |
| def transcribe(audio): | |
| """ | |
| Transcribe audio file to text using faster-whisper. | |
| Returns a single string with all segments joined. | |
| """ | |
| segments, _ = model.transcribe(audio) | |
| text = "" | |
| for s in segments: | |
| text += s.text + " " | |
| return text.strip() | |
| async def tts_async(text, voice, out): | |
| """ | |
| Async function to generate speech from text using Microsoft Edge TTS. | |
| Saves the result to the given output file path. | |
| """ | |
| t = edge_tts.Communicate(text, voice) | |
| await t.save(out) | |
| def run_tts(text, voice, out): | |
| """ | |
| Wrapper to run the async TTS function synchronously | |
| inside the existing asyncio event loop (required for Gradio). | |
| """ | |
| loop = asyncio.get_event_loop() | |
| loop.run_until_complete(tts_async(text, voice, out)) | |
| def process(video, language, use_lipsync): | |
| """ | |
| Main video dubbing pipeline: | |
| Step 1 - Resize: scale video to 480p for faster processing | |
| Step 2 - Extract audio: pull mono 16kHz WAV from video (Whisper format) | |
| Step 3 - Transcribe: convert audio to text using Whisper | |
| Step 4 - Translate: translate text to target language using Google Translate | |
| Step 5 - TTS: generate new speech audio using Edge TTS | |
| Step 6 - Combine: | |
| - If lip sync enabled: run Wav2Lip to animate mouth movements | |
| - If Wav2Lip fails: fallback to simple audio replacement | |
| - If lip sync disabled: directly replace audio track with TTS audio | |
| Returns: (output_video_path, status_message) | |
| """ | |
| try: | |
| # gr.Video returns the file path directly as a string | |
| video_path = video | |
| # Create an isolated temp directory for this job | |
| # Using short UUID to avoid path collisions between concurrent users | |
| uid = uuid.uuid4().hex[:6] | |
| work_dir = f"/tmp/{uid}" | |
| os.makedirs(work_dir, exist_ok=True) | |
| # Copy uploaded video into our work directory | |
| input_video = os.path.join(work_dir, "input.mp4") | |
| shutil.copy(video_path, input_video) | |
| # ------------------------------------------------------------------- | |
| # Step 1: Resize video to 480p | |
| # -vf scale=-2:480 keeps aspect ratio, height = 480px | |
| # Smaller resolution = faster Whisper transcription and Wav2Lip | |
| # ------------------------------------------------------------------- | |
| resized = os.path.join(work_dir, "video.mp4") | |
| subprocess.run( | |
| ["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized], | |
| check=True, | |
| ) | |
| # ------------------------------------------------------------------- | |
| # Step 2: Extract audio track from resized video | |
| # -vn = no video, -ac 1 = mono, -ar 16000 = 16kHz sample rate | |
| # 16kHz mono WAV is the required input format for Whisper | |
| # ------------------------------------------------------------------- | |
| audio = os.path.join(work_dir, "audio.wav") | |
| subprocess.run( | |
| ["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio], | |
| check=True, | |
| ) | |
| # ------------------------------------------------------------------- | |
| # Step 3: Transcribe audio to text using Whisper | |
| # ------------------------------------------------------------------- | |
| text = transcribe(audio) | |
| if not text: | |
| return None, "β Transcription failed or audio is silent." | |
| # ------------------------------------------------------------------- | |
| # Step 4: Translate transcribed text to the target language | |
| # source="auto" = Whisper auto-detects the original language | |
| # ------------------------------------------------------------------- | |
| lang, voice = languages[language] | |
| translated = GoogleTranslator(source="auto", target=lang).translate(text) | |
| if not translated: | |
| return None, "β Translation failed." | |
| # ------------------------------------------------------------------- | |
| # Step 5: Generate TTS speech from translated text | |
| # Edge TTS uses Microsoft neural voices (free, no API key needed) | |
| # ------------------------------------------------------------------- | |
| speech = os.path.join(work_dir, "tts.wav") | |
| run_tts(translated, voice, speech) | |
| # Output file path for final video | |
| output = os.path.join(work_dir, "lipsync.mp4") | |
| # ------------------------------------------------------------------- | |
| # Step 6a: Lip sync mode β run Wav2Lip to animate mouth movements | |
| # Wav2Lip requires: face video + audio -> outputs lip-synced video | |
| # ------------------------------------------------------------------- | |
| if use_lipsync: | |
| result = subprocess.run( | |
| [ | |
| "python", "Wav2Lip/inference.py", | |
| "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth", | |
| "--face", resized, # input face video | |
| "--audio", speech, # new TTS audio | |
| "--outfile", output, # output lip-synced video | |
| ], | |
| capture_output=True, | |
| text=True, | |
| ) | |
| # If Wav2Lip failed for any reason, fall back to simple audio swap | |
| if result.returncode != 0: | |
| print(f"WAV2LIP STDERR: {result.stderr}") | |
| print(f"WAV2LIP STDOUT: {result.stdout}") | |
| # Fallback: copy video stream, replace audio stream | |
| subprocess.run( | |
| f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac " | |
| f"-map 0:v:0 -map 1:a:0 {output}", | |
| shell=True, | |
| check=True, | |
| ) | |
| return output, f"β οΈ Wav2Lip failed, used audio replacement instead.\n{result.stderr}" | |
| return output, "β Done with lip sync!" | |
| # ------------------------------------------------------------------- | |
| # Step 6b: No lip sync β just replace the audio track | |
| # -c:v copy = keep original video stream unchanged | |
| # -c:a aac = encode new audio as AAC | |
| # -map 0:v:0 = take video from first input | |
| # -map 1:a:0 = take audio from second input (TTS) | |
| # ------------------------------------------------------------------- | |
| else: | |
| subprocess.run( | |
| f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac " | |
| f"-map 0:v:0 -map 1:a:0 {output}", | |
| shell=True, | |
| check=True, | |
| ) | |
| return output, "β Done! (audio replacement, no lip sync)" | |
| except Exception as e: | |
| # Catch any unexpected errors and return them as status message | |
| return None, f"β Error: {str(e)}" | |
| # --------------------------------------------------------------------------- | |
| # Gradio UI | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π¬ AI Video Dubbing + Lip Sync") | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Video upload widget β shows preview before processing | |
| video = gr.Video(label="Upload Video") | |
| # Target language selector | |
| lang = gr.Dropdown( | |
| list(languages.keys()), | |
| value="Spanish", | |
| label="Target Language", | |
| ) | |
| # Toggle to enable/disable Wav2Lip lip sync | |
| # Disabled by default β faster, works on all videos | |
| # Enable only if video has close-up face shots | |
| use_lipsync = gr.Checkbox( | |
| label="Enable Lip Sync (Wav2Lip)", | |
| value=False, | |
| info="Enable if video has close-up face. Slower processing.", | |
| ) | |
| # Submit button | |
| run = gr.Button("βΆ Process", variant="primary") | |
| with gr.Column(): | |
| # Output video player | |
| out = gr.Video(label="Result") | |
| # Status/error message box | |
| status = gr.Textbox(label="Status", lines=3) | |
| # Wire up the button click to the process function | |
| run.click(process, inputs=[video, lang, use_lipsync], outputs=[out, status]) | |
| demo.queue() | |
| demo.launch() |