import spaces import gradio as gr import torch import torchaudio import os import gc import sys import shutil import tempfile import subprocess import threading import logging import dubbing from i18n import T, EXAMPLES, ALL_EXAMPLES_FLAT, TTS_LANGUAGES, MAX_TEXT_LEN, MAX_AUDIO_SEC from styles import THEME, CSS logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", stream=sys.stderr, ) log = logging.getLogger(__name__) # ── Config ──────────────────────────────────────────────────────────────────── ECHOMIMIC_MODEL = os.environ.get("ECHOMIMIC_MODEL", "BadToBest/EchoMimicV3") CHATTERBOX_MODEL = os.environ.get("CHATTERBOX_MODEL", "ResembleAI/chatterbox") MAX_DUB_TEXT_LEN = 1500 # ~60s of typical speech at 150 wpm ≈ 900 chars; 1500 is safe headroom ASPECT_PRESETS = { "▮ 9:16 · 576×1024": (576, 1024), "◻ 1:1 · 512×512": (512, 512), "▬ 16:9 · 1024×576": (1024, 576), } DEFAULT_STEPS = 20 DEFAULT_CFG = 3.5 DEFAULT_FPS = 25 # ── Runtime repo installs (avoid PyPI conflicts) ────────────────────────────── _ECHOMIMIC_REPO = "https://github.com/antgroup/echomimic_v3.git" _ECHOMIMIC_DIR = "/tmp/echomimic_v3" _CHATTERBOX_REPO = "https://github.com/resemble-ai/chatterbox.git" _CHATTERBOX_DIR = "/tmp/chatterbox" _clone_lock = threading.Lock() def _clone_repo(repo_url: str, dest: str, label: str): """Thread-safe shallow clone. Uses .git presence to detect complete clones.""" with _clone_lock: if not os.path.exists(os.path.join(dest, ".git")): if os.path.exists(dest): shutil.rmtree(dest) log.info("Cloning %s…", label) subprocess.run( ["git", "clone", "--depth=1", repo_url, dest], check=True, timeout=180, ) log.info("%s cloned", label) if dest not in sys.path: sys.path.insert(0, dest) def _ensure_echomimic_repo(): _clone_repo(_ECHOMIMIC_REPO, _ECHOMIMIC_DIR, "EchoMimic V3") def _ensure_chatterbox_repo(): _clone_repo(_CHATTERBOX_REPO, _CHATTERBOX_DIR, "Chatterbox TTS") # ── Model singletons ────────────────────────────────────────────────────────── _tts_model = None _echo_pipe = None _echo_mode = None def _load_tts(): global _tts_model if _tts_model is None: _ensure_chatterbox_repo() from chatterbox.tts import ChatterboxTTS log.info("Loading Chatterbox TTS…") _tts_model = ChatterboxTTS.from_pretrained(device="cpu") log.info("Chatterbox TTS ready") return _tts_model def _load_echomimic(): global _echo_pipe, _echo_mode if _echo_pipe is not None: return _echo_pipe, _echo_mode try: _ensure_echomimic_repo() from echomimic_v3.pipelines.pipeline_echomimic_v3 import EchoMimicV3Pipeline log.info("Loading EchoMimic V3 (local)…") _echo_pipe = EchoMimicV3Pipeline.from_pretrained(ECHOMIMIC_MODEL, torch_dtype=torch.float16) _echo_mode = "local" log.info("EchoMimic V3 ready (local)") return _echo_pipe, _echo_mode except Exception as e: log.warning("EchoMimic V3 local import failed: %s", e) try: from diffusers import DiffusionPipeline log.info("Loading EchoMimic V3 via diffusers…") _echo_pipe = DiffusionPipeline.from_pretrained( ECHOMIMIC_MODEL, torch_dtype=torch.float16, trust_remote_code=True, ) _echo_mode = "local" log.info("EchoMimic V3 ready (diffusers)") return _echo_pipe, _echo_mode except Exception as e: log.warning("EchoMimic V3 diffusers load failed: %s", e) raise RuntimeError("EchoMimic V3 could not be loaded. Check requirements and model availability.") # ── Video utilities ─────────────────────────────────────────────────────────── def _coerce_frames(frames): """Normalise pipeline output to a list of (H, W, 3) uint8 numpy arrays.""" import numpy as np result = [] for frame in frames: if hasattr(frame, "save"): arr = np.array(frame.convert("RGB")) elif hasattr(frame, "cpu"): arr = frame.cpu().float().numpy() if arr.ndim == 3 and arr.shape[0] in (1, 3, 4): arr = arr.transpose(1, 2, 0) if arr.dtype.kind == 'f' and arr.max() <= 1.0: arr = (arr * 255).clip(0, 255) arr = arr.astype(np.uint8) else: arr = np.array(frame) if arr.ndim == 2: import cv2 arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB) elif arr.ndim == 3 and arr.shape[2] == 4: arr = arr[:, :, :3] result.append(arr) return result def _mux_video(frames, audio_path: str, fps: int = DEFAULT_FPS) -> str: """Combine frames (PIL/tensor/ndarray) + audio into an MP4 file.""" import cv2 coerced = _coerce_frames(frames) with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: out_path = f.name try: with tempfile.TemporaryDirectory() as tmpdir: for i, arr in enumerate(coerced): cv2.imwrite(os.path.join(tmpdir, f"{i:06d}.png"), cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)) cmd = [ "ffmpeg", "-y", "-loglevel", "error", "-framerate", str(fps), "-i", os.path.join(tmpdir, "%06d.png"), "-i", audio_path, "-c:v", "libx264", "-preset", "fast", "-crf", "22", "-c:a", "aac", "-b:a", "128k", "-shortest", "-pix_fmt", "yuv420p", out_path, ] subprocess.run(cmd, check=True, timeout=120) except Exception: if os.path.exists(out_path): try: os.unlink(out_path) except OSError: pass raise return out_path # ── TTS ─────────────────────────────────────────────────────────────────────── def _run_tts(text: str, voice_ref: str | None, emotion: float, language: str = "English") -> str: """Generate speech WAV. Returns temp file path.""" model = _load_tts() log.info("TTS: language=%s text_len=%d emotion=%.2f", language, len(text), emotion) model.to("cuda") out_path = None try: wav = model.generate( text=text.strip(), audio_prompt_path=voice_ref if voice_ref else None, exaggeration=float(emotion), ) # torchaudio.save requires 2-D tensor [channels, samples] if wav.ndim == 1: wav = wav.unsqueeze(0) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: out_path = f.name torchaudio.save(out_path, wav, model.sr) return out_path except Exception: if out_path and os.path.exists(out_path): try: os.unlink(out_path) except OSError: pass raise finally: model.to("cpu") torch.cuda.empty_cache() # ── EchoMimic ───────────────────────────────────────────────────────────────── def _run_echomimic(portrait_img, audio_path: str, width: int, height: int, num_steps: int, guidance_scale: float) -> str: """Generate talking-head video. Returns MP4 file path.""" pipe, _ = _load_echomimic() pipe.to("cuda") try: output = pipe( ref_image=portrait_img, audio_path=audio_path, width=width, height=height, num_inference_steps=num_steps, guidance_scale=guidance_scale, fps=DEFAULT_FPS, ) if hasattr(output, "frames"): return _mux_video(output.frames[0], audio_path) if hasattr(output, "videos"): vid = output.videos[0] if hasattr(vid, "unbind"): return _mux_video(list(vid.unbind(0)), audio_path) return _mux_video(vid, audio_path) if isinstance(output, str): return output raise ValueError(f"Unexpected pipeline output type: {type(output)}") finally: pipe.to("cpu") torch.cuda.empty_cache() gc.collect() # ── Phase 1: Generate video endpoint ───────────────────────────────────────── @spaces.GPU(duration=120) def generate(portrait_img, input_mode: str, text: str, tts_language: str, voice_ref, audio_file, aspect_ratio: str, emotion: float, num_steps: int, guidance_scale: float, lang: str, progress=gr.Progress(track_tqdm=True)): t = T.get(lang, T["🇺🇸 English"]) if portrait_img is None: raise gr.Error(t["err_no_portrait"]) width, height = ASPECT_PRESETS.get(aspect_ratio, (512, 512)) _tts_tmp: str | None = None try: if input_mode == "text": if not text or not text.strip(): raise gr.Error(t["err_no_text"]) if len(text) > MAX_TEXT_LEN: raise gr.Error(t["err_text_long"]) if voice_ref and not os.path.exists(voice_ref): voice_ref = None _tts_tmp = _run_tts(text, voice_ref, emotion, language=tts_language) audio_path = _tts_tmp else: if audio_file is None: raise gr.Error(t["err_no_audio"]) info = torchaudio.info(audio_file) if (info.num_frames / info.sample_rate) > MAX_AUDIO_SEC: raise gr.Error(t["err_audio_long"]) audio_path = audio_file return _run_echomimic(portrait_img, audio_path, width, height, int(num_steps), float(guidance_scale)) except torch.cuda.OutOfMemoryError: raise gr.Error(t["err_oom"]) except gr.Error as e: log.warning("Generation gr.Error: %s", e) raise except Exception as e: log.error("Generation failed: %s", e, exc_info=True) raise gr.Error("Generation failed. Please try different settings or try again.") finally: if _tts_tmp and os.path.exists(_tts_tmp): try: os.unlink(_tts_tmp) except Exception: pass torch.cuda.empty_cache() gc.collect() # ── Phase 2: Dubbing endpoint ───────────────────────────────────────────────── @spaces.GPU(duration=120) def dub_video(video_input, target_lang: str, voice_ref, emotion: float, lang: str, progress=gr.Progress(track_tqdm=True)): t = T.get(lang, T["🇺🇸 English"]) temp_files: list[str] = [] try: if video_input is None: raise gr.Error(t["err_no_video"]) duration = dubbing.get_video_duration(video_input) if duration > dubbing.MAX_DUB_AUDIO_SEC: raise gr.Error(t["err_video_long"]) progress(0.10, desc="Extracting audio…") audio_path = dubbing.extract_audio(video_input) temp_files.append(audio_path) progress(0.25, desc="Transcribing…") transcript = dubbing.transcribe(audio_path) dubbing._unload_whisper() source_display = transcript.language_display if source_display != target_lang: progress(0.45, desc="Translating…") try: translated_text = dubbing.translate(transcript.text, source_display, target_lang) except Exception as exc: log.error("Translation failed: %s", exc, exc_info=True) raise gr.Error(t["err_translate"]) else: translated_text = transcript.text if len(translated_text) > MAX_DUB_TEXT_LEN: raise gr.Error(t["err_dub_text_long"]) progress(0.60, desc="Synthesizing speech…") if voice_ref and not os.path.exists(voice_ref): voice_ref = None dubbed_audio = _run_tts(translated_text, voice_ref, emotion, language=target_lang) temp_files.append(dubbed_audio) progress(0.85, desc="Combining video…") output_path = dubbing.mux_dubbed_video(video_input, dubbed_audio) status = f"✓ {source_display} → {target_lang} | {duration:.1f}s" return output_path, transcript.text, translated_text, status except torch.cuda.OutOfMemoryError: raise gr.Error(t["err_oom"]) except gr.Error as e: log.warning("Dubbing gr.Error: %s", e) raise except Exception as e: log.error("Dubbing failed: %s", e, exc_info=True) raise gr.Error("Dubbing failed. Please try a shorter video or different settings.") finally: for fp in temp_files: if fp and os.path.exists(fp): try: os.unlink(fp) except Exception: pass torch.cuda.empty_cache() gc.collect() # ── Language switcher ───────────────────────────────────────────────────────── def switch_language(lang: str): t = T.get(lang, T["🇺🇸 English"]) mode_choices = [(t["mode_text"], "text"), (t["mode_audio"], "audio")] # 26 outputs — must match _lang_out list order below return ( # Phase 1 (16) gr.update(label=t["portrait_label"]), gr.update(label=t["input_mode_label"], choices=mode_choices, value="text"), gr.update(label=t["text_label"], placeholder=t["text_ph"]), gr.update(label=t["tts_lang_label"]), gr.update(label=t["voice_ref_label"]), gr.update(label=t["emotion_label"], info=t["emotion_info"]), gr.update(label=t["audio_label"]), gr.update(label=t["aspect_label"]), gr.update(label=t["advanced"]), gr.update(label=t["steps_label"], info=t["steps_info"]), gr.update(label=t["guidance_label"], info=t["guidance_info"]), gr.update(value=t["generate"]), gr.update(value=t["examples_header"]), gr.update(visible=True), # text_group gr.update(visible=False), # audio_group gr.update(label=t["output_label"]), # Phase 2 (10) gr.update(label=t["dub_video_label"]), gr.update(label=t["dub_target_label"]), gr.update(label=t["dub_voice_label"]), gr.update(label=t["dub_emotion_label"]), gr.update(value=t["dub_btn"]), gr.update(label=t["dub_output_label"]), gr.update(label=t["dub_transcript"]), gr.update(label=t["dub_translation"]), gr.update(label=t["dub_status"]), gr.update(label=t["dub_details"]), ) def _toggle_input_mode(mode: str, _lang: str): is_text = (mode == "text") return gr.update(visible=is_text), gr.update(visible=not is_text) # ── Interface ───────────────────────────────────────────────────────────────── with gr.Blocks(title="AnimaStudio 🎬") as demo: gr.HTML("""

🎬 AnimaStudio

AI Talking Head Video Creator & Video Dubbing Studio

🎭 Lip Sync 🗣️ 23 TTS Languages 🎙️ Voice Cloning 🎙️ Video Dubbing ⚡ EchoMimic V3 🌐 EN · PT-BR · ES · AR 🤖 MCP Server
""") lang_selector = gr.Radio( choices=list(T.keys()), value="🇺🇸 English", label=None, container=False, elem_id="lang-selector", ) with gr.Tabs(): # ══ Tab 1: Create Video ════════════════════════════════════════════════ with gr.Tab("🎬 Create Video", id="tab-create"): with gr.Row(equal_height=False): with gr.Column(scale=1, min_width=360): portrait = gr.Image( label="Portrait Photo · front-facing face", type="pil", sources=["upload", "webcam"], ) input_mode = gr.Radio( choices=[(T["🇺🇸 English"]["mode_text"], "text"), (T["🇺🇸 English"]["mode_audio"], "audio")], value="text", label="Audio Input", ) with gr.Group(visible=True) as text_group: text_input = gr.Textbox( label="Text", placeholder="Type what you want the avatar to say...", lines=4, max_lines=10, ) tts_language = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Speech Language") with gr.Row(): voice_ref = gr.Audio( label="Voice Reference (optional — clone voice style)", type="filepath", sources=["upload", "microphone"], format="wav", ) emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Emotion Intensity", info="0 = neutral · 1 = very expressive") with gr.Group(visible=False) as audio_group: audio_upload = gr.Audio( label="Audio File · WAV/MP3/FLAC · max 30 s", type="filepath", sources=["upload", "microphone"], format="wav", ) aspect_ratio = gr.Dropdown(choices=list(ASPECT_PRESETS.keys()), value="◻ 1:1 · 512×512", label="Format") with gr.Accordion("⚙️ Advanced Settings", open=False) as adv_acc: num_steps = gr.Slider(5, 50, value=DEFAULT_STEPS, step=1, label="Inference Steps", info="More steps = higher quality, slower") guidance_scale = gr.Slider(1.0, 10.0, value=DEFAULT_CFG, step=0.5, label="Guidance Scale", info="Higher = follows audio more strictly") gen_btn = gr.Button("🎬 Generate Video", variant="primary", elem_id="gen-btn", size="lg") examples_header = gr.Markdown("### 💡 Try These Examples") gr.Examples(examples=ALL_EXAMPLES_FLAT, inputs=[text_input, tts_language, emotion], label=None) with gr.Column(scale=1, min_width=440): output_video = gr.Video(label="Generated Video", format="mp4", autoplay=True, height=640, elem_id="output-video", buttons=["download"]) # ══ Tab 2: Dub Video ═══════════════════════════════════════════════════ with gr.Tab("🎙️ Dub Video", id="tab-dub"): with gr.Row(equal_height=False): with gr.Column(scale=1, min_width=360): dub_video_input = gr.Video(label="Input Video · max 60 seconds", sources=["upload"]) dub_target_lang = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Target Language") dub_voice_ref = gr.Audio(label="Voice Reference (optional — clone voice style)", type="filepath", sources=["upload", "microphone"], format="wav") dub_emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Emotion Intensity") dub_btn = gr.Button("🎙️ Dub Video", variant="primary", elem_id="dub-btn", size="lg") gr.HTML("""
How it works: Whisper transcribes → NLLB-200 translates → Chatterbox TTS synthesizes → audio replaces original track.
""") with gr.Column(scale=1, min_width=440): dub_output_video = gr.Video(label="Dubbed Video", format="mp4", autoplay=True, height=480, elem_id="dub-output-video", buttons=["download"]) with gr.Accordion("Details", open=False) as dub_details_acc: dub_transcript_box = gr.Textbox(label="Detected Transcript", interactive=False, lines=4) dub_translation_box = gr.Textbox(label="Translation", interactive=False, lines=4) dub_status_box = gr.Textbox(label="Status", interactive=False, lines=2) gr.HTML(""" """) # ── Events ──────────────────────────────────────────────────────────────── gen_btn.click( generate, inputs=[portrait, input_mode, text_input, tts_language, voice_ref, audio_upload, aspect_ratio, emotion, num_steps, guidance_scale, lang_selector], outputs=output_video, ) input_mode.change(_toggle_input_mode, inputs=[input_mode, lang_selector], outputs=[text_group, audio_group]) dub_btn.click( dub_video, inputs=[dub_video_input, dub_target_lang, dub_voice_ref, dub_emotion, lang_selector], outputs=[dub_output_video, dub_transcript_box, dub_translation_box, dub_status_box], ) # Language switcher — 26 outputs, must match switch_language() return tuple order _lang_out = [ # Phase 1 (16) portrait, input_mode, text_input, tts_language, voice_ref, emotion, audio_upload, aspect_ratio, adv_acc, num_steps, guidance_scale, gen_btn, examples_header, text_group, audio_group, output_video, # Phase 2 (10) dub_video_input, dub_target_lang, dub_voice_ref, dub_emotion, dub_btn, dub_output_video, dub_transcript_box, dub_translation_box, dub_status_box, dub_details_acc, ] lang_selector.change(switch_language, inputs=lang_selector, outputs=_lang_out) if __name__ == "__main__": demo.queue(max_size=10, default_concurrency_limit=1) demo.launch(theme=THEME, css=CSS, mcp_server=True, ssr_mode=False)