AnimaStudio / app.py
lulavc
fix: wav shape, float dtype check, extract_audio cleanup, NaN duration, HF token for InferenceClient
ae3213a
import spaces
import gradio as gr
import torch
import torchaudio
import os
import gc
import sys
import shutil
import tempfile
import subprocess
import threading
import logging
import dubbing
from i18n import T, EXAMPLES, ALL_EXAMPLES_FLAT, TTS_LANGUAGES, MAX_TEXT_LEN, MAX_AUDIO_SEC
from styles import THEME, CSS
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
stream=sys.stderr,
)
log = logging.getLogger(__name__)
# โ”€โ”€ Config โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
ECHOMIMIC_MODEL = os.environ.get("ECHOMIMIC_MODEL", "BadToBest/EchoMimicV3")
CHATTERBOX_MODEL = os.environ.get("CHATTERBOX_MODEL", "ResembleAI/chatterbox")
MAX_DUB_TEXT_LEN = 1500 # ~60s of typical speech at 150 wpm โ‰ˆ 900 chars; 1500 is safe headroom
ASPECT_PRESETS = {
"โ–ฎ 9:16 ยท 576ร—1024": (576, 1024),
"โ—ป 1:1 ยท 512ร—512": (512, 512),
"โ–ฌ 16:9 ยท 1024ร—576": (1024, 576),
}
DEFAULT_STEPS = 20
DEFAULT_CFG = 3.5
DEFAULT_FPS = 25
# โ”€โ”€ Runtime repo installs (avoid PyPI conflicts) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
_ECHOMIMIC_REPO = "https://github.com/antgroup/echomimic_v3.git"
_ECHOMIMIC_DIR = "/tmp/echomimic_v3"
_CHATTERBOX_REPO = "https://github.com/resemble-ai/chatterbox.git"
_CHATTERBOX_DIR = "/tmp/chatterbox"
_clone_lock = threading.Lock()
def _clone_repo(repo_url: str, dest: str, label: str):
"""Thread-safe shallow clone. Uses .git presence to detect complete clones."""
with _clone_lock:
if not os.path.exists(os.path.join(dest, ".git")):
if os.path.exists(dest):
shutil.rmtree(dest)
log.info("Cloning %sโ€ฆ", label)
subprocess.run(
["git", "clone", "--depth=1", repo_url, dest],
check=True, timeout=180,
)
log.info("%s cloned", label)
if dest not in sys.path:
sys.path.insert(0, dest)
def _ensure_echomimic_repo():
_clone_repo(_ECHOMIMIC_REPO, _ECHOMIMIC_DIR, "EchoMimic V3")
def _ensure_chatterbox_repo():
_clone_repo(_CHATTERBOX_REPO, _CHATTERBOX_DIR, "Chatterbox TTS")
# โ”€โ”€ Model singletons โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
_tts_model = None
_echo_pipe = None
_echo_mode = None
def _load_tts():
global _tts_model
if _tts_model is None:
_ensure_chatterbox_repo()
from chatterbox.tts import ChatterboxTTS
log.info("Loading Chatterbox TTSโ€ฆ")
_tts_model = ChatterboxTTS.from_pretrained(device="cpu")
log.info("Chatterbox TTS ready")
return _tts_model
def _load_echomimic():
global _echo_pipe, _echo_mode
if _echo_pipe is not None:
return _echo_pipe, _echo_mode
try:
_ensure_echomimic_repo()
from echomimic_v3.pipelines.pipeline_echomimic_v3 import EchoMimicV3Pipeline
log.info("Loading EchoMimic V3 (local)โ€ฆ")
_echo_pipe = EchoMimicV3Pipeline.from_pretrained(ECHOMIMIC_MODEL, torch_dtype=torch.float16)
_echo_mode = "local"
log.info("EchoMimic V3 ready (local)")
return _echo_pipe, _echo_mode
except Exception as e:
log.warning("EchoMimic V3 local import failed: %s", e)
try:
from diffusers import DiffusionPipeline
log.info("Loading EchoMimic V3 via diffusersโ€ฆ")
_echo_pipe = DiffusionPipeline.from_pretrained(
ECHOMIMIC_MODEL, torch_dtype=torch.float16, trust_remote_code=True,
)
_echo_mode = "local"
log.info("EchoMimic V3 ready (diffusers)")
return _echo_pipe, _echo_mode
except Exception as e:
log.warning("EchoMimic V3 diffusers load failed: %s", e)
raise RuntimeError("EchoMimic V3 could not be loaded. Check requirements and model availability.")
# โ”€โ”€ Video utilities โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _coerce_frames(frames):
"""Normalise pipeline output to a list of (H, W, 3) uint8 numpy arrays."""
import numpy as np
result = []
for frame in frames:
if hasattr(frame, "save"):
arr = np.array(frame.convert("RGB"))
elif hasattr(frame, "cpu"):
arr = frame.cpu().float().numpy()
if arr.ndim == 3 and arr.shape[0] in (1, 3, 4):
arr = arr.transpose(1, 2, 0)
if arr.dtype.kind == 'f' and arr.max() <= 1.0:
arr = (arr * 255).clip(0, 255)
arr = arr.astype(np.uint8)
else:
arr = np.array(frame)
if arr.ndim == 2:
import cv2
arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB)
elif arr.ndim == 3 and arr.shape[2] == 4:
arr = arr[:, :, :3]
result.append(arr)
return result
def _mux_video(frames, audio_path: str, fps: int = DEFAULT_FPS) -> str:
"""Combine frames (PIL/tensor/ndarray) + audio into an MP4 file."""
import cv2
coerced = _coerce_frames(frames)
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
out_path = f.name
try:
with tempfile.TemporaryDirectory() as tmpdir:
for i, arr in enumerate(coerced):
cv2.imwrite(os.path.join(tmpdir, f"{i:06d}.png"), cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-framerate", str(fps),
"-i", os.path.join(tmpdir, "%06d.png"),
"-i", audio_path,
"-c:v", "libx264", "-preset", "fast", "-crf", "22",
"-c:a", "aac", "-b:a", "128k",
"-shortest", "-pix_fmt", "yuv420p",
out_path,
]
subprocess.run(cmd, check=True, timeout=120)
except Exception:
if os.path.exists(out_path):
try:
os.unlink(out_path)
except OSError:
pass
raise
return out_path
# โ”€โ”€ TTS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _run_tts(text: str, voice_ref: str | None, emotion: float, language: str = "English") -> str:
"""Generate speech WAV. Returns temp file path."""
model = _load_tts()
log.info("TTS: language=%s text_len=%d emotion=%.2f", language, len(text), emotion)
model.to("cuda")
out_path = None
try:
wav = model.generate(
text=text.strip(),
audio_prompt_path=voice_ref if voice_ref else None,
exaggeration=float(emotion),
)
# torchaudio.save requires 2-D tensor [channels, samples]
if wav.ndim == 1:
wav = wav.unsqueeze(0)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
out_path = f.name
torchaudio.save(out_path, wav, model.sr)
return out_path
except Exception:
if out_path and os.path.exists(out_path):
try:
os.unlink(out_path)
except OSError:
pass
raise
finally:
model.to("cpu")
torch.cuda.empty_cache()
# โ”€โ”€ EchoMimic โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _run_echomimic(portrait_img, audio_path: str, width: int, height: int,
num_steps: int, guidance_scale: float) -> str:
"""Generate talking-head video. Returns MP4 file path."""
pipe, _ = _load_echomimic()
pipe.to("cuda")
try:
output = pipe(
ref_image=portrait_img,
audio_path=audio_path,
width=width,
height=height,
num_inference_steps=num_steps,
guidance_scale=guidance_scale,
fps=DEFAULT_FPS,
)
if hasattr(output, "frames"):
return _mux_video(output.frames[0], audio_path)
if hasattr(output, "videos"):
vid = output.videos[0]
if hasattr(vid, "unbind"):
return _mux_video(list(vid.unbind(0)), audio_path)
return _mux_video(vid, audio_path)
if isinstance(output, str):
return output
raise ValueError(f"Unexpected pipeline output type: {type(output)}")
finally:
pipe.to("cpu")
torch.cuda.empty_cache()
gc.collect()
# โ”€โ”€ Phase 1: Generate video endpoint โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@spaces.GPU(duration=120)
def generate(portrait_img, input_mode: str, text: str, tts_language: str,
voice_ref, audio_file, aspect_ratio: str, emotion: float,
num_steps: int, guidance_scale: float, lang: str,
progress=gr.Progress(track_tqdm=True)):
t = T.get(lang, T["๐Ÿ‡บ๐Ÿ‡ธ English"])
if portrait_img is None:
raise gr.Error(t["err_no_portrait"])
width, height = ASPECT_PRESETS.get(aspect_ratio, (512, 512))
_tts_tmp: str | None = None
try:
if input_mode == "text":
if not text or not text.strip():
raise gr.Error(t["err_no_text"])
if len(text) > MAX_TEXT_LEN:
raise gr.Error(t["err_text_long"])
if voice_ref and not os.path.exists(voice_ref):
voice_ref = None
_tts_tmp = _run_tts(text, voice_ref, emotion, language=tts_language)
audio_path = _tts_tmp
else:
if audio_file is None:
raise gr.Error(t["err_no_audio"])
info = torchaudio.info(audio_file)
if (info.num_frames / info.sample_rate) > MAX_AUDIO_SEC:
raise gr.Error(t["err_audio_long"])
audio_path = audio_file
return _run_echomimic(portrait_img, audio_path, width, height, int(num_steps), float(guidance_scale))
except torch.cuda.OutOfMemoryError:
raise gr.Error(t["err_oom"])
except gr.Error as e:
log.warning("Generation gr.Error: %s", e)
raise
except Exception as e:
log.error("Generation failed: %s", e, exc_info=True)
raise gr.Error("Generation failed. Please try different settings or try again.")
finally:
if _tts_tmp and os.path.exists(_tts_tmp):
try:
os.unlink(_tts_tmp)
except Exception:
pass
torch.cuda.empty_cache()
gc.collect()
# โ”€โ”€ Phase 2: Dubbing endpoint โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@spaces.GPU(duration=120)
def dub_video(video_input, target_lang: str, voice_ref, emotion: float, lang: str,
progress=gr.Progress(track_tqdm=True)):
t = T.get(lang, T["๐Ÿ‡บ๐Ÿ‡ธ English"])
temp_files: list[str] = []
try:
if video_input is None:
raise gr.Error(t["err_no_video"])
duration = dubbing.get_video_duration(video_input)
if duration > dubbing.MAX_DUB_AUDIO_SEC:
raise gr.Error(t["err_video_long"])
progress(0.10, desc="Extracting audioโ€ฆ")
audio_path = dubbing.extract_audio(video_input)
temp_files.append(audio_path)
progress(0.25, desc="Transcribingโ€ฆ")
transcript = dubbing.transcribe(audio_path)
dubbing._unload_whisper()
source_display = transcript.language_display
if source_display != target_lang:
progress(0.45, desc="Translatingโ€ฆ")
try:
translated_text = dubbing.translate(transcript.text, source_display, target_lang)
except Exception as exc:
log.error("Translation failed: %s", exc, exc_info=True)
raise gr.Error(t["err_translate"])
else:
translated_text = transcript.text
if len(translated_text) > MAX_DUB_TEXT_LEN:
raise gr.Error(t["err_dub_text_long"])
progress(0.60, desc="Synthesizing speechโ€ฆ")
if voice_ref and not os.path.exists(voice_ref):
voice_ref = None
dubbed_audio = _run_tts(translated_text, voice_ref, emotion, language=target_lang)
temp_files.append(dubbed_audio)
progress(0.85, desc="Combining videoโ€ฆ")
output_path = dubbing.mux_dubbed_video(video_input, dubbed_audio)
status = f"โœ“ {source_display} โ†’ {target_lang} | {duration:.1f}s"
return output_path, transcript.text, translated_text, status
except torch.cuda.OutOfMemoryError:
raise gr.Error(t["err_oom"])
except gr.Error as e:
log.warning("Dubbing gr.Error: %s", e)
raise
except Exception as e:
log.error("Dubbing failed: %s", e, exc_info=True)
raise gr.Error("Dubbing failed. Please try a shorter video or different settings.")
finally:
for fp in temp_files:
if fp and os.path.exists(fp):
try:
os.unlink(fp)
except Exception:
pass
torch.cuda.empty_cache()
gc.collect()
# โ”€โ”€ Language switcher โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def switch_language(lang: str):
t = T.get(lang, T["๐Ÿ‡บ๐Ÿ‡ธ English"])
mode_choices = [(t["mode_text"], "text"), (t["mode_audio"], "audio")]
# 26 outputs โ€” must match _lang_out list order below
return (
# Phase 1 (16)
gr.update(label=t["portrait_label"]),
gr.update(label=t["input_mode_label"], choices=mode_choices, value="text"),
gr.update(label=t["text_label"], placeholder=t["text_ph"]),
gr.update(label=t["tts_lang_label"]),
gr.update(label=t["voice_ref_label"]),
gr.update(label=t["emotion_label"], info=t["emotion_info"]),
gr.update(label=t["audio_label"]),
gr.update(label=t["aspect_label"]),
gr.update(label=t["advanced"]),
gr.update(label=t["steps_label"], info=t["steps_info"]),
gr.update(label=t["guidance_label"], info=t["guidance_info"]),
gr.update(value=t["generate"]),
gr.update(value=t["examples_header"]),
gr.update(visible=True), # text_group
gr.update(visible=False), # audio_group
gr.update(label=t["output_label"]),
# Phase 2 (10)
gr.update(label=t["dub_video_label"]),
gr.update(label=t["dub_target_label"]),
gr.update(label=t["dub_voice_label"]),
gr.update(label=t["dub_emotion_label"]),
gr.update(value=t["dub_btn"]),
gr.update(label=t["dub_output_label"]),
gr.update(label=t["dub_transcript"]),
gr.update(label=t["dub_translation"]),
gr.update(label=t["dub_status"]),
gr.update(label=t["dub_details"]),
)
def _toggle_input_mode(mode: str, _lang: str):
is_text = (mode == "text")
return gr.update(visible=is_text), gr.update(visible=not is_text)
# โ”€โ”€ Interface โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="AnimaStudio ๐ŸŽฌ") as demo:
gr.HTML("""
<div class="as-header">
<h1>๐ŸŽฌ AnimaStudio</h1>
<p class="tagline">AI Talking Head Video Creator &amp; Video Dubbing Studio</p>
<div class="badges">
<span class="badge badge-purple">๐ŸŽญ Lip Sync</span>
<span class="badge badge-pink">๐Ÿ—ฃ๏ธ 23 TTS Languages</span>
<span class="badge badge-cyan">๐ŸŽ™๏ธ Voice Cloning</span>
<span class="badge badge-teal">๐ŸŽ™๏ธ Video Dubbing</span>
<span class="badge">โšก EchoMimic V3</span>
<span class="badge badge-gold">๐ŸŒ EN ยท PT-BR ยท ES ยท AR</span>
<span class="badge">๐Ÿค– MCP Server</span>
</div>
</div>
""")
lang_selector = gr.Radio(
choices=list(T.keys()),
value="๐Ÿ‡บ๐Ÿ‡ธ English",
label=None,
container=False,
elem_id="lang-selector",
)
with gr.Tabs():
# โ•โ• Tab 1: Create Video โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
with gr.Tab("๐ŸŽฌ Create Video", id="tab-create"):
with gr.Row(equal_height=False):
with gr.Column(scale=1, min_width=360):
portrait = gr.Image(
label="Portrait Photo ยท front-facing face",
type="pil",
sources=["upload", "webcam"],
)
input_mode = gr.Radio(
choices=[(T["๐Ÿ‡บ๐Ÿ‡ธ English"]["mode_text"], "text"),
(T["๐Ÿ‡บ๐Ÿ‡ธ English"]["mode_audio"], "audio")],
value="text",
label="Audio Input",
)
with gr.Group(visible=True) as text_group:
text_input = gr.Textbox(
label="Text",
placeholder="Type what you want the avatar to say...",
lines=4, max_lines=10,
)
tts_language = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Speech Language")
with gr.Row():
voice_ref = gr.Audio(
label="Voice Reference (optional โ€” clone voice style)",
type="filepath", sources=["upload", "microphone"],
format="wav",
)
emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05,
label="Emotion Intensity", info="0 = neutral ยท 1 = very expressive")
with gr.Group(visible=False) as audio_group:
audio_upload = gr.Audio(
label="Audio File ยท WAV/MP3/FLAC ยท max 30 s",
type="filepath", sources=["upload", "microphone"],
format="wav",
)
aspect_ratio = gr.Dropdown(choices=list(ASPECT_PRESETS.keys()),
value="โ—ป 1:1 ยท 512ร—512", label="Format")
with gr.Accordion("โš™๏ธ Advanced Settings", open=False) as adv_acc:
num_steps = gr.Slider(5, 50, value=DEFAULT_STEPS, step=1,
label="Inference Steps", info="More steps = higher quality, slower")
guidance_scale = gr.Slider(1.0, 10.0, value=DEFAULT_CFG, step=0.5,
label="Guidance Scale", info="Higher = follows audio more strictly")
gen_btn = gr.Button("๐ŸŽฌ Generate Video", variant="primary", elem_id="gen-btn", size="lg")
examples_header = gr.Markdown("### ๐Ÿ’ก Try These Examples")
gr.Examples(examples=ALL_EXAMPLES_FLAT, inputs=[text_input, tts_language, emotion], label=None)
with gr.Column(scale=1, min_width=440):
output_video = gr.Video(label="Generated Video", format="mp4", autoplay=True,
height=640, elem_id="output-video", buttons=["download"])
# โ•โ• Tab 2: Dub Video โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
with gr.Tab("๐ŸŽ™๏ธ Dub Video", id="tab-dub"):
with gr.Row(equal_height=False):
with gr.Column(scale=1, min_width=360):
dub_video_input = gr.Video(label="Input Video ยท max 60 seconds",
sources=["upload"])
dub_target_lang = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Target Language")
dub_voice_ref = gr.Audio(label="Voice Reference (optional โ€” clone voice style)",
type="filepath", sources=["upload", "microphone"],
format="wav")
dub_emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Emotion Intensity")
dub_btn = gr.Button("๐ŸŽ™๏ธ Dub Video", variant="primary", elem_id="dub-btn", size="lg")
gr.HTML("""
<div style="color:#94a3b8;font-size:0.82rem;margin-top:0.5rem;padding:0.75rem;
background:rgba(6,182,212,0.05);border-radius:0.5rem;
border:1px solid rgba(6,182,212,0.15);">
<strong>How it works:</strong> Whisper transcribes โ†’ NLLB-200 translates โ†’
Chatterbox TTS synthesizes โ†’ audio replaces original track.
</div>
""")
with gr.Column(scale=1, min_width=440):
dub_output_video = gr.Video(label="Dubbed Video", format="mp4", autoplay=True,
height=480, elem_id="dub-output-video", buttons=["download"])
with gr.Accordion("Details", open=False) as dub_details_acc:
dub_transcript_box = gr.Textbox(label="Detected Transcript", interactive=False, lines=4)
dub_translation_box = gr.Textbox(label="Translation", interactive=False, lines=4)
dub_status_box = gr.Textbox(label="Status", interactive=False, lines=2)
gr.HTML("""
<div class="as-footer">
<strong>Models:</strong>
<a href="https://huggingface.co/BadToBest/EchoMimicV3" target="_blank">EchoMimic V3</a>
(Apache 2.0) &nbsp;ยท&nbsp;
<a href="https://huggingface.co/ResembleAI/chatterbox" target="_blank">Chatterbox TTS</a>
(MIT) &nbsp;ยท&nbsp;
<a href="https://huggingface.co/openai/whisper-large-v3-turbo" target="_blank">Whisper Turbo</a>
(MIT) &nbsp;ยท&nbsp;
<a href="https://huggingface.co/facebook/nllb-200-distilled-600M" target="_blank">NLLB-200</a>
(CC-BY-NC) &nbsp;ยท&nbsp;
<strong>Space by:</strong>
<a href="https://huggingface.co/lulavc" target="_blank">lulavc</a>
&nbsp;ยท&nbsp; ZeroGPU &nbsp;ยท&nbsp; A10G
</div>
""")
# โ”€โ”€ Events โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
gen_btn.click(
generate,
inputs=[portrait, input_mode, text_input, tts_language,
voice_ref, audio_upload, aspect_ratio, emotion,
num_steps, guidance_scale, lang_selector],
outputs=output_video,
)
input_mode.change(_toggle_input_mode, inputs=[input_mode, lang_selector],
outputs=[text_group, audio_group])
dub_btn.click(
dub_video,
inputs=[dub_video_input, dub_target_lang, dub_voice_ref, dub_emotion, lang_selector],
outputs=[dub_output_video, dub_transcript_box, dub_translation_box, dub_status_box],
)
# Language switcher โ€” 26 outputs, must match switch_language() return tuple order
_lang_out = [
# Phase 1 (16)
portrait, input_mode, text_input, tts_language,
voice_ref, emotion, audio_upload, aspect_ratio,
adv_acc, num_steps, guidance_scale, gen_btn, examples_header,
text_group, audio_group, output_video,
# Phase 2 (10)
dub_video_input, dub_target_lang, dub_voice_ref,
dub_emotion, dub_btn, dub_output_video,
dub_transcript_box, dub_translation_box,
dub_status_box, dub_details_acc,
]
lang_selector.change(switch_language, inputs=lang_selector, outputs=_lang_out)
if __name__ == "__main__":
demo.queue(max_size=10, default_concurrency_limit=1)
demo.launch(theme=THEME, css=CSS, mcp_server=True, ssr_mode=False)