Spaces:
Running on Zero
Running on Zero
lulavc
fix: wav shape, float dtype check, extract_audio cleanup, NaN duration, HF token for InferenceClient
ae3213a | import spaces | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import os | |
| import gc | |
| import sys | |
| import shutil | |
| import tempfile | |
| import subprocess | |
| import threading | |
| import logging | |
| import dubbing | |
| from i18n import T, EXAMPLES, ALL_EXAMPLES_FLAT, TTS_LANGUAGES, MAX_TEXT_LEN, MAX_AUDIO_SEC | |
| from styles import THEME, CSS | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)s %(name)s: %(message)s", | |
| stream=sys.stderr, | |
| ) | |
| log = logging.getLogger(__name__) | |
| # โโ Config โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| ECHOMIMIC_MODEL = os.environ.get("ECHOMIMIC_MODEL", "BadToBest/EchoMimicV3") | |
| CHATTERBOX_MODEL = os.environ.get("CHATTERBOX_MODEL", "ResembleAI/chatterbox") | |
| MAX_DUB_TEXT_LEN = 1500 # ~60s of typical speech at 150 wpm โ 900 chars; 1500 is safe headroom | |
| ASPECT_PRESETS = { | |
| "โฎ 9:16 ยท 576ร1024": (576, 1024), | |
| "โป 1:1 ยท 512ร512": (512, 512), | |
| "โฌ 16:9 ยท 1024ร576": (1024, 576), | |
| } | |
| DEFAULT_STEPS = 20 | |
| DEFAULT_CFG = 3.5 | |
| DEFAULT_FPS = 25 | |
| # โโ Runtime repo installs (avoid PyPI conflicts) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _ECHOMIMIC_REPO = "https://github.com/antgroup/echomimic_v3.git" | |
| _ECHOMIMIC_DIR = "/tmp/echomimic_v3" | |
| _CHATTERBOX_REPO = "https://github.com/resemble-ai/chatterbox.git" | |
| _CHATTERBOX_DIR = "/tmp/chatterbox" | |
| _clone_lock = threading.Lock() | |
| def _clone_repo(repo_url: str, dest: str, label: str): | |
| """Thread-safe shallow clone. Uses .git presence to detect complete clones.""" | |
| with _clone_lock: | |
| if not os.path.exists(os.path.join(dest, ".git")): | |
| if os.path.exists(dest): | |
| shutil.rmtree(dest) | |
| log.info("Cloning %sโฆ", label) | |
| subprocess.run( | |
| ["git", "clone", "--depth=1", repo_url, dest], | |
| check=True, timeout=180, | |
| ) | |
| log.info("%s cloned", label) | |
| if dest not in sys.path: | |
| sys.path.insert(0, dest) | |
| def _ensure_echomimic_repo(): | |
| _clone_repo(_ECHOMIMIC_REPO, _ECHOMIMIC_DIR, "EchoMimic V3") | |
| def _ensure_chatterbox_repo(): | |
| _clone_repo(_CHATTERBOX_REPO, _CHATTERBOX_DIR, "Chatterbox TTS") | |
| # โโ Model singletons โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _tts_model = None | |
| _echo_pipe = None | |
| _echo_mode = None | |
| def _load_tts(): | |
| global _tts_model | |
| if _tts_model is None: | |
| _ensure_chatterbox_repo() | |
| from chatterbox.tts import ChatterboxTTS | |
| log.info("Loading Chatterbox TTSโฆ") | |
| _tts_model = ChatterboxTTS.from_pretrained(device="cpu") | |
| log.info("Chatterbox TTS ready") | |
| return _tts_model | |
| def _load_echomimic(): | |
| global _echo_pipe, _echo_mode | |
| if _echo_pipe is not None: | |
| return _echo_pipe, _echo_mode | |
| try: | |
| _ensure_echomimic_repo() | |
| from echomimic_v3.pipelines.pipeline_echomimic_v3 import EchoMimicV3Pipeline | |
| log.info("Loading EchoMimic V3 (local)โฆ") | |
| _echo_pipe = EchoMimicV3Pipeline.from_pretrained(ECHOMIMIC_MODEL, torch_dtype=torch.float16) | |
| _echo_mode = "local" | |
| log.info("EchoMimic V3 ready (local)") | |
| return _echo_pipe, _echo_mode | |
| except Exception as e: | |
| log.warning("EchoMimic V3 local import failed: %s", e) | |
| try: | |
| from diffusers import DiffusionPipeline | |
| log.info("Loading EchoMimic V3 via diffusersโฆ") | |
| _echo_pipe = DiffusionPipeline.from_pretrained( | |
| ECHOMIMIC_MODEL, torch_dtype=torch.float16, trust_remote_code=True, | |
| ) | |
| _echo_mode = "local" | |
| log.info("EchoMimic V3 ready (diffusers)") | |
| return _echo_pipe, _echo_mode | |
| except Exception as e: | |
| log.warning("EchoMimic V3 diffusers load failed: %s", e) | |
| raise RuntimeError("EchoMimic V3 could not be loaded. Check requirements and model availability.") | |
| # โโ Video utilities โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _coerce_frames(frames): | |
| """Normalise pipeline output to a list of (H, W, 3) uint8 numpy arrays.""" | |
| import numpy as np | |
| result = [] | |
| for frame in frames: | |
| if hasattr(frame, "save"): | |
| arr = np.array(frame.convert("RGB")) | |
| elif hasattr(frame, "cpu"): | |
| arr = frame.cpu().float().numpy() | |
| if arr.ndim == 3 and arr.shape[0] in (1, 3, 4): | |
| arr = arr.transpose(1, 2, 0) | |
| if arr.dtype.kind == 'f' and arr.max() <= 1.0: | |
| arr = (arr * 255).clip(0, 255) | |
| arr = arr.astype(np.uint8) | |
| else: | |
| arr = np.array(frame) | |
| if arr.ndim == 2: | |
| import cv2 | |
| arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB) | |
| elif arr.ndim == 3 and arr.shape[2] == 4: | |
| arr = arr[:, :, :3] | |
| result.append(arr) | |
| return result | |
| def _mux_video(frames, audio_path: str, fps: int = DEFAULT_FPS) -> str: | |
| """Combine frames (PIL/tensor/ndarray) + audio into an MP4 file.""" | |
| import cv2 | |
| coerced = _coerce_frames(frames) | |
| with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: | |
| out_path = f.name | |
| try: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| for i, arr in enumerate(coerced): | |
| cv2.imwrite(os.path.join(tmpdir, f"{i:06d}.png"), cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)) | |
| cmd = [ | |
| "ffmpeg", "-y", "-loglevel", "error", | |
| "-framerate", str(fps), | |
| "-i", os.path.join(tmpdir, "%06d.png"), | |
| "-i", audio_path, | |
| "-c:v", "libx264", "-preset", "fast", "-crf", "22", | |
| "-c:a", "aac", "-b:a", "128k", | |
| "-shortest", "-pix_fmt", "yuv420p", | |
| out_path, | |
| ] | |
| subprocess.run(cmd, check=True, timeout=120) | |
| except Exception: | |
| if os.path.exists(out_path): | |
| try: | |
| os.unlink(out_path) | |
| except OSError: | |
| pass | |
| raise | |
| return out_path | |
| # โโ TTS โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _run_tts(text: str, voice_ref: str | None, emotion: float, language: str = "English") -> str: | |
| """Generate speech WAV. Returns temp file path.""" | |
| model = _load_tts() | |
| log.info("TTS: language=%s text_len=%d emotion=%.2f", language, len(text), emotion) | |
| model.to("cuda") | |
| out_path = None | |
| try: | |
| wav = model.generate( | |
| text=text.strip(), | |
| audio_prompt_path=voice_ref if voice_ref else None, | |
| exaggeration=float(emotion), | |
| ) | |
| # torchaudio.save requires 2-D tensor [channels, samples] | |
| if wav.ndim == 1: | |
| wav = wav.unsqueeze(0) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| out_path = f.name | |
| torchaudio.save(out_path, wav, model.sr) | |
| return out_path | |
| except Exception: | |
| if out_path and os.path.exists(out_path): | |
| try: | |
| os.unlink(out_path) | |
| except OSError: | |
| pass | |
| raise | |
| finally: | |
| model.to("cpu") | |
| torch.cuda.empty_cache() | |
| # โโ EchoMimic โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _run_echomimic(portrait_img, audio_path: str, width: int, height: int, | |
| num_steps: int, guidance_scale: float) -> str: | |
| """Generate talking-head video. Returns MP4 file path.""" | |
| pipe, _ = _load_echomimic() | |
| pipe.to("cuda") | |
| try: | |
| output = pipe( | |
| ref_image=portrait_img, | |
| audio_path=audio_path, | |
| width=width, | |
| height=height, | |
| num_inference_steps=num_steps, | |
| guidance_scale=guidance_scale, | |
| fps=DEFAULT_FPS, | |
| ) | |
| if hasattr(output, "frames"): | |
| return _mux_video(output.frames[0], audio_path) | |
| if hasattr(output, "videos"): | |
| vid = output.videos[0] | |
| if hasattr(vid, "unbind"): | |
| return _mux_video(list(vid.unbind(0)), audio_path) | |
| return _mux_video(vid, audio_path) | |
| if isinstance(output, str): | |
| return output | |
| raise ValueError(f"Unexpected pipeline output type: {type(output)}") | |
| finally: | |
| pipe.to("cpu") | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| # โโ Phase 1: Generate video endpoint โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def generate(portrait_img, input_mode: str, text: str, tts_language: str, | |
| voice_ref, audio_file, aspect_ratio: str, emotion: float, | |
| num_steps: int, guidance_scale: float, lang: str, | |
| progress=gr.Progress(track_tqdm=True)): | |
| t = T.get(lang, T["๐บ๐ธ English"]) | |
| if portrait_img is None: | |
| raise gr.Error(t["err_no_portrait"]) | |
| width, height = ASPECT_PRESETS.get(aspect_ratio, (512, 512)) | |
| _tts_tmp: str | None = None | |
| try: | |
| if input_mode == "text": | |
| if not text or not text.strip(): | |
| raise gr.Error(t["err_no_text"]) | |
| if len(text) > MAX_TEXT_LEN: | |
| raise gr.Error(t["err_text_long"]) | |
| if voice_ref and not os.path.exists(voice_ref): | |
| voice_ref = None | |
| _tts_tmp = _run_tts(text, voice_ref, emotion, language=tts_language) | |
| audio_path = _tts_tmp | |
| else: | |
| if audio_file is None: | |
| raise gr.Error(t["err_no_audio"]) | |
| info = torchaudio.info(audio_file) | |
| if (info.num_frames / info.sample_rate) > MAX_AUDIO_SEC: | |
| raise gr.Error(t["err_audio_long"]) | |
| audio_path = audio_file | |
| return _run_echomimic(portrait_img, audio_path, width, height, int(num_steps), float(guidance_scale)) | |
| except torch.cuda.OutOfMemoryError: | |
| raise gr.Error(t["err_oom"]) | |
| except gr.Error as e: | |
| log.warning("Generation gr.Error: %s", e) | |
| raise | |
| except Exception as e: | |
| log.error("Generation failed: %s", e, exc_info=True) | |
| raise gr.Error("Generation failed. Please try different settings or try again.") | |
| finally: | |
| if _tts_tmp and os.path.exists(_tts_tmp): | |
| try: | |
| os.unlink(_tts_tmp) | |
| except Exception: | |
| pass | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| # โโ Phase 2: Dubbing endpoint โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def dub_video(video_input, target_lang: str, voice_ref, emotion: float, lang: str, | |
| progress=gr.Progress(track_tqdm=True)): | |
| t = T.get(lang, T["๐บ๐ธ English"]) | |
| temp_files: list[str] = [] | |
| try: | |
| if video_input is None: | |
| raise gr.Error(t["err_no_video"]) | |
| duration = dubbing.get_video_duration(video_input) | |
| if duration > dubbing.MAX_DUB_AUDIO_SEC: | |
| raise gr.Error(t["err_video_long"]) | |
| progress(0.10, desc="Extracting audioโฆ") | |
| audio_path = dubbing.extract_audio(video_input) | |
| temp_files.append(audio_path) | |
| progress(0.25, desc="Transcribingโฆ") | |
| transcript = dubbing.transcribe(audio_path) | |
| dubbing._unload_whisper() | |
| source_display = transcript.language_display | |
| if source_display != target_lang: | |
| progress(0.45, desc="Translatingโฆ") | |
| try: | |
| translated_text = dubbing.translate(transcript.text, source_display, target_lang) | |
| except Exception as exc: | |
| log.error("Translation failed: %s", exc, exc_info=True) | |
| raise gr.Error(t["err_translate"]) | |
| else: | |
| translated_text = transcript.text | |
| if len(translated_text) > MAX_DUB_TEXT_LEN: | |
| raise gr.Error(t["err_dub_text_long"]) | |
| progress(0.60, desc="Synthesizing speechโฆ") | |
| if voice_ref and not os.path.exists(voice_ref): | |
| voice_ref = None | |
| dubbed_audio = _run_tts(translated_text, voice_ref, emotion, language=target_lang) | |
| temp_files.append(dubbed_audio) | |
| progress(0.85, desc="Combining videoโฆ") | |
| output_path = dubbing.mux_dubbed_video(video_input, dubbed_audio) | |
| status = f"โ {source_display} โ {target_lang} | {duration:.1f}s" | |
| return output_path, transcript.text, translated_text, status | |
| except torch.cuda.OutOfMemoryError: | |
| raise gr.Error(t["err_oom"]) | |
| except gr.Error as e: | |
| log.warning("Dubbing gr.Error: %s", e) | |
| raise | |
| except Exception as e: | |
| log.error("Dubbing failed: %s", e, exc_info=True) | |
| raise gr.Error("Dubbing failed. Please try a shorter video or different settings.") | |
| finally: | |
| for fp in temp_files: | |
| if fp and os.path.exists(fp): | |
| try: | |
| os.unlink(fp) | |
| except Exception: | |
| pass | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| # โโ Language switcher โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def switch_language(lang: str): | |
| t = T.get(lang, T["๐บ๐ธ English"]) | |
| mode_choices = [(t["mode_text"], "text"), (t["mode_audio"], "audio")] | |
| # 26 outputs โ must match _lang_out list order below | |
| return ( | |
| # Phase 1 (16) | |
| gr.update(label=t["portrait_label"]), | |
| gr.update(label=t["input_mode_label"], choices=mode_choices, value="text"), | |
| gr.update(label=t["text_label"], placeholder=t["text_ph"]), | |
| gr.update(label=t["tts_lang_label"]), | |
| gr.update(label=t["voice_ref_label"]), | |
| gr.update(label=t["emotion_label"], info=t["emotion_info"]), | |
| gr.update(label=t["audio_label"]), | |
| gr.update(label=t["aspect_label"]), | |
| gr.update(label=t["advanced"]), | |
| gr.update(label=t["steps_label"], info=t["steps_info"]), | |
| gr.update(label=t["guidance_label"], info=t["guidance_info"]), | |
| gr.update(value=t["generate"]), | |
| gr.update(value=t["examples_header"]), | |
| gr.update(visible=True), # text_group | |
| gr.update(visible=False), # audio_group | |
| gr.update(label=t["output_label"]), | |
| # Phase 2 (10) | |
| gr.update(label=t["dub_video_label"]), | |
| gr.update(label=t["dub_target_label"]), | |
| gr.update(label=t["dub_voice_label"]), | |
| gr.update(label=t["dub_emotion_label"]), | |
| gr.update(value=t["dub_btn"]), | |
| gr.update(label=t["dub_output_label"]), | |
| gr.update(label=t["dub_transcript"]), | |
| gr.update(label=t["dub_translation"]), | |
| gr.update(label=t["dub_status"]), | |
| gr.update(label=t["dub_details"]), | |
| ) | |
| def _toggle_input_mode(mode: str, _lang: str): | |
| is_text = (mode == "text") | |
| return gr.update(visible=is_text), gr.update(visible=not is_text) | |
| # โโ Interface โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.Blocks(title="AnimaStudio ๐ฌ") as demo: | |
| gr.HTML(""" | |
| <div class="as-header"> | |
| <h1>๐ฌ AnimaStudio</h1> | |
| <p class="tagline">AI Talking Head Video Creator & Video Dubbing Studio</p> | |
| <div class="badges"> | |
| <span class="badge badge-purple">๐ญ Lip Sync</span> | |
| <span class="badge badge-pink">๐ฃ๏ธ 23 TTS Languages</span> | |
| <span class="badge badge-cyan">๐๏ธ Voice Cloning</span> | |
| <span class="badge badge-teal">๐๏ธ Video Dubbing</span> | |
| <span class="badge">โก EchoMimic V3</span> | |
| <span class="badge badge-gold">๐ EN ยท PT-BR ยท ES ยท AR</span> | |
| <span class="badge">๐ค MCP Server</span> | |
| </div> | |
| </div> | |
| """) | |
| lang_selector = gr.Radio( | |
| choices=list(T.keys()), | |
| value="๐บ๐ธ English", | |
| label=None, | |
| container=False, | |
| elem_id="lang-selector", | |
| ) | |
| with gr.Tabs(): | |
| # โโ Tab 1: Create Video โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.Tab("๐ฌ Create Video", id="tab-create"): | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1, min_width=360): | |
| portrait = gr.Image( | |
| label="Portrait Photo ยท front-facing face", | |
| type="pil", | |
| sources=["upload", "webcam"], | |
| ) | |
| input_mode = gr.Radio( | |
| choices=[(T["๐บ๐ธ English"]["mode_text"], "text"), | |
| (T["๐บ๐ธ English"]["mode_audio"], "audio")], | |
| value="text", | |
| label="Audio Input", | |
| ) | |
| with gr.Group(visible=True) as text_group: | |
| text_input = gr.Textbox( | |
| label="Text", | |
| placeholder="Type what you want the avatar to say...", | |
| lines=4, max_lines=10, | |
| ) | |
| tts_language = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Speech Language") | |
| with gr.Row(): | |
| voice_ref = gr.Audio( | |
| label="Voice Reference (optional โ clone voice style)", | |
| type="filepath", sources=["upload", "microphone"], | |
| format="wav", | |
| ) | |
| emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05, | |
| label="Emotion Intensity", info="0 = neutral ยท 1 = very expressive") | |
| with gr.Group(visible=False) as audio_group: | |
| audio_upload = gr.Audio( | |
| label="Audio File ยท WAV/MP3/FLAC ยท max 30 s", | |
| type="filepath", sources=["upload", "microphone"], | |
| format="wav", | |
| ) | |
| aspect_ratio = gr.Dropdown(choices=list(ASPECT_PRESETS.keys()), | |
| value="โป 1:1 ยท 512ร512", label="Format") | |
| with gr.Accordion("โ๏ธ Advanced Settings", open=False) as adv_acc: | |
| num_steps = gr.Slider(5, 50, value=DEFAULT_STEPS, step=1, | |
| label="Inference Steps", info="More steps = higher quality, slower") | |
| guidance_scale = gr.Slider(1.0, 10.0, value=DEFAULT_CFG, step=0.5, | |
| label="Guidance Scale", info="Higher = follows audio more strictly") | |
| gen_btn = gr.Button("๐ฌ Generate Video", variant="primary", elem_id="gen-btn", size="lg") | |
| examples_header = gr.Markdown("### ๐ก Try These Examples") | |
| gr.Examples(examples=ALL_EXAMPLES_FLAT, inputs=[text_input, tts_language, emotion], label=None) | |
| with gr.Column(scale=1, min_width=440): | |
| output_video = gr.Video(label="Generated Video", format="mp4", autoplay=True, | |
| height=640, elem_id="output-video", buttons=["download"]) | |
| # โโ Tab 2: Dub Video โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.Tab("๐๏ธ Dub Video", id="tab-dub"): | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1, min_width=360): | |
| dub_video_input = gr.Video(label="Input Video ยท max 60 seconds", | |
| sources=["upload"]) | |
| dub_target_lang = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Target Language") | |
| dub_voice_ref = gr.Audio(label="Voice Reference (optional โ clone voice style)", | |
| type="filepath", sources=["upload", "microphone"], | |
| format="wav") | |
| dub_emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Emotion Intensity") | |
| dub_btn = gr.Button("๐๏ธ Dub Video", variant="primary", elem_id="dub-btn", size="lg") | |
| gr.HTML(""" | |
| <div style="color:#94a3b8;font-size:0.82rem;margin-top:0.5rem;padding:0.75rem; | |
| background:rgba(6,182,212,0.05);border-radius:0.5rem; | |
| border:1px solid rgba(6,182,212,0.15);"> | |
| <strong>How it works:</strong> Whisper transcribes โ NLLB-200 translates โ | |
| Chatterbox TTS synthesizes โ audio replaces original track. | |
| </div> | |
| """) | |
| with gr.Column(scale=1, min_width=440): | |
| dub_output_video = gr.Video(label="Dubbed Video", format="mp4", autoplay=True, | |
| height=480, elem_id="dub-output-video", buttons=["download"]) | |
| with gr.Accordion("Details", open=False) as dub_details_acc: | |
| dub_transcript_box = gr.Textbox(label="Detected Transcript", interactive=False, lines=4) | |
| dub_translation_box = gr.Textbox(label="Translation", interactive=False, lines=4) | |
| dub_status_box = gr.Textbox(label="Status", interactive=False, lines=2) | |
| gr.HTML(""" | |
| <div class="as-footer"> | |
| <strong>Models:</strong> | |
| <a href="https://huggingface.co/BadToBest/EchoMimicV3" target="_blank">EchoMimic V3</a> | |
| (Apache 2.0) ยท | |
| <a href="https://huggingface.co/ResembleAI/chatterbox" target="_blank">Chatterbox TTS</a> | |
| (MIT) ยท | |
| <a href="https://huggingface.co/openai/whisper-large-v3-turbo" target="_blank">Whisper Turbo</a> | |
| (MIT) ยท | |
| <a href="https://huggingface.co/facebook/nllb-200-distilled-600M" target="_blank">NLLB-200</a> | |
| (CC-BY-NC) ยท | |
| <strong>Space by:</strong> | |
| <a href="https://huggingface.co/lulavc" target="_blank">lulavc</a> | |
| ยท ZeroGPU ยท A10G | |
| </div> | |
| """) | |
| # โโ Events โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| gen_btn.click( | |
| generate, | |
| inputs=[portrait, input_mode, text_input, tts_language, | |
| voice_ref, audio_upload, aspect_ratio, emotion, | |
| num_steps, guidance_scale, lang_selector], | |
| outputs=output_video, | |
| ) | |
| input_mode.change(_toggle_input_mode, inputs=[input_mode, lang_selector], | |
| outputs=[text_group, audio_group]) | |
| dub_btn.click( | |
| dub_video, | |
| inputs=[dub_video_input, dub_target_lang, dub_voice_ref, dub_emotion, lang_selector], | |
| outputs=[dub_output_video, dub_transcript_box, dub_translation_box, dub_status_box], | |
| ) | |
| # Language switcher โ 26 outputs, must match switch_language() return tuple order | |
| _lang_out = [ | |
| # Phase 1 (16) | |
| portrait, input_mode, text_input, tts_language, | |
| voice_ref, emotion, audio_upload, aspect_ratio, | |
| adv_acc, num_steps, guidance_scale, gen_btn, examples_header, | |
| text_group, audio_group, output_video, | |
| # Phase 2 (10) | |
| dub_video_input, dub_target_lang, dub_voice_ref, | |
| dub_emotion, dub_btn, dub_output_video, | |
| dub_transcript_box, dub_translation_box, | |
| dub_status_box, dub_details_acc, | |
| ] | |
| lang_selector.change(switch_language, inputs=lang_selector, outputs=_lang_out) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=10, default_concurrency_limit=1) | |
| demo.launch(theme=THEME, css=CSS, mcp_server=True, ssr_mode=False) | |