Spaces:

lulavc
/

AnimaStudio

Running on Zero

File size: 24,992 Bytes

import spaces
import gradio as gr
import torch
import torchaudio
import os
import gc
import sys
import shutil
import tempfile
import subprocess
import threading
import logging

import dubbing
from i18n import T, EXAMPLES, ALL_EXAMPLES_FLAT, TTS_LANGUAGES, MAX_TEXT_LEN, MAX_AUDIO_SEC
from styles import THEME, CSS

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    stream=sys.stderr,
)
log = logging.getLogger(__name__)

# ── Config ────────────────────────────────────────────────────────────────────
ECHOMIMIC_MODEL  = os.environ.get("ECHOMIMIC_MODEL",  "BadToBest/EchoMimicV3")
CHATTERBOX_MODEL = os.environ.get("CHATTERBOX_MODEL", "ResembleAI/chatterbox")
MAX_DUB_TEXT_LEN = 1500   # ~60s of typical speech at 150 wpm ≈ 900 chars; 1500 is safe headroom

ASPECT_PRESETS = {
    "▮ 9:16 · 576×1024": (576,  1024),
    "◻ 1:1  · 512×512":  (512,   512),
    "▬ 16:9 · 1024×576": (1024,  576),
}

DEFAULT_STEPS = 20
DEFAULT_CFG   = 3.5
DEFAULT_FPS   = 25

# ── Runtime repo installs (avoid PyPI conflicts) ──────────────────────────────
_ECHOMIMIC_REPO  = "https://github.com/antgroup/echomimic_v3.git"
_ECHOMIMIC_DIR   = "/tmp/echomimic_v3"
_CHATTERBOX_REPO = "https://github.com/resemble-ai/chatterbox.git"
_CHATTERBOX_DIR  = "/tmp/chatterbox"
_clone_lock      = threading.Lock()


def _clone_repo(repo_url: str, dest: str, label: str):
    """Thread-safe shallow clone. Uses .git presence to detect complete clones."""
    with _clone_lock:
        if not os.path.exists(os.path.join(dest, ".git")):
            if os.path.exists(dest):
                shutil.rmtree(dest)
            log.info("Cloning %s…", label)
            subprocess.run(
                ["git", "clone", "--depth=1", repo_url, dest],
                check=True, timeout=180,
            )
            log.info("%s cloned", label)
        if dest not in sys.path:
            sys.path.insert(0, dest)


def _ensure_echomimic_repo():
    _clone_repo(_ECHOMIMIC_REPO, _ECHOMIMIC_DIR, "EchoMimic V3")


def _ensure_chatterbox_repo():
    _clone_repo(_CHATTERBOX_REPO, _CHATTERBOX_DIR, "Chatterbox TTS")


# ── Model singletons ──────────────────────────────────────────────────────────
_tts_model = None
_echo_pipe = None
_echo_mode = None


def _load_tts():
    global _tts_model
    if _tts_model is None:
        _ensure_chatterbox_repo()
        from chatterbox.tts import ChatterboxTTS
        log.info("Loading Chatterbox TTS…")
        _tts_model = ChatterboxTTS.from_pretrained(device="cpu")
        log.info("Chatterbox TTS ready")
    return _tts_model


def _load_echomimic():
    global _echo_pipe, _echo_mode
    if _echo_pipe is not None:
        return _echo_pipe, _echo_mode

    try:
        _ensure_echomimic_repo()
        from echomimic_v3.pipelines.pipeline_echomimic_v3 import EchoMimicV3Pipeline
        log.info("Loading EchoMimic V3 (local)…")
        _echo_pipe = EchoMimicV3Pipeline.from_pretrained(ECHOMIMIC_MODEL, torch_dtype=torch.float16)
        _echo_mode = "local"
        log.info("EchoMimic V3 ready (local)")
        return _echo_pipe, _echo_mode
    except Exception as e:
        log.warning("EchoMimic V3 local import failed: %s", e)

    try:
        from diffusers import DiffusionPipeline
        log.info("Loading EchoMimic V3 via diffusers…")
        _echo_pipe = DiffusionPipeline.from_pretrained(
            ECHOMIMIC_MODEL, torch_dtype=torch.float16, trust_remote_code=True,
        )
        _echo_mode = "local"
        log.info("EchoMimic V3 ready (diffusers)")
        return _echo_pipe, _echo_mode
    except Exception as e:
        log.warning("EchoMimic V3 diffusers load failed: %s", e)

    raise RuntimeError("EchoMimic V3 could not be loaded. Check requirements and model availability.")


# ── Video utilities ───────────────────────────────────────────────────────────
def _coerce_frames(frames):
    """Normalise pipeline output to a list of (H, W, 3) uint8 numpy arrays."""
    import numpy as np
    result = []
    for frame in frames:
        if hasattr(frame, "save"):
            arr = np.array(frame.convert("RGB"))
        elif hasattr(frame, "cpu"):
            arr = frame.cpu().float().numpy()
            if arr.ndim == 3 and arr.shape[0] in (1, 3, 4):
                arr = arr.transpose(1, 2, 0)
            if arr.dtype.kind == 'f' and arr.max() <= 1.0:
                arr = (arr * 255).clip(0, 255)
            arr = arr.astype(np.uint8)
        else:
            arr = np.array(frame)
        if arr.ndim == 2:
            import cv2
            arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB)
        elif arr.ndim == 3 and arr.shape[2] == 4:
            arr = arr[:, :, :3]
        result.append(arr)
    return result


def _mux_video(frames, audio_path: str, fps: int = DEFAULT_FPS) -> str:
    """Combine frames (PIL/tensor/ndarray) + audio into an MP4 file."""
    import cv2

    coerced = _coerce_frames(frames)
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
        out_path = f.name
    try:
        with tempfile.TemporaryDirectory() as tmpdir:
            for i, arr in enumerate(coerced):
                cv2.imwrite(os.path.join(tmpdir, f"{i:06d}.png"), cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
            cmd = [
                "ffmpeg", "-y", "-loglevel", "error",
                "-framerate", str(fps),
                "-i", os.path.join(tmpdir, "%06d.png"),
                "-i", audio_path,
                "-c:v", "libx264", "-preset", "fast", "-crf", "22",
                "-c:a", "aac", "-b:a", "128k",
                "-shortest", "-pix_fmt", "yuv420p",
                out_path,
            ]
            subprocess.run(cmd, check=True, timeout=120)
    except Exception:
        if os.path.exists(out_path):
            try:
                os.unlink(out_path)
            except OSError:
                pass
        raise
    return out_path


# ── TTS ───────────────────────────────────────────────────────────────────────
def _run_tts(text: str, voice_ref: str | None, emotion: float, language: str = "English") -> str:
    """Generate speech WAV. Returns temp file path."""
    model = _load_tts()
    log.info("TTS: language=%s text_len=%d emotion=%.2f", language, len(text), emotion)
    model.to("cuda")
    out_path = None
    try:
        wav = model.generate(
            text=text.strip(),
            audio_prompt_path=voice_ref if voice_ref else None,
            exaggeration=float(emotion),
        )
        # torchaudio.save requires 2-D tensor [channels, samples]
        if wav.ndim == 1:
            wav = wav.unsqueeze(0)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            out_path = f.name
        torchaudio.save(out_path, wav, model.sr)
        return out_path
    except Exception:
        if out_path and os.path.exists(out_path):
            try:
                os.unlink(out_path)
            except OSError:
                pass
        raise
    finally:
        model.to("cpu")
        torch.cuda.empty_cache()


# ── EchoMimic ─────────────────────────────────────────────────────────────────
def _run_echomimic(portrait_img, audio_path: str, width: int, height: int,
                   num_steps: int, guidance_scale: float) -> str:
    """Generate talking-head video. Returns MP4 file path."""
    pipe, _ = _load_echomimic()
    pipe.to("cuda")
    try:
        output = pipe(
            ref_image=portrait_img,
            audio_path=audio_path,
            width=width,
            height=height,
            num_inference_steps=num_steps,
            guidance_scale=guidance_scale,
            fps=DEFAULT_FPS,
        )
        if hasattr(output, "frames"):
            return _mux_video(output.frames[0], audio_path)
        if hasattr(output, "videos"):
            vid = output.videos[0]
            if hasattr(vid, "unbind"):
                return _mux_video(list(vid.unbind(0)), audio_path)
            return _mux_video(vid, audio_path)
        if isinstance(output, str):
            return output
        raise ValueError(f"Unexpected pipeline output type: {type(output)}")
    finally:
        pipe.to("cpu")
        torch.cuda.empty_cache()
        gc.collect()


# ── Phase 1: Generate video endpoint ─────────────────────────────────────────
@spaces.GPU(duration=120)
def generate(portrait_img, input_mode: str, text: str, tts_language: str,
             voice_ref, audio_file, aspect_ratio: str, emotion: float,
             num_steps: int, guidance_scale: float, lang: str,
             progress=gr.Progress(track_tqdm=True)):

    t = T.get(lang, T["🇺🇸 English"])
    if portrait_img is None:
        raise gr.Error(t["err_no_portrait"])

    width, height = ASPECT_PRESETS.get(aspect_ratio, (512, 512))
    _tts_tmp: str | None = None

    try:
        if input_mode == "text":
            if not text or not text.strip():
                raise gr.Error(t["err_no_text"])
            if len(text) > MAX_TEXT_LEN:
                raise gr.Error(t["err_text_long"])
            if voice_ref and not os.path.exists(voice_ref):
                voice_ref = None
            _tts_tmp = _run_tts(text, voice_ref, emotion, language=tts_language)
            audio_path = _tts_tmp
        else:
            if audio_file is None:
                raise gr.Error(t["err_no_audio"])
            info = torchaudio.info(audio_file)
            if (info.num_frames / info.sample_rate) > MAX_AUDIO_SEC:
                raise gr.Error(t["err_audio_long"])
            audio_path = audio_file

        return _run_echomimic(portrait_img, audio_path, width, height, int(num_steps), float(guidance_scale))

    except torch.cuda.OutOfMemoryError:
        raise gr.Error(t["err_oom"])
    except gr.Error as e:
        log.warning("Generation gr.Error: %s", e)
        raise
    except Exception as e:
        log.error("Generation failed: %s", e, exc_info=True)
        raise gr.Error("Generation failed. Please try different settings or try again.")
    finally:
        if _tts_tmp and os.path.exists(_tts_tmp):
            try:
                os.unlink(_tts_tmp)
            except Exception:
                pass
        torch.cuda.empty_cache()
        gc.collect()


# ── Phase 2: Dubbing endpoint ─────────────────────────────────────────────────
@spaces.GPU(duration=120)
def dub_video(video_input, target_lang: str, voice_ref, emotion: float, lang: str,
              progress=gr.Progress(track_tqdm=True)):

    t = T.get(lang, T["🇺🇸 English"])
    temp_files: list[str] = []

    try:
        if video_input is None:
            raise gr.Error(t["err_no_video"])

        duration = dubbing.get_video_duration(video_input)
        if duration > dubbing.MAX_DUB_AUDIO_SEC:
            raise gr.Error(t["err_video_long"])

        progress(0.10, desc="Extracting audio…")
        audio_path = dubbing.extract_audio(video_input)
        temp_files.append(audio_path)

        progress(0.25, desc="Transcribing…")
        transcript = dubbing.transcribe(audio_path)
        dubbing._unload_whisper()

        source_display = transcript.language_display
        if source_display != target_lang:
            progress(0.45, desc="Translating…")
            try:
                translated_text = dubbing.translate(transcript.text, source_display, target_lang)
            except Exception as exc:
                log.error("Translation failed: %s", exc, exc_info=True)
                raise gr.Error(t["err_translate"])
        else:
            translated_text = transcript.text

        if len(translated_text) > MAX_DUB_TEXT_LEN:
            raise gr.Error(t["err_dub_text_long"])

        progress(0.60, desc="Synthesizing speech…")
        if voice_ref and not os.path.exists(voice_ref):
            voice_ref = None
        dubbed_audio = _run_tts(translated_text, voice_ref, emotion, language=target_lang)
        temp_files.append(dubbed_audio)

        progress(0.85, desc="Combining video…")
        output_path = dubbing.mux_dubbed_video(video_input, dubbed_audio)

        status = f"✓ {source_display} → {target_lang} | {duration:.1f}s"
        return output_path, transcript.text, translated_text, status

    except torch.cuda.OutOfMemoryError:
        raise gr.Error(t["err_oom"])
    except gr.Error as e:
        log.warning("Dubbing gr.Error: %s", e)
        raise
    except Exception as e:
        log.error("Dubbing failed: %s", e, exc_info=True)
        raise gr.Error("Dubbing failed. Please try a shorter video or different settings.")
    finally:
        for fp in temp_files:
            if fp and os.path.exists(fp):
                try:
                    os.unlink(fp)
                except Exception:
                    pass
        torch.cuda.empty_cache()
        gc.collect()


# ── Language switcher ─────────────────────────────────────────────────────────
def switch_language(lang: str):
    t = T.get(lang, T["🇺🇸 English"])
    mode_choices = [(t["mode_text"], "text"), (t["mode_audio"], "audio")]
    # 26 outputs — must match _lang_out list order below
    return (
        # Phase 1 (16)
        gr.update(label=t["portrait_label"]),
        gr.update(label=t["input_mode_label"], choices=mode_choices, value="text"),
        gr.update(label=t["text_label"],        placeholder=t["text_ph"]),
        gr.update(label=t["tts_lang_label"]),
        gr.update(label=t["voice_ref_label"]),
        gr.update(label=t["emotion_label"],     info=t["emotion_info"]),
        gr.update(label=t["audio_label"]),
        gr.update(label=t["aspect_label"]),
        gr.update(label=t["advanced"]),
        gr.update(label=t["steps_label"],       info=t["steps_info"]),
        gr.update(label=t["guidance_label"],    info=t["guidance_info"]),
        gr.update(value=t["generate"]),
        gr.update(value=t["examples_header"]),
        gr.update(visible=True),    # text_group
        gr.update(visible=False),   # audio_group
        gr.update(label=t["output_label"]),
        # Phase 2 (10)
        gr.update(label=t["dub_video_label"]),
        gr.update(label=t["dub_target_label"]),
        gr.update(label=t["dub_voice_label"]),
        gr.update(label=t["dub_emotion_label"]),
        gr.update(value=t["dub_btn"]),
        gr.update(label=t["dub_output_label"]),
        gr.update(label=t["dub_transcript"]),
        gr.update(label=t["dub_translation"]),
        gr.update(label=t["dub_status"]),
        gr.update(label=t["dub_details"]),
    )


def _toggle_input_mode(mode: str, _lang: str):
    is_text = (mode == "text")
    return gr.update(visible=is_text), gr.update(visible=not is_text)


# ── Interface ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="AnimaStudio 🎬") as demo:

    gr.HTML("""
    <div class="as-header">
        <h1>🎬 AnimaStudio</h1>
        <p class="tagline">AI Talking Head Video Creator &amp; Video Dubbing Studio</p>
        <div class="badges">
            <span class="badge badge-purple">🎭 Lip Sync</span>
            <span class="badge badge-pink">🗣️ 23 TTS Languages</span>
            <span class="badge badge-cyan">🎙️ Voice Cloning</span>
            <span class="badge badge-teal">🎙️ Video Dubbing</span>
            <span class="badge">⚡ EchoMimic V3</span>
            <span class="badge badge-gold">🌐 EN · PT-BR · ES · AR</span>
            <span class="badge">🤖 MCP Server</span>
        </div>
    </div>
    """)

    lang_selector = gr.Radio(
        choices=list(T.keys()),
        value="🇺🇸 English",
        label=None,
        container=False,
        elem_id="lang-selector",
    )

    with gr.Tabs():

        # ══ Tab 1: Create Video ════════════════════════════════════════════════
        with gr.Tab("🎬 Create Video", id="tab-create"):
            with gr.Row(equal_height=False):
                with gr.Column(scale=1, min_width=360):
                    portrait = gr.Image(
                        label="Portrait Photo · front-facing face",
                        type="pil",
                        sources=["upload", "webcam"],
                    )
                    input_mode = gr.Radio(
                        choices=[(T["🇺🇸 English"]["mode_text"], "text"),
                                 (T["🇺🇸 English"]["mode_audio"], "audio")],
                        value="text",
                        label="Audio Input",
                    )
                    with gr.Group(visible=True) as text_group:
                        text_input = gr.Textbox(
                            label="Text",
                            placeholder="Type what you want the avatar to say...",
                            lines=4, max_lines=10,
                        )
                        tts_language = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Speech Language")
                        with gr.Row():
                            voice_ref = gr.Audio(
                                label="Voice Reference (optional — clone voice style)",
                                type="filepath", sources=["upload", "microphone"],
                                format="wav",
                            )
                        emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05,
                                            label="Emotion Intensity", info="0 = neutral · 1 = very expressive")
                    with gr.Group(visible=False) as audio_group:
                        audio_upload = gr.Audio(
                            label="Audio File · WAV/MP3/FLAC · max 30 s",
                            type="filepath", sources=["upload", "microphone"],
                            format="wav",
                        )
                    aspect_ratio = gr.Dropdown(choices=list(ASPECT_PRESETS.keys()),
                                               value="◻ 1:1  · 512×512", label="Format")
                    with gr.Accordion("⚙️ Advanced Settings", open=False) as adv_acc:
                        num_steps = gr.Slider(5, 50, value=DEFAULT_STEPS, step=1,
                                              label="Inference Steps", info="More steps = higher quality, slower")
                        guidance_scale = gr.Slider(1.0, 10.0, value=DEFAULT_CFG, step=0.5,
                                                   label="Guidance Scale", info="Higher = follows audio more strictly")
                    gen_btn = gr.Button("🎬  Generate Video", variant="primary", elem_id="gen-btn", size="lg")
                    examples_header = gr.Markdown("### 💡 Try These Examples")
                    gr.Examples(examples=ALL_EXAMPLES_FLAT, inputs=[text_input, tts_language, emotion], label=None)

                with gr.Column(scale=1, min_width=440):
                    output_video = gr.Video(label="Generated Video", format="mp4", autoplay=True,
                                            height=640, elem_id="output-video", buttons=["download"])

        # ══ Tab 2: Dub Video ═══════════════════════════════════════════════════
        with gr.Tab("🎙️ Dub Video", id="tab-dub"):
            with gr.Row(equal_height=False):
                with gr.Column(scale=1, min_width=360):
                    dub_video_input = gr.Video(label="Input Video · max 60 seconds",
                                               sources=["upload"])
                    dub_target_lang = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Target Language")
                    dub_voice_ref = gr.Audio(label="Voice Reference (optional — clone voice style)",
                                             type="filepath", sources=["upload", "microphone"],
                                             format="wav")
                    dub_emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Emotion Intensity")
                    dub_btn = gr.Button("🎙️  Dub Video", variant="primary", elem_id="dub-btn", size="lg")
                    gr.HTML("""
                    <div style="color:#94a3b8;font-size:0.82rem;margin-top:0.5rem;padding:0.75rem;
                                background:rgba(6,182,212,0.05);border-radius:0.5rem;
                                border:1px solid rgba(6,182,212,0.15);">
                        <strong>How it works:</strong> Whisper transcribes → NLLB-200 translates →
                        Chatterbox TTS synthesizes → audio replaces original track.
                    </div>
                    """)

                with gr.Column(scale=1, min_width=440):
                    dub_output_video = gr.Video(label="Dubbed Video", format="mp4", autoplay=True,
                                                height=480, elem_id="dub-output-video", buttons=["download"])
                    with gr.Accordion("Details", open=False) as dub_details_acc:
                        dub_transcript_box = gr.Textbox(label="Detected Transcript", interactive=False, lines=4)
                        dub_translation_box = gr.Textbox(label="Translation", interactive=False, lines=4)
                        dub_status_box = gr.Textbox(label="Status", interactive=False, lines=2)

    gr.HTML("""
    <div class="as-footer">
        <strong>Models:</strong>
        <a href="https://huggingface.co/BadToBest/EchoMimicV3" target="_blank">EchoMimic V3</a>
        (Apache 2.0) &nbsp;·&nbsp;
        <a href="https://huggingface.co/ResembleAI/chatterbox" target="_blank">Chatterbox TTS</a>
        (MIT) &nbsp;·&nbsp;
        <a href="https://huggingface.co/openai/whisper-large-v3-turbo" target="_blank">Whisper Turbo</a>
        (MIT) &nbsp;·&nbsp;
        <a href="https://huggingface.co/facebook/nllb-200-distilled-600M" target="_blank">NLLB-200</a>
        (CC-BY-NC) &nbsp;·&nbsp;
        <strong>Space by:</strong>
        <a href="https://huggingface.co/lulavc" target="_blank">lulavc</a>
        &nbsp;·&nbsp; ZeroGPU &nbsp;·&nbsp; A10G
    </div>
    """)

    # ── Events ────────────────────────────────────────────────────────────────
    gen_btn.click(
        generate,
        inputs=[portrait, input_mode, text_input, tts_language,
                voice_ref, audio_upload, aspect_ratio, emotion,
                num_steps, guidance_scale, lang_selector],
        outputs=output_video,
    )

    input_mode.change(_toggle_input_mode, inputs=[input_mode, lang_selector],
                      outputs=[text_group, audio_group])

    dub_btn.click(
        dub_video,
        inputs=[dub_video_input, dub_target_lang, dub_voice_ref, dub_emotion, lang_selector],
        outputs=[dub_output_video, dub_transcript_box, dub_translation_box, dub_status_box],
    )

    # Language switcher — 26 outputs, must match switch_language() return tuple order
    _lang_out = [
        # Phase 1 (16)
        portrait, input_mode, text_input, tts_language,
        voice_ref, emotion, audio_upload, aspect_ratio,
        adv_acc, num_steps, guidance_scale, gen_btn, examples_header,
        text_group, audio_group, output_video,
        # Phase 2 (10)
        dub_video_input, dub_target_lang, dub_voice_ref,
        dub_emotion, dub_btn, dub_output_video,
        dub_transcript_box, dub_translation_box,
        dub_status_box, dub_details_acc,
    ]
    lang_selector.change(switch_language, inputs=lang_selector, outputs=_lang_out)


if __name__ == "__main__":
    demo.queue(max_size=10, default_concurrency_limit=1)
    demo.launch(theme=THEME, css=CSS, mcp_server=True, ssr_mode=False)