#!/usr/bin/env python3 """Avatar Generator — text-prompt-driven talking avatar. Pipeline (single Gradio Blocks app): 1. User provides a unified Dramabox-style prompt + optional voice reference + an avatar reference image. 2. Dramabox (LTX-2.3 audio branch + IC-LoRA, in-process, warm-loaded) turns the prompt into a watermarked WAV inside a @spaces.GPU window. 3. The WAV + image are sent to the deployed `victor/LongCat-Video-Avatar-1.5` Space via `gradio_client`, which returns the final lip-synced MP4. That step uses *its* GPU quota, not ours — keeping this Space's per-call GPU window tight. """ import logging import os import random import re import shutil import subprocess import sys import tempfile import time _MAX_SEED = 2**31 - 1 import gradio as gr from gradio_client import Client, handle_file import spaces sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")) from inference_server import TTSServer # noqa: E402 from model_downloader import get_all_paths # noqa: E402 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.info("Fetching DramaBox checkpoints from HuggingFace (cached after first run)...") PATHS = get_all_paths() logging.info("Loading DramaBox warm server (Gemma + DiT + VAE + Decoder)...") tts = TTSServer( checkpoint=PATHS["transformer"], full_checkpoint=PATHS["audio_components"], gemma_root=PATHS["gemma_root"], device="cuda", dtype=os.environ.get("LTX_DTYPE", "bf16"), compile_model=False, bnb_4bit=True, ) logging.info("TTSServer ready.") # Pre-warm Perth watermarker on the tts instance. inference_server.py loads # it lazily on the first watermark call ("loaded PerthNet (Implicit) at step # 250,000" — ~9 s of disk + setup), which ate the entire GPU window on the # first request. Pre-loading at boot pushes that cost off the hot path. try: import perth tts._perth = perth.PerthImplicitWatermarker() logging.info("Perth watermarker pre-warmed.") except Exception as e: logging.warning(f"Perth pre-warm skipped ({e}); first request will pay the load cost.") # ── Remote video pipeline ──────────────────────────────────────────────────── # We don't load LongCat-Video-Avatar locally: its weights are ~20 GB and # loading both pipelines in one ZeroGPU process is fragile. Instead we proxy # to the public Space via gradio_client. HF_TOKEN is forwarded so quota and # queue priority are attributed to the caller, not anonymous traffic. LONGCAT_SPACE = os.environ.get("LONGCAT_SPACE", "victor/LongCat-Video-Avatar-1.5") _VIDEO_CLIENT: Client | None = None def _build_video_client() -> Client | None: token = os.environ.get("HF_TOKEN") logging.info(f"Connecting to {LONGCAT_SPACE} via gradio_client...") try: # gradio_client>=1.0 renamed `hf_token` -> `token`. Passing None when # no token is set is fine — gradio_client treats it as anonymous. return Client(LONGCAT_SPACE, token=token) except Exception as e: # Don't take the whole Space down if LongCat is briefly unreachable # at boot — we'll retry lazily on first request. logging.warning(f"Could not pre-warm video client at boot ({e}); will retry on first request.") return None def _video_client() -> Client: """Return the pre-warmed client, or build one on demand if boot-time construction failed. The pre-warm at module init saves ~0.5–2 s of TLS + /info handshake on the very first user request.""" global _VIDEO_CLIENT if _VIDEO_CLIENT is None: _VIDEO_CLIENT = _build_video_client() if _VIDEO_CLIENT is None: # Build threw again — surface a clean error to the user. raise gr.Error(f"Couldn't connect to {LONGCAT_SPACE}. Try again in a moment.") return _VIDEO_CLIENT # Eager pre-warm so the first user request doesn't pay the gradio_client # handshake against the LongCat Space. _VIDEO_CLIENT = _build_video_client() # ── Optional portrait generator (FLUX.2-klein-4B) ──────────────────────────── # Lazy: most users will arrive with their own photo and never touch this tab. # Pre-warming a third remote client at boot would just slow startup for a # feature only a fraction of users hit. /infer accepts an empty input_images # list for pure text→image, or a single image dict for prompt+image editing. FLUX_SPACE = os.environ.get("FLUX_SPACE", "black-forest-labs/FLUX.2-klein-4B") _FLUX_CLIENT: Client | None = None def _flux_client() -> Client: global _FLUX_CLIENT if _FLUX_CLIENT is None: token = os.environ.get("HF_TOKEN") logging.info(f"Connecting to {FLUX_SPACE} via gradio_client...") try: _FLUX_CLIENT = Client(FLUX_SPACE, token=token) except Exception as e: raise gr.Error(f"Couldn't connect to {FLUX_SPACE}: {e}") return _FLUX_CLIENT def generate_portrait( flux_prompt: str, flux_edit_image: str | None, progress=gr.Progress(track_tqdm=True), ): """Call FLUX.2-klein-4B /infer with the Distilled 4-step preset. With ``flux_edit_image`` set this edits the existing portrait; without, it generates from scratch. Returns the generated image path *and* a gr.Tabs update so the Generate tab folds back to the Upload tab on completion (so the user sees the new portrait in the same component they'd upload one to). Pattern: multimodalart/wan-2-2-first-last-frame. """ if not flux_prompt or not flux_prompt.strip(): raise gr.Error("Please describe the portrait you want.") progress(0.05, desc="Connecting to FLUX.2-klein-4B…") client = _flux_client() images_arg = ( [{"image": handle_file(flux_edit_image)}] if flux_edit_image and os.path.exists(flux_edit_image) else [] ) mode_desc = "Editing portrait" if images_arg else "Generating portrait" progress(0.2, desc=f"{mode_desc} (FLUX.2-klein-4B, 4 steps)…") t0 = time.time() result = client.predict( prompt=flux_prompt, input_images=images_arg, mode_choice="Distilled (4 steps)", seed=0, randomize_seed=True, width=1024, height=1024, num_inference_steps=4, guidance_scale=1.0, prompt_upsampling=False, api_name="/infer", ) logging.info(f"[flux] {time.time() - t0:.2f}s -> {result}") # result is (image_dict, seed); image_dict has `path` (local cached copy # downloaded by gradio_client) and `url`. image_dict = result[0] if isinstance(result, (list, tuple)) else result image_path = ( image_dict.get("path") if isinstance(image_dict, dict) else image_dict ) progress(1.0, desc="Done") return image_path, gr.Tabs(selected="portrait_upload") def _video_prompt_from_script(script: str) -> str: """Derive a clean visual prompt for LongCat from the unified Dramabox script. Dramabox prompts wrap dialogue in straight quotes, e.g. `A shadowy villain speaks coldly, "You have entered my domain."` — the quoted text is what gets *spoken*, the lead-in is the *speaker description*. LongCat's prompt should describe the *visual*, so we keep the speaker description and drop the dialogue. Falls back to a neutral caption if the script is empty or unquoted. """ if not script or not script.strip(): return "A person speaks expressively, looking at the camera." # Take everything up to the first quote, stripping trailing commas/spaces. head = script.split('"', 1)[0].strip().rstrip(",").strip() if not head: return "A person speaks expressively, looking at the camera." # Anchor it to a portrait shot so LongCat doesn't reframe the avatar. if "camera" not in head.lower(): head += ", speaking to the camera" return head # ── GPU window sizing (TTS step only — video runs on the remote Space) ────── _GPU_BASE_S = 10 _GPU_PER_SENTENCE_S = 1 _GPU_CAP_S = 110 def _count_sentences(prompt: str) -> int: if not prompt or not prompt.strip(): return 1 try: from text_chunker import split_sentences_outside_quotes n = len(split_sentences_outside_quotes(prompt)) except Exception: n = sum(1 for ch in prompt if ch in ".!?") return max(1, n) def _tts_gpu_duration( prompt: str, voice_ref: str | None, cfg: float, stg: float, steps: int, duration: float, seed: int, resolution: str, progress=None, ) -> int: # Denoise time scales with audio length × steps. Observed: ~0.012 s of # GPU per (sec of audio × step) at default settings; 0.05 here gives ~4× # safety margin. Base covers Gemma encode + VAE decode + watermark + save. needed = _GPU_BASE_S + float(duration) * int(steps) * 0.05 return max(_GPU_BASE_S, min(int(round(needed)) + 2, _GPU_CAP_S)) @spaces.GPU(duration=_tts_gpu_duration) def _run_tts( prompt: str, voice_ref: str | None, cfg: float, stg: float, steps: int, duration: float, seed: int, resolution: str, progress=gr.Progress(), ) -> str: """TTS step. Returns path to a watermarked .wav file of length `duration`.""" if not prompt or not prompt.strip(): raise gr.Error("Prompt is empty.") progress(0.05, desc="Generating speech with Dramabox…") out_wav = tempfile.mktemp(suffix=".wav", prefix="avgen_tts_") t0 = time.time() tts.generate_to_file( prompt=prompt, output=out_wav, voice_ref=voice_ref if voice_ref and os.path.exists(voice_ref) else None, cfg_scale=float(cfg), stg_scale=float(stg), steps=int(steps), duration_multiplier=1.1, seed=int(seed), gen_duration=float(duration), ref_duration=10.0, denoise_ref=False, ) logging.info(f"[tts] {time.time() - t0:.2f}s -> {out_wav} (steps={int(steps)}, dur={float(duration):.1f}s)") return out_wav _LONGCAT_VIDEO_SECONDS = 5.0 # LongCat /generate hardcodes NUM_FRAMES=125 @ 25fps def _trim_video(src_mp4: str, duration: float) -> str: """Trim ``src_mp4`` to ``duration`` seconds. Re-encodes (instead of `-c copy`) so the cut is sample-accurate regardless of keyframe layout — LongCat's mp4 is ~5 s so the re-encode is sub-second.""" if duration >= _LONGCAT_VIDEO_SECONDS - 0.05: return src_mp4 # already full length out = tempfile.mktemp(suffix=".mp4", prefix="avgen_trim_") cmd = [ "ffmpeg", "-y", "-loglevel", "error", "-i", src_mp4, "-t", f"{duration:.3f}", "-c:v", "libx264", "-preset", "veryfast", "-crf", "20", "-c:a", "aac", "-b:a", "128k", out, ] try: subprocess.run(cmd, check=True) return out except Exception as e: logging.warning(f"[trim] ffmpeg trim failed ({e}); returning untrimmed clip") return src_mp4 def generate_avatar( image_path: str, voice_ref: str | None, prompt: str, cfg: float, stg: float, steps: int, duration: float, seed: int, randomize_seed: bool, resolution: str, progress=gr.Progress(), ): if not image_path: raise gr.Error("Please upload a reference portrait.") if not voice_ref: raise gr.Error("Please record or upload a voice clip (10+ seconds) to clone.") if not prompt or not prompt.strip(): raise gr.Error("Please enter a script.") if randomize_seed: seed = random.randint(0, _MAX_SEED) logging.info(f"[seed] randomized -> {seed}") wav_path = _run_tts(prompt, voice_ref, cfg, stg, steps, duration, seed, resolution, progress) progress(0.55, desc="Generating talking-head video on LongCat-Video-Avatar…") client = _video_client() video_prompt = _video_prompt_from_script(prompt) logging.info(f"[video] prompt={video_prompt!r} resolution={resolution} seed={seed}") t0 = time.time() # Param order matches victor/LongCat-Video-Avatar-1.5 `generate(image_path, # audio_path, prompt, resolution, seed, vocal_mode, acceleration)`. # vocal_mode is forced to the fast path because our TTS output is already # clean studio audio — no need for vocal isolation. acceleration is the # 8-step DBCache faster preset which runs ~2× faster than exact 8-step at # negligible quality cost. result = client.predict( handle_file(image_path), handle_file(wav_path), video_prompt, resolution, int(seed), "Clean speech (fast)", "DBCache faster", api_name="/generate", ) logging.info(f"[video] {time.time() - t0:.2f}s -> {result}") if isinstance(result, dict): video_path = result.get("video") or result.get("path") or result else: video_path = result if duration < _LONGCAT_VIDEO_SECONDS: progress(0.95, desc=f"Trimming to {duration:.1f}s…") video_path = _trim_video(video_path, float(duration)) progress(1.0, desc="Done") return video_path, seed # ── UI ────────────────────────────────────────────────────────────────────── _ASSETS = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") _AVATARS_DIR = os.path.join(_ASSETS, "avatars") def _a(name: str) -> str: return os.path.join(_AVATARS_DIR, name) # Examples fill portrait + script only. Voice is the user's own, advanced # settings keep their defaults. EXAMPLES = [ [ _a("orc_warrior.png"), 'A shadowy warlord speaks with cold menace, "You have entered my domain, mortal." ' 'He chuckles darkly, "Such arrogance will be your undoing." ' ], [ _a("photoreal_person.png"), 'A radio host clears his throat, "Excuse me, pardon that." ' 'He settles into a warm, professional tone, "Good evening everyone, ' ], [ _a("character.png"), 'A playful character already mid-giggle, "Hehehe, oh my gosh you should see your face!" ' ], ] # Theme inspired by victor/ace-step-jam: dark slate palette, Hanken Grotesk, # tight radius, subtle frosted surfaces. ace-step-jam itself is a custom # HTML/CSS frontend, so this is an approximation translated to a Gradio Blocks # theme + minimal CSS — same vibe, way less surface area. THEME = gr.themes.Soft( primary_hue=gr.themes.colors.slate, secondary_hue=gr.themes.colors.slate, neutral_hue=gr.themes.colors.slate, radius_size=gr.themes.sizes.radius_sm, text_size=gr.themes.sizes.text_md, font=[gr.themes.GoogleFont("Hanken Grotesk"), "system-ui", "sans-serif"], ).set( body_background_fill="oklch(0.13 0.006 260)", body_background_fill_dark="oklch(0.13 0.006 260)", body_text_color="rgba(255, 255, 255, 0.87)", body_text_color_dark="rgba(255, 255, 255, 0.87)", background_fill_primary="rgba(255, 255, 255, 0.04)", background_fill_primary_dark="rgba(255, 255, 255, 0.04)", background_fill_secondary="rgba(255, 255, 255, 0.06)", background_fill_secondary_dark="rgba(255, 255, 255, 0.06)", border_color_primary="rgba(255, 255, 255, 0.08)", border_color_primary_dark="rgba(255, 255, 255, 0.08)", block_background_fill="rgba(255, 255, 255, 0.04)", block_background_fill_dark="rgba(255, 255, 255, 0.04)", block_border_color="rgba(255, 255, 255, 0.08)", block_border_color_dark="rgba(255, 255, 255, 0.08)", block_label_background_fill="transparent", block_label_background_fill_dark="transparent", block_title_text_color="rgba(255, 255, 255, 0.87)", block_title_text_color_dark="rgba(255, 255, 255, 0.87)", input_background_fill="rgba(255, 255, 255, 0.04)", input_background_fill_dark="rgba(255, 255, 255, 0.04)", input_border_color="rgba(255, 255, 255, 0.08)", input_border_color_dark="rgba(255, 255, 255, 0.08)", button_primary_background_fill="oklch(0.90 0.005 260)", button_primary_background_fill_dark="oklch(0.90 0.005 260)", button_primary_background_fill_hover="oklch(0.95 0.005 260)", button_primary_background_fill_hover_dark="oklch(0.95 0.005 260)", button_primary_text_color="oklch(0.13 0.006 260)", button_primary_text_color_dark="oklch(0.13 0.006 260)", ) CUSTOM_CSS = """ main, .gradio-container, .fillable:not(.fill_width) { width: min(100%, 1180px) !important; max-width: 1180px !important; margin-left: auto !important; margin-right: auto !important; } .gradio-container { font-feature-settings: "ss01", "cv11"; } #hero h1 { font-weight: 600; letter-spacing: -0.02em; margin-bottom: 0.25em; } #hero p { color: rgba(255, 255, 255, 0.55); margin-top: 0; } .gr-button-primary { letter-spacing: -0.01em; font-weight: 600; } """ with gr.Blocks(title="Avatar Generator", theme=THEME, css=CUSTOM_CSS) as demo: gr.Markdown( """ # Avatar Generator SOTA Avatar generation with synthetic speech using [Dramabox](https://huggingface.co/ResembleAI/Dramabox) and [LongCat-Video-Avatar 1.5](https://huggingface.co/meituan-longcat/LongCat-Video-Avatar-1.5). Upload/generate a portrait, clone your voice (or upload one), write a script — get a lip-synced talking-head✨ """, elem_id="hero", ) with gr.Row(): with gr.Column(scale=1): with gr.Row(): with gr.Tabs() as portrait_tabs: with gr.TabItem("Upload", id="portrait_upload"): image_in = gr.Image( label="Reference portrait", type="filepath", height=260, sources=["upload", "clipboard"], ) with gr.TabItem("Generate / edit", id="portrait_generate"): flux_prompt = gr.Textbox( info="Describe the portrait (or the edit, if you attach one below)", placeholder=( "e.g. A photorealistic portrait of an elderly fisherman with " "weathered skin and a wool sweater, neutral studio backdrop" ), lines=1, ) flux_edit_image = gr.Image( label="optional: input image", type="filepath", height=160, sources=["upload", "clipboard"], ) flux_go = gr.Button("Generate portrait", variant="secondary") voice_in = gr.Audio( label="Avatar voice", type="filepath", sources=["upload", "microphone"], ) prompt = gr.Textbox( label="Script", value=( 'A confident announcer speaks proudly, "And now, the moment ' 'you have all been waiting for." He chuckles knowingly, ' '"Heheh, trust me, this one is going to blow you away."' ), lines=4, ) with gr.Accordion("Advanced", open=False): # LongCat's /generate API hardcodes 5 s of video output. We # can shorten by pacing TTS to the requested length + trimming # the returned mp4, but we can't go longer from a single call. duration_in = gr.Slider( 1.0, 5.0, value=5.0, step=0.5, label="Output duration (seconds, max 5)", ) with gr.Row(): resolution = gr.Radio(["480p", "720p"], value="480p", label="Resolution") with gr.Row(): seed = gr.Number(value=42, precision=0, label="Seed") randomize_seed = gr.Checkbox(value=True, label="Randomize seed") # Default 22 trades ~25% of the TTS step for negligible quality # cost on typical short prompts; bump back toward 30 for the # cleanest output, drop toward 14 for fastest iteration. steps_in = gr.Slider(10, 40, value=22, step=1, label="TTS steps (Euler)") cfg = gr.Slider(1.0, 5.0, value=2.5, step=0.1, label="TTS CFG scale") stg = gr.Slider(0.0, 3.0, value=1.5, step=0.1, label="TTS STG scale") go = gr.Button("Generate avatar", variant="primary") with gr.Column(scale=1): video_out = gr.Video(label="Output", autoplay=True, height=420) gr.Examples( examples=EXAMPLES, inputs=[image_in, prompt], outputs=None, fn=None, cache_examples=False, examples_per_page=4, label="Script + portrait examples (then add your own voice above)", ) flux_go.click( generate_portrait, inputs=[flux_prompt, flux_edit_image], outputs=[image_in, portrait_tabs], show_progress="full", ) go.click( generate_avatar, inputs=[image_in, voice_in, prompt, cfg, stg, steps_in, duration_in, seed, randomize_seed, resolution], outputs=[video_out, seed], ) if __name__ == "__main__": demo.queue(max_size=8).launch(show_error=True)