Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """Avatar Generator — text-prompt-driven talking avatar. | |
| Pipeline (single Gradio Blocks app): | |
| 1. User provides a unified Dramabox-style prompt + optional voice reference | |
| + an avatar reference image. | |
| 2. Dramabox (LTX-2.3 audio branch + IC-LoRA, in-process, warm-loaded) turns | |
| the prompt into a watermarked WAV inside a @spaces.GPU window. | |
| 3. The WAV + image are sent to the deployed | |
| `victor/LongCat-Video-Avatar-1.5` Space via `gradio_client`, which | |
| returns the final lip-synced MP4. That step uses *its* GPU quota, not | |
| ours — keeping this Space's per-call GPU window tight. | |
| """ | |
| import logging | |
| import os | |
| import random | |
| import re | |
| import shutil | |
| import subprocess | |
| import sys | |
| import tempfile | |
| import time | |
| _MAX_SEED = 2**31 - 1 | |
| import gradio as gr | |
| from gradio_client import Client, handle_file | |
| import spaces | |
| sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")) | |
| from inference_server import TTSServer # noqa: E402 | |
| from model_downloader import get_all_paths # noqa: E402 | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| logging.info("Fetching DramaBox checkpoints from HuggingFace (cached after first run)...") | |
| PATHS = get_all_paths() | |
| logging.info("Loading DramaBox warm server (Gemma + DiT + VAE + Decoder)...") | |
| tts = TTSServer( | |
| checkpoint=PATHS["transformer"], | |
| full_checkpoint=PATHS["audio_components"], | |
| gemma_root=PATHS["gemma_root"], | |
| device="cuda", | |
| dtype=os.environ.get("LTX_DTYPE", "bf16"), | |
| compile_model=False, | |
| bnb_4bit=True, | |
| ) | |
| logging.info("TTSServer ready.") | |
| # Pre-warm Perth watermarker on the tts instance. inference_server.py loads | |
| # it lazily on the first watermark call ("loaded PerthNet (Implicit) at step | |
| # 250,000" — ~9 s of disk + setup), which ate the entire GPU window on the | |
| # first request. Pre-loading at boot pushes that cost off the hot path. | |
| try: | |
| import perth | |
| tts._perth = perth.PerthImplicitWatermarker() | |
| logging.info("Perth watermarker pre-warmed.") | |
| except Exception as e: | |
| logging.warning(f"Perth pre-warm skipped ({e}); first request will pay the load cost.") | |
| # ── Remote video pipeline ──────────────────────────────────────────────────── | |
| # We don't load LongCat-Video-Avatar locally: its weights are ~20 GB and | |
| # loading both pipelines in one ZeroGPU process is fragile. Instead we proxy | |
| # to the public Space via gradio_client. HF_TOKEN is forwarded so quota and | |
| # queue priority are attributed to the caller, not anonymous traffic. | |
| LONGCAT_SPACE = os.environ.get("LONGCAT_SPACE", "victor/LongCat-Video-Avatar-1.5") | |
| _VIDEO_CLIENT: Client | None = None | |
| def _build_video_client() -> Client | None: | |
| token = os.environ.get("HF_TOKEN") | |
| logging.info(f"Connecting to {LONGCAT_SPACE} via gradio_client...") | |
| try: | |
| # gradio_client>=1.0 renamed `hf_token` -> `token`. Passing None when | |
| # no token is set is fine — gradio_client treats it as anonymous. | |
| return Client(LONGCAT_SPACE, token=token) | |
| except Exception as e: | |
| # Don't take the whole Space down if LongCat is briefly unreachable | |
| # at boot — we'll retry lazily on first request. | |
| logging.warning(f"Could not pre-warm video client at boot ({e}); will retry on first request.") | |
| return None | |
| def _video_client() -> Client: | |
| """Return the pre-warmed client, or build one on demand if boot-time | |
| construction failed. The pre-warm at module init saves ~0.5–2 s of TLS + | |
| /info handshake on the very first user request.""" | |
| global _VIDEO_CLIENT | |
| if _VIDEO_CLIENT is None: | |
| _VIDEO_CLIENT = _build_video_client() | |
| if _VIDEO_CLIENT is None: | |
| # Build threw again — surface a clean error to the user. | |
| raise gr.Error(f"Couldn't connect to {LONGCAT_SPACE}. Try again in a moment.") | |
| return _VIDEO_CLIENT | |
| # Eager pre-warm so the first user request doesn't pay the gradio_client | |
| # handshake against the LongCat Space. | |
| _VIDEO_CLIENT = _build_video_client() | |
| # ── Optional portrait generator (FLUX.2-klein-4B) ──────────────────────────── | |
| # Lazy: most users will arrive with their own photo and never touch this tab. | |
| # Pre-warming a third remote client at boot would just slow startup for a | |
| # feature only a fraction of users hit. /infer accepts an empty input_images | |
| # list for pure text→image, or a single image dict for prompt+image editing. | |
| FLUX_SPACE = os.environ.get("FLUX_SPACE", "black-forest-labs/FLUX.2-klein-4B") | |
| _FLUX_CLIENT: Client | None = None | |
| def _flux_client() -> Client: | |
| global _FLUX_CLIENT | |
| if _FLUX_CLIENT is None: | |
| token = os.environ.get("HF_TOKEN") | |
| logging.info(f"Connecting to {FLUX_SPACE} via gradio_client...") | |
| try: | |
| _FLUX_CLIENT = Client(FLUX_SPACE, token=token) | |
| except Exception as e: | |
| raise gr.Error(f"Couldn't connect to {FLUX_SPACE}: {e}") | |
| return _FLUX_CLIENT | |
| def generate_portrait( | |
| flux_prompt: str, | |
| flux_edit_image: str | None, | |
| progress=gr.Progress(track_tqdm=True), | |
| ): | |
| """Call FLUX.2-klein-4B /infer with the Distilled 4-step preset. With | |
| ``flux_edit_image`` set this edits the existing portrait; without, it | |
| generates from scratch. Returns the generated image path *and* a | |
| gr.Tabs update so the Generate tab folds back to the Upload tab on | |
| completion (so the user sees the new portrait in the same component | |
| they'd upload one to). Pattern: multimodalart/wan-2-2-first-last-frame. | |
| """ | |
| if not flux_prompt or not flux_prompt.strip(): | |
| raise gr.Error("Please describe the portrait you want.") | |
| progress(0.05, desc="Connecting to FLUX.2-klein-4B…") | |
| client = _flux_client() | |
| images_arg = ( | |
| [{"image": handle_file(flux_edit_image)}] | |
| if flux_edit_image and os.path.exists(flux_edit_image) | |
| else [] | |
| ) | |
| mode_desc = "Editing portrait" if images_arg else "Generating portrait" | |
| progress(0.2, desc=f"{mode_desc} (FLUX.2-klein-4B, 4 steps)…") | |
| t0 = time.time() | |
| result = client.predict( | |
| prompt=flux_prompt, | |
| input_images=images_arg, | |
| mode_choice="Distilled (4 steps)", | |
| seed=0, | |
| randomize_seed=True, | |
| width=1024, | |
| height=1024, | |
| num_inference_steps=4, | |
| guidance_scale=1.0, | |
| prompt_upsampling=False, | |
| api_name="/infer", | |
| ) | |
| logging.info(f"[flux] {time.time() - t0:.2f}s -> {result}") | |
| # result is (image_dict, seed); image_dict has `path` (local cached copy | |
| # downloaded by gradio_client) and `url`. | |
| image_dict = result[0] if isinstance(result, (list, tuple)) else result | |
| image_path = ( | |
| image_dict.get("path") if isinstance(image_dict, dict) else image_dict | |
| ) | |
| progress(1.0, desc="Done") | |
| return image_path, gr.Tabs(selected="portrait_upload") | |
| def _video_prompt_from_script(script: str) -> str: | |
| """Derive a clean visual prompt for LongCat from the unified Dramabox | |
| script. Dramabox prompts wrap dialogue in straight quotes, e.g. | |
| `A shadowy villain speaks coldly, "You have entered my domain."` — the | |
| quoted text is what gets *spoken*, the lead-in is the *speaker | |
| description*. LongCat's prompt should describe the *visual*, so we keep | |
| the speaker description and drop the dialogue. | |
| Falls back to a neutral caption if the script is empty or unquoted. | |
| """ | |
| if not script or not script.strip(): | |
| return "A person speaks expressively, looking at the camera." | |
| # Take everything up to the first quote, stripping trailing commas/spaces. | |
| head = script.split('"', 1)[0].strip().rstrip(",").strip() | |
| if not head: | |
| return "A person speaks expressively, looking at the camera." | |
| # Anchor it to a portrait shot so LongCat doesn't reframe the avatar. | |
| if "camera" not in head.lower(): | |
| head += ", speaking to the camera" | |
| return head | |
| # ── GPU window sizing (TTS step only — video runs on the remote Space) ────── | |
| _GPU_BASE_S = 10 | |
| _GPU_PER_SENTENCE_S = 1 | |
| _GPU_CAP_S = 110 | |
| def _count_sentences(prompt: str) -> int: | |
| if not prompt or not prompt.strip(): | |
| return 1 | |
| try: | |
| from text_chunker import split_sentences_outside_quotes | |
| n = len(split_sentences_outside_quotes(prompt)) | |
| except Exception: | |
| n = sum(1 for ch in prompt if ch in ".!?") | |
| return max(1, n) | |
| def _tts_gpu_duration( | |
| prompt: str, | |
| voice_ref: str | None, | |
| cfg: float, | |
| stg: float, | |
| steps: int, | |
| duration: float, | |
| seed: int, | |
| resolution: str, | |
| progress=None, | |
| ) -> int: | |
| # Denoise time scales with audio length × steps. Observed: ~0.012 s of | |
| # GPU per (sec of audio × step) at default settings; 0.05 here gives ~4× | |
| # safety margin. Base covers Gemma encode + VAE decode + watermark + save. | |
| needed = _GPU_BASE_S + float(duration) * int(steps) * 0.05 | |
| return max(_GPU_BASE_S, min(int(round(needed)) + 2, _GPU_CAP_S)) | |
| def _run_tts( | |
| prompt: str, | |
| voice_ref: str | None, | |
| cfg: float, | |
| stg: float, | |
| steps: int, | |
| duration: float, | |
| seed: int, | |
| resolution: str, | |
| progress=gr.Progress(), | |
| ) -> str: | |
| """TTS step. Returns path to a watermarked .wav file of length `duration`.""" | |
| if not prompt or not prompt.strip(): | |
| raise gr.Error("Prompt is empty.") | |
| progress(0.05, desc="Generating speech with Dramabox…") | |
| out_wav = tempfile.mktemp(suffix=".wav", prefix="avgen_tts_") | |
| t0 = time.time() | |
| tts.generate_to_file( | |
| prompt=prompt, | |
| output=out_wav, | |
| voice_ref=voice_ref if voice_ref and os.path.exists(voice_ref) else None, | |
| cfg_scale=float(cfg), | |
| stg_scale=float(stg), | |
| steps=int(steps), | |
| duration_multiplier=1.1, | |
| seed=int(seed), | |
| gen_duration=float(duration), | |
| ref_duration=10.0, | |
| denoise_ref=False, | |
| ) | |
| logging.info(f"[tts] {time.time() - t0:.2f}s -> {out_wav} (steps={int(steps)}, dur={float(duration):.1f}s)") | |
| return out_wav | |
| _LONGCAT_VIDEO_SECONDS = 5.0 # LongCat /generate hardcodes NUM_FRAMES=125 @ 25fps | |
| def _trim_video(src_mp4: str, duration: float) -> str: | |
| """Trim ``src_mp4`` to ``duration`` seconds. Re-encodes (instead of | |
| `-c copy`) so the cut is sample-accurate regardless of keyframe layout — | |
| LongCat's mp4 is ~5 s so the re-encode is sub-second.""" | |
| if duration >= _LONGCAT_VIDEO_SECONDS - 0.05: | |
| return src_mp4 # already full length | |
| out = tempfile.mktemp(suffix=".mp4", prefix="avgen_trim_") | |
| cmd = [ | |
| "ffmpeg", "-y", "-loglevel", "error", | |
| "-i", src_mp4, | |
| "-t", f"{duration:.3f}", | |
| "-c:v", "libx264", "-preset", "veryfast", "-crf", "20", | |
| "-c:a", "aac", "-b:a", "128k", | |
| out, | |
| ] | |
| try: | |
| subprocess.run(cmd, check=True) | |
| return out | |
| except Exception as e: | |
| logging.warning(f"[trim] ffmpeg trim failed ({e}); returning untrimmed clip") | |
| return src_mp4 | |
| def generate_avatar( | |
| image_path: str, | |
| voice_ref: str | None, | |
| prompt: str, | |
| cfg: float, | |
| stg: float, | |
| steps: int, | |
| duration: float, | |
| seed: int, | |
| randomize_seed: bool, | |
| resolution: str, | |
| progress=gr.Progress(), | |
| ): | |
| if not image_path: | |
| raise gr.Error("Please upload a reference portrait.") | |
| if not voice_ref: | |
| raise gr.Error("Please record or upload a voice clip (10+ seconds) to clone.") | |
| if not prompt or not prompt.strip(): | |
| raise gr.Error("Please enter a script.") | |
| if randomize_seed: | |
| seed = random.randint(0, _MAX_SEED) | |
| logging.info(f"[seed] randomized -> {seed}") | |
| wav_path = _run_tts(prompt, voice_ref, cfg, stg, steps, duration, seed, resolution, progress) | |
| progress(0.55, desc="Generating talking-head video on LongCat-Video-Avatar…") | |
| client = _video_client() | |
| video_prompt = _video_prompt_from_script(prompt) | |
| logging.info(f"[video] prompt={video_prompt!r} resolution={resolution} seed={seed}") | |
| t0 = time.time() | |
| # Param order matches victor/LongCat-Video-Avatar-1.5 `generate(image_path, | |
| # audio_path, prompt, resolution, seed, vocal_mode, acceleration)`. | |
| # vocal_mode is forced to the fast path because our TTS output is already | |
| # clean studio audio — no need for vocal isolation. acceleration is the | |
| # 8-step DBCache faster preset which runs ~2× faster than exact 8-step at | |
| # negligible quality cost. | |
| result = client.predict( | |
| handle_file(image_path), | |
| handle_file(wav_path), | |
| video_prompt, | |
| resolution, | |
| int(seed), | |
| "Clean speech (fast)", | |
| "DBCache faster", | |
| api_name="/generate", | |
| ) | |
| logging.info(f"[video] {time.time() - t0:.2f}s -> {result}") | |
| if isinstance(result, dict): | |
| video_path = result.get("video") or result.get("path") or result | |
| else: | |
| video_path = result | |
| if duration < _LONGCAT_VIDEO_SECONDS: | |
| progress(0.95, desc=f"Trimming to {duration:.1f}s…") | |
| video_path = _trim_video(video_path, float(duration)) | |
| progress(1.0, desc="Done") | |
| return video_path, seed | |
| # ── UI ────────────────────────────────────────────────────────────────────── | |
| _ASSETS = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") | |
| _AVATARS_DIR = os.path.join(_ASSETS, "avatars") | |
| def _a(name: str) -> str: | |
| return os.path.join(_AVATARS_DIR, name) | |
| # Examples fill portrait + script only. Voice is the user's own, advanced | |
| # settings keep their defaults. | |
| EXAMPLES = [ | |
| [ | |
| _a("orc_warrior.png"), | |
| 'A shadowy warlord speaks with cold menace, "You have entered my domain, mortal." ' | |
| 'He chuckles darkly, "Such arrogance will be your undoing." ' | |
| ], | |
| [ | |
| _a("photoreal_person.png"), | |
| 'A radio host clears his throat, "Excuse me, pardon that." ' | |
| 'He settles into a warm, professional tone, "Good evening everyone, ' | |
| ], | |
| [ | |
| _a("character.png"), | |
| 'A playful character already mid-giggle, "Hehehe, oh my gosh you should see your face!" ' | |
| ], | |
| ] | |
| # Theme inspired by victor/ace-step-jam: dark slate palette, Hanken Grotesk, | |
| # tight radius, subtle frosted surfaces. ace-step-jam itself is a custom | |
| # HTML/CSS frontend, so this is an approximation translated to a Gradio Blocks | |
| # theme + minimal CSS — same vibe, way less surface area. | |
| THEME = gr.themes.Soft( | |
| primary_hue=gr.themes.colors.slate, | |
| secondary_hue=gr.themes.colors.slate, | |
| neutral_hue=gr.themes.colors.slate, | |
| radius_size=gr.themes.sizes.radius_sm, | |
| text_size=gr.themes.sizes.text_md, | |
| font=[gr.themes.GoogleFont("Hanken Grotesk"), "system-ui", "sans-serif"], | |
| ).set( | |
| body_background_fill="oklch(0.13 0.006 260)", | |
| body_background_fill_dark="oklch(0.13 0.006 260)", | |
| body_text_color="rgba(255, 255, 255, 0.87)", | |
| body_text_color_dark="rgba(255, 255, 255, 0.87)", | |
| background_fill_primary="rgba(255, 255, 255, 0.04)", | |
| background_fill_primary_dark="rgba(255, 255, 255, 0.04)", | |
| background_fill_secondary="rgba(255, 255, 255, 0.06)", | |
| background_fill_secondary_dark="rgba(255, 255, 255, 0.06)", | |
| border_color_primary="rgba(255, 255, 255, 0.08)", | |
| border_color_primary_dark="rgba(255, 255, 255, 0.08)", | |
| block_background_fill="rgba(255, 255, 255, 0.04)", | |
| block_background_fill_dark="rgba(255, 255, 255, 0.04)", | |
| block_border_color="rgba(255, 255, 255, 0.08)", | |
| block_border_color_dark="rgba(255, 255, 255, 0.08)", | |
| block_label_background_fill="transparent", | |
| block_label_background_fill_dark="transparent", | |
| block_title_text_color="rgba(255, 255, 255, 0.87)", | |
| block_title_text_color_dark="rgba(255, 255, 255, 0.87)", | |
| input_background_fill="rgba(255, 255, 255, 0.04)", | |
| input_background_fill_dark="rgba(255, 255, 255, 0.04)", | |
| input_border_color="rgba(255, 255, 255, 0.08)", | |
| input_border_color_dark="rgba(255, 255, 255, 0.08)", | |
| button_primary_background_fill="oklch(0.90 0.005 260)", | |
| button_primary_background_fill_dark="oklch(0.90 0.005 260)", | |
| button_primary_background_fill_hover="oklch(0.95 0.005 260)", | |
| button_primary_background_fill_hover_dark="oklch(0.95 0.005 260)", | |
| button_primary_text_color="oklch(0.13 0.006 260)", | |
| button_primary_text_color_dark="oklch(0.13 0.006 260)", | |
| ) | |
| CUSTOM_CSS = """ | |
| main, .gradio-container, .fillable:not(.fill_width) { | |
| width: min(100%, 1180px) !important; | |
| max-width: 1180px !important; | |
| margin-left: auto !important; | |
| margin-right: auto !important; | |
| } | |
| .gradio-container { font-feature-settings: "ss01", "cv11"; } | |
| #hero h1 { | |
| font-weight: 600; | |
| letter-spacing: -0.02em; | |
| margin-bottom: 0.25em; | |
| } | |
| #hero p { color: rgba(255, 255, 255, 0.55); margin-top: 0; } | |
| .gr-button-primary { | |
| letter-spacing: -0.01em; | |
| font-weight: 600; | |
| } | |
| """ | |
| with gr.Blocks(title="Avatar Generator", theme=THEME, css=CUSTOM_CSS) as demo: | |
| gr.Markdown( | |
| """ | |
| # Avatar Generator | |
| SOTA Avatar generation with synthetic speech using [Dramabox](https://huggingface.co/ResembleAI/Dramabox) and [LongCat-Video-Avatar 1.5](https://huggingface.co/meituan-longcat/LongCat-Video-Avatar-1.5). | |
| Upload/generate a portrait, clone your voice (or upload one), write a script — get a lip-synced talking-head✨ | |
| """, | |
| elem_id="hero", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Row(): | |
| with gr.Tabs() as portrait_tabs: | |
| with gr.TabItem("Upload", id="portrait_upload"): | |
| image_in = gr.Image( | |
| label="Reference portrait", | |
| type="filepath", | |
| height=260, | |
| sources=["upload", "clipboard"], | |
| ) | |
| with gr.TabItem("Generate / edit", id="portrait_generate"): | |
| flux_prompt = gr.Textbox( | |
| info="Describe the portrait (or the edit, if you attach one below)", | |
| placeholder=( | |
| "e.g. A photorealistic portrait of an elderly fisherman with " | |
| "weathered skin and a wool sweater, neutral studio backdrop" | |
| ), | |
| lines=1, | |
| ) | |
| flux_edit_image = gr.Image( | |
| label="optional: input image", | |
| type="filepath", | |
| height=160, | |
| sources=["upload", "clipboard"], | |
| ) | |
| flux_go = gr.Button("Generate portrait", variant="secondary") | |
| voice_in = gr.Audio( | |
| label="Avatar voice", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| ) | |
| prompt = gr.Textbox( | |
| label="Script", | |
| value=( | |
| 'A confident announcer speaks proudly, "And now, the moment ' | |
| 'you have all been waiting for." He chuckles knowingly, ' | |
| '"Heheh, trust me, this one is going to blow you away."' | |
| ), | |
| lines=4, | |
| ) | |
| with gr.Accordion("Advanced", open=False): | |
| # LongCat's /generate API hardcodes 5 s of video output. We | |
| # can shorten by pacing TTS to the requested length + trimming | |
| # the returned mp4, but we can't go longer from a single call. | |
| duration_in = gr.Slider( | |
| 1.0, 5.0, value=5.0, step=0.5, | |
| label="Output duration (seconds, max 5)", | |
| ) | |
| with gr.Row(): | |
| resolution = gr.Radio(["480p", "720p"], value="480p", label="Resolution") | |
| with gr.Row(): | |
| seed = gr.Number(value=42, precision=0, label="Seed") | |
| randomize_seed = gr.Checkbox(value=True, label="Randomize seed") | |
| # Default 22 trades ~25% of the TTS step for negligible quality | |
| # cost on typical short prompts; bump back toward 30 for the | |
| # cleanest output, drop toward 14 for fastest iteration. | |
| steps_in = gr.Slider(10, 40, value=22, step=1, label="TTS steps (Euler)") | |
| cfg = gr.Slider(1.0, 5.0, value=2.5, step=0.1, label="TTS CFG scale") | |
| stg = gr.Slider(0.0, 3.0, value=1.5, step=0.1, label="TTS STG scale") | |
| go = gr.Button("Generate avatar", variant="primary") | |
| with gr.Column(scale=1): | |
| video_out = gr.Video(label="Output", autoplay=True, height=420) | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=[image_in, prompt], | |
| outputs=None, | |
| fn=None, | |
| cache_examples=False, | |
| examples_per_page=4, | |
| label="Script + portrait examples (then add your own voice above)", | |
| ) | |
| flux_go.click( | |
| generate_portrait, | |
| inputs=[flux_prompt, flux_edit_image], | |
| outputs=[image_in, portrait_tabs], | |
| show_progress="full", | |
| ) | |
| go.click( | |
| generate_avatar, | |
| inputs=[image_in, voice_in, prompt, cfg, stg, steps_in, duration_in, seed, randomize_seed, resolution], | |
| outputs=[video_out, seed], | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=8).launch(show_error=True) | |