IgorCSIS
Stage 2 phase 2: add x2t_image, image_edit, video_edit with shared _run_lance_task helper
f32613e | # Copyright 2026 Igor Lima. Licensed under Apache 2.0 (see LICENSE_LANCE for the | |
| # upstream Lance license; this file is original work by Igor Lima). | |
| # | |
| # nifty-lab: ZeroGPU adapter for ByteDance's Lance unified multimodal model. | |
| # | |
| # WHAT THIS IS | |
| # ------------ | |
| # A small Gradio app that runs on Hugging Face Spaces with the ZeroGPU runtime. | |
| # All of the heavy model-loading and inference logic lives in | |
| # `lance_gradio_t2v_v2t.py`, which is a verbatim copy of ByteDance's reference | |
| # Gradio script. This file wraps their `LanceT2VV2TPipeline` class for | |
| # ZeroGPU's on-demand GPU lifecycle. | |
| # | |
| # WHY WE DIDN'T JUST RUN THEIR SCRIPT AS-IS | |
| # ----------------------------------------- | |
| # Their script assumes a long-running dedicated GPU. They wrap inference in a | |
| # `PipelinePool` that holds the GPU across requests via threads. ZeroGPU works | |
| # on a per-request claim model: you request a GPU when you call a function | |
| # decorated with `@spaces.GPU`, and the GPU is released when the function | |
| # returns. No persistent pool. So we instantiate ONE pipeline, decorate the | |
| # inference entry points, and trust ZeroGPU to schedule the GPU for us. | |
| # | |
| # STAGE 1 SCOPE | |
| # ------------- | |
| # Only the two tasks their reference script supports out of the box: | |
| # 1. text-to-video (TASK_T2V) | |
| # 2. video understanding / Q&A (TASK_X2T_VIDEO) | |
| # | |
| # Stage 2 will add t2i, image_edit, video_edit, and x2t_image as new tabs. | |
| # Those tasks need a separate model variant (Lance_3B for image work) and | |
| # additional plumbing in the pipeline, so we ship stage 1 first to prove the | |
| # ZeroGPU pattern works end-to-end before expanding. | |
| from __future__ import annotations | |
| # ----------------------------------------------------------------------------- | |
| # CUDA runtime preload. MUST run before any transformers import. | |
| # ----------------------------------------------------------------------------- | |
| # Why this exists: recent transformers versions eagerly import flash_attn | |
| # inside modeling_utils.py, which dynamically links against libcudart.so.12 | |
| # at module load. On ZeroGPU, the GPU (and thus its CUDA libs) is only | |
| # attached during @spaces.GPU calls, so libcudart isn't on the linker | |
| # path at boot. Importing torch first doesn't help because torch only | |
| # preloads CUDA when it detects a live GPU device. | |
| # | |
| # We work around it by hunting for libcudart.so.12 in the pip-installed | |
| # nvidia and torch wheels and ctypes-loading it with RTLD_GLOBAL so the | |
| # symbols are visible to anyone who dlopens later. | |
| import ctypes | |
| import glob | |
| import os | |
| import sys | |
| def _preload_cuda_runtime() -> None: | |
| """Find libcudart.so.12 in pip-installed nvidia/torch wheels and preload it.""" | |
| candidates: list[str] = [] | |
| # nvidia-cuda-runtime-cu12 pip package | |
| candidates += glob.glob("/usr/local/lib/python*/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12*") | |
| # Sometimes torch's bundled CUDA libs are loadable too, as a fallback | |
| candidates += glob.glob("/usr/local/lib/python*/site-packages/torch/lib/libcudart.so.12*") | |
| for path in candidates: | |
| try: | |
| ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL) | |
| print(f"[boot] preloaded {path}", flush=True) | |
| return | |
| except OSError as exc: | |
| print(f"[boot] failed to preload {path}: {exc}", flush=True) | |
| print("[boot] WARNING: no libcudart.so.12 found to preload, flash_attn import will fail", flush=True) | |
| _preload_cuda_runtime() | |
| # Now the rest of the imports are safe. | |
| from pathlib import Path | |
| from typing import Optional | |
| import torch # noqa: F401 imported early so its CUDA env init runs before lance code | |
| import gradio as gr | |
| import spaces # Hugging Face ZeroGPU runtime decorator | |
| from huggingface_hub import snapshot_download | |
| # ============================================================================= | |
| # Model weight download | |
| # ============================================================================= | |
| # We pull Lance_3B_Video at module-load time (not on first request) for two | |
| # reasons: | |
| # 1. It only needs CPU + network, which we have at Space startup. ZeroGPU | |
| # does NOT give us a GPU at module load. | |
| # 2. The download is ~6GB and would otherwise consume most of the 300s GPU | |
| # budget on the first user request. | |
| # | |
| # After the first boot, Hugging Face Spaces caches the files in the Space's | |
| # persistent storage, so subsequent boots skip re-downloading. | |
| REPO_ID = "bytedance-research/Lance" | |
| WEIGHTS_ROOT = Path("downloads") | |
| def fetch_weights() -> None: | |
| """ | |
| Pull the weights Lance needs at runtime from the Hugging Face Hub. | |
| Two directories matter: | |
| - Lance_3B_Video/ The main multimodal model checkpoint (~28GB). | |
| - Qwen2.5-VL-ViT/ The vision transformer Lance imports from | |
| Qwen2.5-VL. Lance's code hardcodes this path | |
| and expects `config.json` + `vit.safetensors` | |
| inside it. | |
| We download both with one snapshot_download call. If Qwen2.5-VL-ViT/ | |
| isn't shipped under the Lance repo, snapshot_download just skips it | |
| silently and we'll surface a clearer error when initialize() runs. | |
| """ | |
| print(f"[boot] downloading {REPO_ID} (Lance_3B_Video + Qwen2.5-VL-ViT + Wan2.2 VAE)...", flush=True) | |
| snapshot_download( | |
| repo_id=REPO_ID, | |
| local_dir=str(WEIGHTS_ROOT), | |
| allow_patterns=[ | |
| "Lance_3B_Video/*", | |
| "Lance_3B_Video/**/*", | |
| "Qwen2.5-VL-ViT/*", | |
| "Qwen2.5-VL-ViT/**/*", | |
| # Wan2.2 video VAE checkpoint, used by WanVideoVAE during pipeline | |
| # initialize(). Lives at the top of the repo (no subdir). | |
| "Wan2.2*", | |
| # Top-level config.json (901 bytes). Tiny, defensively included | |
| # in case Lance's code reads it during init. | |
| "config.json", | |
| ], | |
| resume_download=True, | |
| ) | |
| # Sanity check: log what showed up. If a required artifact is missing, | |
| # the next runtime error will tell us exactly what to add to allow_patterns. | |
| vit_dir = WEIGHTS_ROOT / "Qwen2.5-VL-ViT" | |
| if vit_dir.exists(): | |
| print(f"[boot] Qwen2.5-VL-ViT/ landed with {len(list(vit_dir.iterdir()))} files", flush=True) | |
| else: | |
| print("[boot] WARNING: Qwen2.5-VL-ViT/ NOT in the Lance repo, model init will fail", flush=True) | |
| wan_path = WEIGHTS_ROOT / "Wan2.2_VAE.pth" | |
| if wan_path.exists(): | |
| size_mb = wan_path.stat().st_size / (1024 * 1024) | |
| print(f"[boot] Wan2.2_VAE.pth landed at {size_mb:.0f} MB", flush=True) | |
| else: | |
| print("[boot] WARNING: Wan2.2_VAE.pth NOT in the Lance repo, VAE init will fail", flush=True) | |
| print("[boot] weight download complete.", flush=True) | |
| # Run download at import time. On HF Spaces this happens once when the Space | |
| # starts up; the user sees "Starting..." in the dashboard while it runs. | |
| fetch_weights() | |
| # ============================================================================= | |
| # Pipeline import (must come AFTER fetch_weights so the import-time checks | |
| # in their script find the downloaded files where they expect them) | |
| # ============================================================================= | |
| from lance_gradio_t2v_v2t import ( | |
| LanceT2VV2TPipeline, | |
| TASK_T2V, | |
| TASK_X2T_VIDEO, | |
| DEFAULT_HEIGHT, | |
| DEFAULT_WIDTH, | |
| DEFAULT_NUM_FRAMES, | |
| DEFAULT_TIMESTEPS, | |
| DEFAULT_TIMESTEP_SHIFT, | |
| DEFAULT_CFG_TEXT_SCALE, | |
| DEFAULT_RESOLUTION, | |
| DEFAULT_BASIC_SEED, | |
| VIDEO_RESOLUTION_CHOICES, | |
| ) | |
| # A single pipeline instance. ZeroGPU only ever hands us one GPU at a time, | |
| # so the PipelinePool from the upstream script (which manages multiple GPUs | |
| # via threads) isn't relevant here. We instantiate now but defer actual | |
| # model loading: `LanceT2VV2TPipeline.generate()` calls `self.initialize()` | |
| # lazily on its first invocation, so the model loads the first time a user | |
| # hits the GPU. After that, subsequent calls reuse the loaded model | |
| # (as long as the Space hasn't gone cold). | |
| PIPELINE = LanceT2VV2TPipeline(device_id=0) | |
| # ============================================================================= | |
| # ZeroGPU-decorated entry points | |
| # ============================================================================= | |
| # Two entry points instead of one routed by `task` because each has a | |
| # different duration budget. Text-to-video can run close to the 300s cap on | |
| # 50 frames at 480p. Video understanding finishes in 20-60s. Declaring | |
| # tighter durations lets ZeroGPU schedule short tasks more aggressively and | |
| # means the user's daily quota covers more requests. | |
| def run_text_to_video( | |
| prompt: str, | |
| seed: int, | |
| resolution: str, | |
| num_frames: int, | |
| height: int, | |
| width: int, | |
| validation_num_timesteps: int, | |
| validation_timestep_shift: float, | |
| cfg_text_scale: float, | |
| ): | |
| """ | |
| Generate a short video from a text prompt. | |
| Returns a 4-tuple matching the upstream pipeline shape: | |
| (video_path, text_result, status_markdown, run_logs) | |
| For t2v, video_path is the produced clip, text_result is empty. | |
| Duration is set to 240s to allow proper-quality generation at 480p | |
| with 50 frames and 30 denoising steps. Each call costs ~3-4 minutes | |
| of the signed-in user's daily ZeroGPU budget (25 min/day on HF Pro), | |
| so a Pro user gets ~6-8 high-quality clips per day. ZeroGPU's hard | |
| per-call cap is 300s; we leave 60s of headroom under that. | |
| """ | |
| return PIPELINE.generate( | |
| task=TASK_T2V, | |
| prompt=prompt, | |
| input_video=None, | |
| question="", | |
| height=int(height), | |
| width=int(width), | |
| num_frames=int(num_frames), | |
| seed=int(seed), | |
| resolution=resolution, | |
| validation_num_timesteps=int(validation_num_timesteps), | |
| validation_timestep_shift=float(validation_timestep_shift), | |
| cfg_text_scale=float(cfg_text_scale), | |
| ) | |
| def _run_lance_task( | |
| task_name: str, | |
| payload: dict, | |
| height: int, | |
| width: int, | |
| num_frames: int, | |
| seed: int, | |
| resolution: str, | |
| validation_num_timesteps: int, | |
| validation_timestep_shift: float, | |
| cfg_text_scale: float, | |
| output_kind: str, | |
| ): | |
| """ | |
| Generic Lance task runner. Builds inference args, runs the model, | |
| extracts the result. | |
| Args: | |
| task_name: one of "t2i", "t2v", "x2t_image", "x2t_video", | |
| "image_edit", "video_edit". This drives Lance's | |
| task-specific dataset routing inside ValidationDataset. | |
| payload: the dict that gets written to the prompt JSON file. Shape | |
| varies per task; build it in the task-specific wrapper | |
| using the patterns from config/examples/*_example.json. | |
| output_kind: "image", "video", or "text". Decides which kind of | |
| file we glob for at the end and how we report status. | |
| We sidestep the upstream `LanceT2VV2TPipeline.generate()` because it | |
| hardcodes t2v + x2t_video. Everything else here mirrors that method's | |
| structure: clone the base args, override the request-specific bits, | |
| call _build_request_batch, then validate_on_fixed_batch. | |
| """ | |
| import json as _json | |
| import time as _time | |
| from copy import deepcopy as _deepcopy | |
| from datetime import datetime as _datetime | |
| import torch as _torch | |
| from inference_lance import ( | |
| validate_on_fixed_batch as _validate_on_fixed_batch, | |
| save_prompt_results as _save_prompt_results, | |
| clean_memory as _clean_memory, | |
| ) | |
| from lance_gradio_t2v_v2t import ( | |
| TEXT_TEMPLATE as _TEXT_TEMPLATE, | |
| TMP_INPUT_DIR as _TMP_INPUT_DIR, | |
| RESULTS_ROOT as _RESULTS_ROOT, | |
| ensure_dirs as _ensure_dirs, | |
| extract_text_result as _extract_text_result, | |
| ) | |
| PIPELINE.initialize() | |
| _ensure_dirs() | |
| timestamp = _datetime.now().strftime("%Y%m%d_%H%M%S_%f") | |
| # Pretty-print JSON so the dataset loader's _read_jsonl line-by-line | |
| # parse FAILS and falls through to the json.load + dict-transform path | |
| # that produces the {"data": ..., "index": ...} records the samplers | |
| # expect. See validation_dataset.py ~line 84. | |
| prompt_file = _TMP_INPUT_DIR / f"{task_name}_{timestamp}.json" | |
| prompt_file.write_text(_json.dumps(payload, ensure_ascii=False, indent=2)) | |
| save_dir = _RESULTS_ROOT / f"{task_name}_{timestamp}" | |
| save_dir.mkdir(parents=True, exist_ok=True) | |
| request_model_args = _deepcopy(PIPELINE.base_model_args) | |
| request_model_args.cfg_text_scale = float(cfg_text_scale) | |
| request_data_args = _deepcopy(PIPELINE.base_data_args) | |
| request_data_args.val_dataset_config_file = str(prompt_file) | |
| request_inference_args = _deepcopy(PIPELINE.base_inference_args) | |
| request_inference_args.validation_num_timesteps = int(validation_num_timesteps) | |
| request_inference_args.validation_timestep_shift = float(validation_timestep_shift) | |
| request_inference_args.validation_data_seed = int(seed) | |
| request_inference_args.validation_noise_seed = int(seed) | |
| request_inference_args.video_height = int(height) | |
| request_inference_args.video_width = int(width) | |
| request_inference_args.num_frames = int(num_frames) | |
| request_inference_args.resolution = resolution | |
| request_inference_args.save_path_gen = str(save_dir) | |
| request_inference_args.task = task_name | |
| request_inference_args.text_template = _TEXT_TEMPLATE | |
| request_inference_args.prompt_data_dict = {} | |
| val_data_cpu = PIPELINE._build_request_batch( | |
| prompt_file=prompt_file, | |
| model_args=request_model_args, | |
| data_args=request_data_args, | |
| inference_args=request_inference_args, | |
| ) | |
| print( | |
| f"[app] {task_name} start | size={height}x{width} | " | |
| f"frames={num_frames} | steps={validation_num_timesteps}", | |
| flush=True, | |
| ) | |
| start = _time.perf_counter() | |
| with PIPELINE._generate_lock: | |
| _torch.cuda.set_device(PIPELINE.device) | |
| _validate_on_fixed_batch( | |
| fsdp_model=PIPELINE.model, | |
| vae_model=PIPELINE.vae_model, | |
| tokenizer=PIPELINE.tokenizer, | |
| val_data_cpu=val_data_cpu, | |
| training_args=request_inference_args, | |
| model_args=request_model_args, | |
| inference_args=request_inference_args, | |
| new_token_ids=PIPELINE.new_token_ids, | |
| image_token_id=PIPELINE.image_token_id, | |
| device=PIPELINE.device, | |
| save_source_video=False, | |
| save_path_gen=str(save_dir), | |
| save_path_gt="", | |
| ) | |
| _save_prompt_results( | |
| request_inference_args.prompt_data_dict, | |
| str(save_dir), | |
| PIPELINE.logger, | |
| ) | |
| _clean_memory() | |
| elapsed = _time.perf_counter() - start | |
| # Result extraction by output kind. The output 4-tuple matches the | |
| # shape the upstream pipeline returns so the frontend handlers stay | |
| # uniform: (video_path, text_result, status_md, logs). | |
| if output_kind == "image": | |
| files = sorted(save_dir.glob("*.png")) + sorted(save_dir.glob("*.jpg")) | |
| if not files: | |
| return None, "", f"Inference completed but no image in {save_dir}", "" | |
| return str(files[0]), "", f"Done in {elapsed:.1f}s", "" | |
| if output_kind == "video": | |
| files = sorted(save_dir.glob("*.mp4")) | |
| if not files: | |
| return None, "", f"Inference completed but no video in {save_dir}", "" | |
| return str(files[0]), "", f"Done in {elapsed:.1f}s", "" | |
| if output_kind == "text": | |
| text = _extract_text_result(save_dir) | |
| if not text: | |
| return None, "", f"Inference completed but no text result in {save_dir}", "" | |
| return None, text, f"Done in {elapsed:.1f}s", "" | |
| return None, "", f"Unknown output_kind: {output_kind}", "" | |
| def _image_inference( | |
| prompt: str, | |
| height: int, | |
| width: int, | |
| seed: int, | |
| validation_num_timesteps: int, | |
| validation_timestep_shift: float, | |
| cfg_text_scale: float, | |
| ): | |
| """Text-to-image. Thin wrapper around _run_lance_task.""" | |
| prompt = (prompt or "").strip() | |
| if not prompt: | |
| return None, "", "Please enter a prompt.", "" | |
| return _run_lance_task( | |
| task_name="t2i", | |
| payload={"000000.png": prompt}, | |
| height=int(height), | |
| width=int(width), | |
| num_frames=1, | |
| seed=int(seed), | |
| resolution="image_768res", | |
| validation_num_timesteps=int(validation_num_timesteps), | |
| validation_timestep_shift=float(validation_timestep_shift), | |
| cfg_text_scale=float(cfg_text_scale), | |
| output_kind="image", | |
| ) | |
| def run_text_to_image( | |
| prompt: str, | |
| seed: int, | |
| height: int, | |
| width: int, | |
| validation_num_timesteps: int, | |
| validation_timestep_shift: float, | |
| cfg_text_scale: float, | |
| ): | |
| """ | |
| ZeroGPU wrapper for text-to-image. 90s budget; image gen with Lance | |
| at 768res / 30 steps lands in 20 to 40 seconds on H200. | |
| """ | |
| return _image_inference( | |
| prompt=prompt, | |
| height=int(height), | |
| width=int(width), | |
| seed=int(seed), | |
| validation_num_timesteps=int(validation_num_timesteps), | |
| validation_timestep_shift=float(validation_timestep_shift), | |
| cfg_text_scale=float(cfg_text_scale), | |
| ) | |
| def run_video_understanding( | |
| input_video: str, | |
| question: str, | |
| seed: int, | |
| ): | |
| """ | |
| Answer a question about an uploaded video clip. | |
| Returns the same 4-tuple shape. For understanding, video_path is empty | |
| and text_result holds the model's answer. | |
| Most of the pipeline.generate() arguments are unused for understanding | |
| tasks, but the signature requires all of them, so we pass defaults. | |
| """ | |
| return PIPELINE.generate( | |
| task=TASK_X2T_VIDEO, | |
| prompt="", | |
| input_video=input_video, | |
| question=question, | |
| height=DEFAULT_HEIGHT, | |
| width=DEFAULT_WIDTH, | |
| num_frames=DEFAULT_NUM_FRAMES, | |
| seed=int(seed), | |
| resolution=DEFAULT_RESOLUTION, | |
| validation_num_timesteps=DEFAULT_TIMESTEPS, | |
| validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT, | |
| cfg_text_scale=DEFAULT_CFG_TEXT_SCALE, | |
| ) | |
| def run_image_understanding( | |
| input_image: str, | |
| question: str, | |
| seed: int, | |
| ): | |
| """ | |
| Answer a question about an uploaded image. Same shape as | |
| run_video_understanding but operates on a single image. | |
| The interleave_array payload format comes from Lance's | |
| config/examples/x2t_image_example.json. | |
| """ | |
| question = (question or "").strip() | |
| if not input_image: | |
| return None, "", "Please upload an image.", "" | |
| if not question: | |
| return None, "", "Please enter a question.", "" | |
| payload = { | |
| "0001": { | |
| "interleave_array": [ | |
| input_image, | |
| ["Look at the image carefully and answer the question.", question, ""], | |
| ], | |
| "element_dtype_array": ["image", "text"], | |
| "istarget_in_interleave": [0, 1], | |
| } | |
| } | |
| return _run_lance_task( | |
| task_name="x2t_image", | |
| payload=payload, | |
| height=DEFAULT_HEIGHT, | |
| width=DEFAULT_WIDTH, | |
| num_frames=1, | |
| seed=int(seed), | |
| resolution="image_768res", | |
| validation_num_timesteps=DEFAULT_TIMESTEPS, | |
| validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT, | |
| cfg_text_scale=DEFAULT_CFG_TEXT_SCALE, | |
| output_kind="text", | |
| ) | |
| def run_image_edit( | |
| input_image: str, | |
| prompt: str, | |
| seed: int, | |
| height: int, | |
| width: int, | |
| validation_num_timesteps: int, | |
| validation_timestep_shift: float, | |
| cfg_text_scale: float, | |
| ): | |
| """ | |
| Edit an input image with a text instruction. Payload format from | |
| Lance's config/examples/image_edit_example.json: text prompt first, | |
| then the source image twice (Lance's pipeline expects source AND | |
| target slots; the second slot is what the model fills in). | |
| """ | |
| prompt = (prompt or "").strip() | |
| if not input_image: | |
| return None, "", "Please upload an image.", "" | |
| if not prompt: | |
| return None, "", "Please enter an edit instruction.", "" | |
| payload = { | |
| "0001": { | |
| "interleave_array": [prompt, input_image, input_image], | |
| "element_dtype_array": ["text", "image", "image"], | |
| "istarget_in_interleave": [0, 0, 1], | |
| } | |
| } | |
| return _run_lance_task( | |
| task_name="image_edit", | |
| payload=payload, | |
| height=int(height), | |
| width=int(width), | |
| num_frames=1, | |
| seed=int(seed), | |
| resolution="image_768res", | |
| validation_num_timesteps=int(validation_num_timesteps), | |
| validation_timestep_shift=float(validation_timestep_shift), | |
| cfg_text_scale=float(cfg_text_scale), | |
| output_kind="image", | |
| ) | |
| def run_video_edit( | |
| input_video: str, | |
| prompt: str, | |
| seed: int, | |
| height: int, | |
| width: int, | |
| num_frames: int, | |
| resolution: str, | |
| validation_num_timesteps: int, | |
| validation_timestep_shift: float, | |
| cfg_text_scale: float, | |
| ): | |
| """ | |
| Edit an input video with a text instruction. Same shape as | |
| image_edit but with video element dtypes. 240s GPU budget matches | |
| t2v because the underlying inference cost is similar. | |
| """ | |
| prompt = (prompt or "").strip() | |
| if not input_video: | |
| return None, "", "Please upload a video.", "" | |
| if not prompt: | |
| return None, "", "Please enter an edit instruction.", "" | |
| payload = { | |
| "0001": { | |
| "interleave_array": [prompt, input_video, input_video], | |
| "element_dtype_array": ["text", "video", "video"], | |
| "istarget_in_interleave": [0, 0, 1], | |
| } | |
| } | |
| return _run_lance_task( | |
| task_name="video_edit", | |
| payload=payload, | |
| height=int(height), | |
| width=int(width), | |
| num_frames=int(num_frames), | |
| seed=int(seed), | |
| resolution=resolution, | |
| validation_num_timesteps=int(validation_num_timesteps), | |
| validation_timestep_shift=float(validation_timestep_shift), | |
| cfg_text_scale=float(cfg_text_scale), | |
| output_kind="video", | |
| ) | |
| # ============================================================================= | |
| # Gradio UI | |
| # ============================================================================= | |
| # Two tabs, one per task. Layout follows the same column structure as the | |
| # upstream demo so users familiar with their reference UI feel at home. | |
| with gr.Blocks(title="nifty-lab") as demo: | |
| gr.Markdown( | |
| """ | |
| # nifty-lab | |
| A multimodal playground built on ByteDance's | |
| [Lance](https://github.com/bytedance/Lance) model, served on | |
| Hugging Face ZeroGPU. By [Igor Lima](https://github.com/IgorCSIS). | |
| Tasks wired: text-to-image, text-to-video, video understanding. | |
| Image edit and video edit ship in a follow-up. | |
| First request after the Space wakes from idle takes about a minute | |
| to warm the model. Subsequent requests are fast. | |
| """ | |
| ) | |
| # ---- Tab: Text to Video ------------------------------------------------ | |
| with gr.Tab("Text to Video"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| t2v_prompt = gr.Textbox( | |
| label="Prompt", | |
| lines=5, | |
| placeholder="Describe the video you want to generate...", | |
| ) | |
| with gr.Row(): | |
| # Defaults at 480x848 give Lance enough pixels to make | |
| # something coherent. Lower if you want faster gens. | |
| t2v_height = gr.Slider(192, 1024, value=480, step=16, label="Height") | |
| t2v_width = gr.Slider(192, 1024, value=848, step=16, label="Width") | |
| # 50 frames is ~2s at 25fps, fits comfortably in 240s GPU budget. | |
| t2v_num_frames = gr.Slider( | |
| 1, 121, value=50, step=1, label="Frames", | |
| info="50 frames is roughly 2 seconds. 121 is the model max.", | |
| ) | |
| t2v_resolution = gr.Dropdown( | |
| label="Resolution preset", | |
| choices=VIDEO_RESOLUTION_CHOICES, | |
| value="video_480p", | |
| ) | |
| t2v_seed = gr.Number( | |
| label="Seed", | |
| value=DEFAULT_BASIC_SEED, | |
| precision=0, | |
| info="-1 picks a fresh random seed each run.", | |
| ) | |
| with gr.Accordion("Advanced", open=False): | |
| t2v_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps") | |
| t2v_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) | |
| t2v_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) | |
| t2v_run = gr.Button("Generate Video", variant="primary") | |
| with gr.Column(scale=1): | |
| t2v_output = gr.Video(label="Result") | |
| t2v_status = gr.Markdown("Idle.") | |
| t2v_logs = gr.Textbox(label="Run log", lines=12, max_lines=30) | |
| # The pipeline returns a 4-tuple of (video_path, text, status, logs). | |
| # Text result is unused for t2v but we still receive it, so we wire | |
| # it to a hidden state to consume the value. | |
| t2v_unused_text = gr.State("") | |
| t2v_run.click( | |
| fn=run_text_to_video, | |
| inputs=[ | |
| t2v_prompt, t2v_seed, t2v_resolution, t2v_num_frames, | |
| t2v_height, t2v_width, t2v_timesteps, t2v_shift, t2v_cfg, | |
| ], | |
| outputs=[t2v_output, t2v_unused_text, t2v_status, t2v_logs], | |
| ) | |
| # ---- Tab: Text to Image ----------------------------------------------- | |
| with gr.Tab("Text to Image"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| t2i_prompt = gr.Textbox( | |
| label="Prompt", | |
| lines=5, | |
| placeholder="A red panda walking through a snowy forest at dusk...", | |
| ) | |
| with gr.Row(): | |
| t2i_height = gr.Slider(256, 1024, value=768, step=16, label="Height") | |
| t2i_width = gr.Slider(256, 1024, value=768, step=16, label="Width") | |
| t2i_seed = gr.Number( | |
| label="Seed", | |
| value=DEFAULT_BASIC_SEED, | |
| precision=0, | |
| ) | |
| with gr.Accordion("Advanced", open=False): | |
| t2i_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps") | |
| t2i_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) | |
| t2i_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) | |
| t2i_run = gr.Button("Generate Image", variant="primary") | |
| with gr.Column(scale=1): | |
| t2i_output = gr.Image(label="Result") | |
| t2i_status = gr.Markdown("Idle.") | |
| t2i_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) | |
| t2i_unused_text = gr.State("") | |
| t2i_run.click( | |
| fn=run_text_to_image, | |
| inputs=[ | |
| t2i_prompt, t2i_seed, t2i_height, t2i_width, | |
| t2i_timesteps, t2i_shift, t2i_cfg, | |
| ], | |
| outputs=[t2i_output, t2i_unused_text, t2i_status, t2i_logs], | |
| ) | |
| # ---- Tab: Video Understanding ----------------------------------------- | |
| with gr.Tab("Video Understanding"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| v2t_input = gr.Video(label="Upload a video") | |
| v2t_question = gr.Textbox( | |
| label="Question", | |
| lines=3, | |
| placeholder="What is happening in this video?", | |
| ) | |
| v2t_seed = gr.Number( | |
| label="Seed", | |
| value=DEFAULT_BASIC_SEED, | |
| precision=0, | |
| ) | |
| v2t_run = gr.Button("Ask", variant="primary") | |
| with gr.Column(scale=1): | |
| v2t_output = gr.Textbox(label="Answer", lines=8) | |
| v2t_status = gr.Markdown("Idle.") | |
| v2t_logs = gr.Textbox(label="Run log", lines=12, max_lines=30) | |
| # Video understanding returns (None_for_video, text_answer, status, logs). | |
| # Discard the video slot; surface the text. | |
| v2t_unused_video = gr.State(None) | |
| v2t_run.click( | |
| fn=run_video_understanding, | |
| inputs=[v2t_input, v2t_question, v2t_seed], | |
| outputs=[v2t_unused_video, v2t_output, v2t_status, v2t_logs], | |
| ) | |
| # ---- Tab: Image Understanding ----------------------------------------- | |
| with gr.Tab("Image Q&A"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| i2t_input = gr.Image(label="Upload an image", type="filepath") | |
| i2t_question = gr.Textbox( | |
| label="Question", | |
| lines=3, | |
| placeholder="What is happening in this image?", | |
| ) | |
| i2t_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0) | |
| i2t_run = gr.Button("Ask", variant="primary") | |
| with gr.Column(scale=1): | |
| i2t_output = gr.Textbox(label="Answer", lines=8) | |
| i2t_status = gr.Markdown("Idle.") | |
| i2t_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) | |
| i2t_unused = gr.State(None) | |
| i2t_run.click( | |
| fn=run_image_understanding, | |
| inputs=[i2t_input, i2t_question, i2t_seed], | |
| outputs=[i2t_unused, i2t_output, i2t_status, i2t_logs], | |
| ) | |
| # ---- Tab: Image Edit -------------------------------------------------- | |
| with gr.Tab("Image Edit"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| ie_input = gr.Image(label="Source image", type="filepath") | |
| ie_prompt = gr.Textbox( | |
| label="Edit instruction", | |
| lines=3, | |
| placeholder="Add a pearl necklace; convert to watercolor; etc.", | |
| ) | |
| with gr.Row(): | |
| ie_height = gr.Slider(256, 1024, value=768, step=16, label="Height") | |
| ie_width = gr.Slider(256, 1024, value=768, step=16, label="Width") | |
| ie_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0) | |
| with gr.Accordion("Advanced", open=False): | |
| ie_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps") | |
| ie_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) | |
| ie_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) | |
| ie_run = gr.Button("Edit Image", variant="primary") | |
| with gr.Column(scale=1): | |
| ie_output = gr.Image(label="Result") | |
| ie_status = gr.Markdown("Idle.") | |
| ie_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) | |
| ie_unused = gr.State("") | |
| ie_run.click( | |
| fn=run_image_edit, | |
| inputs=[ | |
| ie_input, ie_prompt, ie_seed, | |
| ie_height, ie_width, | |
| ie_timesteps, ie_shift, ie_cfg, | |
| ], | |
| outputs=[ie_output, ie_unused, ie_status, ie_logs], | |
| ) | |
| # ---- Tab: Video Edit -------------------------------------------------- | |
| with gr.Tab("Video Edit"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| ve_input = gr.Video(label="Source video") | |
| ve_prompt = gr.Textbox( | |
| label="Edit instruction", | |
| lines=3, | |
| placeholder="Change the background; restyle as anime; etc.", | |
| ) | |
| with gr.Row(): | |
| ve_height = gr.Slider(192, 1024, value=480, step=16, label="Height") | |
| ve_width = gr.Slider(192, 1024, value=848, step=16, label="Width") | |
| ve_num_frames = gr.Slider(1, 121, value=50, step=1, label="Frames") | |
| ve_resolution = gr.Dropdown( | |
| label="Resolution preset", | |
| choices=VIDEO_RESOLUTION_CHOICES, | |
| value="video_480p", | |
| ) | |
| ve_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0) | |
| with gr.Accordion("Advanced", open=False): | |
| ve_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps") | |
| ve_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) | |
| ve_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) | |
| ve_run = gr.Button("Edit Video", variant="primary") | |
| with gr.Column(scale=1): | |
| ve_output = gr.Video(label="Result") | |
| ve_status = gr.Markdown("Idle.") | |
| ve_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) | |
| ve_unused = gr.State("") | |
| ve_run.click( | |
| fn=run_video_edit, | |
| inputs=[ | |
| ve_input, ve_prompt, ve_seed, | |
| ve_height, ve_width, ve_num_frames, ve_resolution, | |
| ve_timesteps, ve_shift, ve_cfg, | |
| ], | |
| outputs=[ve_output, ve_unused, ve_status, ve_logs], | |
| ) | |
| if __name__ == "__main__": | |
| # SSR mode (Gradio 6 default) breaks @gradio/client 1.8: requests vanish | |
| # silently because the client expects the old non-SSR endpoint shape. | |
| # Disable it explicitly. | |
| # Theme also moved to launch() in Gradio 6, so we pass it here. | |
| demo.queue(max_size=8).launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| ssr_mode=False, | |
| theme=gr.themes.Soft(), | |
| ) | |