# Copyright 2026 Igor Lima. Licensed under Apache 2.0 (see LICENSE_LANCE for the # upstream Lance license; this file is original work by Igor Lima). # # nifty-lab: ZeroGPU adapter for ByteDance's Lance unified multimodal model. # # WHAT THIS IS # ------------ # A small Gradio app that runs on Hugging Face Spaces with the ZeroGPU runtime. # All of the heavy model-loading and inference logic lives in # `lance_gradio_t2v_v2t.py`, which is a verbatim copy of ByteDance's reference # Gradio script. This file wraps their `LanceT2VV2TPipeline` class for # ZeroGPU's on-demand GPU lifecycle. # # WHY WE DIDN'T JUST RUN THEIR SCRIPT AS-IS # ----------------------------------------- # Their script assumes a long-running dedicated GPU. They wrap inference in a # `PipelinePool` that holds the GPU across requests via threads. ZeroGPU works # on a per-request claim model: you request a GPU when you call a function # decorated with `@spaces.GPU`, and the GPU is released when the function # returns. No persistent pool. So we instantiate ONE pipeline, decorate the # inference entry points, and trust ZeroGPU to schedule the GPU for us. # # STAGE 1 SCOPE # ------------- # Only the two tasks their reference script supports out of the box: # 1. text-to-video (TASK_T2V) # 2. video understanding / Q&A (TASK_X2T_VIDEO) # # Stage 2 will add t2i, image_edit, video_edit, and x2t_image as new tabs. # Those tasks need a separate model variant (Lance_3B for image work) and # additional plumbing in the pipeline, so we ship stage 1 first to prove the # ZeroGPU pattern works end-to-end before expanding. from __future__ import annotations # ----------------------------------------------------------------------------- # CUDA runtime preload. MUST run before any transformers import. # ----------------------------------------------------------------------------- # Why this exists: recent transformers versions eagerly import flash_attn # inside modeling_utils.py, which dynamically links against libcudart.so.12 # at module load. On ZeroGPU, the GPU (and thus its CUDA libs) is only # attached during @spaces.GPU calls, so libcudart isn't on the linker # path at boot. Importing torch first doesn't help because torch only # preloads CUDA when it detects a live GPU device. # # We work around it by hunting for libcudart.so.12 in the pip-installed # nvidia and torch wheels and ctypes-loading it with RTLD_GLOBAL so the # symbols are visible to anyone who dlopens later. import ctypes import glob import os import sys def _preload_cuda_runtime() -> None: """Find libcudart.so.12 in pip-installed nvidia/torch wheels and preload it.""" candidates: list[str] = [] # nvidia-cuda-runtime-cu12 pip package candidates += glob.glob("/usr/local/lib/python*/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12*") # Sometimes torch's bundled CUDA libs are loadable too, as a fallback candidates += glob.glob("/usr/local/lib/python*/site-packages/torch/lib/libcudart.so.12*") for path in candidates: try: ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL) print(f"[boot] preloaded {path}", flush=True) return except OSError as exc: print(f"[boot] failed to preload {path}: {exc}", flush=True) print("[boot] WARNING: no libcudart.so.12 found to preload, flash_attn import will fail", flush=True) _preload_cuda_runtime() # Now the rest of the imports are safe. from pathlib import Path from typing import Optional import torch # noqa: F401 imported early so its CUDA env init runs before lance code import gradio as gr import spaces # Hugging Face ZeroGPU runtime decorator from huggingface_hub import snapshot_download # ============================================================================= # Model weight download # ============================================================================= # We pull Lance_3B_Video at module-load time (not on first request) for two # reasons: # 1. It only needs CPU + network, which we have at Space startup. ZeroGPU # does NOT give us a GPU at module load. # 2. The download is ~6GB and would otherwise consume most of the 300s GPU # budget on the first user request. # # After the first boot, Hugging Face Spaces caches the files in the Space's # persistent storage, so subsequent boots skip re-downloading. REPO_ID = "bytedance-research/Lance" WEIGHTS_ROOT = Path("downloads") def fetch_weights() -> None: """ Pull the weights Lance needs at runtime from the Hugging Face Hub. Two directories matter: - Lance_3B_Video/ The main multimodal model checkpoint (~28GB). - Qwen2.5-VL-ViT/ The vision transformer Lance imports from Qwen2.5-VL. Lance's code hardcodes this path and expects `config.json` + `vit.safetensors` inside it. We download both with one snapshot_download call. If Qwen2.5-VL-ViT/ isn't shipped under the Lance repo, snapshot_download just skips it silently and we'll surface a clearer error when initialize() runs. """ print(f"[boot] downloading {REPO_ID} (Lance_3B_Video + Qwen2.5-VL-ViT + Wan2.2 VAE)...", flush=True) snapshot_download( repo_id=REPO_ID, local_dir=str(WEIGHTS_ROOT), allow_patterns=[ "Lance_3B_Video/*", "Lance_3B_Video/**/*", "Qwen2.5-VL-ViT/*", "Qwen2.5-VL-ViT/**/*", # Wan2.2 video VAE checkpoint, used by WanVideoVAE during pipeline # initialize(). Lives at the top of the repo (no subdir). "Wan2.2*", # Top-level config.json (901 bytes). Tiny, defensively included # in case Lance's code reads it during init. "config.json", ], resume_download=True, ) # Sanity check: log what showed up. If a required artifact is missing, # the next runtime error will tell us exactly what to add to allow_patterns. vit_dir = WEIGHTS_ROOT / "Qwen2.5-VL-ViT" if vit_dir.exists(): print(f"[boot] Qwen2.5-VL-ViT/ landed with {len(list(vit_dir.iterdir()))} files", flush=True) else: print("[boot] WARNING: Qwen2.5-VL-ViT/ NOT in the Lance repo, model init will fail", flush=True) wan_path = WEIGHTS_ROOT / "Wan2.2_VAE.pth" if wan_path.exists(): size_mb = wan_path.stat().st_size / (1024 * 1024) print(f"[boot] Wan2.2_VAE.pth landed at {size_mb:.0f} MB", flush=True) else: print("[boot] WARNING: Wan2.2_VAE.pth NOT in the Lance repo, VAE init will fail", flush=True) print("[boot] weight download complete.", flush=True) # Run download at import time. On HF Spaces this happens once when the Space # starts up; the user sees "Starting..." in the dashboard while it runs. fetch_weights() # ============================================================================= # Pipeline import (must come AFTER fetch_weights so the import-time checks # in their script find the downloaded files where they expect them) # ============================================================================= from lance_gradio_t2v_v2t import ( LanceT2VV2TPipeline, TASK_T2V, TASK_X2T_VIDEO, DEFAULT_HEIGHT, DEFAULT_WIDTH, DEFAULT_NUM_FRAMES, DEFAULT_TIMESTEPS, DEFAULT_TIMESTEP_SHIFT, DEFAULT_CFG_TEXT_SCALE, DEFAULT_RESOLUTION, DEFAULT_BASIC_SEED, VIDEO_RESOLUTION_CHOICES, ) # A single pipeline instance. ZeroGPU only ever hands us one GPU at a time, # so the PipelinePool from the upstream script (which manages multiple GPUs # via threads) isn't relevant here. We instantiate now but defer actual # model loading: `LanceT2VV2TPipeline.generate()` calls `self.initialize()` # lazily on its first invocation, so the model loads the first time a user # hits the GPU. After that, subsequent calls reuse the loaded model # (as long as the Space hasn't gone cold). PIPELINE = LanceT2VV2TPipeline(device_id=0) # ============================================================================= # ZeroGPU-decorated entry points # ============================================================================= # Two entry points instead of one routed by `task` because each has a # different duration budget. Text-to-video can run close to the 300s cap on # 50 frames at 480p. Video understanding finishes in 20-60s. Declaring # tighter durations lets ZeroGPU schedule short tasks more aggressively and # means the user's daily quota covers more requests. @spaces.GPU(duration=240) def run_text_to_video( prompt: str, seed: int, resolution: str, num_frames: int, height: int, width: int, validation_num_timesteps: int, validation_timestep_shift: float, cfg_text_scale: float, ): """ Generate a short video from a text prompt. Returns a 4-tuple matching the upstream pipeline shape: (video_path, text_result, status_markdown, run_logs) For t2v, video_path is the produced clip, text_result is empty. Duration is set to 240s to allow proper-quality generation at 480p with 50 frames and 30 denoising steps. Each call costs ~3-4 minutes of the signed-in user's daily ZeroGPU budget (25 min/day on HF Pro), so a Pro user gets ~6-8 high-quality clips per day. ZeroGPU's hard per-call cap is 300s; we leave 60s of headroom under that. """ return PIPELINE.generate( task=TASK_T2V, prompt=prompt, input_video=None, question="", height=int(height), width=int(width), num_frames=int(num_frames), seed=int(seed), resolution=resolution, validation_num_timesteps=int(validation_num_timesteps), validation_timestep_shift=float(validation_timestep_shift), cfg_text_scale=float(cfg_text_scale), ) def _run_lance_task( task_name: str, payload: dict, height: int, width: int, num_frames: int, seed: int, resolution: str, validation_num_timesteps: int, validation_timestep_shift: float, cfg_text_scale: float, output_kind: str, ): """ Generic Lance task runner. Builds inference args, runs the model, extracts the result. Args: task_name: one of "t2i", "t2v", "x2t_image", "x2t_video", "image_edit", "video_edit". This drives Lance's task-specific dataset routing inside ValidationDataset. payload: the dict that gets written to the prompt JSON file. Shape varies per task; build it in the task-specific wrapper using the patterns from config/examples/*_example.json. output_kind: "image", "video", or "text". Decides which kind of file we glob for at the end and how we report status. We sidestep the upstream `LanceT2VV2TPipeline.generate()` because it hardcodes t2v + x2t_video. Everything else here mirrors that method's structure: clone the base args, override the request-specific bits, call _build_request_batch, then validate_on_fixed_batch. """ import json as _json import time as _time from copy import deepcopy as _deepcopy from datetime import datetime as _datetime import torch as _torch from inference_lance import ( validate_on_fixed_batch as _validate_on_fixed_batch, save_prompt_results as _save_prompt_results, clean_memory as _clean_memory, ) from lance_gradio_t2v_v2t import ( TEXT_TEMPLATE as _TEXT_TEMPLATE, TMP_INPUT_DIR as _TMP_INPUT_DIR, RESULTS_ROOT as _RESULTS_ROOT, ensure_dirs as _ensure_dirs, extract_text_result as _extract_text_result, ) PIPELINE.initialize() _ensure_dirs() timestamp = _datetime.now().strftime("%Y%m%d_%H%M%S_%f") # Pretty-print JSON so the dataset loader's _read_jsonl line-by-line # parse FAILS and falls through to the json.load + dict-transform path # that produces the {"data": ..., "index": ...} records the samplers # expect. See validation_dataset.py ~line 84. prompt_file = _TMP_INPUT_DIR / f"{task_name}_{timestamp}.json" prompt_file.write_text(_json.dumps(payload, ensure_ascii=False, indent=2)) save_dir = _RESULTS_ROOT / f"{task_name}_{timestamp}" save_dir.mkdir(parents=True, exist_ok=True) request_model_args = _deepcopy(PIPELINE.base_model_args) request_model_args.cfg_text_scale = float(cfg_text_scale) request_data_args = _deepcopy(PIPELINE.base_data_args) request_data_args.val_dataset_config_file = str(prompt_file) request_inference_args = _deepcopy(PIPELINE.base_inference_args) request_inference_args.validation_num_timesteps = int(validation_num_timesteps) request_inference_args.validation_timestep_shift = float(validation_timestep_shift) request_inference_args.validation_data_seed = int(seed) request_inference_args.validation_noise_seed = int(seed) request_inference_args.video_height = int(height) request_inference_args.video_width = int(width) request_inference_args.num_frames = int(num_frames) request_inference_args.resolution = resolution request_inference_args.save_path_gen = str(save_dir) request_inference_args.task = task_name request_inference_args.text_template = _TEXT_TEMPLATE request_inference_args.prompt_data_dict = {} val_data_cpu = PIPELINE._build_request_batch( prompt_file=prompt_file, model_args=request_model_args, data_args=request_data_args, inference_args=request_inference_args, ) print( f"[app] {task_name} start | size={height}x{width} | " f"frames={num_frames} | steps={validation_num_timesteps}", flush=True, ) start = _time.perf_counter() with PIPELINE._generate_lock: _torch.cuda.set_device(PIPELINE.device) _validate_on_fixed_batch( fsdp_model=PIPELINE.model, vae_model=PIPELINE.vae_model, tokenizer=PIPELINE.tokenizer, val_data_cpu=val_data_cpu, training_args=request_inference_args, model_args=request_model_args, inference_args=request_inference_args, new_token_ids=PIPELINE.new_token_ids, image_token_id=PIPELINE.image_token_id, device=PIPELINE.device, save_source_video=False, save_path_gen=str(save_dir), save_path_gt="", ) _save_prompt_results( request_inference_args.prompt_data_dict, str(save_dir), PIPELINE.logger, ) _clean_memory() elapsed = _time.perf_counter() - start # Result extraction by output kind. The output 4-tuple matches the # shape the upstream pipeline returns so the frontend handlers stay # uniform: (video_path, text_result, status_md, logs). if output_kind == "image": files = sorted(save_dir.glob("*.png")) + sorted(save_dir.glob("*.jpg")) if not files: return None, "", f"Inference completed but no image in {save_dir}", "" return str(files[0]), "", f"Done in {elapsed:.1f}s", "" if output_kind == "video": files = sorted(save_dir.glob("*.mp4")) if not files: return None, "", f"Inference completed but no video in {save_dir}", "" return str(files[0]), "", f"Done in {elapsed:.1f}s", "" if output_kind == "text": text = _extract_text_result(save_dir) if not text: return None, "", f"Inference completed but no text result in {save_dir}", "" return None, text, f"Done in {elapsed:.1f}s", "" return None, "", f"Unknown output_kind: {output_kind}", "" def _image_inference( prompt: str, height: int, width: int, seed: int, validation_num_timesteps: int, validation_timestep_shift: float, cfg_text_scale: float, ): """Text-to-image. Thin wrapper around _run_lance_task.""" prompt = (prompt or "").strip() if not prompt: return None, "", "Please enter a prompt.", "" return _run_lance_task( task_name="t2i", payload={"000000.png": prompt}, height=int(height), width=int(width), num_frames=1, seed=int(seed), resolution="image_768res", validation_num_timesteps=int(validation_num_timesteps), validation_timestep_shift=float(validation_timestep_shift), cfg_text_scale=float(cfg_text_scale), output_kind="image", ) @spaces.GPU(duration=90) def run_text_to_image( prompt: str, seed: int, height: int, width: int, validation_num_timesteps: int, validation_timestep_shift: float, cfg_text_scale: float, ): """ ZeroGPU wrapper for text-to-image. 90s budget; image gen with Lance at 768res / 30 steps lands in 20 to 40 seconds on H200. """ return _image_inference( prompt=prompt, height=int(height), width=int(width), seed=int(seed), validation_num_timesteps=int(validation_num_timesteps), validation_timestep_shift=float(validation_timestep_shift), cfg_text_scale=float(cfg_text_scale), ) @spaces.GPU(duration=60) def run_video_understanding( input_video: str, question: str, seed: int, ): """ Answer a question about an uploaded video clip. Returns the same 4-tuple shape. For understanding, video_path is empty and text_result holds the model's answer. Most of the pipeline.generate() arguments are unused for understanding tasks, but the signature requires all of them, so we pass defaults. """ return PIPELINE.generate( task=TASK_X2T_VIDEO, prompt="", input_video=input_video, question=question, height=DEFAULT_HEIGHT, width=DEFAULT_WIDTH, num_frames=DEFAULT_NUM_FRAMES, seed=int(seed), resolution=DEFAULT_RESOLUTION, validation_num_timesteps=DEFAULT_TIMESTEPS, validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT, cfg_text_scale=DEFAULT_CFG_TEXT_SCALE, ) @spaces.GPU(duration=60) def run_image_understanding( input_image: str, question: str, seed: int, ): """ Answer a question about an uploaded image. Same shape as run_video_understanding but operates on a single image. The interleave_array payload format comes from Lance's config/examples/x2t_image_example.json. """ question = (question or "").strip() if not input_image: return None, "", "Please upload an image.", "" if not question: return None, "", "Please enter a question.", "" payload = { "0001": { "interleave_array": [ input_image, ["Look at the image carefully and answer the question.", question, ""], ], "element_dtype_array": ["image", "text"], "istarget_in_interleave": [0, 1], } } return _run_lance_task( task_name="x2t_image", payload=payload, height=DEFAULT_HEIGHT, width=DEFAULT_WIDTH, num_frames=1, seed=int(seed), resolution="image_768res", validation_num_timesteps=DEFAULT_TIMESTEPS, validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT, cfg_text_scale=DEFAULT_CFG_TEXT_SCALE, output_kind="text", ) @spaces.GPU(duration=120) def run_image_edit( input_image: str, prompt: str, seed: int, height: int, width: int, validation_num_timesteps: int, validation_timestep_shift: float, cfg_text_scale: float, ): """ Edit an input image with a text instruction. Payload format from Lance's config/examples/image_edit_example.json: text prompt first, then the source image twice (Lance's pipeline expects source AND target slots; the second slot is what the model fills in). """ prompt = (prompt or "").strip() if not input_image: return None, "", "Please upload an image.", "" if not prompt: return None, "", "Please enter an edit instruction.", "" payload = { "0001": { "interleave_array": [prompt, input_image, input_image], "element_dtype_array": ["text", "image", "image"], "istarget_in_interleave": [0, 0, 1], } } return _run_lance_task( task_name="image_edit", payload=payload, height=int(height), width=int(width), num_frames=1, seed=int(seed), resolution="image_768res", validation_num_timesteps=int(validation_num_timesteps), validation_timestep_shift=float(validation_timestep_shift), cfg_text_scale=float(cfg_text_scale), output_kind="image", ) @spaces.GPU(duration=240) def run_video_edit( input_video: str, prompt: str, seed: int, height: int, width: int, num_frames: int, resolution: str, validation_num_timesteps: int, validation_timestep_shift: float, cfg_text_scale: float, ): """ Edit an input video with a text instruction. Same shape as image_edit but with video element dtypes. 240s GPU budget matches t2v because the underlying inference cost is similar. """ prompt = (prompt or "").strip() if not input_video: return None, "", "Please upload a video.", "" if not prompt: return None, "", "Please enter an edit instruction.", "" payload = { "0001": { "interleave_array": [prompt, input_video, input_video], "element_dtype_array": ["text", "video", "video"], "istarget_in_interleave": [0, 0, 1], } } return _run_lance_task( task_name="video_edit", payload=payload, height=int(height), width=int(width), num_frames=int(num_frames), seed=int(seed), resolution=resolution, validation_num_timesteps=int(validation_num_timesteps), validation_timestep_shift=float(validation_timestep_shift), cfg_text_scale=float(cfg_text_scale), output_kind="video", ) # ============================================================================= # Gradio UI # ============================================================================= # Two tabs, one per task. Layout follows the same column structure as the # upstream demo so users familiar with their reference UI feel at home. with gr.Blocks(title="nifty-lab") as demo: gr.Markdown( """ # nifty-lab A multimodal playground built on ByteDance's [Lance](https://github.com/bytedance/Lance) model, served on Hugging Face ZeroGPU. By [Igor Lima](https://github.com/IgorCSIS). Tasks wired: text-to-image, text-to-video, video understanding. Image edit and video edit ship in a follow-up. First request after the Space wakes from idle takes about a minute to warm the model. Subsequent requests are fast. """ ) # ---- Tab: Text to Video ------------------------------------------------ with gr.Tab("Text to Video"): with gr.Row(): with gr.Column(scale=1): t2v_prompt = gr.Textbox( label="Prompt", lines=5, placeholder="Describe the video you want to generate...", ) with gr.Row(): # Defaults at 480x848 give Lance enough pixels to make # something coherent. Lower if you want faster gens. t2v_height = gr.Slider(192, 1024, value=480, step=16, label="Height") t2v_width = gr.Slider(192, 1024, value=848, step=16, label="Width") # 50 frames is ~2s at 25fps, fits comfortably in 240s GPU budget. t2v_num_frames = gr.Slider( 1, 121, value=50, step=1, label="Frames", info="50 frames is roughly 2 seconds. 121 is the model max.", ) t2v_resolution = gr.Dropdown( label="Resolution preset", choices=VIDEO_RESOLUTION_CHOICES, value="video_480p", ) t2v_seed = gr.Number( label="Seed", value=DEFAULT_BASIC_SEED, precision=0, info="-1 picks a fresh random seed each run.", ) with gr.Accordion("Advanced", open=False): t2v_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps") t2v_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) t2v_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) t2v_run = gr.Button("Generate Video", variant="primary") with gr.Column(scale=1): t2v_output = gr.Video(label="Result") t2v_status = gr.Markdown("Idle.") t2v_logs = gr.Textbox(label="Run log", lines=12, max_lines=30) # The pipeline returns a 4-tuple of (video_path, text, status, logs). # Text result is unused for t2v but we still receive it, so we wire # it to a hidden state to consume the value. t2v_unused_text = gr.State("") t2v_run.click( fn=run_text_to_video, inputs=[ t2v_prompt, t2v_seed, t2v_resolution, t2v_num_frames, t2v_height, t2v_width, t2v_timesteps, t2v_shift, t2v_cfg, ], outputs=[t2v_output, t2v_unused_text, t2v_status, t2v_logs], ) # ---- Tab: Text to Image ----------------------------------------------- with gr.Tab("Text to Image"): with gr.Row(): with gr.Column(scale=1): t2i_prompt = gr.Textbox( label="Prompt", lines=5, placeholder="A red panda walking through a snowy forest at dusk...", ) with gr.Row(): t2i_height = gr.Slider(256, 1024, value=768, step=16, label="Height") t2i_width = gr.Slider(256, 1024, value=768, step=16, label="Width") t2i_seed = gr.Number( label="Seed", value=DEFAULT_BASIC_SEED, precision=0, ) with gr.Accordion("Advanced", open=False): t2i_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps") t2i_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) t2i_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) t2i_run = gr.Button("Generate Image", variant="primary") with gr.Column(scale=1): t2i_output = gr.Image(label="Result") t2i_status = gr.Markdown("Idle.") t2i_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) t2i_unused_text = gr.State("") t2i_run.click( fn=run_text_to_image, inputs=[ t2i_prompt, t2i_seed, t2i_height, t2i_width, t2i_timesteps, t2i_shift, t2i_cfg, ], outputs=[t2i_output, t2i_unused_text, t2i_status, t2i_logs], ) # ---- Tab: Video Understanding ----------------------------------------- with gr.Tab("Video Understanding"): with gr.Row(): with gr.Column(scale=1): v2t_input = gr.Video(label="Upload a video") v2t_question = gr.Textbox( label="Question", lines=3, placeholder="What is happening in this video?", ) v2t_seed = gr.Number( label="Seed", value=DEFAULT_BASIC_SEED, precision=0, ) v2t_run = gr.Button("Ask", variant="primary") with gr.Column(scale=1): v2t_output = gr.Textbox(label="Answer", lines=8) v2t_status = gr.Markdown("Idle.") v2t_logs = gr.Textbox(label="Run log", lines=12, max_lines=30) # Video understanding returns (None_for_video, text_answer, status, logs). # Discard the video slot; surface the text. v2t_unused_video = gr.State(None) v2t_run.click( fn=run_video_understanding, inputs=[v2t_input, v2t_question, v2t_seed], outputs=[v2t_unused_video, v2t_output, v2t_status, v2t_logs], ) # ---- Tab: Image Understanding ----------------------------------------- with gr.Tab("Image Q&A"): with gr.Row(): with gr.Column(scale=1): i2t_input = gr.Image(label="Upload an image", type="filepath") i2t_question = gr.Textbox( label="Question", lines=3, placeholder="What is happening in this image?", ) i2t_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0) i2t_run = gr.Button("Ask", variant="primary") with gr.Column(scale=1): i2t_output = gr.Textbox(label="Answer", lines=8) i2t_status = gr.Markdown("Idle.") i2t_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) i2t_unused = gr.State(None) i2t_run.click( fn=run_image_understanding, inputs=[i2t_input, i2t_question, i2t_seed], outputs=[i2t_unused, i2t_output, i2t_status, i2t_logs], ) # ---- Tab: Image Edit -------------------------------------------------- with gr.Tab("Image Edit"): with gr.Row(): with gr.Column(scale=1): ie_input = gr.Image(label="Source image", type="filepath") ie_prompt = gr.Textbox( label="Edit instruction", lines=3, placeholder="Add a pearl necklace; convert to watercolor; etc.", ) with gr.Row(): ie_height = gr.Slider(256, 1024, value=768, step=16, label="Height") ie_width = gr.Slider(256, 1024, value=768, step=16, label="Width") ie_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0) with gr.Accordion("Advanced", open=False): ie_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps") ie_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) ie_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) ie_run = gr.Button("Edit Image", variant="primary") with gr.Column(scale=1): ie_output = gr.Image(label="Result") ie_status = gr.Markdown("Idle.") ie_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) ie_unused = gr.State("") ie_run.click( fn=run_image_edit, inputs=[ ie_input, ie_prompt, ie_seed, ie_height, ie_width, ie_timesteps, ie_shift, ie_cfg, ], outputs=[ie_output, ie_unused, ie_status, ie_logs], ) # ---- Tab: Video Edit -------------------------------------------------- with gr.Tab("Video Edit"): with gr.Row(): with gr.Column(scale=1): ve_input = gr.Video(label="Source video") ve_prompt = gr.Textbox( label="Edit instruction", lines=3, placeholder="Change the background; restyle as anime; etc.", ) with gr.Row(): ve_height = gr.Slider(192, 1024, value=480, step=16, label="Height") ve_width = gr.Slider(192, 1024, value=848, step=16, label="Width") ve_num_frames = gr.Slider(1, 121, value=50, step=1, label="Frames") ve_resolution = gr.Dropdown( label="Resolution preset", choices=VIDEO_RESOLUTION_CHOICES, value="video_480p", ) ve_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0) with gr.Accordion("Advanced", open=False): ve_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps") ve_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT) ve_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE) ve_run = gr.Button("Edit Video", variant="primary") with gr.Column(scale=1): ve_output = gr.Video(label="Result") ve_status = gr.Markdown("Idle.") ve_logs = gr.Textbox(label="Run log", lines=8, max_lines=20) ve_unused = gr.State("") ve_run.click( fn=run_video_edit, inputs=[ ve_input, ve_prompt, ve_seed, ve_height, ve_width, ve_num_frames, ve_resolution, ve_timesteps, ve_shift, ve_cfg, ], outputs=[ve_output, ve_unused, ve_status, ve_logs], ) if __name__ == "__main__": # SSR mode (Gradio 6 default) breaks @gradio/client 1.8: requests vanish # silently because the client expects the old non-SSR endpoint shape. # Disable it explicitly. # Theme also moved to launch() in Gradio 6, so we pass it here. demo.queue(max_size=8).launch( server_name="0.0.0.0", server_port=7860, ssr_mode=False, theme=gr.themes.Soft(), )