nifty-lab / app.py
IgorCSIS
Stage 2 phase 2: add x2t_image, image_edit, video_edit with shared _run_lance_task helper
f32613e
Raw
History Blame Contribute Delete
34 kB
# Copyright 2026 Igor Lima. Licensed under Apache 2.0 (see LICENSE_LANCE for the
# upstream Lance license; this file is original work by Igor Lima).
#
# nifty-lab: ZeroGPU adapter for ByteDance's Lance unified multimodal model.
#
# WHAT THIS IS
# ------------
# A small Gradio app that runs on Hugging Face Spaces with the ZeroGPU runtime.
# All of the heavy model-loading and inference logic lives in
# `lance_gradio_t2v_v2t.py`, which is a verbatim copy of ByteDance's reference
# Gradio script. This file wraps their `LanceT2VV2TPipeline` class for
# ZeroGPU's on-demand GPU lifecycle.
#
# WHY WE DIDN'T JUST RUN THEIR SCRIPT AS-IS
# -----------------------------------------
# Their script assumes a long-running dedicated GPU. They wrap inference in a
# `PipelinePool` that holds the GPU across requests via threads. ZeroGPU works
# on a per-request claim model: you request a GPU when you call a function
# decorated with `@spaces.GPU`, and the GPU is released when the function
# returns. No persistent pool. So we instantiate ONE pipeline, decorate the
# inference entry points, and trust ZeroGPU to schedule the GPU for us.
#
# STAGE 1 SCOPE
# -------------
# Only the two tasks their reference script supports out of the box:
# 1. text-to-video (TASK_T2V)
# 2. video understanding / Q&A (TASK_X2T_VIDEO)
#
# Stage 2 will add t2i, image_edit, video_edit, and x2t_image as new tabs.
# Those tasks need a separate model variant (Lance_3B for image work) and
# additional plumbing in the pipeline, so we ship stage 1 first to prove the
# ZeroGPU pattern works end-to-end before expanding.
from __future__ import annotations
# -----------------------------------------------------------------------------
# CUDA runtime preload. MUST run before any transformers import.
# -----------------------------------------------------------------------------
# Why this exists: recent transformers versions eagerly import flash_attn
# inside modeling_utils.py, which dynamically links against libcudart.so.12
# at module load. On ZeroGPU, the GPU (and thus its CUDA libs) is only
# attached during @spaces.GPU calls, so libcudart isn't on the linker
# path at boot. Importing torch first doesn't help because torch only
# preloads CUDA when it detects a live GPU device.
#
# We work around it by hunting for libcudart.so.12 in the pip-installed
# nvidia and torch wheels and ctypes-loading it with RTLD_GLOBAL so the
# symbols are visible to anyone who dlopens later.
import ctypes
import glob
import os
import sys
def _preload_cuda_runtime() -> None:
"""Find libcudart.so.12 in pip-installed nvidia/torch wheels and preload it."""
candidates: list[str] = []
# nvidia-cuda-runtime-cu12 pip package
candidates += glob.glob("/usr/local/lib/python*/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12*")
# Sometimes torch's bundled CUDA libs are loadable too, as a fallback
candidates += glob.glob("/usr/local/lib/python*/site-packages/torch/lib/libcudart.so.12*")
for path in candidates:
try:
ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL)
print(f"[boot] preloaded {path}", flush=True)
return
except OSError as exc:
print(f"[boot] failed to preload {path}: {exc}", flush=True)
print("[boot] WARNING: no libcudart.so.12 found to preload, flash_attn import will fail", flush=True)
_preload_cuda_runtime()
# Now the rest of the imports are safe.
from pathlib import Path
from typing import Optional
import torch # noqa: F401 imported early so its CUDA env init runs before lance code
import gradio as gr
import spaces # Hugging Face ZeroGPU runtime decorator
from huggingface_hub import snapshot_download
# =============================================================================
# Model weight download
# =============================================================================
# We pull Lance_3B_Video at module-load time (not on first request) for two
# reasons:
# 1. It only needs CPU + network, which we have at Space startup. ZeroGPU
# does NOT give us a GPU at module load.
# 2. The download is ~6GB and would otherwise consume most of the 300s GPU
# budget on the first user request.
#
# After the first boot, Hugging Face Spaces caches the files in the Space's
# persistent storage, so subsequent boots skip re-downloading.
REPO_ID = "bytedance-research/Lance"
WEIGHTS_ROOT = Path("downloads")
def fetch_weights() -> None:
"""
Pull the weights Lance needs at runtime from the Hugging Face Hub.
Two directories matter:
- Lance_3B_Video/ The main multimodal model checkpoint (~28GB).
- Qwen2.5-VL-ViT/ The vision transformer Lance imports from
Qwen2.5-VL. Lance's code hardcodes this path
and expects `config.json` + `vit.safetensors`
inside it.
We download both with one snapshot_download call. If Qwen2.5-VL-ViT/
isn't shipped under the Lance repo, snapshot_download just skips it
silently and we'll surface a clearer error when initialize() runs.
"""
print(f"[boot] downloading {REPO_ID} (Lance_3B_Video + Qwen2.5-VL-ViT + Wan2.2 VAE)...", flush=True)
snapshot_download(
repo_id=REPO_ID,
local_dir=str(WEIGHTS_ROOT),
allow_patterns=[
"Lance_3B_Video/*",
"Lance_3B_Video/**/*",
"Qwen2.5-VL-ViT/*",
"Qwen2.5-VL-ViT/**/*",
# Wan2.2 video VAE checkpoint, used by WanVideoVAE during pipeline
# initialize(). Lives at the top of the repo (no subdir).
"Wan2.2*",
# Top-level config.json (901 bytes). Tiny, defensively included
# in case Lance's code reads it during init.
"config.json",
],
resume_download=True,
)
# Sanity check: log what showed up. If a required artifact is missing,
# the next runtime error will tell us exactly what to add to allow_patterns.
vit_dir = WEIGHTS_ROOT / "Qwen2.5-VL-ViT"
if vit_dir.exists():
print(f"[boot] Qwen2.5-VL-ViT/ landed with {len(list(vit_dir.iterdir()))} files", flush=True)
else:
print("[boot] WARNING: Qwen2.5-VL-ViT/ NOT in the Lance repo, model init will fail", flush=True)
wan_path = WEIGHTS_ROOT / "Wan2.2_VAE.pth"
if wan_path.exists():
size_mb = wan_path.stat().st_size / (1024 * 1024)
print(f"[boot] Wan2.2_VAE.pth landed at {size_mb:.0f} MB", flush=True)
else:
print("[boot] WARNING: Wan2.2_VAE.pth NOT in the Lance repo, VAE init will fail", flush=True)
print("[boot] weight download complete.", flush=True)
# Run download at import time. On HF Spaces this happens once when the Space
# starts up; the user sees "Starting..." in the dashboard while it runs.
fetch_weights()
# =============================================================================
# Pipeline import (must come AFTER fetch_weights so the import-time checks
# in their script find the downloaded files where they expect them)
# =============================================================================
from lance_gradio_t2v_v2t import (
LanceT2VV2TPipeline,
TASK_T2V,
TASK_X2T_VIDEO,
DEFAULT_HEIGHT,
DEFAULT_WIDTH,
DEFAULT_NUM_FRAMES,
DEFAULT_TIMESTEPS,
DEFAULT_TIMESTEP_SHIFT,
DEFAULT_CFG_TEXT_SCALE,
DEFAULT_RESOLUTION,
DEFAULT_BASIC_SEED,
VIDEO_RESOLUTION_CHOICES,
)
# A single pipeline instance. ZeroGPU only ever hands us one GPU at a time,
# so the PipelinePool from the upstream script (which manages multiple GPUs
# via threads) isn't relevant here. We instantiate now but defer actual
# model loading: `LanceT2VV2TPipeline.generate()` calls `self.initialize()`
# lazily on its first invocation, so the model loads the first time a user
# hits the GPU. After that, subsequent calls reuse the loaded model
# (as long as the Space hasn't gone cold).
PIPELINE = LanceT2VV2TPipeline(device_id=0)
# =============================================================================
# ZeroGPU-decorated entry points
# =============================================================================
# Two entry points instead of one routed by `task` because each has a
# different duration budget. Text-to-video can run close to the 300s cap on
# 50 frames at 480p. Video understanding finishes in 20-60s. Declaring
# tighter durations lets ZeroGPU schedule short tasks more aggressively and
# means the user's daily quota covers more requests.
@spaces.GPU(duration=240)
def run_text_to_video(
prompt: str,
seed: int,
resolution: str,
num_frames: int,
height: int,
width: int,
validation_num_timesteps: int,
validation_timestep_shift: float,
cfg_text_scale: float,
):
"""
Generate a short video from a text prompt.
Returns a 4-tuple matching the upstream pipeline shape:
(video_path, text_result, status_markdown, run_logs)
For t2v, video_path is the produced clip, text_result is empty.
Duration is set to 240s to allow proper-quality generation at 480p
with 50 frames and 30 denoising steps. Each call costs ~3-4 minutes
of the signed-in user's daily ZeroGPU budget (25 min/day on HF Pro),
so a Pro user gets ~6-8 high-quality clips per day. ZeroGPU's hard
per-call cap is 300s; we leave 60s of headroom under that.
"""
return PIPELINE.generate(
task=TASK_T2V,
prompt=prompt,
input_video=None,
question="",
height=int(height),
width=int(width),
num_frames=int(num_frames),
seed=int(seed),
resolution=resolution,
validation_num_timesteps=int(validation_num_timesteps),
validation_timestep_shift=float(validation_timestep_shift),
cfg_text_scale=float(cfg_text_scale),
)
def _run_lance_task(
task_name: str,
payload: dict,
height: int,
width: int,
num_frames: int,
seed: int,
resolution: str,
validation_num_timesteps: int,
validation_timestep_shift: float,
cfg_text_scale: float,
output_kind: str,
):
"""
Generic Lance task runner. Builds inference args, runs the model,
extracts the result.
Args:
task_name: one of "t2i", "t2v", "x2t_image", "x2t_video",
"image_edit", "video_edit". This drives Lance's
task-specific dataset routing inside ValidationDataset.
payload: the dict that gets written to the prompt JSON file. Shape
varies per task; build it in the task-specific wrapper
using the patterns from config/examples/*_example.json.
output_kind: "image", "video", or "text". Decides which kind of
file we glob for at the end and how we report status.
We sidestep the upstream `LanceT2VV2TPipeline.generate()` because it
hardcodes t2v + x2t_video. Everything else here mirrors that method's
structure: clone the base args, override the request-specific bits,
call _build_request_batch, then validate_on_fixed_batch.
"""
import json as _json
import time as _time
from copy import deepcopy as _deepcopy
from datetime import datetime as _datetime
import torch as _torch
from inference_lance import (
validate_on_fixed_batch as _validate_on_fixed_batch,
save_prompt_results as _save_prompt_results,
clean_memory as _clean_memory,
)
from lance_gradio_t2v_v2t import (
TEXT_TEMPLATE as _TEXT_TEMPLATE,
TMP_INPUT_DIR as _TMP_INPUT_DIR,
RESULTS_ROOT as _RESULTS_ROOT,
ensure_dirs as _ensure_dirs,
extract_text_result as _extract_text_result,
)
PIPELINE.initialize()
_ensure_dirs()
timestamp = _datetime.now().strftime("%Y%m%d_%H%M%S_%f")
# Pretty-print JSON so the dataset loader's _read_jsonl line-by-line
# parse FAILS and falls through to the json.load + dict-transform path
# that produces the {"data": ..., "index": ...} records the samplers
# expect. See validation_dataset.py ~line 84.
prompt_file = _TMP_INPUT_DIR / f"{task_name}_{timestamp}.json"
prompt_file.write_text(_json.dumps(payload, ensure_ascii=False, indent=2))
save_dir = _RESULTS_ROOT / f"{task_name}_{timestamp}"
save_dir.mkdir(parents=True, exist_ok=True)
request_model_args = _deepcopy(PIPELINE.base_model_args)
request_model_args.cfg_text_scale = float(cfg_text_scale)
request_data_args = _deepcopy(PIPELINE.base_data_args)
request_data_args.val_dataset_config_file = str(prompt_file)
request_inference_args = _deepcopy(PIPELINE.base_inference_args)
request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
request_inference_args.validation_data_seed = int(seed)
request_inference_args.validation_noise_seed = int(seed)
request_inference_args.video_height = int(height)
request_inference_args.video_width = int(width)
request_inference_args.num_frames = int(num_frames)
request_inference_args.resolution = resolution
request_inference_args.save_path_gen = str(save_dir)
request_inference_args.task = task_name
request_inference_args.text_template = _TEXT_TEMPLATE
request_inference_args.prompt_data_dict = {}
val_data_cpu = PIPELINE._build_request_batch(
prompt_file=prompt_file,
model_args=request_model_args,
data_args=request_data_args,
inference_args=request_inference_args,
)
print(
f"[app] {task_name} start | size={height}x{width} | "
f"frames={num_frames} | steps={validation_num_timesteps}",
flush=True,
)
start = _time.perf_counter()
with PIPELINE._generate_lock:
_torch.cuda.set_device(PIPELINE.device)
_validate_on_fixed_batch(
fsdp_model=PIPELINE.model,
vae_model=PIPELINE.vae_model,
tokenizer=PIPELINE.tokenizer,
val_data_cpu=val_data_cpu,
training_args=request_inference_args,
model_args=request_model_args,
inference_args=request_inference_args,
new_token_ids=PIPELINE.new_token_ids,
image_token_id=PIPELINE.image_token_id,
device=PIPELINE.device,
save_source_video=False,
save_path_gen=str(save_dir),
save_path_gt="",
)
_save_prompt_results(
request_inference_args.prompt_data_dict,
str(save_dir),
PIPELINE.logger,
)
_clean_memory()
elapsed = _time.perf_counter() - start
# Result extraction by output kind. The output 4-tuple matches the
# shape the upstream pipeline returns so the frontend handlers stay
# uniform: (video_path, text_result, status_md, logs).
if output_kind == "image":
files = sorted(save_dir.glob("*.png")) + sorted(save_dir.glob("*.jpg"))
if not files:
return None, "", f"Inference completed but no image in {save_dir}", ""
return str(files[0]), "", f"Done in {elapsed:.1f}s", ""
if output_kind == "video":
files = sorted(save_dir.glob("*.mp4"))
if not files:
return None, "", f"Inference completed but no video in {save_dir}", ""
return str(files[0]), "", f"Done in {elapsed:.1f}s", ""
if output_kind == "text":
text = _extract_text_result(save_dir)
if not text:
return None, "", f"Inference completed but no text result in {save_dir}", ""
return None, text, f"Done in {elapsed:.1f}s", ""
return None, "", f"Unknown output_kind: {output_kind}", ""
def _image_inference(
prompt: str,
height: int,
width: int,
seed: int,
validation_num_timesteps: int,
validation_timestep_shift: float,
cfg_text_scale: float,
):
"""Text-to-image. Thin wrapper around _run_lance_task."""
prompt = (prompt or "").strip()
if not prompt:
return None, "", "Please enter a prompt.", ""
return _run_lance_task(
task_name="t2i",
payload={"000000.png": prompt},
height=int(height),
width=int(width),
num_frames=1,
seed=int(seed),
resolution="image_768res",
validation_num_timesteps=int(validation_num_timesteps),
validation_timestep_shift=float(validation_timestep_shift),
cfg_text_scale=float(cfg_text_scale),
output_kind="image",
)
@spaces.GPU(duration=90)
def run_text_to_image(
prompt: str,
seed: int,
height: int,
width: int,
validation_num_timesteps: int,
validation_timestep_shift: float,
cfg_text_scale: float,
):
"""
ZeroGPU wrapper for text-to-image. 90s budget; image gen with Lance
at 768res / 30 steps lands in 20 to 40 seconds on H200.
"""
return _image_inference(
prompt=prompt,
height=int(height),
width=int(width),
seed=int(seed),
validation_num_timesteps=int(validation_num_timesteps),
validation_timestep_shift=float(validation_timestep_shift),
cfg_text_scale=float(cfg_text_scale),
)
@spaces.GPU(duration=60)
def run_video_understanding(
input_video: str,
question: str,
seed: int,
):
"""
Answer a question about an uploaded video clip.
Returns the same 4-tuple shape. For understanding, video_path is empty
and text_result holds the model's answer.
Most of the pipeline.generate() arguments are unused for understanding
tasks, but the signature requires all of them, so we pass defaults.
"""
return PIPELINE.generate(
task=TASK_X2T_VIDEO,
prompt="",
input_video=input_video,
question=question,
height=DEFAULT_HEIGHT,
width=DEFAULT_WIDTH,
num_frames=DEFAULT_NUM_FRAMES,
seed=int(seed),
resolution=DEFAULT_RESOLUTION,
validation_num_timesteps=DEFAULT_TIMESTEPS,
validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
cfg_text_scale=DEFAULT_CFG_TEXT_SCALE,
)
@spaces.GPU(duration=60)
def run_image_understanding(
input_image: str,
question: str,
seed: int,
):
"""
Answer a question about an uploaded image. Same shape as
run_video_understanding but operates on a single image.
The interleave_array payload format comes from Lance's
config/examples/x2t_image_example.json.
"""
question = (question or "").strip()
if not input_image:
return None, "", "Please upload an image.", ""
if not question:
return None, "", "Please enter a question.", ""
payload = {
"0001": {
"interleave_array": [
input_image,
["Look at the image carefully and answer the question.", question, ""],
],
"element_dtype_array": ["image", "text"],
"istarget_in_interleave": [0, 1],
}
}
return _run_lance_task(
task_name="x2t_image",
payload=payload,
height=DEFAULT_HEIGHT,
width=DEFAULT_WIDTH,
num_frames=1,
seed=int(seed),
resolution="image_768res",
validation_num_timesteps=DEFAULT_TIMESTEPS,
validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
cfg_text_scale=DEFAULT_CFG_TEXT_SCALE,
output_kind="text",
)
@spaces.GPU(duration=120)
def run_image_edit(
input_image: str,
prompt: str,
seed: int,
height: int,
width: int,
validation_num_timesteps: int,
validation_timestep_shift: float,
cfg_text_scale: float,
):
"""
Edit an input image with a text instruction. Payload format from
Lance's config/examples/image_edit_example.json: text prompt first,
then the source image twice (Lance's pipeline expects source AND
target slots; the second slot is what the model fills in).
"""
prompt = (prompt or "").strip()
if not input_image:
return None, "", "Please upload an image.", ""
if not prompt:
return None, "", "Please enter an edit instruction.", ""
payload = {
"0001": {
"interleave_array": [prompt, input_image, input_image],
"element_dtype_array": ["text", "image", "image"],
"istarget_in_interleave": [0, 0, 1],
}
}
return _run_lance_task(
task_name="image_edit",
payload=payload,
height=int(height),
width=int(width),
num_frames=1,
seed=int(seed),
resolution="image_768res",
validation_num_timesteps=int(validation_num_timesteps),
validation_timestep_shift=float(validation_timestep_shift),
cfg_text_scale=float(cfg_text_scale),
output_kind="image",
)
@spaces.GPU(duration=240)
def run_video_edit(
input_video: str,
prompt: str,
seed: int,
height: int,
width: int,
num_frames: int,
resolution: str,
validation_num_timesteps: int,
validation_timestep_shift: float,
cfg_text_scale: float,
):
"""
Edit an input video with a text instruction. Same shape as
image_edit but with video element dtypes. 240s GPU budget matches
t2v because the underlying inference cost is similar.
"""
prompt = (prompt or "").strip()
if not input_video:
return None, "", "Please upload a video.", ""
if not prompt:
return None, "", "Please enter an edit instruction.", ""
payload = {
"0001": {
"interleave_array": [prompt, input_video, input_video],
"element_dtype_array": ["text", "video", "video"],
"istarget_in_interleave": [0, 0, 1],
}
}
return _run_lance_task(
task_name="video_edit",
payload=payload,
height=int(height),
width=int(width),
num_frames=int(num_frames),
seed=int(seed),
resolution=resolution,
validation_num_timesteps=int(validation_num_timesteps),
validation_timestep_shift=float(validation_timestep_shift),
cfg_text_scale=float(cfg_text_scale),
output_kind="video",
)
# =============================================================================
# Gradio UI
# =============================================================================
# Two tabs, one per task. Layout follows the same column structure as the
# upstream demo so users familiar with their reference UI feel at home.
with gr.Blocks(title="nifty-lab") as demo:
gr.Markdown(
"""
# nifty-lab
A multimodal playground built on ByteDance's
[Lance](https://github.com/bytedance/Lance) model, served on
Hugging Face ZeroGPU. By [Igor Lima](https://github.com/IgorCSIS).
Tasks wired: text-to-image, text-to-video, video understanding.
Image edit and video edit ship in a follow-up.
First request after the Space wakes from idle takes about a minute
to warm the model. Subsequent requests are fast.
"""
)
# ---- Tab: Text to Video ------------------------------------------------
with gr.Tab("Text to Video"):
with gr.Row():
with gr.Column(scale=1):
t2v_prompt = gr.Textbox(
label="Prompt",
lines=5,
placeholder="Describe the video you want to generate...",
)
with gr.Row():
# Defaults at 480x848 give Lance enough pixels to make
# something coherent. Lower if you want faster gens.
t2v_height = gr.Slider(192, 1024, value=480, step=16, label="Height")
t2v_width = gr.Slider(192, 1024, value=848, step=16, label="Width")
# 50 frames is ~2s at 25fps, fits comfortably in 240s GPU budget.
t2v_num_frames = gr.Slider(
1, 121, value=50, step=1, label="Frames",
info="50 frames is roughly 2 seconds. 121 is the model max.",
)
t2v_resolution = gr.Dropdown(
label="Resolution preset",
choices=VIDEO_RESOLUTION_CHOICES,
value="video_480p",
)
t2v_seed = gr.Number(
label="Seed",
value=DEFAULT_BASIC_SEED,
precision=0,
info="-1 picks a fresh random seed each run.",
)
with gr.Accordion("Advanced", open=False):
t2v_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps")
t2v_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
t2v_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
t2v_run = gr.Button("Generate Video", variant="primary")
with gr.Column(scale=1):
t2v_output = gr.Video(label="Result")
t2v_status = gr.Markdown("Idle.")
t2v_logs = gr.Textbox(label="Run log", lines=12, max_lines=30)
# The pipeline returns a 4-tuple of (video_path, text, status, logs).
# Text result is unused for t2v but we still receive it, so we wire
# it to a hidden state to consume the value.
t2v_unused_text = gr.State("")
t2v_run.click(
fn=run_text_to_video,
inputs=[
t2v_prompt, t2v_seed, t2v_resolution, t2v_num_frames,
t2v_height, t2v_width, t2v_timesteps, t2v_shift, t2v_cfg,
],
outputs=[t2v_output, t2v_unused_text, t2v_status, t2v_logs],
)
# ---- Tab: Text to Image -----------------------------------------------
with gr.Tab("Text to Image"):
with gr.Row():
with gr.Column(scale=1):
t2i_prompt = gr.Textbox(
label="Prompt",
lines=5,
placeholder="A red panda walking through a snowy forest at dusk...",
)
with gr.Row():
t2i_height = gr.Slider(256, 1024, value=768, step=16, label="Height")
t2i_width = gr.Slider(256, 1024, value=768, step=16, label="Width")
t2i_seed = gr.Number(
label="Seed",
value=DEFAULT_BASIC_SEED,
precision=0,
)
with gr.Accordion("Advanced", open=False):
t2i_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps")
t2i_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
t2i_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
t2i_run = gr.Button("Generate Image", variant="primary")
with gr.Column(scale=1):
t2i_output = gr.Image(label="Result")
t2i_status = gr.Markdown("Idle.")
t2i_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
t2i_unused_text = gr.State("")
t2i_run.click(
fn=run_text_to_image,
inputs=[
t2i_prompt, t2i_seed, t2i_height, t2i_width,
t2i_timesteps, t2i_shift, t2i_cfg,
],
outputs=[t2i_output, t2i_unused_text, t2i_status, t2i_logs],
)
# ---- Tab: Video Understanding -----------------------------------------
with gr.Tab("Video Understanding"):
with gr.Row():
with gr.Column(scale=1):
v2t_input = gr.Video(label="Upload a video")
v2t_question = gr.Textbox(
label="Question",
lines=3,
placeholder="What is happening in this video?",
)
v2t_seed = gr.Number(
label="Seed",
value=DEFAULT_BASIC_SEED,
precision=0,
)
v2t_run = gr.Button("Ask", variant="primary")
with gr.Column(scale=1):
v2t_output = gr.Textbox(label="Answer", lines=8)
v2t_status = gr.Markdown("Idle.")
v2t_logs = gr.Textbox(label="Run log", lines=12, max_lines=30)
# Video understanding returns (None_for_video, text_answer, status, logs).
# Discard the video slot; surface the text.
v2t_unused_video = gr.State(None)
v2t_run.click(
fn=run_video_understanding,
inputs=[v2t_input, v2t_question, v2t_seed],
outputs=[v2t_unused_video, v2t_output, v2t_status, v2t_logs],
)
# ---- Tab: Image Understanding -----------------------------------------
with gr.Tab("Image Q&A"):
with gr.Row():
with gr.Column(scale=1):
i2t_input = gr.Image(label="Upload an image", type="filepath")
i2t_question = gr.Textbox(
label="Question",
lines=3,
placeholder="What is happening in this image?",
)
i2t_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
i2t_run = gr.Button("Ask", variant="primary")
with gr.Column(scale=1):
i2t_output = gr.Textbox(label="Answer", lines=8)
i2t_status = gr.Markdown("Idle.")
i2t_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
i2t_unused = gr.State(None)
i2t_run.click(
fn=run_image_understanding,
inputs=[i2t_input, i2t_question, i2t_seed],
outputs=[i2t_unused, i2t_output, i2t_status, i2t_logs],
)
# ---- Tab: Image Edit --------------------------------------------------
with gr.Tab("Image Edit"):
with gr.Row():
with gr.Column(scale=1):
ie_input = gr.Image(label="Source image", type="filepath")
ie_prompt = gr.Textbox(
label="Edit instruction",
lines=3,
placeholder="Add a pearl necklace; convert to watercolor; etc.",
)
with gr.Row():
ie_height = gr.Slider(256, 1024, value=768, step=16, label="Height")
ie_width = gr.Slider(256, 1024, value=768, step=16, label="Width")
ie_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
with gr.Accordion("Advanced", open=False):
ie_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps")
ie_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
ie_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
ie_run = gr.Button("Edit Image", variant="primary")
with gr.Column(scale=1):
ie_output = gr.Image(label="Result")
ie_status = gr.Markdown("Idle.")
ie_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
ie_unused = gr.State("")
ie_run.click(
fn=run_image_edit,
inputs=[
ie_input, ie_prompt, ie_seed,
ie_height, ie_width,
ie_timesteps, ie_shift, ie_cfg,
],
outputs=[ie_output, ie_unused, ie_status, ie_logs],
)
# ---- Tab: Video Edit --------------------------------------------------
with gr.Tab("Video Edit"):
with gr.Row():
with gr.Column(scale=1):
ve_input = gr.Video(label="Source video")
ve_prompt = gr.Textbox(
label="Edit instruction",
lines=3,
placeholder="Change the background; restyle as anime; etc.",
)
with gr.Row():
ve_height = gr.Slider(192, 1024, value=480, step=16, label="Height")
ve_width = gr.Slider(192, 1024, value=848, step=16, label="Width")
ve_num_frames = gr.Slider(1, 121, value=50, step=1, label="Frames")
ve_resolution = gr.Dropdown(
label="Resolution preset",
choices=VIDEO_RESOLUTION_CHOICES,
value="video_480p",
)
ve_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
with gr.Accordion("Advanced", open=False):
ve_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps")
ve_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
ve_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
ve_run = gr.Button("Edit Video", variant="primary")
with gr.Column(scale=1):
ve_output = gr.Video(label="Result")
ve_status = gr.Markdown("Idle.")
ve_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
ve_unused = gr.State("")
ve_run.click(
fn=run_video_edit,
inputs=[
ve_input, ve_prompt, ve_seed,
ve_height, ve_width, ve_num_frames, ve_resolution,
ve_timesteps, ve_shift, ve_cfg,
],
outputs=[ve_output, ve_unused, ve_status, ve_logs],
)
if __name__ == "__main__":
# SSR mode (Gradio 6 default) breaks @gradio/client 1.8: requests vanish
# silently because the client expects the old non-SSR endpoint shape.
# Disable it explicitly.
# Theme also moved to launch() in Gradio 6, so we pass it here.
demo.queue(max_size=8).launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False,
theme=gr.themes.Soft(),
)