Spaces:

IgorCSIS
/

nifty-lab

Sleeping

IgorCSIS

Stage 2 phase 2: add x2t_image, image_edit, video_edit with shared _run_lance_task helper

f32613e about 1 month ago

34 kB

	# Copyright 2026 Igor Lima. Licensed under Apache 2.0 (see LICENSE_LANCE for the
	# upstream Lance license; this file is original work by Igor Lima).
	#
	# nifty-lab: ZeroGPU adapter for ByteDance's Lance unified multimodal model.
	#
	# WHAT THIS IS
	# ------------
	# A small Gradio app that runs on Hugging Face Spaces with the ZeroGPU runtime.
	# All of the heavy model-loading and inference logic lives in
	# `lance_gradio_t2v_v2t.py`, which is a verbatim copy of ByteDance's reference
	# Gradio script. This file wraps their `LanceT2VV2TPipeline` class for
	# ZeroGPU's on-demand GPU lifecycle.
	#
	# WHY WE DIDN'T JUST RUN THEIR SCRIPT AS-IS
	# -----------------------------------------
	# Their script assumes a long-running dedicated GPU. They wrap inference in a
	# `PipelinePool` that holds the GPU across requests via threads. ZeroGPU works
	# on a per-request claim model: you request a GPU when you call a function
	# decorated with `@spaces.GPU`, and the GPU is released when the function
	# returns. No persistent pool. So we instantiate ONE pipeline, decorate the
	# inference entry points, and trust ZeroGPU to schedule the GPU for us.
	#
	# STAGE 1 SCOPE
	# -------------
	# Only the two tasks their reference script supports out of the box:
	# 1. text-to-video (TASK_T2V)
	# 2. video understanding / Q&A (TASK_X2T_VIDEO)
	#
	# Stage 2 will add t2i, image_edit, video_edit, and x2t_image as new tabs.
	# Those tasks need a separate model variant (Lance_3B for image work) and
	# additional plumbing in the pipeline, so we ship stage 1 first to prove the
	# ZeroGPU pattern works end-to-end before expanding.

	from __future__ import annotations

	# -----------------------------------------------------------------------------
	# CUDA runtime preload. MUST run before any transformers import.
	# -----------------------------------------------------------------------------
	# Why this exists: recent transformers versions eagerly import flash_attn
	# inside modeling_utils.py, which dynamically links against libcudart.so.12
	# at module load. On ZeroGPU, the GPU (and thus its CUDA libs) is only
	# attached during @spaces.GPU calls, so libcudart isn't on the linker
	# path at boot. Importing torch first doesn't help because torch only
	# preloads CUDA when it detects a live GPU device.
	#
	# We work around it by hunting for libcudart.so.12 in the pip-installed
	# nvidia and torch wheels and ctypes-loading it with RTLD_GLOBAL so the
	# symbols are visible to anyone who dlopens later.
	import ctypes
	import glob
	import os
	import sys

	def _preload_cuda_runtime() -> None:
	"""Find libcudart.so.12 in pip-installed nvidia/torch wheels and preload it."""
	candidates: list[str] = []
	# nvidia-cuda-runtime-cu12 pip package
	candidates += glob.glob("/usr/local/lib/python/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12")
	# Sometimes torch's bundled CUDA libs are loadable too, as a fallback
	candidates += glob.glob("/usr/local/lib/python/site-packages/torch/lib/libcudart.so.12")
	for path in candidates:
	try:
	ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL)
	print(f"[boot] preloaded {path}", flush=True)
	return
	except OSError as exc:
	print(f"[boot] failed to preload {path}: {exc}", flush=True)
	print("[boot] WARNING: no libcudart.so.12 found to preload, flash_attn import will fail", flush=True)

	_preload_cuda_runtime()

	# Now the rest of the imports are safe.
	from pathlib import Path
	from typing import Optional

	import torch # noqa: F401 imported early so its CUDA env init runs before lance code

	import gradio as gr
	import spaces # Hugging Face ZeroGPU runtime decorator
	from huggingface_hub import snapshot_download


	# =============================================================================
	# Model weight download
	# =============================================================================
	# We pull Lance_3B_Video at module-load time (not on first request) for two
	# reasons:
	# 1. It only needs CPU + network, which we have at Space startup. ZeroGPU
	# does NOT give us a GPU at module load.
	# 2. The download is ~6GB and would otherwise consume most of the 300s GPU
	# budget on the first user request.
	#
	# After the first boot, Hugging Face Spaces caches the files in the Space's
	# persistent storage, so subsequent boots skip re-downloading.

	REPO_ID = "bytedance-research/Lance"
	WEIGHTS_ROOT = Path("downloads")

	def fetch_weights() -> None:
	"""
	Pull the weights Lance needs at runtime from the Hugging Face Hub.

	Two directories matter:
	- Lance_3B_Video/ The main multimodal model checkpoint (~28GB).
	- Qwen2.5-VL-ViT/ The vision transformer Lance imports from
	Qwen2.5-VL. Lance's code hardcodes this path
	and expects `config.json` + `vit.safetensors`
	inside it.

	We download both with one snapshot_download call. If Qwen2.5-VL-ViT/
	isn't shipped under the Lance repo, snapshot_download just skips it
	silently and we'll surface a clearer error when initialize() runs.
	"""
	print(f"[boot] downloading {REPO_ID} (Lance_3B_Video + Qwen2.5-VL-ViT + Wan2.2 VAE)...", flush=True)
	snapshot_download(
	repo_id=REPO_ID,
	local_dir=str(WEIGHTS_ROOT),
	allow_patterns=[
	"Lance_3B_Video/*",
	"Lance_3B_Video/*/",
	"Qwen2.5-VL-ViT/*",
	"Qwen2.5-VL-ViT/*/",
	# Wan2.2 video VAE checkpoint, used by WanVideoVAE during pipeline
	# initialize(). Lives at the top of the repo (no subdir).
	"Wan2.2*",
	# Top-level config.json (901 bytes). Tiny, defensively included
	# in case Lance's code reads it during init.
	"config.json",
	],
	resume_download=True,
	)
	# Sanity check: log what showed up. If a required artifact is missing,
	# the next runtime error will tell us exactly what to add to allow_patterns.
	vit_dir = WEIGHTS_ROOT / "Qwen2.5-VL-ViT"
	if vit_dir.exists():
	print(f"[boot] Qwen2.5-VL-ViT/ landed with {len(list(vit_dir.iterdir()))} files", flush=True)
	else:
	print("[boot] WARNING: Qwen2.5-VL-ViT/ NOT in the Lance repo, model init will fail", flush=True)
	wan_path = WEIGHTS_ROOT / "Wan2.2_VAE.pth"
	if wan_path.exists():
	size_mb = wan_path.stat().st_size / (1024 * 1024)
	print(f"[boot] Wan2.2_VAE.pth landed at {size_mb:.0f} MB", flush=True)
	else:
	print("[boot] WARNING: Wan2.2_VAE.pth NOT in the Lance repo, VAE init will fail", flush=True)
	print("[boot] weight download complete.", flush=True)


	# Run download at import time. On HF Spaces this happens once when the Space
	# starts up; the user sees "Starting..." in the dashboard while it runs.
	fetch_weights()


	# =============================================================================
	# Pipeline import (must come AFTER fetch_weights so the import-time checks
	# in their script find the downloaded files where they expect them)
	# =============================================================================

	from lance_gradio_t2v_v2t import (
	LanceT2VV2TPipeline,
	TASK_T2V,
	TASK_X2T_VIDEO,
	DEFAULT_HEIGHT,
	DEFAULT_WIDTH,
	DEFAULT_NUM_FRAMES,
	DEFAULT_TIMESTEPS,
	DEFAULT_TIMESTEP_SHIFT,
	DEFAULT_CFG_TEXT_SCALE,
	DEFAULT_RESOLUTION,
	DEFAULT_BASIC_SEED,
	VIDEO_RESOLUTION_CHOICES,
	)


	# A single pipeline instance. ZeroGPU only ever hands us one GPU at a time,
	# so the PipelinePool from the upstream script (which manages multiple GPUs
	# via threads) isn't relevant here. We instantiate now but defer actual
	# model loading: `LanceT2VV2TPipeline.generate()` calls `self.initialize()`
	# lazily on its first invocation, so the model loads the first time a user
	# hits the GPU. After that, subsequent calls reuse the loaded model
	# (as long as the Space hasn't gone cold).
	PIPELINE = LanceT2VV2TPipeline(device_id=0)


	# =============================================================================
	# ZeroGPU-decorated entry points
	# =============================================================================
	# Two entry points instead of one routed by `task` because each has a
	# different duration budget. Text-to-video can run close to the 300s cap on
	# 50 frames at 480p. Video understanding finishes in 20-60s. Declaring
	# tighter durations lets ZeroGPU schedule short tasks more aggressively and
	# means the user's daily quota covers more requests.

	@spaces.GPU(duration=240)
	def run_text_to_video(
	prompt: str,
	seed: int,
	resolution: str,
	num_frames: int,
	height: int,
	width: int,
	validation_num_timesteps: int,
	validation_timestep_shift: float,
	cfg_text_scale: float,
	):
	"""
	Generate a short video from a text prompt.

	Returns a 4-tuple matching the upstream pipeline shape:
	(video_path, text_result, status_markdown, run_logs)

	For t2v, video_path is the produced clip, text_result is empty.

	Duration is set to 240s to allow proper-quality generation at 480p
	with 50 frames and 30 denoising steps. Each call costs ~3-4 minutes
	of the signed-in user's daily ZeroGPU budget (25 min/day on HF Pro),
	so a Pro user gets ~6-8 high-quality clips per day. ZeroGPU's hard
	per-call cap is 300s; we leave 60s of headroom under that.
	"""
	return PIPELINE.generate(
	task=TASK_T2V,
	prompt=prompt,
	input_video=None,
	question="",
	height=int(height),
	width=int(width),
	num_frames=int(num_frames),
	seed=int(seed),
	resolution=resolution,
	validation_num_timesteps=int(validation_num_timesteps),
	validation_timestep_shift=float(validation_timestep_shift),
	cfg_text_scale=float(cfg_text_scale),
	)


	def _run_lance_task(
	task_name: str,
	payload: dict,
	height: int,
	width: int,
	num_frames: int,
	seed: int,
	resolution: str,
	validation_num_timesteps: int,
	validation_timestep_shift: float,
	cfg_text_scale: float,
	output_kind: str,
	):
	"""
	Generic Lance task runner. Builds inference args, runs the model,
	extracts the result.

	Args:
	task_name: one of "t2i", "t2v", "x2t_image", "x2t_video",
	"image_edit", "video_edit". This drives Lance's
	task-specific dataset routing inside ValidationDataset.
	payload: the dict that gets written to the prompt JSON file. Shape
	varies per task; build it in the task-specific wrapper
	using the patterns from config/examples/*_example.json.
	output_kind: "image", "video", or "text". Decides which kind of
	file we glob for at the end and how we report status.

	We sidestep the upstream `LanceT2VV2TPipeline.generate()` because it
	hardcodes t2v + x2t_video. Everything else here mirrors that method's
	structure: clone the base args, override the request-specific bits,
	call _build_request_batch, then validate_on_fixed_batch.
	"""
	import json as _json
	import time as _time
	from copy import deepcopy as _deepcopy
	from datetime import datetime as _datetime

	import torch as _torch
	from inference_lance import (
	validate_on_fixed_batch as _validate_on_fixed_batch,
	save_prompt_results as _save_prompt_results,
	clean_memory as _clean_memory,
	)
	from lance_gradio_t2v_v2t import (
	TEXT_TEMPLATE as _TEXT_TEMPLATE,
	TMP_INPUT_DIR as _TMP_INPUT_DIR,
	RESULTS_ROOT as _RESULTS_ROOT,
	ensure_dirs as _ensure_dirs,
	extract_text_result as _extract_text_result,
	)

	PIPELINE.initialize()
	_ensure_dirs()
	timestamp = _datetime.now().strftime("%Y%m%d_%H%M%S_%f")

	# Pretty-print JSON so the dataset loader's _read_jsonl line-by-line
	# parse FAILS and falls through to the json.load + dict-transform path
	# that produces the {"data": ..., "index": ...} records the samplers
	# expect. See validation_dataset.py ~line 84.
	prompt_file = _TMP_INPUT_DIR / f"{task_name}_{timestamp}.json"
	prompt_file.write_text(_json.dumps(payload, ensure_ascii=False, indent=2))

	save_dir = _RESULTS_ROOT / f"{task_name}_{timestamp}"
	save_dir.mkdir(parents=True, exist_ok=True)

	request_model_args = _deepcopy(PIPELINE.base_model_args)
	request_model_args.cfg_text_scale = float(cfg_text_scale)

	request_data_args = _deepcopy(PIPELINE.base_data_args)
	request_data_args.val_dataset_config_file = str(prompt_file)

	request_inference_args = _deepcopy(PIPELINE.base_inference_args)
	request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
	request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
	request_inference_args.validation_data_seed = int(seed)
	request_inference_args.validation_noise_seed = int(seed)
	request_inference_args.video_height = int(height)
	request_inference_args.video_width = int(width)
	request_inference_args.num_frames = int(num_frames)
	request_inference_args.resolution = resolution
	request_inference_args.save_path_gen = str(save_dir)
	request_inference_args.task = task_name
	request_inference_args.text_template = _TEXT_TEMPLATE
	request_inference_args.prompt_data_dict = {}

	val_data_cpu = PIPELINE._build_request_batch(
	prompt_file=prompt_file,
	model_args=request_model_args,
	data_args=request_data_args,
	inference_args=request_inference_args,
	)

	print(
	f"[app] {task_name} start \| size={height}x{width} \| "
	f"frames={num_frames} \| steps={validation_num_timesteps}",
	flush=True,
	)
	start = _time.perf_counter()
	with PIPELINE._generate_lock:
	_torch.cuda.set_device(PIPELINE.device)
	_validate_on_fixed_batch(
	fsdp_model=PIPELINE.model,
	vae_model=PIPELINE.vae_model,
	tokenizer=PIPELINE.tokenizer,
	val_data_cpu=val_data_cpu,
	training_args=request_inference_args,
	model_args=request_model_args,
	inference_args=request_inference_args,
	new_token_ids=PIPELINE.new_token_ids,
	image_token_id=PIPELINE.image_token_id,
	device=PIPELINE.device,
	save_source_video=False,
	save_path_gen=str(save_dir),
	save_path_gt="",
	)
	_save_prompt_results(
	request_inference_args.prompt_data_dict,
	str(save_dir),
	PIPELINE.logger,
	)
	_clean_memory()
	elapsed = _time.perf_counter() - start

	# Result extraction by output kind. The output 4-tuple matches the
	# shape the upstream pipeline returns so the frontend handlers stay
	# uniform: (video_path, text_result, status_md, logs).
	if output_kind == "image":
	files = sorted(save_dir.glob(".png")) + sorted(save_dir.glob(".jpg"))
	if not files:
	return None, "", f"Inference completed but no image in {save_dir}", ""
	return str(files[0]), "", f"Done in {elapsed:.1f}s", ""
	if output_kind == "video":
	files = sorted(save_dir.glob("*.mp4"))
	if not files:
	return None, "", f"Inference completed but no video in {save_dir}", ""
	return str(files[0]), "", f"Done in {elapsed:.1f}s", ""
	if output_kind == "text":
	text = _extract_text_result(save_dir)
	if not text:
	return None, "", f"Inference completed but no text result in {save_dir}", ""
	return None, text, f"Done in {elapsed:.1f}s", ""
	return None, "", f"Unknown output_kind: {output_kind}", ""


	def _image_inference(
	prompt: str,
	height: int,
	width: int,
	seed: int,
	validation_num_timesteps: int,
	validation_timestep_shift: float,
	cfg_text_scale: float,
	):
	"""Text-to-image. Thin wrapper around _run_lance_task."""
	prompt = (prompt or "").strip()
	if not prompt:
	return None, "", "Please enter a prompt.", ""
	return _run_lance_task(
	task_name="t2i",
	payload={"000000.png": prompt},
	height=int(height),
	width=int(width),
	num_frames=1,
	seed=int(seed),
	resolution="image_768res",
	validation_num_timesteps=int(validation_num_timesteps),
	validation_timestep_shift=float(validation_timestep_shift),
	cfg_text_scale=float(cfg_text_scale),
	output_kind="image",
	)


	@spaces.GPU(duration=90)
	def run_text_to_image(
	prompt: str,
	seed: int,
	height: int,
	width: int,
	validation_num_timesteps: int,
	validation_timestep_shift: float,
	cfg_text_scale: float,
	):
	"""
	ZeroGPU wrapper for text-to-image. 90s budget; image gen with Lance
	at 768res / 30 steps lands in 20 to 40 seconds on H200.
	"""
	return _image_inference(
	prompt=prompt,
	height=int(height),
	width=int(width),
	seed=int(seed),
	validation_num_timesteps=int(validation_num_timesteps),
	validation_timestep_shift=float(validation_timestep_shift),
	cfg_text_scale=float(cfg_text_scale),
	)


	@spaces.GPU(duration=60)
	def run_video_understanding(
	input_video: str,
	question: str,
	seed: int,
	):
	"""
	Answer a question about an uploaded video clip.

	Returns the same 4-tuple shape. For understanding, video_path is empty
	and text_result holds the model's answer.

	Most of the pipeline.generate() arguments are unused for understanding
	tasks, but the signature requires all of them, so we pass defaults.
	"""
	return PIPELINE.generate(
	task=TASK_X2T_VIDEO,
	prompt="",
	input_video=input_video,
	question=question,
	height=DEFAULT_HEIGHT,
	width=DEFAULT_WIDTH,
	num_frames=DEFAULT_NUM_FRAMES,
	seed=int(seed),
	resolution=DEFAULT_RESOLUTION,
	validation_num_timesteps=DEFAULT_TIMESTEPS,
	validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
	cfg_text_scale=DEFAULT_CFG_TEXT_SCALE,
	)


	@spaces.GPU(duration=60)
	def run_image_understanding(
	input_image: str,
	question: str,
	seed: int,
	):
	"""
	Answer a question about an uploaded image. Same shape as
	run_video_understanding but operates on a single image.

	The interleave_array payload format comes from Lance's
	config/examples/x2t_image_example.json.
	"""
	question = (question or "").strip()
	if not input_image:
	return None, "", "Please upload an image.", ""
	if not question:
	return None, "", "Please enter a question.", ""

	payload = {
	"0001": {
	"interleave_array": [
	input_image,
	["Look at the image carefully and answer the question.", question, ""],
	],
	"element_dtype_array": ["image", "text"],
	"istarget_in_interleave": [0, 1],
	}
	}
	return _run_lance_task(
	task_name="x2t_image",
	payload=payload,
	height=DEFAULT_HEIGHT,
	width=DEFAULT_WIDTH,
	num_frames=1,
	seed=int(seed),
	resolution="image_768res",
	validation_num_timesteps=DEFAULT_TIMESTEPS,
	validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
	cfg_text_scale=DEFAULT_CFG_TEXT_SCALE,
	output_kind="text",
	)


	@spaces.GPU(duration=120)
	def run_image_edit(
	input_image: str,
	prompt: str,
	seed: int,
	height: int,
	width: int,
	validation_num_timesteps: int,
	validation_timestep_shift: float,
	cfg_text_scale: float,
	):
	"""
	Edit an input image with a text instruction. Payload format from
	Lance's config/examples/image_edit_example.json: text prompt first,
	then the source image twice (Lance's pipeline expects source AND
	target slots; the second slot is what the model fills in).
	"""
	prompt = (prompt or "").strip()
	if not input_image:
	return None, "", "Please upload an image.", ""
	if not prompt:
	return None, "", "Please enter an edit instruction.", ""

	payload = {
	"0001": {
	"interleave_array": [prompt, input_image, input_image],
	"element_dtype_array": ["text", "image", "image"],
	"istarget_in_interleave": [0, 0, 1],
	}
	}
	return _run_lance_task(
	task_name="image_edit",
	payload=payload,
	height=int(height),
	width=int(width),
	num_frames=1,
	seed=int(seed),
	resolution="image_768res",
	validation_num_timesteps=int(validation_num_timesteps),
	validation_timestep_shift=float(validation_timestep_shift),
	cfg_text_scale=float(cfg_text_scale),
	output_kind="image",
	)


	@spaces.GPU(duration=240)
	def run_video_edit(
	input_video: str,
	prompt: str,
	seed: int,
	height: int,
	width: int,
	num_frames: int,
	resolution: str,
	validation_num_timesteps: int,
	validation_timestep_shift: float,
	cfg_text_scale: float,
	):
	"""
	Edit an input video with a text instruction. Same shape as
	image_edit but with video element dtypes. 240s GPU budget matches
	t2v because the underlying inference cost is similar.
	"""
	prompt = (prompt or "").strip()
	if not input_video:
	return None, "", "Please upload a video.", ""
	if not prompt:
	return None, "", "Please enter an edit instruction.", ""

	payload = {
	"0001": {
	"interleave_array": [prompt, input_video, input_video],
	"element_dtype_array": ["text", "video", "video"],
	"istarget_in_interleave": [0, 0, 1],
	}
	}
	return _run_lance_task(
	task_name="video_edit",
	payload=payload,
	height=int(height),
	width=int(width),
	num_frames=int(num_frames),
	seed=int(seed),
	resolution=resolution,
	validation_num_timesteps=int(validation_num_timesteps),
	validation_timestep_shift=float(validation_timestep_shift),
	cfg_text_scale=float(cfg_text_scale),
	output_kind="video",
	)


	# =============================================================================
	# Gradio UI
	# =============================================================================
	# Two tabs, one per task. Layout follows the same column structure as the
	# upstream demo so users familiar with their reference UI feel at home.

	with gr.Blocks(title="nifty-lab") as demo:
	gr.Markdown(
	"""
	# nifty-lab

	A multimodal playground built on ByteDance's
	[Lance](https://github.com/bytedance/Lance) model, served on
	Hugging Face ZeroGPU. By [Igor Lima](https://github.com/IgorCSIS).

	Tasks wired: text-to-image, text-to-video, video understanding.
	Image edit and video edit ship in a follow-up.

	First request after the Space wakes from idle takes about a minute
	to warm the model. Subsequent requests are fast.
	"""
	)

	# ---- Tab: Text to Video ------------------------------------------------
	with gr.Tab("Text to Video"):
	with gr.Row():
	with gr.Column(scale=1):
	t2v_prompt = gr.Textbox(
	label="Prompt",
	lines=5,
	placeholder="Describe the video you want to generate...",
	)
	with gr.Row():
	# Defaults at 480x848 give Lance enough pixels to make
	# something coherent. Lower if you want faster gens.
	t2v_height = gr.Slider(192, 1024, value=480, step=16, label="Height")
	t2v_width = gr.Slider(192, 1024, value=848, step=16, label="Width")
	# 50 frames is ~2s at 25fps, fits comfortably in 240s GPU budget.
	t2v_num_frames = gr.Slider(
	1, 121, value=50, step=1, label="Frames",
	info="50 frames is roughly 2 seconds. 121 is the model max.",
	)
	t2v_resolution = gr.Dropdown(
	label="Resolution preset",
	choices=VIDEO_RESOLUTION_CHOICES,
	value="video_480p",
	)
	t2v_seed = gr.Number(
	label="Seed",
	value=DEFAULT_BASIC_SEED,
	precision=0,
	info="-1 picks a fresh random seed each run.",
	)
	with gr.Accordion("Advanced", open=False):
	t2v_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps")
	t2v_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
	t2v_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
	t2v_run = gr.Button("Generate Video", variant="primary")
	with gr.Column(scale=1):
	t2v_output = gr.Video(label="Result")
	t2v_status = gr.Markdown("Idle.")
	t2v_logs = gr.Textbox(label="Run log", lines=12, max_lines=30)

	# The pipeline returns a 4-tuple of (video_path, text, status, logs).
	# Text result is unused for t2v but we still receive it, so we wire
	# it to a hidden state to consume the value.
	t2v_unused_text = gr.State("")
	t2v_run.click(
	fn=run_text_to_video,
	inputs=[
	t2v_prompt, t2v_seed, t2v_resolution, t2v_num_frames,
	t2v_height, t2v_width, t2v_timesteps, t2v_shift, t2v_cfg,
	],
	outputs=[t2v_output, t2v_unused_text, t2v_status, t2v_logs],
	)

	# ---- Tab: Text to Image -----------------------------------------------
	with gr.Tab("Text to Image"):
	with gr.Row():
	with gr.Column(scale=1):
	t2i_prompt = gr.Textbox(
	label="Prompt",
	lines=5,
	placeholder="A red panda walking through a snowy forest at dusk...",
	)
	with gr.Row():
	t2i_height = gr.Slider(256, 1024, value=768, step=16, label="Height")
	t2i_width = gr.Slider(256, 1024, value=768, step=16, label="Width")
	t2i_seed = gr.Number(
	label="Seed",
	value=DEFAULT_BASIC_SEED,
	precision=0,
	)
	with gr.Accordion("Advanced", open=False):
	t2i_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps")
	t2i_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
	t2i_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
	t2i_run = gr.Button("Generate Image", variant="primary")
	with gr.Column(scale=1):
	t2i_output = gr.Image(label="Result")
	t2i_status = gr.Markdown("Idle.")
	t2i_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)

	t2i_unused_text = gr.State("")
	t2i_run.click(
	fn=run_text_to_image,
	inputs=[
	t2i_prompt, t2i_seed, t2i_height, t2i_width,
	t2i_timesteps, t2i_shift, t2i_cfg,
	],
	outputs=[t2i_output, t2i_unused_text, t2i_status, t2i_logs],
	)

	# ---- Tab: Video Understanding -----------------------------------------
	with gr.Tab("Video Understanding"):
	with gr.Row():
	with gr.Column(scale=1):
	v2t_input = gr.Video(label="Upload a video")
	v2t_question = gr.Textbox(
	label="Question",
	lines=3,
	placeholder="What is happening in this video?",
	)
	v2t_seed = gr.Number(
	label="Seed",
	value=DEFAULT_BASIC_SEED,
	precision=0,
	)
	v2t_run = gr.Button("Ask", variant="primary")
	with gr.Column(scale=1):
	v2t_output = gr.Textbox(label="Answer", lines=8)
	v2t_status = gr.Markdown("Idle.")
	v2t_logs = gr.Textbox(label="Run log", lines=12, max_lines=30)

	# Video understanding returns (None_for_video, text_answer, status, logs).
	# Discard the video slot; surface the text.
	v2t_unused_video = gr.State(None)
	v2t_run.click(
	fn=run_video_understanding,
	inputs=[v2t_input, v2t_question, v2t_seed],
	outputs=[v2t_unused_video, v2t_output, v2t_status, v2t_logs],
	)

	# ---- Tab: Image Understanding -----------------------------------------
	with gr.Tab("Image Q&A"):
	with gr.Row():
	with gr.Column(scale=1):
	i2t_input = gr.Image(label="Upload an image", type="filepath")
	i2t_question = gr.Textbox(
	label="Question",
	lines=3,
	placeholder="What is happening in this image?",
	)
	i2t_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
	i2t_run = gr.Button("Ask", variant="primary")
	with gr.Column(scale=1):
	i2t_output = gr.Textbox(label="Answer", lines=8)
	i2t_status = gr.Markdown("Idle.")
	i2t_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
	i2t_unused = gr.State(None)
	i2t_run.click(
	fn=run_image_understanding,
	inputs=[i2t_input, i2t_question, i2t_seed],
	outputs=[i2t_unused, i2t_output, i2t_status, i2t_logs],
	)

	# ---- Tab: Image Edit --------------------------------------------------
	with gr.Tab("Image Edit"):
	with gr.Row():
	with gr.Column(scale=1):
	ie_input = gr.Image(label="Source image", type="filepath")
	ie_prompt = gr.Textbox(
	label="Edit instruction",
	lines=3,
	placeholder="Add a pearl necklace; convert to watercolor; etc.",
	)
	with gr.Row():
	ie_height = gr.Slider(256, 1024, value=768, step=16, label="Height")
	ie_width = gr.Slider(256, 1024, value=768, step=16, label="Width")
	ie_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
	with gr.Accordion("Advanced", open=False):
	ie_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps")
	ie_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
	ie_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
	ie_run = gr.Button("Edit Image", variant="primary")
	with gr.Column(scale=1):
	ie_output = gr.Image(label="Result")
	ie_status = gr.Markdown("Idle.")
	ie_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
	ie_unused = gr.State("")
	ie_run.click(
	fn=run_image_edit,
	inputs=[
	ie_input, ie_prompt, ie_seed,
	ie_height, ie_width,
	ie_timesteps, ie_shift, ie_cfg,
	],
	outputs=[ie_output, ie_unused, ie_status, ie_logs],
	)

	# ---- Tab: Video Edit --------------------------------------------------
	with gr.Tab("Video Edit"):
	with gr.Row():
	with gr.Column(scale=1):
	ve_input = gr.Video(label="Source video")
	ve_prompt = gr.Textbox(
	label="Edit instruction",
	lines=3,
	placeholder="Change the background; restyle as anime; etc.",
	)
	with gr.Row():
	ve_height = gr.Slider(192, 1024, value=480, step=16, label="Height")
	ve_width = gr.Slider(192, 1024, value=848, step=16, label="Width")
	ve_num_frames = gr.Slider(1, 121, value=50, step=1, label="Frames")
	ve_resolution = gr.Dropdown(
	label="Resolution preset",
	choices=VIDEO_RESOLUTION_CHOICES,
	value="video_480p",
	)
	ve_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
	with gr.Accordion("Advanced", open=False):
	ve_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps")
	ve_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
	ve_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
	ve_run = gr.Button("Edit Video", variant="primary")
	with gr.Column(scale=1):
	ve_output = gr.Video(label="Result")
	ve_status = gr.Markdown("Idle.")
	ve_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
	ve_unused = gr.State("")
	ve_run.click(
	fn=run_video_edit,
	inputs=[
	ve_input, ve_prompt, ve_seed,
	ve_height, ve_width, ve_num_frames, ve_resolution,
	ve_timesteps, ve_shift, ve_cfg,
	],
	outputs=[ve_output, ve_unused, ve_status, ve_logs],
	)


	if __name__ == "__main__":
	# SSR mode (Gradio 6 default) breaks @gradio/client 1.8: requests vanish
	# silently because the client expects the old non-SSR endpoint shape.
	# Disable it explicitly.
	# Theme also moved to launch() in Gradio 6, so we pass it here.
	demo.queue(max_size=8).launch(
	server_name="0.0.0.0",
	server_port=7860,
	ssr_mode=False,
	theme=gr.themes.Soft(),
	)