# Copyright 2026 Igor Lima. Licensed under Apache 2.0 (see LICENSE_LANCE for the
# upstream Lance license; this file is original work by Igor Lima).
#
# nifty-lab: ZeroGPU adapter for ByteDance's Lance unified multimodal model.
#
# WHAT THIS IS
# ------------
# A small Gradio app that runs on Hugging Face Spaces with the ZeroGPU runtime.
# All of the heavy model-loading and inference logic lives in
# `lance_gradio_t2v_v2t.py`, which is a verbatim copy of ByteDance's reference
# Gradio script. This file wraps their `LanceT2VV2TPipeline` class for
# ZeroGPU's on-demand GPU lifecycle.
#
# WHY WE DIDN'T JUST RUN THEIR SCRIPT AS-IS
# -----------------------------------------
# Their script assumes a long-running dedicated GPU. They wrap inference in a
# `PipelinePool` that holds the GPU across requests via threads. ZeroGPU works
# on a per-request claim model: you request a GPU when you call a function
# decorated with `@spaces.GPU`, and the GPU is released when the function
# returns. No persistent pool. So we instantiate ONE pipeline, decorate the
# inference entry points, and trust ZeroGPU to schedule the GPU for us.
#
# STAGE 1 SCOPE
# -------------
# Only the two tasks their reference script supports out of the box:
#   1. text-to-video (TASK_T2V)
#   2. video understanding / Q&A (TASK_X2T_VIDEO)
#
# Stage 2 will add t2i, image_edit, video_edit, and x2t_image as new tabs.
# Those tasks need a separate model variant (Lance_3B for image work) and
# additional plumbing in the pipeline, so we ship stage 1 first to prove the
# ZeroGPU pattern works end-to-end before expanding.

from __future__ import annotations

# -----------------------------------------------------------------------------
# CUDA runtime preload. MUST run before any transformers import.
# -----------------------------------------------------------------------------
# Why this exists: recent transformers versions eagerly import flash_attn
# inside modeling_utils.py, which dynamically links against libcudart.so.12
# at module load. On ZeroGPU, the GPU (and thus its CUDA libs) is only
# attached during @spaces.GPU calls, so libcudart isn't on the linker
# path at boot. Importing torch first doesn't help because torch only
# preloads CUDA when it detects a live GPU device.
#
# We work around it by hunting for libcudart.so.12 in the pip-installed
# nvidia and torch wheels and ctypes-loading it with RTLD_GLOBAL so the
# symbols are visible to anyone who dlopens later.
import ctypes
import glob
import os
import sys

def _preload_cuda_runtime() -> None:
    """Find libcudart.so.12 in pip-installed nvidia/torch wheels and preload it."""
    candidates: list[str] = []
    # nvidia-cuda-runtime-cu12 pip package
    candidates += glob.glob("/usr/local/lib/python*/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12*")
    # Sometimes torch's bundled CUDA libs are loadable too, as a fallback
    candidates += glob.glob("/usr/local/lib/python*/site-packages/torch/lib/libcudart.so.12*")
    for path in candidates:
        try:
            ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL)
            print(f"[boot] preloaded {path}", flush=True)
            return
        except OSError as exc:
            print(f"[boot] failed to preload {path}: {exc}", flush=True)
    print("[boot] WARNING: no libcudart.so.12 found to preload, flash_attn import will fail", flush=True)

_preload_cuda_runtime()

# Now the rest of the imports are safe.
from pathlib import Path
from typing import Optional

import torch  # noqa: F401  imported early so its CUDA env init runs before lance code

import gradio as gr
import spaces  # Hugging Face ZeroGPU runtime decorator
from huggingface_hub import snapshot_download


# =============================================================================
# Model weight download
# =============================================================================
# We pull Lance_3B_Video at module-load time (not on first request) for two
# reasons:
#   1. It only needs CPU + network, which we have at Space startup. ZeroGPU
#      does NOT give us a GPU at module load.
#   2. The download is ~6GB and would otherwise consume most of the 300s GPU
#      budget on the first user request.
#
# After the first boot, Hugging Face Spaces caches the files in the Space's
# persistent storage, so subsequent boots skip re-downloading.

REPO_ID = "bytedance-research/Lance"
WEIGHTS_ROOT = Path("downloads")

def fetch_weights() -> None:
    """
    Pull the weights Lance needs at runtime from the Hugging Face Hub.

    Two directories matter:
      - Lance_3B_Video/      The main multimodal model checkpoint (~28GB).
      - Qwen2.5-VL-ViT/      The vision transformer Lance imports from
                             Qwen2.5-VL. Lance's code hardcodes this path
                             and expects `config.json` + `vit.safetensors`
                             inside it.

    We download both with one snapshot_download call. If Qwen2.5-VL-ViT/
    isn't shipped under the Lance repo, snapshot_download just skips it
    silently and we'll surface a clearer error when initialize() runs.
    """
    print(f"[boot] downloading {REPO_ID} (Lance_3B_Video + Qwen2.5-VL-ViT + Wan2.2 VAE)...", flush=True)
    snapshot_download(
        repo_id=REPO_ID,
        local_dir=str(WEIGHTS_ROOT),
        allow_patterns=[
            "Lance_3B_Video/*",
            "Lance_3B_Video/**/*",
            "Qwen2.5-VL-ViT/*",
            "Qwen2.5-VL-ViT/**/*",
            # Wan2.2 video VAE checkpoint, used by WanVideoVAE during pipeline
            # initialize(). Lives at the top of the repo (no subdir).
            "Wan2.2*",
            # Top-level config.json (901 bytes). Tiny, defensively included
            # in case Lance's code reads it during init.
            "config.json",
        ],
        resume_download=True,
    )
    # Sanity check: log what showed up. If a required artifact is missing,
    # the next runtime error will tell us exactly what to add to allow_patterns.
    vit_dir = WEIGHTS_ROOT / "Qwen2.5-VL-ViT"
    if vit_dir.exists():
        print(f"[boot] Qwen2.5-VL-ViT/ landed with {len(list(vit_dir.iterdir()))} files", flush=True)
    else:
        print("[boot] WARNING: Qwen2.5-VL-ViT/ NOT in the Lance repo, model init will fail", flush=True)
    wan_path = WEIGHTS_ROOT / "Wan2.2_VAE.pth"
    if wan_path.exists():
        size_mb = wan_path.stat().st_size / (1024 * 1024)
        print(f"[boot] Wan2.2_VAE.pth landed at {size_mb:.0f} MB", flush=True)
    else:
        print("[boot] WARNING: Wan2.2_VAE.pth NOT in the Lance repo, VAE init will fail", flush=True)
    print("[boot] weight download complete.", flush=True)


# Run download at import time. On HF Spaces this happens once when the Space
# starts up; the user sees "Starting..." in the dashboard while it runs.
fetch_weights()


# =============================================================================
# Pipeline import (must come AFTER fetch_weights so the import-time checks
# in their script find the downloaded files where they expect them)
# =============================================================================

from lance_gradio_t2v_v2t import (
    LanceT2VV2TPipeline,
    TASK_T2V,
    TASK_X2T_VIDEO,
    DEFAULT_HEIGHT,
    DEFAULT_WIDTH,
    DEFAULT_NUM_FRAMES,
    DEFAULT_TIMESTEPS,
    DEFAULT_TIMESTEP_SHIFT,
    DEFAULT_CFG_TEXT_SCALE,
    DEFAULT_RESOLUTION,
    DEFAULT_BASIC_SEED,
    VIDEO_RESOLUTION_CHOICES,
)


# A single pipeline instance. ZeroGPU only ever hands us one GPU at a time,
# so the PipelinePool from the upstream script (which manages multiple GPUs
# via threads) isn't relevant here. We instantiate now but defer actual
# model loading: `LanceT2VV2TPipeline.generate()` calls `self.initialize()`
# lazily on its first invocation, so the model loads the first time a user
# hits the GPU. After that, subsequent calls reuse the loaded model
# (as long as the Space hasn't gone cold).
PIPELINE = LanceT2VV2TPipeline(device_id=0)


# =============================================================================
# ZeroGPU-decorated entry points
# =============================================================================
# Two entry points instead of one routed by `task` because each has a
# different duration budget. Text-to-video can run close to the 300s cap on
# 50 frames at 480p. Video understanding finishes in 20-60s. Declaring
# tighter durations lets ZeroGPU schedule short tasks more aggressively and
# means the user's daily quota covers more requests.

@spaces.GPU(duration=240)
def run_text_to_video(
    prompt: str,
    seed: int,
    resolution: str,
    num_frames: int,
    height: int,
    width: int,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
):
    """
    Generate a short video from a text prompt.

    Returns a 4-tuple matching the upstream pipeline shape:
        (video_path, text_result, status_markdown, run_logs)

    For t2v, video_path is the produced clip, text_result is empty.

    Duration is set to 240s to allow proper-quality generation at 480p
    with 50 frames and 30 denoising steps. Each call costs ~3-4 minutes
    of the signed-in user's daily ZeroGPU budget (25 min/day on HF Pro),
    so a Pro user gets ~6-8 high-quality clips per day. ZeroGPU's hard
    per-call cap is 300s; we leave 60s of headroom under that.
    """
    return PIPELINE.generate(
        task=TASK_T2V,
        prompt=prompt,
        input_video=None,
        question="",
        height=int(height),
        width=int(width),
        num_frames=int(num_frames),
        seed=int(seed),
        resolution=resolution,
        validation_num_timesteps=int(validation_num_timesteps),
        validation_timestep_shift=float(validation_timestep_shift),
        cfg_text_scale=float(cfg_text_scale),
    )


def _run_lance_task(
    task_name: str,
    payload: dict,
    height: int,
    width: int,
    num_frames: int,
    seed: int,
    resolution: str,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
    output_kind: str,
):
    """
    Generic Lance task runner. Builds inference args, runs the model,
    extracts the result.

    Args:
      task_name: one of "t2i", "t2v", "x2t_image", "x2t_video",
                 "image_edit", "video_edit". This drives Lance's
                 task-specific dataset routing inside ValidationDataset.
      payload: the dict that gets written to the prompt JSON file. Shape
               varies per task; build it in the task-specific wrapper
               using the patterns from config/examples/*_example.json.
      output_kind: "image", "video", or "text". Decides which kind of
                   file we glob for at the end and how we report status.

    We sidestep the upstream `LanceT2VV2TPipeline.generate()` because it
    hardcodes t2v + x2t_video. Everything else here mirrors that method's
    structure: clone the base args, override the request-specific bits,
    call _build_request_batch, then validate_on_fixed_batch.
    """
    import json as _json
    import time as _time
    from copy import deepcopy as _deepcopy
    from datetime import datetime as _datetime

    import torch as _torch
    from inference_lance import (
        validate_on_fixed_batch as _validate_on_fixed_batch,
        save_prompt_results as _save_prompt_results,
        clean_memory as _clean_memory,
    )
    from lance_gradio_t2v_v2t import (
        TEXT_TEMPLATE as _TEXT_TEMPLATE,
        TMP_INPUT_DIR as _TMP_INPUT_DIR,
        RESULTS_ROOT as _RESULTS_ROOT,
        ensure_dirs as _ensure_dirs,
        extract_text_result as _extract_text_result,
    )

    PIPELINE.initialize()
    _ensure_dirs()
    timestamp = _datetime.now().strftime("%Y%m%d_%H%M%S_%f")

    # Pretty-print JSON so the dataset loader's _read_jsonl line-by-line
    # parse FAILS and falls through to the json.load + dict-transform path
    # that produces the {"data": ..., "index": ...} records the samplers
    # expect. See validation_dataset.py ~line 84.
    prompt_file = _TMP_INPUT_DIR / f"{task_name}_{timestamp}.json"
    prompt_file.write_text(_json.dumps(payload, ensure_ascii=False, indent=2))

    save_dir = _RESULTS_ROOT / f"{task_name}_{timestamp}"
    save_dir.mkdir(parents=True, exist_ok=True)

    request_model_args = _deepcopy(PIPELINE.base_model_args)
    request_model_args.cfg_text_scale = float(cfg_text_scale)

    request_data_args = _deepcopy(PIPELINE.base_data_args)
    request_data_args.val_dataset_config_file = str(prompt_file)

    request_inference_args = _deepcopy(PIPELINE.base_inference_args)
    request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
    request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
    request_inference_args.validation_data_seed = int(seed)
    request_inference_args.validation_noise_seed = int(seed)
    request_inference_args.video_height = int(height)
    request_inference_args.video_width = int(width)
    request_inference_args.num_frames = int(num_frames)
    request_inference_args.resolution = resolution
    request_inference_args.save_path_gen = str(save_dir)
    request_inference_args.task = task_name
    request_inference_args.text_template = _TEXT_TEMPLATE
    request_inference_args.prompt_data_dict = {}

    val_data_cpu = PIPELINE._build_request_batch(
        prompt_file=prompt_file,
        model_args=request_model_args,
        data_args=request_data_args,
        inference_args=request_inference_args,
    )

    print(
        f"[app] {task_name} start | size={height}x{width} | "
        f"frames={num_frames} | steps={validation_num_timesteps}",
        flush=True,
    )
    start = _time.perf_counter()
    with PIPELINE._generate_lock:
        _torch.cuda.set_device(PIPELINE.device)
        _validate_on_fixed_batch(
            fsdp_model=PIPELINE.model,
            vae_model=PIPELINE.vae_model,
            tokenizer=PIPELINE.tokenizer,
            val_data_cpu=val_data_cpu,
            training_args=request_inference_args,
            model_args=request_model_args,
            inference_args=request_inference_args,
            new_token_ids=PIPELINE.new_token_ids,
            image_token_id=PIPELINE.image_token_id,
            device=PIPELINE.device,
            save_source_video=False,
            save_path_gen=str(save_dir),
            save_path_gt="",
        )
        _save_prompt_results(
            request_inference_args.prompt_data_dict,
            str(save_dir),
            PIPELINE.logger,
        )
        _clean_memory()
    elapsed = _time.perf_counter() - start

    # Result extraction by output kind. The output 4-tuple matches the
    # shape the upstream pipeline returns so the frontend handlers stay
    # uniform: (video_path, text_result, status_md, logs).
    if output_kind == "image":
        files = sorted(save_dir.glob("*.png")) + sorted(save_dir.glob("*.jpg"))
        if not files:
            return None, "", f"Inference completed but no image in {save_dir}", ""
        return str(files[0]), "", f"Done in {elapsed:.1f}s", ""
    if output_kind == "video":
        files = sorted(save_dir.glob("*.mp4"))
        if not files:
            return None, "", f"Inference completed but no video in {save_dir}", ""
        return str(files[0]), "", f"Done in {elapsed:.1f}s", ""
    if output_kind == "text":
        text = _extract_text_result(save_dir)
        if not text:
            return None, "", f"Inference completed but no text result in {save_dir}", ""
        return None, text, f"Done in {elapsed:.1f}s", ""
    return None, "", f"Unknown output_kind: {output_kind}", ""


def _image_inference(
    prompt: str,
    height: int,
    width: int,
    seed: int,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
):
    """Text-to-image. Thin wrapper around _run_lance_task."""
    prompt = (prompt or "").strip()
    if not prompt:
        return None, "", "Please enter a prompt.", ""
    return _run_lance_task(
        task_name="t2i",
        payload={"000000.png": prompt},
        height=int(height),
        width=int(width),
        num_frames=1,
        seed=int(seed),
        resolution="image_768res",
        validation_num_timesteps=int(validation_num_timesteps),
        validation_timestep_shift=float(validation_timestep_shift),
        cfg_text_scale=float(cfg_text_scale),
        output_kind="image",
    )


@spaces.GPU(duration=90)
def run_text_to_image(
    prompt: str,
    seed: int,
    height: int,
    width: int,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
):
    """
    ZeroGPU wrapper for text-to-image. 90s budget; image gen with Lance
    at 768res / 30 steps lands in 20 to 40 seconds on H200.
    """
    return _image_inference(
        prompt=prompt,
        height=int(height),
        width=int(width),
        seed=int(seed),
        validation_num_timesteps=int(validation_num_timesteps),
        validation_timestep_shift=float(validation_timestep_shift),
        cfg_text_scale=float(cfg_text_scale),
    )


@spaces.GPU(duration=60)
def run_video_understanding(
    input_video: str,
    question: str,
    seed: int,
):
    """
    Answer a question about an uploaded video clip.

    Returns the same 4-tuple shape. For understanding, video_path is empty
    and text_result holds the model's answer.

    Most of the pipeline.generate() arguments are unused for understanding
    tasks, but the signature requires all of them, so we pass defaults.
    """
    return PIPELINE.generate(
        task=TASK_X2T_VIDEO,
        prompt="",
        input_video=input_video,
        question=question,
        height=DEFAULT_HEIGHT,
        width=DEFAULT_WIDTH,
        num_frames=DEFAULT_NUM_FRAMES,
        seed=int(seed),
        resolution=DEFAULT_RESOLUTION,
        validation_num_timesteps=DEFAULT_TIMESTEPS,
        validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
        cfg_text_scale=DEFAULT_CFG_TEXT_SCALE,
    )


@spaces.GPU(duration=60)
def run_image_understanding(
    input_image: str,
    question: str,
    seed: int,
):
    """
    Answer a question about an uploaded image. Same shape as
    run_video_understanding but operates on a single image.

    The interleave_array payload format comes from Lance's
    config/examples/x2t_image_example.json.
    """
    question = (question or "").strip()
    if not input_image:
        return None, "", "Please upload an image.", ""
    if not question:
        return None, "", "Please enter a question.", ""

    payload = {
        "0001": {
            "interleave_array": [
                input_image,
                ["Look at the image carefully and answer the question.", question, ""],
            ],
            "element_dtype_array": ["image", "text"],
            "istarget_in_interleave": [0, 1],
        }
    }
    return _run_lance_task(
        task_name="x2t_image",
        payload=payload,
        height=DEFAULT_HEIGHT,
        width=DEFAULT_WIDTH,
        num_frames=1,
        seed=int(seed),
        resolution="image_768res",
        validation_num_timesteps=DEFAULT_TIMESTEPS,
        validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
        cfg_text_scale=DEFAULT_CFG_TEXT_SCALE,
        output_kind="text",
    )


@spaces.GPU(duration=120)
def run_image_edit(
    input_image: str,
    prompt: str,
    seed: int,
    height: int,
    width: int,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
):
    """
    Edit an input image with a text instruction. Payload format from
    Lance's config/examples/image_edit_example.json: text prompt first,
    then the source image twice (Lance's pipeline expects source AND
    target slots; the second slot is what the model fills in).
    """
    prompt = (prompt or "").strip()
    if not input_image:
        return None, "", "Please upload an image.", ""
    if not prompt:
        return None, "", "Please enter an edit instruction.", ""

    payload = {
        "0001": {
            "interleave_array": [prompt, input_image, input_image],
            "element_dtype_array": ["text", "image", "image"],
            "istarget_in_interleave": [0, 0, 1],
        }
    }
    return _run_lance_task(
        task_name="image_edit",
        payload=payload,
        height=int(height),
        width=int(width),
        num_frames=1,
        seed=int(seed),
        resolution="image_768res",
        validation_num_timesteps=int(validation_num_timesteps),
        validation_timestep_shift=float(validation_timestep_shift),
        cfg_text_scale=float(cfg_text_scale),
        output_kind="image",
    )


@spaces.GPU(duration=240)
def run_video_edit(
    input_video: str,
    prompt: str,
    seed: int,
    height: int,
    width: int,
    num_frames: int,
    resolution: str,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
):
    """
    Edit an input video with a text instruction. Same shape as
    image_edit but with video element dtypes. 240s GPU budget matches
    t2v because the underlying inference cost is similar.
    """
    prompt = (prompt or "").strip()
    if not input_video:
        return None, "", "Please upload a video.", ""
    if not prompt:
        return None, "", "Please enter an edit instruction.", ""

    payload = {
        "0001": {
            "interleave_array": [prompt, input_video, input_video],
            "element_dtype_array": ["text", "video", "video"],
            "istarget_in_interleave": [0, 0, 1],
        }
    }
    return _run_lance_task(
        task_name="video_edit",
        payload=payload,
        height=int(height),
        width=int(width),
        num_frames=int(num_frames),
        seed=int(seed),
        resolution=resolution,
        validation_num_timesteps=int(validation_num_timesteps),
        validation_timestep_shift=float(validation_timestep_shift),
        cfg_text_scale=float(cfg_text_scale),
        output_kind="video",
    )


# =============================================================================
# Gradio UI
# =============================================================================
# Two tabs, one per task. Layout follows the same column structure as the
# upstream demo so users familiar with their reference UI feel at home.

with gr.Blocks(title="nifty-lab") as demo:
    gr.Markdown(
        """
        # nifty-lab

        A multimodal playground built on ByteDance's
        [Lance](https://github.com/bytedance/Lance) model, served on
        Hugging Face ZeroGPU. By [Igor Lima](https://github.com/IgorCSIS).

        Tasks wired: text-to-image, text-to-video, video understanding.
        Image edit and video edit ship in a follow-up.

        First request after the Space wakes from idle takes about a minute
        to warm the model. Subsequent requests are fast.
        """
    )

    # ---- Tab: Text to Video ------------------------------------------------
    with gr.Tab("Text to Video"):
        with gr.Row():
            with gr.Column(scale=1):
                t2v_prompt = gr.Textbox(
                    label="Prompt",
                    lines=5,
                    placeholder="Describe the video you want to generate...",
                )
                with gr.Row():
                    # Defaults at 480x848 give Lance enough pixels to make
                    # something coherent. Lower if you want faster gens.
                    t2v_height = gr.Slider(192, 1024, value=480, step=16, label="Height")
                    t2v_width = gr.Slider(192, 1024, value=848, step=16, label="Width")
                # 50 frames is ~2s at 25fps, fits comfortably in 240s GPU budget.
                t2v_num_frames = gr.Slider(
                    1, 121, value=50, step=1, label="Frames",
                    info="50 frames is roughly 2 seconds. 121 is the model max.",
                )
                t2v_resolution = gr.Dropdown(
                    label="Resolution preset",
                    choices=VIDEO_RESOLUTION_CHOICES,
                    value="video_480p",
                )
                t2v_seed = gr.Number(
                    label="Seed",
                    value=DEFAULT_BASIC_SEED,
                    precision=0,
                    info="-1 picks a fresh random seed each run.",
                )
                with gr.Accordion("Advanced", open=False):
                    t2v_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps")
                    t2v_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
                    t2v_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
                t2v_run = gr.Button("Generate Video", variant="primary")
            with gr.Column(scale=1):
                t2v_output = gr.Video(label="Result")
                t2v_status = gr.Markdown("Idle.")
                t2v_logs = gr.Textbox(label="Run log", lines=12, max_lines=30)

        # The pipeline returns a 4-tuple of (video_path, text, status, logs).
        # Text result is unused for t2v but we still receive it, so we wire
        # it to a hidden state to consume the value.
        t2v_unused_text = gr.State("")
        t2v_run.click(
            fn=run_text_to_video,
            inputs=[
                t2v_prompt, t2v_seed, t2v_resolution, t2v_num_frames,
                t2v_height, t2v_width, t2v_timesteps, t2v_shift, t2v_cfg,
            ],
            outputs=[t2v_output, t2v_unused_text, t2v_status, t2v_logs],
        )

    # ---- Tab: Text to Image -----------------------------------------------
    with gr.Tab("Text to Image"):
        with gr.Row():
            with gr.Column(scale=1):
                t2i_prompt = gr.Textbox(
                    label="Prompt",
                    lines=5,
                    placeholder="A red panda walking through a snowy forest at dusk...",
                )
                with gr.Row():
                    t2i_height = gr.Slider(256, 1024, value=768, step=16, label="Height")
                    t2i_width = gr.Slider(256, 1024, value=768, step=16, label="Width")
                t2i_seed = gr.Number(
                    label="Seed",
                    value=DEFAULT_BASIC_SEED,
                    precision=0,
                )
                with gr.Accordion("Advanced", open=False):
                    t2i_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps")
                    t2i_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
                    t2i_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
                t2i_run = gr.Button("Generate Image", variant="primary")
            with gr.Column(scale=1):
                t2i_output = gr.Image(label="Result")
                t2i_status = gr.Markdown("Idle.")
                t2i_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)

        t2i_unused_text = gr.State("")
        t2i_run.click(
            fn=run_text_to_image,
            inputs=[
                t2i_prompt, t2i_seed, t2i_height, t2i_width,
                t2i_timesteps, t2i_shift, t2i_cfg,
            ],
            outputs=[t2i_output, t2i_unused_text, t2i_status, t2i_logs],
        )

    # ---- Tab: Video Understanding -----------------------------------------
    with gr.Tab("Video Understanding"):
        with gr.Row():
            with gr.Column(scale=1):
                v2t_input = gr.Video(label="Upload a video")
                v2t_question = gr.Textbox(
                    label="Question",
                    lines=3,
                    placeholder="What is happening in this video?",
                )
                v2t_seed = gr.Number(
                    label="Seed",
                    value=DEFAULT_BASIC_SEED,
                    precision=0,
                )
                v2t_run = gr.Button("Ask", variant="primary")
            with gr.Column(scale=1):
                v2t_output = gr.Textbox(label="Answer", lines=8)
                v2t_status = gr.Markdown("Idle.")
                v2t_logs = gr.Textbox(label="Run log", lines=12, max_lines=30)

        # Video understanding returns (None_for_video, text_answer, status, logs).
        # Discard the video slot; surface the text.
        v2t_unused_video = gr.State(None)
        v2t_run.click(
            fn=run_video_understanding,
            inputs=[v2t_input, v2t_question, v2t_seed],
            outputs=[v2t_unused_video, v2t_output, v2t_status, v2t_logs],
        )

    # ---- Tab: Image Understanding -----------------------------------------
    with gr.Tab("Image Q&A"):
        with gr.Row():
            with gr.Column(scale=1):
                i2t_input = gr.Image(label="Upload an image", type="filepath")
                i2t_question = gr.Textbox(
                    label="Question",
                    lines=3,
                    placeholder="What is happening in this image?",
                )
                i2t_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
                i2t_run = gr.Button("Ask", variant="primary")
            with gr.Column(scale=1):
                i2t_output = gr.Textbox(label="Answer", lines=8)
                i2t_status = gr.Markdown("Idle.")
                i2t_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
        i2t_unused = gr.State(None)
        i2t_run.click(
            fn=run_image_understanding,
            inputs=[i2t_input, i2t_question, i2t_seed],
            outputs=[i2t_unused, i2t_output, i2t_status, i2t_logs],
        )

    # ---- Tab: Image Edit --------------------------------------------------
    with gr.Tab("Image Edit"):
        with gr.Row():
            with gr.Column(scale=1):
                ie_input = gr.Image(label="Source image", type="filepath")
                ie_prompt = gr.Textbox(
                    label="Edit instruction",
                    lines=3,
                    placeholder="Add a pearl necklace; convert to watercolor; etc.",
                )
                with gr.Row():
                    ie_height = gr.Slider(256, 1024, value=768, step=16, label="Height")
                    ie_width = gr.Slider(256, 1024, value=768, step=16, label="Width")
                ie_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
                with gr.Accordion("Advanced", open=False):
                    ie_timesteps = gr.Slider(1, 100, value=30, step=1, label="Denoising steps")
                    ie_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
                    ie_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
                ie_run = gr.Button("Edit Image", variant="primary")
            with gr.Column(scale=1):
                ie_output = gr.Image(label="Result")
                ie_status = gr.Markdown("Idle.")
                ie_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
        ie_unused = gr.State("")
        ie_run.click(
            fn=run_image_edit,
            inputs=[
                ie_input, ie_prompt, ie_seed,
                ie_height, ie_width,
                ie_timesteps, ie_shift, ie_cfg,
            ],
            outputs=[ie_output, ie_unused, ie_status, ie_logs],
        )

    # ---- Tab: Video Edit --------------------------------------------------
    with gr.Tab("Video Edit"):
        with gr.Row():
            with gr.Column(scale=1):
                ve_input = gr.Video(label="Source video")
                ve_prompt = gr.Textbox(
                    label="Edit instruction",
                    lines=3,
                    placeholder="Change the background; restyle as anime; etc.",
                )
                with gr.Row():
                    ve_height = gr.Slider(192, 1024, value=480, step=16, label="Height")
                    ve_width = gr.Slider(192, 1024, value=848, step=16, label="Width")
                ve_num_frames = gr.Slider(1, 121, value=50, step=1, label="Frames")
                ve_resolution = gr.Dropdown(
                    label="Resolution preset",
                    choices=VIDEO_RESOLUTION_CHOICES,
                    value="video_480p",
                )
                ve_seed = gr.Number(label="Seed", value=DEFAULT_BASIC_SEED, precision=0)
                with gr.Accordion("Advanced", open=False):
                    ve_timesteps = gr.Slider(1, 100, value=DEFAULT_TIMESTEPS, step=1, label="Denoising steps")
                    ve_shift = gr.Number(label="Timestep shift", value=DEFAULT_TIMESTEP_SHIFT)
                    ve_cfg = gr.Number(label="CFG text scale", value=DEFAULT_CFG_TEXT_SCALE)
                ve_run = gr.Button("Edit Video", variant="primary")
            with gr.Column(scale=1):
                ve_output = gr.Video(label="Result")
                ve_status = gr.Markdown("Idle.")
                ve_logs = gr.Textbox(label="Run log", lines=8, max_lines=20)
        ve_unused = gr.State("")
        ve_run.click(
            fn=run_video_edit,
            inputs=[
                ve_input, ve_prompt, ve_seed,
                ve_height, ve_width, ve_num_frames, ve_resolution,
                ve_timesteps, ve_shift, ve_cfg,
            ],
            outputs=[ve_output, ve_unused, ve_status, ve_logs],
        )


if __name__ == "__main__":
    # SSR mode (Gradio 6 default) breaks @gradio/client 1.8: requests vanish
    # silently because the client expects the old non-SSR endpoint shape.
    # Disable it explicitly.
    # Theme also moved to launch() in Gradio 6, so we pass it here.
    demo.queue(max_size=8).launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False,
        theme=gr.themes.Soft(),
    )