import os
import sys
import subprocess
import tempfile
from typing import Iterable

import torch
import numpy as np
import gradio as gr
from PIL import Image
from types import SimpleNamespace
from huggingface_hub import snapshot_download

import spaces

from gradio.themes import Soft
from gradio.themes.utils import colors, fonts, sizes

colors.orange_red = colors.Color(
    name="orange_red", c50="#FFF0E5", c100="#FFE0CC", c200="#FFC299", c300="#FFA366",
    c400="#FF8533", c500="#FF4500", c600="#E63E00", c700="#CC3700", c800="#B33000",
    c900="#992900", c950="#802200",
)

class OrangeRedTheme(Soft):
    def __init__(
        self, *, primary_hue: colors.Color | str = colors.gray,
        secondary_hue: colors.Color | str = colors.orange_red,
        neutral_hue: colors.Color | str = colors.slate, text_size: sizes.Size | str = sizes.text_lg,
        font: fonts.Font | str | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
        ),
        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
        ),
    ):
        super().__init__(
            primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue,
            text_size=text_size, font=font, font_mono=font_mono,
        )
        super().set(
            background_fill_primary="*primary_50",
            background_fill_primary_dark="*primary_900",
            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
            button_primary_text_color="white",
            button_primary_text_color_hover="white",
            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
            slider_color="*secondary_500",
            slider_color_dark="*secondary_600",
            block_title_text_weight="600", block_border_width="3px",
            block_shadow="*shadow_drop_lg", button_primary_shadow="*shadow_drop_lg",
            button_large_padding="11px", color_accent_soft="*primary_100",
            block_label_background_fill="*primary_200",
        )

orange_red_theme = OrangeRedTheme()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.__version__ =", torch.__version__)
print("torch.version.cuda =", torch.version.cuda)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("current device:", torch.cuda.current_device())
    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

print("Using device:", device)

# Help the allocator survive the large-activation spikes during PiD pixel-space ops
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

PID_REPO_URL = "https://github.com/nv-tlabs/PiD.git"
PID_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "PiD")

if not os.path.exists(PID_REPO_DIR):
    print(f"[pid] cloning {PID_REPO_URL} -> {PID_REPO_DIR}", flush=True)
    subprocess.check_call(["git", "clone", "--depth", "1", PID_REPO_URL, PID_REPO_DIR])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", PID_REPO_DIR])

# PiD's loader resolves paths relative to CWD, so chdir into the repo root.
os.chdir(PID_REPO_DIR)
sys.path.insert(0, PID_REPO_DIR)

# Pull just the Flux-1 / Z-Image-compatible checkpoints from nvidia/PiD into the
# repo's expected checkpoints/ tree.
snapshot_download(
    repo_id="nvidia/PiD",
    local_dir=PID_REPO_DIR,
    allow_patterns=[
        "checkpoints/PiD_res2k_sr4x_official_flux_distill_4step/*",
        "checkpoints/PiD_res2kto4k_sr4x_official_flux_distill_4step/*",
        "checkpoints/ae.safetensors",
    ],
)

from pid._src.inference.checkpoint_registry import get_pid_checkpoint
#from pid._src.inference.create_dataset import XtCaptureCallback
from pid._src.inference.pipeline_registry import (
    decode_with_pipeline_vae,
    extract_latent,
    load_pipeline,
)
from pid._src.utils.model_loader import load_model_from_checkpoint


DTYPE = torch.bfloat16
BACKBONE = "zimage"
SR_SCALE = 4
PID_INFERENCE_STEPS = 4
MAX_SEED = 2**31 - 1

print("[pid] loading Z-Image pipeline...", flush=True)

# transformers 4.57's SDPA / eager mask builders both broadcast the mask
# function over (b, h, q, k) via torch.vmap, which trips ZeroGPU's
# __torch_function__ hijack when it tries to fake-allocate the indexed
# tensors. Replace vmap with explicit broadcasting — same result, same speed,
# no functorch transform context.
from transformers import masking_utils as _mu

def _broadcasting_vmap_for_bhqkv(mask_function, bh_indices: bool = True):
    def wrapped(b, h, q, k):
        if bh_indices:
            return mask_function(
                b[:, None, None, None],
                h[None, :, None, None],
                q[None, None, :, None],
                k[None, None, None, :],
            )
        return mask_function(b, h, q[:, None], k[None, :])
    return wrapped

_mu._vmap_for_bhqkv = _broadcasting_vmap_for_bhqkv

# Gemma2's forward does `normalizer = torch.tensor(hidden_size**0.5, dtype=...)`
# without a device kwarg, so it lands on CPU while hidden_states is on cuda.
# Vanilla CUDA tolerates the cross-device scalar op; ZeroGPU's __torch_function__
# hijack rejects it. Force torch.tensor calls inside Gemma2.forward onto the
# embedding's device.
import transformers.models.gemma2.modeling_gemma2 as _gm

_orig_gemma2_forward = _gm.Gemma2Model.forward

def _patched_gemma2_forward(self, *args, **kwargs):
    _orig_tt = torch.tensor
    dev = self.embed_tokens.weight.device
    def _tt(data, *a, **kw):
        kw.setdefault("device", dev)
        return _orig_tt(data, *a, **kw)
    torch.tensor = _tt
    try:
        return _orig_gemma2_forward(self, *args, **kwargs)
    finally:
        torch.tensor = _orig_tt

_gm.Gemma2Model.forward = _patched_gemma2_forward

pipeline, pipe_cfg = load_pipeline(BACKBONE, dtype=DTYPE)
pipeline.to("cuda")

print("[pid] loading TAEF1 (fast preview decoder)...", flush=True)
from diffusers import AutoencoderTiny
taef1 = AutoencoderTiny.from_pretrained(
    "madebyollin/taef1", torch_dtype=DTYPE, low_cpu_mem_usage=False
).to("cuda")
taef1.eval()

def _load_pid(ckpt_type: str):
    meta = get_pid_checkpoint(BACKBONE, ckpt_type)
    print(f"[pid] loading PiD decoder ({ckpt_type})...", flush=True)
    model, _ = load_model_from_checkpoint(
        experiment_name=meta.experiment,
        checkpoint_path=meta.checkpoint_path,
        config_file="pid/_src/configs/pid/config.py",
        enable_fsdp=False,
        strict=False,
    )
    model.eval()
    return model


pid_models = {
    "2k": _load_pid("2k"),
    "2kto4k": _load_pid("2kto4k"),
}


print("[pid] loading FLUX.2-Klein pipeline...", flush=True)
from diffusers import Flux2KleinPipeline

klein_pipe = Flux2KleinPipeline.from_pretrained(
    "black-forest-labs/FLUX.2-klein-4B",
    torch_dtype=DTYPE,
).to("cuda")
print("[pid] FLUX.2-Klein loaded.", flush=True)

print("[pid] ready", flush=True)


def _pick_pid_model(resolution: int):
    """2k decoder is trained at 2048px (sweet spot 512 → 2048); 2kto4k handles 1024 → 4K."""
    return pid_models["2kto4k"] if resolution > 512 else pid_models["2k"]


def _latent_to_pil(tensor: torch.Tensor) -> Image.Image:
    """PiD output is (C, T, H, W) with T=1 for image -> PIL.Image."""
    if tensor.dim() == 4:
        tensor = tensor.squeeze(1)
    arr = ((tensor.float().clamp(-1, 1) + 1) * 127.5).permute(1, 2, 0).cpu().numpy().astype(np.uint8)
    return Image.fromarray(arr)


def _taef1_preview(packed_latent: torch.Tensor, H: int, W: int) -> Image.Image:
    """Fast low-res decode of a Z-Image latent using TAEF1 (FLUX-1 compatible)."""
    with torch.no_grad():
        unpacked = extract_latent(pipeline, SimpleNamespace(images=packed_latent), pipe_cfg, H, W)
        scale = pipeline.vae.config.scaling_factor
        shift = getattr(pipeline.vae.config, "shift_factor", None) or 0.0
        denorm = unpacked.to(dtype=DTYPE) / scale + shift
        img = taef1.decode(denorm).sample
        img = (img.float().clamp(-1, 1) + 1) / 2
        arr = (img[0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
    return Image.fromarray(arr)


def _pid_pixel_to_pil(x: torch.Tensor) -> Image.Image:
    """PiD pixel-space tensor (B, 3, H, W) in [-1, 1] -> PIL.Image."""
    arr = ((x[0].float().clamp(-1, 1) + 1) * 127.5).permute(1, 2, 0).cpu().numpy().astype(np.uint8)
    return Image.fromarray(arr)


def _pid_stream(
    pid_model,
    latent: torch.Tensor,
    baseline_01: torch.Tensor,
    sigma: float,
    caption: str,
    num_steps: int = PID_INFERENCE_STEPS,
):
    """Reimplementation of PiDDistillModel.generate_samples_from_batch that yields
    the current pixel-space tensor after each of the `num_steps` student-sampler
    iterations. Final yield is the clean output."""
    from contextlib import nullcontext

    B = 1
    lq_h, lq_w = baseline_01.shape[-2], baseline_01.shape[-1]
    img_h, img_w = lq_h * SR_SCALE, lq_w * SR_SCALE

    caption_embs, _ = pid_model._encode_text_raw([caption])
    caption_embs = caption_embs.to(**pid_model.tensor_kwargs)

    lq_video_or_image = (baseline_01 * 2.0 - 1.0).to(dtype=DTYPE, device="cuda")
    lq_latent = latent.to(dtype=DTYPE, device="cuda")
    degrade_sigma_tensor = torch.tensor([sigma], device="cuda", dtype=torch.float32)

    gen = torch.Generator(device="cuda").manual_seed(0)
    noise = torch.randn(B, 3, img_h, img_w, device="cuda", generator=gen)

    t_list = pid_model._get_t_list(device=torch.device("cuda"), num_steps=num_steps)
    autocast_ctx = (
        torch.autocast("cuda", dtype=pid_model.autocast_dtype)
        if pid_model.autocast_dtype
        else nullcontext()
    )
    net = pid_model.net
    net.eval()
    timescale = pid_model.fm_trainer.timescale
    student_sample_type = pid_model.config.student_sample_type
    prediction_type = pid_model.config.prediction_type

    x = noise
    with torch.no_grad(), autocast_ctx:
        steps_total = len(t_list) - 1
        for step_idx, (t_cur, t_next) in enumerate(zip(t_list[:-1], t_list[1:])):
            t_cur_batch = t_cur.expand(B)
            t_cur_scaled = t_cur_batch * timescale
            v_pred = net(
                x,
                t_cur_scaled,
                caption_embs,
                lq_video_or_image=lq_video_or_image,
                lq_latent=lq_latent,
                degrade_sigma=degrade_sigma_tensor,
            )
            if t_next.item() > 0:
                if student_sample_type == "ode":
                    v_for_step = pid_model._net_output_to_velocity(x, v_pred, t_cur_batch, prediction_type)
                    dt = t_next - t_cur
                    x = x + dt * v_for_step
                else:
                    x0_pred = pid_model._velocity_to_x0(x, v_pred, t_cur_batch)
                    eps_infer = torch.randn(
                        x0_pred.shape, device=x0_pred.device, dtype=x0_pred.dtype, generator=gen
                    )
                    s = [B] + [1] * (x.ndim - 1)
                    t_next_bcast = t_next.reshape(1).expand(s)
                    x = (1.0 - t_next_bcast) * x0_pred + t_next_bcast * eps_infer
            else:
                x = pid_model._velocity_to_x0(x, v_pred, t_cur_batch)
            yield step_idx + 1, steps_total, x.clone()


def _evenly_spaced_capture_steps(total_steps: int, num_captures: int) -> list[int]:
    """Pick N capture indices spread across [1, total_steps-1]."""
    if num_captures <= 0:
        return []
    raw = np.linspace(1, max(2, total_steps - 1), num_captures + 1)[1:]
    return sorted({int(round(x)) for x in raw})


def _resize_to_divisible(image: Image.Image, max_side: int = 1024, div: int = 16) -> Image.Image:
    """Resize so the longer side ≤ max_side and both dims divisible by `div`.
    Never upscales the input image."""
    w, h = image.size
    scale = min(max_side / w, max_side / h, 1.0)
    nw = max(div, (int(w * scale) // div) * div)
    nh = max(div, (int(h * scale) // div) * div)
    return image.resize((nw, nh), Image.LANCZOS)


def _encode_image_to_latent(image_01: torch.Tensor) -> torch.Tensor:
    """Encode a (1, 3, H, W) [0,1] float tensor to a VAE latent via the Z-Image VAE."""
    vae = pipeline.vae
    image_norm = image_01 * 2.0 - 1.0          # [0,1] → [-1,1]
    with torch.no_grad():
        latent = vae.encode(image_norm.to(dtype=DTYPE, device="cuda")).latent_dist.sample()
        scale = vae.config.scaling_factor
        shift = getattr(vae.config, "shift_factor", None) or 0.0
        latent = (latent - shift) * scale
    return latent


import random
import threading
import queue as _queue

def _generate_core(
    prompt: str,
    num_inference_steps: int = 28,
    guidance_scale: float = 5.0,
    seed: int = 0,
    resolution: int = 512,
    randomize_seed: bool = False,
):
    if not prompt or not prompt.strip():
        raise gr.Error("Please enter a prompt.")

    if randomize_seed:
        seed = random.randint(0, 2**31 - 1)
    seed = int(seed)
    num_inference_steps = int(num_inference_steps)
    H = W = int(resolution)

    # initial: show the live preview, hide the final slider
    yield gr.update(visible=True, value=None, label="Generating Z-Image…"), gr.update(visible=False, value=None), gr.update(value=seed)

    # ---- Run Z-Image in a thread; stream taef1 previews via a queue ----
    preview_q: "_queue.Queue" = _queue.Queue()
    _DONE = object()

    def streaming_cb(pipe, step_index, timestep, callback_kwargs):
        try:
            preview = _taef1_preview(callback_kwargs["latents"], H, W)
            preview_q.put((step_index, preview))
        except Exception as e:
            print(f"[pid] taef1 preview failed at step {step_index}: {e}", flush=True)
        return callback_kwargs

    def run_pipeline():
        gen_torch = torch.Generator(device="cuda").manual_seed(int(seed))
        gen_kwargs = dict(
            prompt=prompt,
            height=H,
            width=W,
            num_inference_steps=num_inference_steps,
            guidance_scale=float(guidance_scale),
            num_images_per_prompt=1,
            output_type="latent",
            generator=gen_torch,
            callback_on_step_end=streaming_cb,
            callback_on_step_end_tensor_inputs=["latents"],
        )
        gen_kwargs.update(pipe_cfg.extra_generate_kwargs)
        try:
            with torch.no_grad():
                out = pipeline(**gen_kwargs)
            preview_q.put((_DONE, out))
        except Exception as e:
            preview_q.put((_DONE, e))

    thread = threading.Thread(target=run_pipeline, daemon=True)
    thread.start()

    raw_output = None
    while True:
        step_index, payload = preview_q.get()
        if step_index is _DONE:
            if isinstance(payload, Exception):
                raise payload
            raw_output = payload
            break
        label = f"Generating Z-Image — step {step_index + 1}/{num_inference_steps}"
        yield gr.update(visible=True, value=payload, label=label), gr.update(visible=False), gr.update()

    thread.join()
    final_latent = extract_latent(pipeline, raw_output, pipe_cfg, H, W)

    yield gr.update(visible=True, label="Decoding final Z-Image…"), gr.update(visible=False), gr.update()
    with torch.no_grad():
        baseline_01 = decode_with_pipeline_vae(pipeline, final_latent, pipe_cfg)
    zimage_img = Image.fromarray(
        (baseline_01[0].clamp(0, 1).permute(1, 2, 0).float().cpu().numpy() * 255).astype(np.uint8)
    )

    torch.cuda.empty_cache()

    final_sigma = float(pipeline.scheduler.sigmas[-1].item())
    pid_img = None
    pid_model = _pick_pid_model(H)
    for k, total, x in _pid_stream(pid_model, final_latent, baseline_01, final_sigma, prompt):
        pid_img = _pid_pixel_to_pil(x)
        yield (
            gr.update(visible=True, value=pid_img, label=f"Upscaling with PiD — step {k}/{total}"),
            gr.update(visible=False),
            gr.update(),
        )

    yield (
        gr.update(visible=False, value=None),
        gr.update(visible=True, value=(zimage_img, pid_img)),
        gr.update(),
    )


@spaces.GPU(duration=60)
def generate_large(*args, **kwargs):
    yield from _generate_core(*args, **kwargs)


@spaces.GPU(duration=90, size="xlarge")
def generate_xlarge(*args, **kwargs):
    yield from _generate_core(*args, **kwargs)


def generate(prompt, num_inference_steps, guidance_scale, seed, resolution, randomize_seed):
    fn = generate_xlarge if int(resolution) >= 1024 else generate_large
    yield from fn(prompt, num_inference_steps, guidance_scale, seed, resolution, randomize_seed)


def update_dimensions_on_upload(image: Image.Image):
    """Return markdown info string after safe resize."""
    if image is None:
        return "_Upload an image to see its processed dimensions._"
    resized = _resize_to_divisible(image)
    ow, oh = image.size
    nw, nh = resized.size
    return (
        f"**Input:** {ow} × {oh} px  →  "
        f"**Processed:** {nw} × {nh} px  →  "
        f"**PiD output:** {nw * SR_SCALE} × {nh * SR_SCALE} px"
    )


def _i2i_generate_core(
    input_image: Image.Image,
    prompt: str,
    seed: int = 0,
    randomize_seed: bool = True,
    guidance_scale: float = 1.0,
    steps: int = 4,
):
    if input_image is None:
        raise gr.Error("Please upload an input image.")
    if not prompt or not prompt.strip():
        raise gr.Error("Please enter a prompt / description.")

    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    seed = int(seed)

    input_image = _resize_to_divisible(input_image.convert("RGB"))
    W, H = input_image.size

    yield (
        gr.update(visible=True, value=None, label="Running FLUX.2-Klein…"),
        gr.update(visible=False, value=None),
        gr.update(value=seed),
    )

    gen_torch = torch.Generator(device="cuda").manual_seed(seed)
    with torch.no_grad():
        klein_out = klein_pipe(
            prompt=prompt,
            image=input_image,
            num_inference_steps=int(steps),
            guidance_scale=float(guidance_scale),
            generator=gen_torch,
            output_type="pil",
        )
    klein_img: Image.Image = klein_out.images[0]

    if klein_img.size != (W, H):
        klein_img = klein_img.resize((W, H), Image.LANCZOS)

    yield (
        gr.update(visible=True, value=klein_img, label="FLUX.2-Klein done — encoding for PiD…"),
        gr.update(visible=False),
        gr.update(),
    )

    torch.cuda.empty_cache()

    klein_arr = np.array(klein_img).astype(np.float32) / 255.0
    klein_tensor_01 = torch.from_numpy(klein_arr).permute(2, 0, 1).unsqueeze(0)

    final_latent = _encode_image_to_latent(klein_tensor_01)
    baseline_01  = klein_tensor_01.to(dtype=DTYPE, device="cuda")
    final_sigma  = float(pipeline.scheduler.sigmas[-1].item())

    pid_model = _pick_pid_model(max(H, W))
    pid_img = None
    for k, total, x in _pid_stream(
        pid_model, final_latent, baseline_01, final_sigma, prompt, num_steps=PID_INFERENCE_STEPS
    ):
        pid_img = _pid_pixel_to_pil(x)
        yield (
            gr.update(visible=True, value=pid_img, label=f"Upscaling with PiD — step {k}/{total}"),
            gr.update(visible=False),
            gr.update(),
        )

    yield (
        gr.update(visible=False, value=None),
        gr.update(visible=True, value=(klein_img, pid_img)),
        gr.update(),
    )


@spaces.GPU(duration=90, size="xlarge")
def i2i_generate(*args, **kwargs):
    yield from _i2i_generate_core(*args, **kwargs)

# PiD upscaler supports up to 1024px input (→ 4096px output with 2kto4k model).
# We clamp at 1024 to stay within VRAM budget.
UPSCALER_MAX_SIDE = 1024


def _upscaler_dim_info(image: Image.Image):
    """Dimension markdown shown when the user uploads an image."""
    if image is None:
        return "_Upload an image to see its upscale dimensions._"
    w, h = image.size
    scale = min(UPSCALER_MAX_SIDE / w, UPSCALER_MAX_SIDE / h, 1.0)
    nw = max(16, (int(w * scale) // 16) * 16)
    nh = max(16, (int(h * scale) // 16) * 16)
    out_w, out_h = nw * SR_SCALE, nh * SR_SCALE
    return (
        f"**Input:** {w} × {h} px  →  "
        f"**Processed:** {nw} × {nh} px  →  "
        f"**Upscaled output:** {out_w} × {out_h} px  "
        f"*({SR_SCALE}× via PiD)*"
    )


def _upscaler_core(
    input_image: Image.Image,
    prompt: str,
):
    """
    Pure PiD upscaler:
      1. Resize input so longer side ≤ 1024 and dims are divisible by 16.
      2. Encode to VAE latent (Z-Image VAE).
      3. Run PiD 4-step student sampler → 4× pixel-space output.
      4. Yield live step previews, then the final A/B slider.
    """
    if input_image is None:
        raise gr.Error("Please upload an image to upscale.")

    # caption is optional — use a generic fallback if blank
    caption = prompt.strip() if prompt and prompt.strip() else "high quality, detailed, sharp"

    img_rgb = input_image.convert("RGB")
    w, h = img_rgb.size
    scale = min(UPSCALER_MAX_SIDE / w, UPSCALER_MAX_SIDE / h, 1.0)
    nw = max(16, (int(w * scale) // 16) * 16)
    nh = max(16, (int(h * scale) // 16) * 16)
    if (nw, nh) != (w, h):
        img_rgb = img_rgb.resize((nw, nh), Image.LANCZOS)

    input_pil = img_rgb           # clean resized input shown on the left of the slider

    yield (
        gr.update(visible=True, value=input_pil, label="Encoding image…"),
        gr.update(visible=False, value=None),
    )

    # ── Encode to VAE latent ───────────────────────────────────────────────
    arr_01 = np.array(img_rgb).astype(np.float32) / 255.0
    tensor_01 = torch.from_numpy(arr_01).permute(2, 0, 1).unsqueeze(0)   # 1 3 H W  [0,1]

    latent      = _encode_image_to_latent(tensor_01)
    baseline_01 = tensor_01.to(dtype=DTYPE, device="cuda")
    sigma       = float(pipeline.scheduler.sigmas[-1].item())

    torch.cuda.empty_cache()

    # ── PiD 4-step upscaling ───────────────────────────────────────────────
    pid_model = _pick_pid_model(max(nw, nh))
    pid_img = None

    for k, total, x in _pid_stream(
        pid_model, latent, baseline_01, sigma, caption, num_steps=PID_INFERENCE_STEPS
    ):
        pid_img = _pid_pixel_to_pil(x)
        yield (
            gr.update(visible=True, value=pid_img, label=f"Upscaling with PiD — step {k}/{total}"),
            gr.update(visible=False),
        )

    # ── Done: show A/B slider ──────────────────────────────────────────────
    yield (
        gr.update(visible=False, value=None),
        gr.update(visible=True, value=(input_pil, pid_img)),
    )


@spaces.GPU(duration=90, size="xlarge")
def upscaler_run(*args, **kwargs):
    yield from _upscaler_core(*args, **kwargs)


DESCRIPTION = """
# PiD — Pixel Diffusion Decoder

**Text2Image** uses [Z-Image](https://huggingface.co/Tongyi-MAI/Z-Image) (live TAEF1 previews) then [PiD](https://huggingface.co/nvidia/PiD)'s 4-step pixel-diffusion decoder for 4× super-resolution. **Image2Image** uses FLUX.2-Klein for fast image-to-image then [PiD](https://huggingface.co/nvidia/PiD) for 4× upscaling. The slider on each tab compares the base model output vs the PiD upscale. — [visit github](https://github.com/PRITHIVSAKTHIUR/PiD-Image-Upscaler).
"""

css = """
.gradio-container { max-width: 1200px !important; margin: auto !important; }
.dark .gradio-container { color: var(--body-text-color); }
"""

with gr.Blocks(theme=orange_red_theme, css=css) as demo:

    gr.Markdown(DESCRIPTION)

    with gr.Tabs():

        with gr.Tab("Image2ImagePiD"):

            gr.Markdown(
                "Upload any image — **[FLUX.2-Klein](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B)** refines it then "
                "**PiD** super-resolves the result 4×.  \n"
                "The slider compares the Klein output **(left)** to the PiD upscale **(right)**."
            )

            with gr.Row():
                with gr.Column(scale=1):
                    i2i_input = gr.Image(label="Input image", type="pil", height=380)
                    i2i_dim_info = gr.Markdown("_Upload an image to see its processed dimensions._")
                    i2i_prompt = gr.Textbox(
                        label="Prompt / description",
                        placeholder="Describe the image content or the desired style…",
                        lines=3,
                    )
                    i2i_run = gr.Button("Run", variant="primary")

                    with gr.Accordion("Advanced Settings", open=False, visible=True):
                        i2i_seed = gr.Slider(
                            label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0
                        )
                        i2i_rand = gr.Checkbox(label="Randomize seed", value=True)
                        i2i_guidance = gr.Slider(
                            label="Guidance Scale", minimum=0.0, maximum=10.0, step=0.1, value=1.0
                        )
                        i2i_steps = gr.Slider(
                            label="Steps", minimum=1, maximum=50, value=4, step=1
                        )

                with gr.Column(scale=2):
                    i2i_live = gr.Image(
                        label="Output", visible=True, show_label=True, type="pil", height=400
                    )
                    i2i_slider = gr.ImageSlider(
                        label="FLUX.2-Klein (left)  ↔  PiD 4× upscale (right)",
                        visible=False,
                        type="pil",
                        height=720,
                        max_height=720,
                    )

            i2i_input.upload(
                fn=update_dimensions_on_upload,
                inputs=i2i_input,
                outputs=i2i_dim_info,
            )
            i2i_run.click(
                fn=i2i_generate,
                inputs=[i2i_input, i2i_prompt, i2i_seed, i2i_rand, i2i_guidance, i2i_steps],
                outputs=[i2i_live, i2i_slider, i2i_seed],
            )

        with gr.Tab("Text2ImagePiD"):

            with gr.Row():
                prompt = gr.Textbox(
                    show_label=False,
                    placeholder="Describe what you want to generate…",
                    value="A photorealistic Labrador retriever resting beside a campfire at night, glowing warm firelight reflecting on detailed fur, cinematic outdoor atmosphere.",
                    max_lines=1,
                    scale=4,
                    container=False,
                )
                run = gr.Button("Run", variant="primary", scale=1)

            live_preview = gr.Image(label="Z-Image with PiD", visible=True, show_label=True, type="pil", height=720)
            slider = gr.ImageSlider(
                label="Z-Image (left)  ↔  PiD 4× upscale (right)",
                visible=False,
                type="pil",
                height=720,
                max_height=720,
            )

            with gr.Accordion("Advanced settings", open=False):
                with gr.Row():
                    resolution = gr.Radio(
                        label="Z-Image resolution",
                        choices=[512, 1024],
                        value=512,
                        info="512 → 2048² (PiD 2k); 1024 → 4096² (PiD 2kto4k)",
                    )
                    num_inference_steps = gr.Slider(
                        label="Z-Image steps", minimum=8, maximum=50, step=1, value=28
                    )
                with gr.Row():
                    guidance_scale = gr.Slider(
                        label="Guidance", minimum=1.0, maximum=10.0, step=0.5, value=5.0
                    )
                    seed = gr.Number(label="Seed", value=0, precision=0)
                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

            run.click(
                fn=generate,
                inputs=[prompt, num_inference_steps, guidance_scale, seed, resolution, randomize_seed],
                outputs=[live_preview, slider, seed],
            )

        with gr.Tab("Image-Upscaler-(preview)"):

            gr.Markdown(
                "Upload any image and **PiD** will upscale it **4×** directly — "
                "no text generation step needed.  \n"
                "An optional prompt / description helps PiD produce sharper, "
                "more faithful detail.  \n"
                "The slider compares the **original** (left) to the **PiD 4× upscale** (right)."
            )

            with gr.Row():

                with gr.Column(scale=1):
                    up_input = gr.Image(
                        label="Image to upscale",
                        type="pil",
                        height=400,
                    )
                    up_dim_info = gr.Markdown(
                        "_Upload an image to see its upscale dimensions._"
                    )
                    up_prompt = gr.Textbox(
                        label="Optional prompt / description",
                        placeholder="Describe the image for better detail (leave blank for auto)…",
                        lines=3,
                        visible=False,
                    )
                    up_run = gr.Button("Upscale 4×", variant="primary")

                with gr.Column(scale=2):
                    up_live = gr.Image(
                        label="Output",
                        visible=True,
                        show_label=True,
                        type="pil",
                        height=400,
                    )
                    up_slider = gr.ImageSlider(
                        label="Original (left)  ↔  PiD 4× upscale (right)",
                        visible=False,
                        type="pil",
                        height=720,
                        max_height=720,
                    )

            # live dimension info on upload
            up_input.upload(
                fn=_upscaler_dim_info,
                inputs=up_input,
                outputs=up_dim_info,
            )

            up_run.click(
                fn=upscaler_run,
                inputs=[up_input, up_prompt],
                outputs=[up_live, up_slider],
            )

if __name__ == "__main__":
    demo.queue().launch(mcp_server=True, ssr_mode=False, show_error=True)