# Copyright (c) 2026 Bytedance Ltd. and/or its affiliate
# Licensed under the Apache License, Version 2.0

"""Bernini Renderer Gradio demo — HuggingFace Spaces edition."""

import os
import tempfile
from datetime import datetime

import gradio as gr
import spaces
import torch

from bernini.pipeline import BerniniRendererPipeline
from bernini.cli import DEFAULT_NEG_PROMPT, GUIDANCE_MODES
from bernini.prompt_enhancer import PromptEnhancer, get_system_prompt_for_task

HF_MODEL_ID = "ByteDance/Bernini-R-Diffusers"
SAVE_BASE = tempfile.mkdtemp(prefix="bernini_gradio_")
os.makedirs(SAVE_BASE, exist_ok=True)

# Prompt Enhancement — configured via HF Secrets, not exposed to users
_PE_API_KEY = os.environ.get("BERNINI_PE_API_KEY", "")
_PE_BASE_URL = os.environ.get("BERNINI_PE_BASE_URL", "")
_PE_MODEL = os.environ.get("BERNINI_PE_MODEL", "")

TASK_TYPE_CHOICES = ["t2i", "t2v", "i2i", "v2v", "mv2v", "r2v", "rv2v", "ads2v"]

GUIDANCE_MODE_BY_TASK = {
    "t2i":   "t2v_apg",
    "t2v":   "t2v_apg",
    "i2i":   "v2v",
    "v2v":   "v2v_apg",
    "mv2v":  "v2v_apg",
    "r2v":   "r2v_apg",
    "rv2v":  "rv2v",
    "ads2v": "v2v_apg",
}

TASK_INPUTS = {
    "t2i":   {"video": False, "image_role": "none",      "images": False},
    "t2v":   {"video": False, "image_role": "none",      "images": False},
    "i2i":   {"video": False, "image_role": "source",    "images": False},
    "v2v":   {"video": True,  "image_role": "none",      "images": False},
    "mv2v":  {"video": True,  "image_role": "none",      "images": False},
    "r2v":   {"video": False, "image_role": "reference", "images": True},
    "rv2v":  {"video": True,  "image_role": "reference", "images": True},
    "ads2v": {"video": True,  "image_role": "reference", "images": True},
}

IMAGE_TASKS = {"t2i", "i2i"}

PIPELINE = None

def get_pipeline():
    global PIPELINE
    if PIPELINE is None:
        print(f"Loading pipeline from {HF_MODEL_ID} ...")
        PIPELINE = BerniniRendererPipeline.from_pretrained(
            HF_MODEL_ID,
            device=torch.device("cuda"),
            load_ckpt_weights=False,
            use_unipc=True,
            use_src_id_rotary_emb=True,
        )
        print("Pipeline loaded.")
    return PIPELINE

def _coerce_video_paths(video_input):
    if not video_input:
        return None
    if isinstance(video_input, str):
        return [video_input]
    if isinstance(video_input, list):
        out = []
        for v in video_input:
            if v is None:
                continue
            if isinstance(v, str):
                out.append(v)
            elif hasattr(v, "name"):
                out.append(v.name)
            elif isinstance(v, dict) and v.get("path"):
                out.append(v["path"])
        return out or None
    return None

def _coerce_gallery_paths(gallery_input):
    if not gallery_input:
        return None
    out = []
    for item in gallery_input:
        if isinstance(item, (list, tuple)) and item:
            item = item[0]
        if isinstance(item, str):
            out.append(item)
        elif isinstance(item, dict) and item.get("path"):
            out.append(item["path"])
        elif hasattr(item, "name"):
            out.append(item.name)
    return out or None

def _output_path(task_type):
    ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    ext = "png" if task_type in IMAGE_TASKS else "mp4"
    return os.path.join(SAVE_BASE, f"{task_type}_{ts}.{ext}")

def _build_kwargs(
    prompt, task_type, video_input, image_input, gallery_input, guidance_mode,
    max_image_size, num_inference_steps, num_frames, flow_shift, seed, fps,
    height, width, omega_V, omega_I, omega_TI, omega_scale, eta, momentum,
):
    needs = TASK_INPUTS[task_type]
    video = _coerce_video_paths(video_input) if needs["video"] else None
    images = _coerce_gallery_paths(gallery_input) if needs["images"] else None
    image = None
    if needs["image_role"] == "source":
        image = image_input or None
    elif needs["image_role"] == "reference" and image_input:
        images = [image_input] + (images or [])
    if task_type in IMAGE_TASKS:
        num_frames = 1
    return dict(
        prompt=prompt or "",
        neg_prompt=DEFAULT_NEG_PROMPT,
        video=video, image=image, images=images,
        max_image_size=int(max_image_size),
        num_inference_steps=int(num_inference_steps),
        num_frames=int(num_frames),
        flow_shift=float(flow_shift),
        seed=int(seed), fps=int(fps),
        height=int(height), width=int(width),
        guidance_mode=guidance_mode or GUIDANCE_MODE_BY_TASK[task_type],
        omega_V=float(omega_V), omega_I=float(omega_I),
        omega_TI=float(omega_TI), omega_scale=float(omega_scale),
        eta=float(eta), momentum=float(momentum),
        system_prompt=get_system_prompt_for_task(task_type),
    )

@spaces.GPU(duration=1200)
def generate_handler(
    prompt, task_type, video_input, image_input, gallery_input,
    guidance_mode, max_image_size, num_inference_steps, num_frames,
    flow_shift, seed, fps, height, width,
    omega_V, omega_I, omega_TI, omega_scale, eta, momentum,
    progress=gr.Progress(),
):
    if not task_type:
        gr.Warning("Please select a task type first!")
        return None, None, "", "Please select a task type first!"
    if not (prompt or "").strip():
        gr.Warning("Please enter a prompt!")
        return None, None, "", "Please enter a prompt!"

    kwargs = _build_kwargs(
        prompt, task_type, video_input, image_input, gallery_input,
        guidance_mode, max_image_size, num_inference_steps, num_frames,
        flow_shift, seed, fps, height, width,
        omega_V, omega_I, omega_TI, omega_scale, eta, momentum,
    )

    # Prompt enhancement via server-side key (not exposed to users)
    if _PE_API_KEY:
        try:
            rewriter = PromptEnhancer(
                api_key=_PE_API_KEY,
                base_url=_PE_BASE_URL or None,
                model=_PE_MODEL or None,
            )
            enhanced = rewriter(
                task_type,
                kwargs["prompt"],
                video=kwargs.get("video"),
                image=kwargs.get("image"),
                images=kwargs.get("images"),
            )
            if enhanced:
                kwargs["prompt"] = enhanced
        except Exception as e:
            gr.Warning(f"Prompt enhancement failed: {e}. Using original prompt.")

    kwargs["output_path"] = _output_path(task_type)
    pipeline = get_pipeline()

    try:
        output_path = pipeline(write_output=True, **kwargs)
    except Exception as e:
        return None, None, kwargs["prompt"], f"Generation failed: {e}"

    out_video = out_image = None
    if output_path:
        if output_path.endswith(".png") or task_type in IMAGE_TASKS:
            out_image = output_path
        else:
            out_video = output_path

    return out_video, out_image, kwargs["prompt"], f"Done: {output_path}"

def _on_task_change(task_type):
    auto = GUIDANCE_MODE_BY_TASK.get(task_type) if task_type else None
    needs = TASK_INPUTS.get(task_type, {})
    bits = []
    if needs.get("video"):
        bits.append("source video")
    if needs.get("image_role") == "source":
        bits.append("single source image")
    if needs.get("image_role") == "reference" or needs.get("images"):
        bits.append("reference image(s)")
    extra = "inputs: " + ", ".join(bits) if bits else "text-only"
    frames = " | forced num_frames=1" if task_type in IMAGE_TASKS else ""
    return gr.update(value=auto), f"{extra}{frames}"

with gr.Blocks(title="Bernini Renderer Demo") as demo:
    gr.Markdown("# 🎬 Bernini Renderer Demo")
    gr.Markdown(
        "Unified video generation & editing — text-to-image, text-to-video, "
        "image editing, video editing, reference-to-video, and more.\n\n"
        "**Paper**: [arXiv 2605.22344](https://arxiv.org/abs/2605.22344) | "
        "**Model**: [ByteDance/Bernini-R](https://huggingface.co/ByteDance/Bernini-R)"
    )

    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### Input")
                prompt = gr.Textbox(label="Prompt", lines=3,
                    placeholder="Describe the scene or the editing instruction...")
                with gr.Tabs():
                    with gr.TabItem("Video"):
                        video_input = gr.File(label="Upload video(s)",
                            file_count="multiple", file_types=["video"], type="filepath")
                    with gr.TabItem("Single image"):
                        image_input = gr.Image(
                            label="Upload an image (source for i2i, or a single reference)",
                            type="filepath")
                    with gr.TabItem("Multiple images"):
                        gallery_input = gr.Gallery(label="Upload reference images (r2v / rv2v)",
                            columns=4, height="auto", interactive=True)

            with gr.Group():
                gr.Markdown("### Task")
                task_type = gr.Dropdown(choices=TASK_TYPE_CHOICES, value=None,
                    label="Task type (required)", info="Auto-fills guidance_mode below")
                guidance_mode = gr.Dropdown(choices=GUIDANCE_MODES, value=None, label="Guidance mode")
                input_hint = gr.Markdown("")

            with gr.Group():
                gr.Markdown("### Basic parameters")
                with gr.Row():
                    max_image_size = gr.Slider(256, 1280, value=848, step=16, label="Max image size")
                    num_frames = gr.Slider(1, 121, value=49, step=4, label="Num frames")
                with gr.Row():
                    num_inference_steps = gr.Slider(10, 50, value=40, step=5, label="Inference steps")
                    flow_shift = gr.Slider(0.0, 12.0, value=5.0, step=0.5, label="Flow shift")
                with gr.Row():
                    seed = gr.Number(value=42, precision=0, label="Seed")
                    fps = gr.Slider(1, 30, value=16, step=1, label="FPS")
                with gr.Row():
                    height = gr.Number(value=480, precision=0, label="Height")
                    width = gr.Number(value=848, precision=0, label="Width")

            with gr.Accordion("Guidance (advanced)", open=False):
                with gr.Row():
                    omega_V = gr.Slider(0.0, 10.0, value=1.25, step=0.05, label="omega_V")
                    omega_I = gr.Slider(0.0, 10.0, value=4.5, step=0.05, label="omega_I")
                    omega_TI = gr.Slider(0.0, 10.0, value=4.0, step=0.05, label="omega_TI")
                with gr.Row():
                    omega_scale = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="omega_scale")
                    eta = gr.Slider(0.0, 2.0, value=0.5, step=0.05, label="eta")
                    momentum = gr.Slider(-2.0, 2.0, value=0.0, step=0.05, label="momentum")

            generate_btn = gr.Button("Generate", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### Output")
            output_video = gr.Video(label="Generated video")
            output_image = gr.Image(label="Generated image")
            final_prompt = gr.Textbox(label="Prompt used", interactive=False, lines=3)
            output_status = gr.Textbox(label="Status", interactive=False, lines=2)

    task_type.change(fn=_on_task_change, inputs=task_type, outputs=[guidance_mode, input_hint])

    generate_btn.click(
        fn=generate_handler,
        inputs=[
            prompt, task_type, video_input, image_input, gallery_input,
            guidance_mode, max_image_size, num_inference_steps, num_frames,
            flow_shift, seed, fps, height, width,
            omega_V, omega_I, omega_TI, omega_scale, eta, momentum,
        ],
        outputs=[output_video, output_image, final_prompt, output_status],
    )

if __name__ == "__main__":
    demo.queue(max_size=5, default_concurrency_limit=1).launch()