import spaces  # noqa: F401  must precede torch / diffusers

import subprocess
import tempfile
from pathlib import Path

import gradio as gr
import imageio.v3 as iio
import numpy as np
import torch
from diffusers import Cosmos3OmniPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler

MODEL_ID = "nvidia/Cosmos3-Nano"

pipe = Cosmos3OmniPipeline.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    enable_safety_checker=True,
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=10.0)
pipe.to("cuda")

AUDIO_SR = int(pipe.sound_tokenizer.config.sampling_rate)

MODE_IMAGE = "Image"
MODE_VIDEO = "Video"

RESOLUTIONS = {
    "720p (1280x720, slow)": (1280, 720),
    "480p (832x480, fast)": (832, 480),
    "360p (640x352, fastest)": (640, 352),
}


def _duration(mode, prompt, image, resolution, num_frames, steps, *_):
    w, h = RESOLUTIONS[resolution]
    n = 1 if mode == MODE_IMAGE else num_frames
    pixels = w * h * n * steps
    return min(1200, int(60 + pixels / 8_000_000))


@spaces.GPU(duration=_duration)
def generate(
    mode,
    prompt,
    image,
    resolution,
    num_frames,
    steps,
    guidance,
    enable_sound,
    negative_prompt,
    seed,
    randomize_seed,
    progress=gr.Progress(track_tqdm=True),
):
    if not prompt or not prompt.strip():
        raise gr.Error("Please enter a prompt.")

    width, height = RESOLUTIONS[resolution]

    if randomize_seed:
        seed = int(torch.randint(0, 2**31 - 1, (1,)).item())
    generator = torch.Generator(device="cuda").manual_seed(int(seed))

    fps = 24
    is_image = mode == MODE_IMAGE
    sound = bool(enable_sound) and not is_image

    result = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt or None,
        image=image,
        num_frames=1 if is_image else int(num_frames),
        height=height,
        width=width,
        fps=fps,
        num_inference_steps=int(steps),
        guidance_scale=float(guidance),
        enable_sound=sound,
        generator=generator,
        output_type="pil",
    )

    out_dir = Path(tempfile.mkdtemp(prefix="cosmos3_"))

    if is_image:
        img = result.video[0]
        img_path = out_dir / "image.png"
        img.save(img_path)
        return str(img_path), None, seed

    frames = np.stack([np.asarray(f) for f in result.video], axis=0)
    silent_path = out_dir / "video.mp4"
    iio.imwrite(silent_path, frames, fps=fps, codec="libx264")

    video_path = silent_path
    if sound and result.sound is not None:
        wav = result.sound.detach().to(torch.float32).cpu().numpy()
        if wav.ndim == 2:
            wav = wav.T
        wav_path = out_dir / "audio.wav"
        import soundfile as sf

        sf.write(wav_path, wav, AUDIO_SR)

        video_path = out_dir / "video_with_audio.mp4"
        subprocess.run(
            [
                "ffmpeg", "-y", "-loglevel", "error",
                "-i", str(silent_path),
                "-i", str(wav_path),
                "-c:v", "copy", "-c:a", "aac", "-shortest",
                str(video_path),
            ],
            check=True,
        )

    return None, str(video_path), seed


EXAMPLE_T2I = (
    "A close-up portrait of an astronaut in a jungle, cold color palette, "
    "muted colors, detailed, 8k."
)
EXAMPLE_T2V = (
    "A cinematic wide shot of a vintage red convertible driving along a "
    "coastal highway at sunset. Waves crash against rocks below as the car "
    "speeds toward the horizon. Golden hour lighting, lens flare, smooth "
    "tracking shot."
)
EXAMPLE_T2VS = (
    "A close-up of rain hitting a window at night, neon city lights blurred "
    "in the background. Soft ambient sound of rain and distant traffic."
)


def _on_mode_change(mode):
    is_image = mode == MODE_IMAGE
    return (
        gr.update(visible=not is_image, value=False if is_image else None),  # sound
        gr.update(visible=not is_image),                                     # num_frames
        gr.update(visible=is_image, value=None),                             # out_image
        gr.update(visible=not is_image, value=None),                         # out_video
    )


with gr.Blocks(title="Cosmos3-Nano") as demo:
    gr.Markdown(
        """
        # NVIDIA Cosmos3-Nano — omnimodal world model
        Generate images or videos (with optional conditioning image and audio) using
        [`nvidia/Cosmos3-Nano`](https://huggingface.co/nvidia/Cosmos3-Nano)
        (16B params) via the Diffusers `Cosmos3OmniPipeline`.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            mode = gr.Radio(
                label="Mode",
                choices=[MODE_IMAGE, MODE_VIDEO],
                value=MODE_VIDEO,
            )
            prompt = gr.Textbox(
                label="Prompt", lines=4, value=EXAMPLE_T2V,
                placeholder="Describe what to generate...",
            )
            image = gr.Image(
                label="Conditioning image (optional)", type="pil", height=240,
            )
            enable_sound = gr.Checkbox(label="Generate audio", value=False)
            with gr.Accordion("Advanced settings", open=False):
                negative_prompt = gr.Textbox(label="Negative prompt", lines=2, value="")
                resolution = gr.Dropdown(
                    label="Resolution",
                    choices=list(RESOLUTIONS.keys()),
                    value="480p (832x480, fast)",
                )
                num_frames = gr.Slider(
                    label="Frames (24 fps)", minimum=33, maximum=189, value=65, step=4
                )
                steps = gr.Slider(
                    label="Inference steps", minimum=15, maximum=50, value=25, step=1
                )
                guidance = gr.Slider(
                    label="Guidance scale", minimum=1.0, maximum=10.0, value=6.0, step=0.5
                )
                seed = gr.Number(label="Seed", value=0, precision=0)
                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
            run = gr.Button("Generate", variant="primary")

        with gr.Column(scale=1):
            out_image = gr.Image(label="Generated image", visible=False)
            out_video = gr.Video(label="Generated video", autoplay=True, visible=True)
            used_seed = gr.Number(label="Seed used", interactive=False)

    gr.Examples(
        examples=[
            [MODE_IMAGE, EXAMPLE_T2I, False],
            [MODE_VIDEO, EXAMPLE_T2V, False],
            [MODE_VIDEO, EXAMPLE_T2VS, True],
        ],
        inputs=[mode, prompt, enable_sound],
        label="Examples",
    )

    mode.change(
        fn=_on_mode_change, inputs=[mode],
        outputs=[enable_sound, num_frames, out_image, out_video],
    )

    run.click(
        fn=generate,
        inputs=[
            mode, prompt, image, resolution, num_frames, steps, guidance,
            enable_sound, negative_prompt, seed, randomize_seed,
        ],
        outputs=[out_image, out_video, used_seed],
    )


if __name__ == "__main__":
    demo.queue(max_size=10).launch(theme=gr.themes.Citrus())