"""Load this quantized Cosmos3-Nano (FP8, safetensors). Self-contained — no project `src/` needed.

Requires: diffusers (git main / >=0.39), nvidia-modelopt, torch (cu128), safetensors.

    from load_quantized import load
    pipe = load()                       # uses this dir, or pass a repo id / local dir
    import torch
    with torch.autocast("cuda", torch.bfloat16):
        img = pipe("a corgi astronaut", num_frames=1, height=480, width=480).video[0][0]

Format (Path B; see ../docs/reports/session_3.md): the FP8 transformer is stored as **safetensors**
(`transformer/diffusion_pytorch_model.safetensors`: 505 weight-only E4M3 weights + per-tensor
`weight_quantizer._amax`/`._scale` buffers) plus a tiny tensor-free `transformer/modelopt_state.pt`
structural sidecar (the quantizer layout). The original `transformer/modelopt_quantized.pt` is
**retained** as a fallback (loadable via `modelopt.torch.opt.restore`); this loader does NOT use it.

SECURITY: `modelopt_state.pt` is loaded with `torch.load(weights_only=False)`, which executes
arbitrary pickle. Load this checkpoint ONLY from a source you trust (a tampered sidecar = remote
code execution). The safetensors weights themselves are safe; only the structural sidecar is pickle.
"""
import glob
import os

import torch
from diffusers import Cosmos3OmniPipeline, Cosmos3OmniTransformer, UniPCMultistepScheduler
import modelopt.torch.opt as mto
from safetensors.torch import load_file


def load_transformer(local):
    """Materialize the quantized transformer from safetensors + the structural sidecar (no `.pt`)."""
    cfg = {**Cosmos3OmniTransformer.load_config(f"{local}/transformer/config.json"), "action_gen": False}
    tf = Cosmos3OmniTransformer.from_config(cfg).to(torch.bfloat16)
    state = torch.load(f"{local}/transformer/modelopt_state.pt", weights_only=False)
    restored = mto.restore_from_modelopt_state(tf, state)
    if restored is not None:
        tf = restored
    tensors = {}
    for shard in sorted(glob.glob(f"{local}/transformer/*.safetensors")):
        tensors.update(load_file(shard))
    tf.load_state_dict(tensors, strict=True)
    return tf


def load(repo_or_dir=".", device="cuda"):
    if os.path.isdir(repo_or_dir):
        local = repo_or_dir
    else:
        from huggingface_hub import snapshot_download

        local = snapshot_download(repo_or_dir)
    tf = load_transformer(local)
    pipe = Cosmos3OmniPipeline.from_pretrained(
        local, transformer=tf, torch_dtype=torch.bfloat16, enable_safety_checker=False
    )
    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=10.0)
    return pipe.to(device)


if __name__ == "__main__":
    pipe = load()
    with torch.autocast("cuda", dtype=torch.bfloat16):  # required: float32 rotary tensors -> bf16 linears
        img = pipe("A red panda astronaut floating in a nebula, highly detailed",
                   num_frames=1, height=480, width=480).video[0][0]
    img.save("out.png")
    print("saved out.png")