#!/usr/bin/env python3
"""Probe: how much does the TRAINED per-pixel head contribute over a DEFAULT (untrained,
zero-residual) head? Renders the same val clips with (a) the trained head and (b) a freshly
re-initialized head (the init: opacity logit 2.0, log-scale 0 -> footprint, identity quat).
If (a)~(b), the learnable head adds ~nothing and PSNR is set by VGGT-depth + source color +
default footprint -> the bottleneck is the representation/backbone, not the head's optimization.
"""
import argparse, os, copy, sys
import numpy as np
import torch
from mapgs.config import load_config
from mapgs.data import UnifiedClipDataset
from mapgs.eval.metrics import psnr, ssim
from mapvggt import MapVGGT
DEV = "cuda"
sys.path.insert(0, "/mnt/william")
from scripts.train_mapvggt_full import prep, render_scene


def build_val(args):
    cfg = load_config(overrides=["data.name=unified", f"data.root={args.roots}",
                                 f"data.height={args.height}", f"data.width={args.width}",
                                 "model.tokens.n_map=2048"])
    full = UnifiedClipDataset(cfg, roots=args.roots.split(","), split="train", n_sup_views=6)
    def segid(p):
        return "_".join(os.path.basename(p.rstrip("/")).split("_")[:2])
    segs = sorted(set(segid(c) for c in full.clips))
    val_segs = set(segs[:args.val_segs])
    seen, vclips = set(), []
    for c in full.clips:
        sgi = segid(c)
        if sgi in val_segs and sgi not in seen:
            seen.add(sgi); vclips.append(c)
    vds = copy.copy(full); vds.clips = vclips
    return vds


@torch.no_grad()
def eval_model(model, vds, n_in):
    model.eval(); model.cur_s = model.s_max
    model.vggt.eval()
    ps, ss = [], []
    for i in range(len(vds.clips)):
        d = prep(vds[i], n_in, DEV)
        gsm = model(d["in_img"], d["in_K"], d["in_c2w"], d["ap"], d["at"], d["an"])
        rgb, _ = render_scene(model, gsm, d, *d["sup_img"].shape[-2:], gain=1.0)
        p = float(psnr(rgb, d["sup_img"]))
        if p == p and abs(p) != float("inf"):
            ps.append(p); ss.append(float(ssim(rgb, d["sup_img"])))
    return np.mean(ps), np.mean(ss), np.std(ps)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--roots", default="/mnt/william/data/unified/waymo")
    ap.add_argument("--ckpt", default="/mnt/william/runs/abl_full_best.safetensors")
    ap.add_argument("--n-in", type=int, default=8)
    ap.add_argument("--height", type=int, default=256)
    ap.add_argument("--width", type=int, default=448)
    ap.add_argument("--val-segs", type=int, default=40)
    ap.add_argument("--clips", type=int, default=12)
    args = ap.parse_args()

    from safetensors.torch import load_file
    sd = load_file(args.ckpt)
    vds = build_val(args)
    vds.clips = vds.clips[:args.clips]
    print(f"probing {len(vds.clips)} val clips", flush=True)

    # (a) trained head
    model = MapVGGT(with_map=True, with_dyn=True, finetune_backbone=False).to(DEV)
    model.load_state_dict(sd, strict=False)
    tp, ts, tsd = eval_model(model, vds, args.n_in)
    print(f"TRAINED head:  PSNR {tp:.2f}±{tsd:.2f} SSIM {ts:.3f}", flush=True)

    # (b) re-init the static head to its constructor init (zero residual, default footprint),
    #     keep map/dyn heads trained (we want to isolate the per-pixel static head)
    import torch.nn as nn
    h = model.head
    nn.init.zeros_(h[0].weight); nn.init.zeros_(h[0].bias)
    nn.init.zeros_(h[2].weight); nn.init.zeros_(h[2].bias)
    nn.init.zeros_(h[-1].weight); nn.init.zeros_(h[-1].bias)
    h[-1].bias.data[0] = 2.0
    h[-1].bias.data[4] = 1.0
    up, us, usd = eval_model(model, vds, args.n_in)
    print(f"DEFAULT head:  PSNR {up:.2f}±{usd:.2f} SSIM {us:.3f}", flush=True)
    print(f"\nHead's learned contribution: PSNR {tp-up:+.2f} dB  SSIM {ts-us:+.3f}", flush=True)


if __name__ == "__main__":
    main()