"""
Evaluate quantized ONNX models on ImageNet-1k validation set.

Uses ONNX Runtime for inference. Loads the cached ImageNet dataset
directly from arrow shard files.
"""

import argparse
import os
import time
import io
import tempfile
import numpy as np
import onnx
import onnxruntime as ort
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from datasets import load_dataset
import pyarrow.ipc as ipc
from transformers import ViTForImageClassification, ViTImageProcessor
from sklearn.metrics import average_precision_score, precision_recall_fscore_support


# ---------------------------------------------------------------------------
# Dataset that reads directly from arrow shards
# ---------------------------------------------------------------------------

class ArrowImageNetDataset(Dataset):
    """Load ImageNet validation data from cached arrow shard files."""

    def __init__(self, arrow_dir, transform=None):
        self.transform = transform
        self.shards = []
        self.offsets = [0]

        # Load all valid arrow shards
        shard_files = sorted(
            f for f in os.listdir(arrow_dir)
            if f.startswith("imagenet-1k_validation-validation-") and f.endswith(".arrow")
        )

        for fname in shard_files:
            path = os.path.join(arrow_dir, fname)
            try:
                with open(path, "rb") as f:
                    reader = ipc.RecordBatchStreamReader(f)
                    table = reader.read_all()
                self.shards.append(table)
                self.offsets.append(self.offsets[-1] + len(table))
                print(f"  Loaded shard {fname}: {len(table)} rows")
            except Exception as e:
                print(f"  SKIP shard {fname}: {e}")

        self.total = self.offsets[-1]
        print(f"  Total images: {self.total}")

    def __len__(self):
        return self.total

    def __getitem__(self, idx):
        # Binary search for the correct shard
        lo, hi = 0, len(self.shards) - 1
        while lo < hi:
            mid = (lo + hi) // 2
            if self.offsets[mid + 1] <= idx:
                lo = mid + 1
            else:
                hi = mid
        shard_idx = lo
        local_idx = idx - self.offsets[shard_idx]

        table = self.shards[shard_idx]
        img_bytes = table.column("image")[local_idx].as_py()

        if isinstance(img_bytes, dict):
            img_bytes = img_bytes.get("bytes", img_bytes.get("path", b""))
        if isinstance(img_bytes, bytes):
            img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
        else:
            img = Image.new("RGB", (224, 224))

        label = table.column("label")[local_idx].as_py()

        if self.transform:
            img = self.transform(img)

        return img, label


# ---------------------------------------------------------------------------
# Metrics (same as model_eval_test.py)
# ---------------------------------------------------------------------------

def compute_metrics(logits, labels, num_classes):
    probs = torch.softmax(torch.from_numpy(logits), dim=1).numpy()
    preds = probs.argmax(axis=1)
    N = len(labels)

    top1 = (preds == labels).sum() / N
    topk_vals = np.argsort(probs, axis=1)[:, ::-1]
    top5 = sum(labels[i] in topk_vals[i, :5] for i in range(N)) / N

    one_hot = np.zeros((N, num_classes), dtype=np.int32)
    one_hot[np.arange(N), labels] = 1
    aps = []
    for c in range(num_classes):
        if one_hot[:, c].sum() == 0:
            continue
        try:
            ap = average_precision_score(one_hot[:, c], probs[:, c])
        except ValueError:
            ap = 0.0
        aps.append(ap)
    mAP = np.mean(aps) if aps else 0.0

    prec_mac, rec_mac, f1_mac, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    prec_wt, rec_wt, f1_wt, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )

    return {
        "top1": top1,
        "top5": top5,
        "mAP": mAP,
        "precision_macro": prec_mac,
        "recall_macro": rec_mac,
        "f1_macro": f1_mac,
        "precision_weighted": prec_wt,
        "recall_weighted": rec_wt,
        "f1_weighted": f1_wt,
    }


# ---------------------------------------------------------------------------
# Model repair — fix known issues in quantized ONNX models
# ---------------------------------------------------------------------------

def repair_onnx_model(onnx_path):
    """Patch known bugs in quantized ONNX models before ORT inference.

    Currently fixes:
      - INT4 classifier DequantizeLinear axis: ModelOpt sets axis=0 but the
        scale shape [1000, 8] is correct for axis=1 with block_size=128.
        ORT validates ceil(Di/block_size) on the declared axis and rejects it.

    Returns the path to use for inference (original or a temp file with fixes).
    """
    if "int4/" not in onnx_path:
        return onnx_path

    model = onnx.load(onnx_path)
    fixed = False

    for node in model.graph.node:
        if node.op_type != "DequantizeLinear":
            continue
        if "classifier.weight" not in node.name:
            continue

        # Read attributes
        axis = None
        block_size = None
        for attr in node.attribute:
            if attr.name == "axis":
                axis = attr.i
            elif attr.name == "block_size":
                block_size = attr.i

        if axis != 0 or block_size is None:
            continue

        # Check whether scale shape matches axis=1 instead of axis=0
        weight_name = node.input[0]
        scale_name = node.input[1]
        weight_shape = scale_shape = None
        for init in model.graph.initializer:
            if init.name == weight_name:
                weight_shape = list(init.dims)
            if init.name == scale_name:
                scale_shape = list(init.dims)

        if weight_shape is None or scale_shape is None:
            continue

        # Expected scale shape if axis were 1: [D0, ceil(D1/block_size)]
        expected_axis1 = list(weight_shape)
        expected_axis1[1] = (expected_axis1[1] + block_size - 1) // block_size

        if scale_shape == expected_axis1:
            for attr in node.attribute:
                if attr.name == "axis":
                    attr.i = 1
                    fixed = True
                    print(f"  [repair] Fixed {node.name}: axis 0 -> 1 "
                          f"(scale {scale_shape} matches axis=1, block_size={block_size})")

    if not fixed:
        return onnx_path

    # Save repaired model to a temp file (persists for session lifetime)
    fd, tmp_path = tempfile.mkstemp(suffix=".onnx", prefix="repaired_")
    os.close(fd)
    onnx.save(model, tmp_path)
    print(f"  [repair] Saved repaired model to {tmp_path}")
    return tmp_path


# ---------------------------------------------------------------------------
# ONNX model evaluation
# ---------------------------------------------------------------------------

@torch.no_grad()
def evaluate_onnx(onnx_path, loader, num_classes, print_every=500):
    """Evaluate an ONNX model using ONNX Runtime with batch=1 (models have static shapes)."""
    # Repair known model bugs before loading
    repaired_path = repair_onnx_model(onnx_path)

    providers = []
    if "CUDAExecutionProvider" in ort.get_available_providers():
        providers.append(("CUDAExecutionProvider", {"device_id": 0}))
    providers.append("CPUExecutionProvider")

    session_options = ort.SessionOptions()
    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
    # Disable memory pattern/reuse optimizations that conflict with dim_param
    # (unknown) dimensions in quantized models. Without this, ORT pre-allocates
    # buffers based on incorrectly resolved shapes, causing runtime crashes
    # on FP16 (Add node) and FP8 (MatMul in decomposed SDPA).
    session_options.enable_mem_pattern = False
    session_options.enable_mem_reuse = False

    session = ort.InferenceSession(repaired_path, sess_options=session_options, providers=providers)
    input_name = session.get_inputs()[0].name

    all_logits = []
    all_labels = []
    total = 0
    total_inference_time = 0.0  # strict inference-only timing

    start = time.time()  # process timer (includes data prep, inference, output, metrics)
    for batch_idx, (imgs, labels) in enumerate(loader):
        # Run one image at a time (model has static batch=1 in internal Reshape nodes)
        for i in range(imgs.size(0)):
            single_img = imgs[i:i+1].numpy()  # shape (1, C, H, W)
            t0 = time.perf_counter()
            outputs = session.run(None, {input_name: single_img})
            t1 = time.perf_counter()
            total_inference_time += (t1 - t0)
            all_logits.append(outputs[0])
            all_labels.append(np.array([labels[i].item()] if torch.is_tensor(labels[i]) else [labels[i]]))
        total += imgs.size(0)

        if print_every and (batch_idx + 1) % print_every == 0:
            elapsed = time.time() - start
            speed = total / elapsed
            print(f"  [{total:>6d} images]  {speed:.1f} img/s")

    all_logits = np.concatenate(all_logits, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    metrics = compute_metrics(all_logits, all_labels, num_classes)
    metrics["total_images"] = total
    elapsed = time.time() - start
    metrics["elapsed"] = elapsed
    metrics["avg_process_ms"] = elapsed / total * 1000 if total > 0 else 0.0
    metrics["avg_inference_ms"] = total_inference_time / total * 1000 if total > 0 else 0.0
    return metrics


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(description="Evaluate quantized ONNX models")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--num_workers", type=int, default=4)
    parser.add_argument("--subset", type=int, default=0, help="Evaluate on first N images (0=all)")
    ALL_MODES = ["fp32", "fp16", "int8", "fp8", "int4"]
    parser.add_argument(
        "--mode", type=str, nargs="*", default=ALL_MODES, choices=ALL_MODES,
        help=f"Quantization mode(s) to evaluate (default: all). Choices: {ALL_MODES}",
    )
    args = parser.parse_args()

    # ------------------------------------------------------------------
    # Load model & processor
    # ------------------------------------------------------------------
    model_name = "google/vit-large-patch16-224"
    print(f"Loading {model_name} ...")
    
    processor = ViTImageProcessor.from_pretrained(model_name)
    input_size = (1, 3, 224, 224)  # (B, C, H, W)
    print(f" Input size: {input_size}")

    # Build a transform callable from the HF processor for use in DataLoader
    def transform(img):
        inputs = processor(images=img, return_tensors="pt")
        return inputs["pixel_values"].squeeze(0)  # (C, H, W)

    num_classes = 1000

    # Load dataset from cached arrow shards
    arrow_dir = os.path.expanduser(
        "~/.cache/huggingface/datasets/Tsomaros___imagenet-1k_validation/"
        "default/0.0.0/55405c49dece42420e68ddd5f80174f19b29ebaf/"
    )
    print(f"Loading dataset from arrow shards: {arrow_dir}")
    dataset = ArrowImageNetDataset(arrow_dir, transform=transform)

    if args.subset > 0:
        from torch.utils.data import Subset
        dataset = Subset(dataset, range(min(args.subset, len(dataset))))
        print(f"  Using subset: {args.subset} images")

    loader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        pin_memory=True,
    )

    # Define models to evaluate
    models = {
        "FP32 (baseline)": "vit_large_patch16_224_fp32.onnx",
        "FP16": "fp16/vit_large_patch16_224_fp16.onnx",
        "INT8 entropy": "int8/vit_large_patch16_224_int8_entropy.onnx",
        "INT8 max": "int8/vit_large_patch16_224_int8_max.onnx",
        "FP8 entropy": "fp8/vit_large_patch16_224_fp8_entropy.onnx",
        "FP8 max": "fp8/vit_large_patch16_224_fp8_max.onnx",
        "INT4 awq_clip": "int4/vit_large_patch16_224_int4_awq_clip.onnx",
        "INT4 awq_lite (asym)": "int4/vit_large_patch16_224_int4_awq_lite_asym.onnx",
        "INT4 awq_lite (sym)": "int4/vit_large_patch16_224_int4_awq_lite.onnx",
        "INT4 awq_full": "int4/vit_large_patch16_224_int4_awq_full.onnx",
        "INT4 rtn_dq": "int4/vit_large_patch16_224_int4_rtn_dq.onnx",
    }

    # Filter by --mode selection
    mode_prefix = {"fp32": "FP32", "fp16": "FP16", "int8": "INT8", "fp8": "FP8", "int4": "INT4"}
    selected_prefixes = {mode_prefix[m] for m in args.mode}
    models = {k: v for k, v in models.items() if any(k.startswith(p) for p in selected_prefixes)}
    print(f"Evaluating modes: {args.mode}")

    # Filter to only existing files
    existing_models = {}
    for name, path in models.items():
        if os.path.exists(path):
            existing_models[name] = path
        else:
            print(f"  SKIP: {name} — file not found: {path}")

    results = {}

    for name, onnx_path in existing_models.items():
        print(f"\n{'='*60}")
        print(f"Evaluating: {name}")
        print(f"  Model: {onnx_path}")
        print(f"{'='*60}")

        try:
            metrics = evaluate_onnx(onnx_path, loader, num_classes)
            results[name] = metrics
            print(f"\n  Top-1 Accuracy:  {metrics['top1']*100:.3f}%")
            print(f"  Top-5 Accuracy:  {metrics['top5']*100:.3f}%")
            print(f"  mAP:             {metrics['mAP']:.4f}")
            print(f"  F1 (macro):      {metrics['f1_macro']:.4f}")
            print(f"  F1 (weighted):   {metrics['f1_weighted']:.4f}")
            print(f"  Time:            {metrics['elapsed']:.1f}s")
            print(f"  Avg Process:     {metrics['avg_process_ms']:.2f}ms/img")
            print(f"  Avg Inference:   {metrics['avg_inference_ms']:.2f}ms/img")
        except Exception as e:
            print(f"  FAILED: {e}")
            import traceback
            traceback.print_exc()
            results[name] = {"error": str(e)}

    # Print comparison table
    print(f"\n\n{'='*100}")
    print("Evaluation Comparison Table")
    print(f"{'='*100}")
    print(f"  {'Model':<25s} {'Images':>7s} {'Top-1%':>8s} {'Top-5%':>8s} {'mAP':>8s} {'F1_mac':>8s} {'F1_wt':>8s} {'Proc(ms)':>9s} {'Inf(ms)':>8s} {'Time':>8s}")
    print(f"  {'-'*25} {'-'*7} {'-'*8} {'-'*8} {'-'*8} {'-'*8} {'-'*8} {'-'*9} {'-'*8} {'-'*8}")

    for name, m in results.items():
        if "error" in m:
            print(f"  {name:<25s} FAILED: {m['error']}")
        else:
            print(
                f"  {name:<25s} "
                f"{m['total_images']:>7d} "
                f"{m['top1']*100:>8.3f} "
                f"{m['top5']*100:>8.3f} "
                f"{m['mAP']:>8.4f} "
                f"{m['f1_macro']:>8.4f} "
                f"{m['f1_weighted']:>8.4f} "
                f"{m['avg_process_ms']:>9.2f} "
                f"{m['avg_inference_ms']:>8.2f} "
                f"{m['elapsed']:>7.1f}s"
            )

    print(f"\n  Reference (timm model card): Top-1: 82.346%  |  Top-5: 96.394%")
    print(f"{'='*90}")

    # Find best INT8 model and copy as the canonical output
    int8_results = {k: v for k, v in results.items() if k.startswith("INT8") and "error" not in v}
    if int8_results:
        best_int8 = max(int8_results, key=lambda k: int8_results[k]["top1"])
        best_path = existing_models[best_int8]
        print(f"\n  Best INT8 model: {best_int8} ({best_path})")
        print(f"    Top-1: {int8_results[best_int8]['top1']*100:.3f}%")
        print(f"    Top-5: {int8_results[best_int8]['top5']*100:.3f}%")


if __name__ == "__main__":
    main()