#!/usr/bin/env python3
"""
Ultron Benchmarking — Post-Training Evaluation

Downloads trained checkpoints from HF Hub and evaluates on standard benchmarks
using lm-evaluation-harness.

Benchmarks (0-shot, matching Parcae/FineWeb paper suite):
  - HellaSwag
  - ARC-Easy / ARC-Challenge
  - PIQA
  - WinoGrande
  - BoolQ

Also tests depth extrapolation: same model evaluated at different loop counts.

Usage:
  python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline
  python benchmark_ultron.py --model_id trojan0x/ultron-small-moe
  python benchmark_ultron.py --model_id trojan0x/ultron-small-baseline --depth_extrapolation
"""

import os
import sys
import json
import argparse
import types
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import asdict

from huggingface_hub import hf_hub_download, snapshot_download, HfApi
from transformers import AutoTokenizer

# Setup Ultron
def setup_ultron():
    from huggingface_hub import snapshot_download
    repo_path = snapshot_download("trojan0x/ultron", allow_patterns=["ultron/*.py"])
    sys.path.insert(0, repo_path)
    print(f"Ultron loaded from: {repo_path}")

setup_ultron()
from ultron.model import Ultron, UltronConfig


def load_model(model_id, device="cuda"):
    """Load trained Ultron model from HF Hub."""
    print(f"Loading model from {model_id}...")

    # Download checkpoint
    ckpt_path = hf_hub_download(model_id, "ultron_final.pt")
    ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)

    # Reconstruct config
    cfg_dict = ckpt["config"]
    cfg = UltronConfig(**cfg_dict)

    # Build and load model
    model = Ultron(cfg)
    model.load_state_dict(ckpt["model_state_dict"])
    model = model.to(device)
    model.eval()

    print(f"  Params: {model.get_num_params(False):,}")
    print(f"  Trained for {ckpt['step']:,} steps, {ckpt['tokens_seen']:,} tokens")
    print(f"  ρ(A): {model.get_spectral_radius():.6f}")

    return model, cfg


class UltronLMWrapper(nn.Module):
    """Wraps Ultron for lm-evaluation-harness compatibility."""

    def __init__(self, model, cfg, n_loops=None):
        super().__init__()
        self.model = model
        self.n_loops = n_loops or cfg.max_loop_iters
        self.config = types.SimpleNamespace(
            max_position_embeddings=cfg.max_seq_len,
            vocab_size=cfg.vocab_size,
            model_type="ultron",
            hidden_size=cfg.dim,
        )
        self.device = next(model.parameters()).device

    def forward(self, input_ids, **kwargs):
        logits = self.model(input_ids, n_loops=self.n_loops)
        # lm-eval expects output.logits
        return types.SimpleNamespace(logits=logits)

    def parameters(self):
        return self.model.parameters()

    def to(self, *args, **kwargs):
        self.model = self.model.to(*args, **kwargs)
        return self


def evaluate(model_wrapper, tokenizer, tasks, limit=None, batch_size=8):
    """Run lm-evaluation-harness benchmarks."""
    import lm_eval
    from lm_eval.models.huggingface import HFLM

    lm = HFLM(
        pretrained=model_wrapper,
        tokenizer=tokenizer,
        max_length=model_wrapper.config.max_position_embeddings,
        batch_size=batch_size,
        backend="causal",
    )

    kwargs = {
        "model": lm,
        "tasks": tasks,
        "num_fewshot": 0,
        "log_samples": False,
    }
    if limit is not None:
        kwargs["limit"] = limit

    results = lm_eval.simple_evaluate(**kwargs)
    return results["results"]


def print_results(results, label=""):
    """Pretty-print benchmark results."""
    if label:
        print(f"\n{'='*60}")
        print(f"  {label}")
        print(f"{'='*60}")

    print(f"\n{'Task':<20} {'Metric':<20} {'Score':>8}")
    print("-" * 50)
    for task, scores in results.items():
        # Pick best metric
        for metric in ["acc_norm,none", "acc,none"]:
            if metric in scores:
                val = scores[metric]
                print(f"{task:<20} {metric:<20} {val:>8.4f}")
                break
    print()


def main():
    parser = argparse.ArgumentParser(description="Ultron Benchmarking")
    parser.add_argument("--model_id", type=str, required=True,
                        help="HF Hub model ID (e.g., trojan0x/ultron-small-baseline)")
    parser.add_argument("--tasks", type=str, nargs="+",
                        default=["hellaswag", "arc_easy", "arc_challenge", "piqa", "winogrande", "boolq"])
    parser.add_argument("--limit", type=int, default=None,
                        help="Limit eval samples per task (for quick testing)")
    parser.add_argument("--batch_size", type=int, default=8)
    parser.add_argument("--depth_extrapolation", action="store_true",
                        help="Test at multiple loop counts")
    parser.add_argument("--upload_results", action="store_true",
                        help="Upload results to the model repo")
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token

    # Load model
    model, cfg = load_model(args.model_id, device)

    if args.depth_extrapolation:
        # Test at multiple loop depths
        loop_counts = [1, 2, 4, cfg.max_loop_iters, 12, 16]
        all_results = {}

        for n_loops in loop_counts:
            print(f"\n--- Evaluating at {n_loops} loops ---")
            wrapper = UltronLMWrapper(model, cfg, n_loops=n_loops)
            results = evaluate(wrapper, tokenizer, args.tasks,
                             limit=args.limit or 200, batch_size=args.batch_size)
            all_results[n_loops] = results
            print_results(results, f"n_loops = {n_loops}")

        # Summary table
        print("\n" + "="*80)
        print("DEPTH EXTRAPOLATION SUMMARY")
        print("="*80)
        print(f"{'n_loops':<10}", end="")
        for task in args.tasks:
            print(f"{task:<15}", end="")
        print()
        print("-" * (10 + 15 * len(args.tasks)))

        for n_loops, results in all_results.items():
            print(f"{n_loops:<10}", end="")
            for task in args.tasks:
                if task in results:
                    for m in ["acc_norm,none", "acc,none"]:
                        if m in results[task]:
                            print(f"{results[task][m]:<15.4f}", end="")
                            break
                    else:
                        print(f"{'N/A':<15}", end="")
                else:
                    print(f"{'N/A':<15}", end="")
            print()

        # Save results
        summary = {
            "model_id": args.model_id,
            "type": "depth_extrapolation",
            "results": {str(k): v for k, v in all_results.items()},
        }

    else:
        # Standard evaluation
        wrapper = UltronLMWrapper(model, cfg)
        results = evaluate(wrapper, tokenizer, args.tasks,
                         limit=args.limit, batch_size=args.batch_size)
        print_results(results, f"Benchmark Results: {args.model_id}")

        summary = {
            "model_id": args.model_id,
            "type": "standard",
            "n_loops": cfg.max_loop_iters,
            "results": results,
        }

    # Save locally
    results_path = "benchmark_results.json"
    with open(results_path, "w") as f:
        json.dump(summary, f, indent=2, default=str)
    print(f"\nResults saved to {results_path}")

    # Upload to Hub
    if args.upload_results:
        try:
            api = HfApi()
            api.upload_file(
                path_or_fileobj=results_path,
                path_in_repo="benchmark_results.json",
                repo_id=args.model_id,
            )
            print(f"Results uploaded to {args.model_id}")
        except Exception as e:
            print(f"Upload failed: {e}")


if __name__ == "__main__":
    main()