#!/usr/bin/env python3
# ==========================================================
# High-speed multi-GPU evaluation for GLM-4.5-Air-HS adapters
# Uses 4×H200 for maximum throughput.
# ==========================================================

import os, json, math, torch, time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from torch.utils.data import DataLoader, Dataset
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, barrier, destroy_process_group
import torch.distributed as dist


# ---------------- CONFIG ----------------
BASE_MODEL = "/workspace/Avinash/models/GLM-4.5-Air"
CHECKPOINT_DIR = "checkpoints"
DATA_PATH = "/workspace/Avinash/dataset/all_data.jsonl"
OUTPUT_PATH = "eval_scores.json"
MAX_SAMPLES = 1000       # subset for eval speed
BATCH_SIZE = 2           # safe for 80GB H200
SEQ_LEN = 2048
DTYPE = torch.bfloat16   # use bf16 for H200
# ----------------------------------------


class CodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=2048):
        self.samples = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text = self.samples[idx]["text"]
        tokens = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return tokens["input_ids"][0]


def collate_fn(batch, pad_token_id=0):
    """Pad variable-length sequences and build attention masks and labels."""
    lengths = [seq.size(0) for seq in batch]
    max_len = max(lengths)

    input_ids = []
    attention_masks = []

    for seq, seq_len in zip(batch, lengths):
        if seq_len < max_len:
            padding = torch.full((max_len - seq_len,), pad_token_id, dtype=seq.dtype)
            padded_seq = torch.cat([seq, padding], dim=0)
        else:
            padded_seq = seq
        mask = torch.zeros(max_len, dtype=torch.long)
        mask[:seq_len] = 1
        input_ids.append(padded_seq)
        attention_masks.append(mask)

    input_ids = torch.stack(input_ids, dim=0)
    attention_mask = torch.stack(attention_masks, dim=0)
    labels = input_ids.clone()
    labels[attention_mask == 0] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


def load_subset(path, limit=MAX_SAMPLES):
    data = []
    with open(path, "r") as f:
        for i, line in enumerate(f):
            if i >= limit:
                break
            try:
                data.append(json.loads(line))
            except Exception:
                continue
    return data


def evaluate_checkpoint(ckpt_path, subset, rank, local_rank, world_size):
    """Evaluate one checkpoint - only rank 0 loads the model with device_map='auto'."""
    
    if rank == 0:
        print(f"\n🚀 Evaluating {ckpt_path} on {world_size} GPUs", flush=True)
        print(f"📥 Loading base model with device_map='auto'...", flush=True)
        
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
        if tokenizer.pad_token_id is None:
            if tokenizer.eos_token is None:
                raise ValueError("Tokenizer needs a pad_token or eos_token for batching.")
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model with automatic device mapping across all GPUs
        base = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.bfloat16,
            device_map="auto",  # Automatically shard across all GPUs
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )

        print(f"🔧 Loading adapter from {ckpt_path}...", flush=True)
        model = PeftModel.from_pretrained(base, ckpt_path)
        model.eval()

        print(f"📊 Creating dataset and dataloader...", flush=True)
        dataset = CodeDataset(subset, tokenizer, max_len=SEQ_LEN)
        
        # Get pad token id from tokenizer
        pad_token_id = tokenizer.pad_token_id
        
        # Create custom collate function with the correct pad_token_id
        def custom_collate(batch):
            return collate_fn(batch, pad_token_id=pad_token_id)
        
        loader = DataLoader(
            dataset, 
            batch_size=BATCH_SIZE, 
            shuffle=False,
            pin_memory=True,
            num_workers=0,
            collate_fn=custom_collate
        )

        total_loss = 0
        total_count = 0

        print(f"⚡ Starting evaluation...", flush=True)

        with torch.no_grad():
            for batch in tqdm(loader, ncols=100, desc="Evaluating"):
                # Move batch to first device (where model starts)
                first_device = next(model.parameters()).device
                batch = {k: v.to(first_device) for k, v in batch.items()}
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                loss = outputs.loss.detach()
                batch_size = batch["input_ids"].size(0)
                total_loss += loss.item() * batch_size
                total_count += batch_size

        avg_loss = total_loss / max(total_count, 1)
        ppl = math.exp(avg_loss)
        
        result = {
            "avg_loss": round(avg_loss, 4),
            "perplexity": round(ppl, 3)
        }
        
        print(f"✅ {os.path.basename(ckpt_path)}: loss={avg_loss:.4f}, ppl={ppl:.2f}", flush=True)
        
        # Clean up to free memory
        del loader
        del dataset
        del model
        del base
        del tokenizer
        
        # Force garbage collection and clear CUDA cache
        import gc
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        
        return result
    else:
        # Other ranks just wait
        return None


def main():
    # Initialize process group (torchrun sets the environment variables)
    rank = int(os.environ.get("RANK", 0))
    local_rank = int(os.environ.get("LOCAL_RANK", 0))
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    
    # Set device BEFORE initializing process group
    torch.cuda.set_device(local_rank)
    
    # Initialize distributed training
    if not dist.is_initialized():
        init_process_group(backend="nccl")
    
    if rank == 0:
        print("🔍 Loading subset of dataset...", flush=True)
    
    subset = load_subset(DATA_PATH)
    
    if rank == 0:
        print(f"Loaded {len(subset)} samples.", flush=True)

    # Find specific checkpoints to evaluate
    if rank == 0:
        target_checkpoints = ["checkpoint-5000", "checkpoint-6000", "checkpoint-7000", "final-checkpoint"]
        checkpoints = []
        for ckpt_name in target_checkpoints:
            ckpt_path = os.path.join(CHECKPOINT_DIR, ckpt_name)
            if os.path.isdir(ckpt_path):
                checkpoints.append(ckpt_path)
            else:
                print(f"⚠️  Warning: {ckpt_name} not found", flush=True)
        
        if not checkpoints:
            print(f"⚠️  No target checkpoints found in {CHECKPOINT_DIR}", flush=True)
            destroy_process_group()
            return
        print(f"📁 Found {len(checkpoints)} checkpoints to evaluate", flush=True)
        print(f"📋 Checkpoints: {checkpoints}", flush=True)
    else:
        checkpoints = None
    
    # Synchronize before broadcast
    if rank == 0:
        print("🔄 Broadcasting checkpoint list to all ranks...", flush=True)
    dist.barrier()
    
    # Broadcast checkpoint list to all ranks
    if world_size > 1:
        if rank == 0:
            checkpoint_obj = [checkpoints]
        else:
            checkpoint_obj = [None]
        dist.broadcast_object_list(checkpoint_obj, src=0)
        checkpoints = checkpoint_obj[0]
    
    if rank == 0:
        print(f"✅ All ranks have checkpoint list", flush=True)

    all_results = {}
    start_time = time.time()

    for ckpt in checkpoints:
        result = evaluate_checkpoint(ckpt, subset, rank, local_rank, world_size)
        
        # Only rank 0 saves results
        if rank == 0 and result is not None:
            ckpt_name = os.path.basename(ckpt)
            all_results[ckpt_name] = result
            
            # Save interim results
            with open(OUTPUT_PATH, "w") as f:
                json.dump(all_results, f, indent=2)
            print(f"💾 Interim results saved to {OUTPUT_PATH}", flush=True)

    if rank == 0:
        total_mins = (time.time() - start_time) / 60
        print(f"\n🏁 All evaluations done in {total_mins:.1f} min.")
        print(f"📊 Final results saved at {OUTPUT_PATH}")
        print("\n📈 Results sorted by perplexity:")
        for ckpt_name, metrics in sorted(all_results.items(), key=lambda x: x[1]["perplexity"]):
            print(f"  {ckpt_name}: loss={metrics['avg_loss']}, ppl={metrics['perplexity']}")

    # Clean up
    destroy_process_group()


if __name__ == "__main__":
    main()