File size: 9,546 Bytes

44217ec

"""
Eval harness for İvme-Conversate.

Wraps the custom model + tokenizer in an lm-eval compatible interface and runs
HellaSwag and ARC-Easy — the two benchmarks scored on the Tiny-ML leaderboard.

Usage:
    python eval.py --checkpoint checkpoints/ivme_base_ema.pt
    python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy
    python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy,piqa

Requirements:
    pip install lm-eval tokenizers torch
"""

from __future__ import annotations

import argparse
import json
import sys
import torch
import numpy as np
from tokenizers import Tokenizer

# lm-eval imports
from lm_eval.api.model import LM
from lm_eval.api.instance import Instance
import lm_eval

# Local
sys.path.insert(0, ".")
from model import IvmeConfig, IvmeConversate

TOKENIZER_PATH = "ivme_tokenizer.json"
DEFAULT_TASKS = "hellaswag,arc_easy"


# --------------------------------------------------------------------------- #
# lm-eval wrapper
# --------------------------------------------------------------------------- #
class IvmeLM(LM):
    def __init__(self, checkpoint_path: str, device: str = "cuda", batch_size: int = 32):
        super().__init__()
        self._device = torch.device(device if torch.cuda.is_available() else "cpu")
        self._batch_size = batch_size

        # Load tokenizer
        print(f"[eval] loading tokenizer from {TOKENIZER_PATH}")
        self._tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
        self._tokenizer.no_truncation()
        self._tokenizer.no_padding()
        self.vocab_size = self._tokenizer.get_vocab_size()
        self.eos_token_id = self._tokenizer.token_to_id("<|eos|>")

        # Load model
        print(f"[eval] loading model from {checkpoint_path}")
        ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
        cfg = ckpt["cfg"]
        # Force SDPA for eval — no training kernels needed, wider compatibility
        cfg.attn_backend = "sdpa"
        self._model = IvmeConversate(cfg)
        self._model.load_state_dict(ckpt["model"])
        self._model.to(self._device)
        self._model.eval()
        n = self._model.num_params()
        print(f"[eval] model loaded: {n/1e6:.1f}M params on {self._device}")

    @property
    def max_length(self):
        return self._model.cfg.max_seq_len

    @property
    def max_gen_toks(self):
        return 256

    def tok_encode(self, text: str) -> list[int]:
        return self._tokenizer.encode(text).ids

    def tok_decode(self, tokens: list[int]) -> str:
        return self._tokenizer.decode(tokens)

    # ---- Required lm-eval interface methods -------------------------------- #

    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
        """Compute log-likelihood of each (context, continuation) pair."""
        results = []
        for i in range(0, len(requests), self._batch_size):
            batch = requests[i : i + self._batch_size]
            results.extend(self._loglikelihood_batch(batch))
        return results

    def _loglikelihood_batch(self, batch: list[Instance]) -> list[tuple[float, bool]]:
        results = []
        for req in batch:
            context, continuation = req.args

            # CRITICAL: tokenize context+continuation JOINTLY. With ByteLevel BPE,
            # tokenizing the continuation alone mishandles the leading space and
            # word-boundary merges, so the scored tokens wouldn't match what the
            # model actually predicts in context. We find the continuation's token
            # span by encoding the context alone only to measure its length.
            ctx_ids = self.tok_encode(context)
            full_ids = self.tok_encode(context + continuation)
            cont_len = len(full_ids) - len(ctx_ids)

            # Guard: joint tokenization can merge across the boundary leaving
            # cont_len=0 or even negative. Fall back to scoring the last token.
            if cont_len <= 0:
                cont_len = 1
                if len(full_ids) < cont_len + 1:
                    # Sequence too short to score anything meaningful — skip.
                    results.append((-float("inf"), False))
                    continue

            all_ids = full_ids
            # Truncate from the left if too long, always keeping the continuation.
            if len(all_ids) > self.max_length:
                all_ids = all_ids[-self.max_length:]

            input_ids = torch.tensor([all_ids], dtype=torch.long, device=self._device)

            with torch.no_grad():
                with torch.autocast(device_type=str(self._device).split(":")[0],
                                    dtype=torch.bfloat16,
                                    enabled=self._device.type == "cuda"):
                    logits, _ = self._model(input_ids)

            # Log-probs for the continuation tokens only.
            # logits[:, i, :] predicts the token at position i+1, so to score the
            # last cont_len tokens we read logits at [len-cont_len-1 : len-1].
            cont_targets = torch.tensor(all_ids[-cont_len:], device=self._device)
            start = max(0, len(all_ids) - cont_len - 1)
            cont_logits = logits[0, start : start + cont_len, :]   # (cont_len, vocab)

            log_probs = torch.nn.functional.log_softmax(cont_logits.float(), dim=-1)
            token_log_probs = log_probs[range(cont_len), cont_targets]
            total_log_prob = token_log_probs.sum().item()

            greedy = (cont_logits.argmax(dim=-1) == cont_targets).all().item()
            results.append((total_log_prob, bool(greedy)))

        return results

    def loglikelihood_rolling(self, requests: list[Instance]) -> list[float]:
        """Compute rolling log-likelihood for perplexity tasks."""
        results = []
        for req in requests:
            text = req.args[0]
            ids = self.tok_encode(text)
            total_ll = 0.0
            # Slide a window of max_length over the tokens.
            for start in range(0, max(1, len(ids) - 1), self.max_length):
                chunk = ids[start : start + self.max_length + 1]
                if len(chunk) < 2:
                    break
                inp = torch.tensor([chunk[:-1]], dtype=torch.long, device=self._device)
                tgt = torch.tensor(chunk[1:], dtype=torch.long, device=self._device)
                with torch.no_grad():
                    with torch.autocast(device_type=str(self._device).split(":")[0],
                                        dtype=torch.bfloat16,
                                        enabled=self._device.type == "cuda"):
                        logits, _ = self._model(inp)
                log_probs = torch.nn.functional.log_softmax(logits[0].float(), dim=-1)
                total_ll += log_probs[range(len(tgt)), tgt].sum().item()
            results.append(total_ll)
        return results

    def generate_until(self, requests: list[Instance]) -> list[str]:
        """Greedy generation until stop string (used by some tasks)."""
        results = []
        for req in requests:
            context, gen_kwargs = req.args
            until = gen_kwargs.get("until", ["<|eos|>"])
            max_new = gen_kwargs.get("max_gen_toks", self.max_gen_toks)
            ids = torch.tensor([self.tok_encode(context)], dtype=torch.long,
                               device=self._device)
            out = self._model.generate(ids, max_new_tokens=max_new,
                                       temperature=1.0, top_k=1)  # greedy
            new_ids = out[0, ids.shape[1]:].tolist()
            text = self.tok_decode(new_ids)
            for stop in until:
                if stop in text:
                    text = text[:text.index(stop)]
            results.append(text)
        return results


# --------------------------------------------------------------------------- #
# Main
# --------------------------------------------------------------------------- #
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--checkpoint", required=True)
    ap.add_argument("--tasks", default=DEFAULT_TASKS)
    ap.add_argument("--batch_size", type=int, default=32)
    ap.add_argument("--device", default="cuda")
    ap.add_argument("--output", default="eval_results.json")
    args = ap.parse_args()

    model = IvmeLM(args.checkpoint, device=args.device, batch_size=args.batch_size)
    task_list = [t.strip() for t in args.tasks.split(",")]

    print(f"\n[eval] running tasks: {task_list}")
    results = lm_eval.simple_evaluate(
        model=model,
        tasks=task_list,
        num_fewshot=0,       # zero-shot, matching the leaderboard
        batch_size=args.batch_size,
        log_samples=False,
    )

    # Print a clean summary
    print("\n" + "=" * 52)
    print("  İvme-Conversate Eval Results")
    print("=" * 52)
    for task, metrics in results["results"].items():
        acc = metrics.get("acc,none") or metrics.get("acc_norm,none") or 0.0
        print(f"  {task:<20} {acc*100:.2f}%")
    print("=" * 52)
    print(f"  Model params : {model._model.num_params()/1e6:.1f}M")
    print(f"  Checkpoint   : {args.checkpoint}")
    print(f"  Eval mode    : zero-shot")
    print("=" * 52)

    # Save full results for the model card / leaderboard PR
    with open(args.output, "w") as f:
        json.dump(results["results"], f, indent=2)
    print(f"\n[eval] full results saved -> {args.output}")


if __name__ == "__main__":
    main()