IvmeLabs
/

Ivme-Conversate-22M-Base

+"""
+Eval harness for İvme-Conversate.
+Wraps the custom model + tokenizer in an lm-eval compatible interface and runs
+HellaSwag and ARC-Easy — the two benchmarks scored on the Tiny-ML leaderboard.
+Usage:
+    python eval.py --checkpoint checkpoints/ivme_base_ema.pt
+    python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy
+    python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy,piqa
+Requirements:
+    pip install lm-eval tokenizers torch
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+import torch
+import numpy as np
+from tokenizers import Tokenizer
+# lm-eval imports
+from lm_eval.api.model import LM
+from lm_eval.api.instance import Instance
+import lm_eval
+# Local
+sys.path.insert(0, ".")
+from model import IvmeConfig, IvmeConversate
+TOKENIZER_PATH = "ivme_tokenizer.json"
+DEFAULT_TASKS = "hellaswag,arc_easy"
+# --------------------------------------------------------------------------- #
+# lm-eval wrapper
+# --------------------------------------------------------------------------- #
+class IvmeLM(LM):
+    def __init__(self, checkpoint_path: str, device: str = "cuda", batch_size: int = 32):
+        super().__init__()
+        self._device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self._batch_size = batch_size
+        # Load tokenizer
+        print(f"[eval] loading tokenizer from {TOKENIZER_PATH}")
+        self._tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
+        self._tokenizer.no_truncation()
+        self._tokenizer.no_padding()
+        self.vocab_size = self._tokenizer.get_vocab_size()
+        self.eos_token_id = self._tokenizer.token_to_id("<|eos|>")
+        # Load model
+        print(f"[eval] loading model from {checkpoint_path}")
+        ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+        cfg = ckpt["cfg"]
+        # Force SDPA for eval — no training kernels needed, wider compatibility
+        cfg.attn_backend = "sdpa"
+        self._model = IvmeConversate(cfg)
+        self._model.load_state_dict(ckpt["model"])
+        self._model.to(self._device)
+        self._model.eval()
+        n = self._model.num_params()
+        print(f"[eval] model loaded: {n/1e6:.1f}M params on {self._device}")
+    @property
+    def max_length(self):
+        return self._model.cfg.max_seq_len
+    @property
+    def max_gen_toks(self):
+        return 256
+    def tok_encode(self, text: str) -> list[int]:
+        return self._tokenizer.encode(text).ids
+    def tok_decode(self, tokens: list[int]) -> str:
+        return self._tokenizer.decode(tokens)
+    # ---- Required lm-eval interface methods -------------------------------- #
+    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        """Compute log-likelihood of each (context, continuation) pair."""
+        results = []
+        for i in range(0, len(requests), self._batch_size):
+            batch = requests[i : i + self._batch_size]
+            results.extend(self._loglikelihood_batch(batch))
+        return results
+    def _loglikelihood_batch(self, batch: list[Instance]) -> list[tuple[float, bool]]:
+        results = []
+        for req in batch:
+            context, continuation = req.args
+            # CRITICAL: tokenize context+continuation JOINTLY. With ByteLevel BPE,
+            # tokenizing the continuation alone mishandles the leading space and
+            # word-boundary merges, so the scored tokens wouldn't match what the
+            # model actually predicts in context. We find the continuation's token
+            # span by encoding the context alone only to measure its length.
+            ctx_ids = self.tok_encode(context)
+            full_ids = self.tok_encode(context + continuation)
+            cont_len = len(full_ids) - len(ctx_ids)
+            # Guard: joint tokenization can merge across the boundary leaving
+            # cont_len=0 or even negative. Fall back to scoring the last token.
+            if cont_len <= 0:
+                cont_len = 1
+                if len(full_ids) < cont_len + 1:
+                    # Sequence too short to score anything meaningful — skip.
+                    results.append((-float("inf"), False))
+                    continue
+            all_ids = full_ids
+            # Truncate from the left if too long, always keeping the continuation.
+            if len(all_ids) > self.max_length:
+                all_ids = all_ids[-self.max_length:]
+            input_ids = torch.tensor([all_ids], dtype=torch.long, device=self._device)
+            with torch.no_grad():
+                with torch.autocast(device_type=str(self._device).split(":")[0],
+                                    dtype=torch.bfloat16,
+                                    enabled=self._device.type == "cuda"):
+                    logits, _ = self._model(input_ids)
+            # Log-probs for the continuation tokens only.
+            # logits[:, i, :] predicts the token at position i+1, so to score the
+            # last cont_len tokens we read logits at [len-cont_len-1 : len-1].
+            cont_targets = torch.tensor(all_ids[-cont_len:], device=self._device)
+            start = max(0, len(all_ids) - cont_len - 1)
+            cont_logits = logits[0, start : start + cont_len, :]   # (cont_len, vocab)
+            log_probs = torch.nn.functional.log_softmax(cont_logits.float(), dim=-1)
+            token_log_probs = log_probs[range(cont_len), cont_targets]
+            total_log_prob = token_log_probs.sum().item()
+            greedy = (cont_logits.argmax(dim=-1) == cont_targets).all().item()
+            results.append((total_log_prob, bool(greedy)))
+        return results
+    def loglikelihood_rolling(self, requests: list[Instance]) -> list[float]:
+        """Compute rolling log-likelihood for perplexity tasks."""
+        results = []
+        for req in requests:
+            text = req.args[0]
+            ids = self.tok_encode(text)
+            total_ll = 0.0
+            # Slide a window of max_length over the tokens.
+            for start in range(0, max(1, len(ids) - 1), self.max_length):
+                chunk = ids[start : start + self.max_length + 1]
+                if len(chunk) < 2:
+                    break
+                inp = torch.tensor([chunk[:-1]], dtype=torch.long, device=self._device)
+                tgt = torch.tensor(chunk[1:], dtype=torch.long, device=self._device)
+                with torch.no_grad():
+                    with torch.autocast(device_type=str(self._device).split(":")[0],
+                                        dtype=torch.bfloat16,
+                                        enabled=self._device.type == "cuda"):
+                        logits, _ = self._model(inp)
+                log_probs = torch.nn.functional.log_softmax(logits[0].float(), dim=-1)
+                total_ll += log_probs[range(len(tgt)), tgt].sum().item()
+            results.append(total_ll)
+        return results
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        """Greedy generation until stop string (used by some tasks)."""
+        results = []
+        for req in requests:
+            context, gen_kwargs = req.args
+            until = gen_kwargs.get("until", ["<|eos|>"])
+            max_new = gen_kwargs.get("max_gen_toks", self.max_gen_toks)
+            ids = torch.tensor([self.tok_encode(context)], dtype=torch.long,
+                               device=self._device)
+            out = self._model.generate(ids, max_new_tokens=max_new,
+                                       temperature=1.0, top_k=1)  # greedy
+            new_ids = out[0, ids.shape[1]:].tolist()
+            text = self.tok_decode(new_ids)
+            for stop in until:
+                if stop in text:
+                    text = text[:text.index(stop)]
+            results.append(text)
+        return results
+# --------------------------------------------------------------------------- #
+# Main
+# --------------------------------------------------------------------------- #
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--checkpoint", required=True)
+    ap.add_argument("--tasks", default=DEFAULT_TASKS)
+    ap.add_argument("--batch_size", type=int, default=32)
+    ap.add_argument("--device", default="cuda")
+    ap.add_argument("--output", default="eval_results.json")
+    args = ap.parse_args()
+    model = IvmeLM(args.checkpoint, device=args.device, batch_size=args.batch_size)
+    task_list = [t.strip() for t in args.tasks.split(",")]
+    print(f"\n[eval] running tasks: {task_list}")
+    results = lm_eval.simple_evaluate(
+        model=model,
+        tasks=task_list,
+        num_fewshot=0,       # zero-shot, matching the leaderboard
+        batch_size=args.batch_size,
+        log_samples=False,
+    )
+    # Print a clean summary
+    print("\n" + "=" * 52)
+    print("  İvme-Conversate Eval Results")
+    print("=" * 52)
+    for task, metrics in results["results"].items():
+        acc = metrics.get("acc,none") or metrics.get("acc_norm,none") or 0.0
+        print(f"  {task:<20} {acc*100:.2f}%")
+    print("=" * 52)
+    print(f"  Model params : {model._model.num_params()/1e6:.1f}M")
+    print(f"  Checkpoint   : {args.checkpoint}")
+    print(f"  Eval mode    : zero-shot")
+    print("=" * 52)
+    # Save full results for the model card / leaderboard PR
+    with open(args.output, "w") as f:
+        json.dump(results["results"], f, indent=2)
+    print(f"\n[eval] full results saved -> {args.output}")
+if __name__ == "__main__":
+    main()