File size: 9,546 Bytes
44217ec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | """
Eval harness for İvme-Conversate.
Wraps the custom model + tokenizer in an lm-eval compatible interface and runs
HellaSwag and ARC-Easy — the two benchmarks scored on the Tiny-ML leaderboard.
Usage:
python eval.py --checkpoint checkpoints/ivme_base_ema.pt
python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy
python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy,piqa
Requirements:
pip install lm-eval tokenizers torch
"""
from __future__ import annotations
import argparse
import json
import sys
import torch
import numpy as np
from tokenizers import Tokenizer
# lm-eval imports
from lm_eval.api.model import LM
from lm_eval.api.instance import Instance
import lm_eval
# Local
sys.path.insert(0, ".")
from model import IvmeConfig, IvmeConversate
TOKENIZER_PATH = "ivme_tokenizer.json"
DEFAULT_TASKS = "hellaswag,arc_easy"
# --------------------------------------------------------------------------- #
# lm-eval wrapper
# --------------------------------------------------------------------------- #
class IvmeLM(LM):
def __init__(self, checkpoint_path: str, device: str = "cuda", batch_size: int = 32):
super().__init__()
self._device = torch.device(device if torch.cuda.is_available() else "cpu")
self._batch_size = batch_size
# Load tokenizer
print(f"[eval] loading tokenizer from {TOKENIZER_PATH}")
self._tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
self._tokenizer.no_truncation()
self._tokenizer.no_padding()
self.vocab_size = self._tokenizer.get_vocab_size()
self.eos_token_id = self._tokenizer.token_to_id("<|eos|>")
# Load model
print(f"[eval] loading model from {checkpoint_path}")
ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
cfg = ckpt["cfg"]
# Force SDPA for eval — no training kernels needed, wider compatibility
cfg.attn_backend = "sdpa"
self._model = IvmeConversate(cfg)
self._model.load_state_dict(ckpt["model"])
self._model.to(self._device)
self._model.eval()
n = self._model.num_params()
print(f"[eval] model loaded: {n/1e6:.1f}M params on {self._device}")
@property
def max_length(self):
return self._model.cfg.max_seq_len
@property
def max_gen_toks(self):
return 256
def tok_encode(self, text: str) -> list[int]:
return self._tokenizer.encode(text).ids
def tok_decode(self, tokens: list[int]) -> str:
return self._tokenizer.decode(tokens)
# ---- Required lm-eval interface methods -------------------------------- #
def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
"""Compute log-likelihood of each (context, continuation) pair."""
results = []
for i in range(0, len(requests), self._batch_size):
batch = requests[i : i + self._batch_size]
results.extend(self._loglikelihood_batch(batch))
return results
def _loglikelihood_batch(self, batch: list[Instance]) -> list[tuple[float, bool]]:
results = []
for req in batch:
context, continuation = req.args
# CRITICAL: tokenize context+continuation JOINTLY. With ByteLevel BPE,
# tokenizing the continuation alone mishandles the leading space and
# word-boundary merges, so the scored tokens wouldn't match what the
# model actually predicts in context. We find the continuation's token
# span by encoding the context alone only to measure its length.
ctx_ids = self.tok_encode(context)
full_ids = self.tok_encode(context + continuation)
cont_len = len(full_ids) - len(ctx_ids)
# Guard: joint tokenization can merge across the boundary leaving
# cont_len=0 or even negative. Fall back to scoring the last token.
if cont_len <= 0:
cont_len = 1
if len(full_ids) < cont_len + 1:
# Sequence too short to score anything meaningful — skip.
results.append((-float("inf"), False))
continue
all_ids = full_ids
# Truncate from the left if too long, always keeping the continuation.
if len(all_ids) > self.max_length:
all_ids = all_ids[-self.max_length:]
input_ids = torch.tensor([all_ids], dtype=torch.long, device=self._device)
with torch.no_grad():
with torch.autocast(device_type=str(self._device).split(":")[0],
dtype=torch.bfloat16,
enabled=self._device.type == "cuda"):
logits, _ = self._model(input_ids)
# Log-probs for the continuation tokens only.
# logits[:, i, :] predicts the token at position i+1, so to score the
# last cont_len tokens we read logits at [len-cont_len-1 : len-1].
cont_targets = torch.tensor(all_ids[-cont_len:], device=self._device)
start = max(0, len(all_ids) - cont_len - 1)
cont_logits = logits[0, start : start + cont_len, :] # (cont_len, vocab)
log_probs = torch.nn.functional.log_softmax(cont_logits.float(), dim=-1)
token_log_probs = log_probs[range(cont_len), cont_targets]
total_log_prob = token_log_probs.sum().item()
greedy = (cont_logits.argmax(dim=-1) == cont_targets).all().item()
results.append((total_log_prob, bool(greedy)))
return results
def loglikelihood_rolling(self, requests: list[Instance]) -> list[float]:
"""Compute rolling log-likelihood for perplexity tasks."""
results = []
for req in requests:
text = req.args[0]
ids = self.tok_encode(text)
total_ll = 0.0
# Slide a window of max_length over the tokens.
for start in range(0, max(1, len(ids) - 1), self.max_length):
chunk = ids[start : start + self.max_length + 1]
if len(chunk) < 2:
break
inp = torch.tensor([chunk[:-1]], dtype=torch.long, device=self._device)
tgt = torch.tensor(chunk[1:], dtype=torch.long, device=self._device)
with torch.no_grad():
with torch.autocast(device_type=str(self._device).split(":")[0],
dtype=torch.bfloat16,
enabled=self._device.type == "cuda"):
logits, _ = self._model(inp)
log_probs = torch.nn.functional.log_softmax(logits[0].float(), dim=-1)
total_ll += log_probs[range(len(tgt)), tgt].sum().item()
results.append(total_ll)
return results
def generate_until(self, requests: list[Instance]) -> list[str]:
"""Greedy generation until stop string (used by some tasks)."""
results = []
for req in requests:
context, gen_kwargs = req.args
until = gen_kwargs.get("until", ["<|eos|>"])
max_new = gen_kwargs.get("max_gen_toks", self.max_gen_toks)
ids = torch.tensor([self.tok_encode(context)], dtype=torch.long,
device=self._device)
out = self._model.generate(ids, max_new_tokens=max_new,
temperature=1.0, top_k=1) # greedy
new_ids = out[0, ids.shape[1]:].tolist()
text = self.tok_decode(new_ids)
for stop in until:
if stop in text:
text = text[:text.index(stop)]
results.append(text)
return results
# --------------------------------------------------------------------------- #
# Main
# --------------------------------------------------------------------------- #
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--checkpoint", required=True)
ap.add_argument("--tasks", default=DEFAULT_TASKS)
ap.add_argument("--batch_size", type=int, default=32)
ap.add_argument("--device", default="cuda")
ap.add_argument("--output", default="eval_results.json")
args = ap.parse_args()
model = IvmeLM(args.checkpoint, device=args.device, batch_size=args.batch_size)
task_list = [t.strip() for t in args.tasks.split(",")]
print(f"\n[eval] running tasks: {task_list}")
results = lm_eval.simple_evaluate(
model=model,
tasks=task_list,
num_fewshot=0, # zero-shot, matching the leaderboard
batch_size=args.batch_size,
log_samples=False,
)
# Print a clean summary
print("\n" + "=" * 52)
print(" İvme-Conversate Eval Results")
print("=" * 52)
for task, metrics in results["results"].items():
acc = metrics.get("acc,none") or metrics.get("acc_norm,none") or 0.0
print(f" {task:<20} {acc*100:.2f}%")
print("=" * 52)
print(f" Model params : {model._model.num_params()/1e6:.1f}M")
print(f" Checkpoint : {args.checkpoint}")
print(f" Eval mode : zero-shot")
print("=" * 52)
# Save full results for the model card / leaderboard PR
with open(args.output, "w") as f:
json.dump(results["results"], f, indent=2)
print(f"\n[eval] full results saved -> {args.output}")
if __name__ == "__main__":
main() |