transformer-weights / tests /test_perplexity.py
angerami's picture
feat: add eval metrics pipeline (perplexity, paper reference, dashboard overlay)
17def50
Raw
History Blame Contribute Delete
3.83 kB
"""Tests for the eval loop and collector pattern in compute_perplexity.py."""
import sys
import math
import types
from pathlib import Path
import numpy as np
import pytest
import torch
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
from compute_perplexity import NLLCollector, eval_loop, to_long_format, run_inference
# ---------------------------------------------------------------------------
# NLLCollector
# ---------------------------------------------------------------------------
def test_nll_collector_basic():
c = NLLCollector()
# Simulate 100 tokens with NLL = 2.0
out = types.SimpleNamespace(loss=torch.tensor(2.0))
c.update(out, 100)
assert abs(c.result() - 2.0) < 1e-6
def test_nll_collector_weighted_average():
c = NLLCollector()
out1 = types.SimpleNamespace(loss=torch.tensor(4.0))
out2 = types.SimpleNamespace(loss=torch.tensor(2.0))
c.update(out1, 100) # 400 total
c.update(out2, 100) # 200 total
assert abs(c.result() - 3.0) < 1e-6 # (400+200)/200 = 3.0
def test_nll_collector_empty():
c = NLLCollector()
assert math.isnan(c.result())
def test_perplexity_from_nll():
nll = 3.0
ppl = math.exp(nll)
assert abs(ppl - math.e ** 3) < 1e-6
# ---------------------------------------------------------------------------
# eval_loop with a tiny stub model
# ---------------------------------------------------------------------------
class StubOutput:
def __init__(self, loss):
self.loss = torch.tensor(loss)
self.hidden_states = None
self.attentions = None
class StubModel(torch.nn.Module):
"""Returns a fixed loss value for any input."""
def __init__(self, fixed_loss=2.5, max_pos=128):
super().__init__()
self.config = types.SimpleNamespace(max_position_embeddings=max_pos)
self.fixed_loss = fixed_loss
def forward(self, input_ids=None, attention_mask=None, labels=None,
output_hidden_states=False, output_attentions=False):
return StubOutput(self.fixed_loss)
def test_eval_loop_constant_loss():
model = StubModel(fixed_loss=3.0, max_pos=64)
tokens = torch.randint(0, 1000, (200,))
results = eval_loop(model, tokens, device=torch.device("cpu"), stride=64)
assert abs(results["nll"] - 3.0) < 1e-4
assert abs(math.exp(results["nll"]) - math.exp(3.0)) < 1e-3
def test_eval_loop_respects_max_tokens():
model = StubModel(fixed_loss=1.0, max_pos=64)
tokens = torch.randint(0, 1000, (1000,))
results = eval_loop(model, tokens, device=torch.device("cpu"),
stride=64, max_tokens=128)
assert abs(results["nll"] - 1.0) < 1e-4
# ---------------------------------------------------------------------------
# run_inference flags
# ---------------------------------------------------------------------------
def test_run_inference_returns_loss():
model = StubModel(fixed_loss=2.0, max_pos=32)
ids = torch.randint(0, 100, (1, 10))
mask = torch.ones_like(ids)
out = run_inference(model, ids, mask)
assert hasattr(out, "loss")
assert abs(out.loss.item() - 2.0) < 1e-6
# ---------------------------------------------------------------------------
# to_long_format
# ---------------------------------------------------------------------------
def test_to_long_format_schema():
row = {"model": "gpt2", "revision": "main", "step": None,
"perplexity": 30.5, "nll": math.log(30.5), "bpb": 4.5, "corpus": "wikitext103"}
records = to_long_format(row)
metrics = {r["metric"] for r in records}
assert metrics == {"perplexity", "nll", "bpb"}
for r in records:
assert r["source"] == "eval_pass"
assert r["corpus"] == "wikitext103"
assert r["model"] == "gpt2"
assert isinstance(r["value"], float)