icarus112's picture
Upload folder using huggingface_hub
c383594 verified
"""Evaluation: factual probes + sampled factual English scoring.
Extracted from train.py (W1 modularization). Semantics unchanged.
Perf optimizations (eval_perf_fix):
- Probe mode: single forward per prompt instead of autoregressive gen
- Batch decode: all GPU work first, all CPU decode after
- Batched factual probes: single padded forward instead of N sequential
"""
from __future__ import annotations
import os
import re as _re
import torch
from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS, USE_MDLM, MDLM_MASK_ID
from hydra.mdlm_decode import mdlm_next_token_logits
# Default to probe mode (1 forward per prompt); set HYDRA_FACTUAL_MODE=gen for
# the original autoregressive generation path.
FACTUAL_MODE = os.environ.get("HYDRA_FACTUAL_MODE", "probe")
def _next_token_logits(model, x: torch.Tensor) -> torch.Tensor:
"""Return next-token logits, branching on MDLM training mode.
Audit 2026-05-09 issue #16: when MDLM training is on, the model was
trained to reconstruct masked positions, not to autoregressively predict
the next token. Reading ``model(x)[:, -1, :]`` therefore measures the
wrong distribution. Route through ``mdlm_next_token_logits`` which
appends a single MASK slot and returns the prediction at that slot.
Returns a 2D tensor of shape (B, V) in float precision.
"""
if USE_MDLM:
# mask_id default of -1 is a sentinel for "use vocab_size-1"; the
# mdlm_decode helper resolves the actual mask id via
# validate_mask_token_id once we know the vocab size.
mask_id = MDLM_MASK_ID
if mask_id < 0:
mask_id = int(getattr(model.config, "vocab_size", 0)) - 1
return mdlm_next_token_logits(
model,
x,
mask_id=mask_id,
vocab_size=int(model.config.vocab_size),
)
logits = model(x, targets=None)
if logits.dim() == 3:
return logits[:, -1, :].float()
return logits.float()
FACTUAL_EVAL = [
# Hard factual recall — requires specific knowledge memorization
("The capital of France is", ["Paris", "paris"]),
("Water boils at", ["100", "boiling"]),
("The largest planet in our solar system is", ["Jupiter", "jupiter"]),
# Easier completions — common collocations / patterns the model may pick up
("Once upon a", ["time"]),
("Hello, my name", ["is", "'s"]),
("The cat sat on the", ["mat", "floor", "rug", "table", "couch", "chair", "ground"]),
("She opened the door and", ["walked", "saw", "found", "stepped", "looked", "went", "ran"]),
# Original hard ones kept for completeness
("The speed of light is approximately", ["299", "300", "186,000", "light speed"]),
("Two plus two equals", ["4", "four"]),
]
_FACTUAL_PROBES = [
"The capital of France is",
"Water boils at",
"The largest planet in our solar system is",
"The speed of light is approximately",
"Shakespeare wrote",
]
def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:
"""Top-5 next-token predictions for canonical factual prompts.
Batched: pads all prompts into a single forward pass instead of N
sequential passes.
"""
print("\n--- Factual Probes ---")
model.eval()
# Process probes one at a time to avoid cooperative launch limit
# (batched forward with B=len(probes) can exceed SM residency cap).
for prompt_text in _FACTUAL_PROBES:
ids = tokenizer.encode(prompt_text)
x = torch.tensor([ids], device=device)
with torch.no_grad(), autocast_ctx:
logits = model(x)
probs = torch.softmax(logits[0, -1].float(), dim=-1)
top5 = torch.topk(probs, 5)
completions = [tokenizer.decode([idx.item()]) for idx in top5.indices]
probs_list = [f"{p:.4f}" for p in top5.values[:3].tolist()]
print(f' "{prompt_text}" -> {completions[:3]} (p={probs_list})')
print("--- End Factual Probes ---\n")
# ---------------------------------------------------------------------------
# Probe mode: single forward per prompt (Fix D)
# ---------------------------------------------------------------------------
def _run_factual_english_probe(model, tokenizer, max_seq_len: int):
"""Fast probe mode: for each (prompt, answers), encode prompt + each answer
candidate as a single sequence, do ONE forward pass, and check if the model's
argmax at the last prompt token matches the first answer token.
Falls back to checking top-K predictions to be generous (same as gen mode
which samples multiple temperatures).
"""
print("---")
print("factual_english_samples: (probe mode)")
model.eval()
hits = 0
with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
for prompt, answers in FACTUAL_EVAL:
prompt_ids = tokenizer.encode(prompt)
prompt_len = len(prompt_ids)
x = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
# Audit 2026-05-09 #16: route through MDLM contract if active.
last_logits = _next_token_logits(model, x)[0]
probs = torch.softmax(last_logits, dim=-1)
# Check top-K predictions (generous: K=20 to match multi-sample gen)
top_k = min(20, probs.shape[-1])
top_ids = torch.topk(probs, top_k).indices.tolist()
top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]
answers_lower = [a.lower() for a in answers]
any_hit = any(
any(a in tok for a in answers_lower)
for tok in top_tokens
)
if any_hit:
hits += 1
best_completion = tokenizer.decode([top_ids[0]])
print(f" prompt: {prompt!r}")
print(f" output: {(prompt + best_completion).replace(chr(10), ' ')!r}")
print(f" hit: {any_hit} (probe top-{top_k})")
score = hits / len(FACTUAL_EVAL)
print("---")
print(f"factual_english_score: {score:.4f}")
print(f"factual_english_hits: {hits}/{len(FACTUAL_EVAL)}")
return score, hits, len(FACTUAL_EVAL)
# ---------------------------------------------------------------------------
# Gen mode: original autoregressive path (Fix F: batch decode)
# ---------------------------------------------------------------------------
def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
"""Original autoregressive generation path with batch decode optimization:
all GPU work runs first, then all CPU decoding happens after."""
print("---")
print("factual_english_samples: (gen mode)")
model.eval()
num_samples = FACTUAL_SAMPLES
batch = FACTUAL_BATCH
gen_tokens = FACTUAL_GEN_TOKENS
temps = [0.7, 0.9, 1.1]
hits = 0
with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
for prompt, answers in FACTUAL_EVAL:
ids = tokenizer.encode(prompt)
answers_lower = [a.lower() for a in answers]
# Collect all generated token sequences on GPU first
all_rows: list[list[int]] = []
samples_done = 0
batch_idx = 0
while samples_done < num_samples:
b = min(batch, num_samples - samples_done)
temp = temps[batch_idx % len(temps)]
batch_idx += 1
ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
for _ in range(gen_tokens):
# Audit 2026-05-09 #16: route through MDLM contract if active.
next_logits = _next_token_logits(model, ctx)
probs = torch.softmax(next_logits / temp, dim=-1)
next_id = torch.multinomial(probs, num_samples=1)
ctx = torch.cat([ctx, next_id], dim=1)
if ctx.size(1) >= max_seq_len:
break
# Transfer to CPU in one shot, no per-row sync
all_rows.extend(ctx.cpu().tolist())
samples_done += b
# CPU-side batch decode — no GPU sync between decodes
any_hit = False
first_gen = None
hit_gen = None
for row in all_rows:
generated = tokenizer.decode(row)
continuation = generated[len(prompt):].strip()
_words = set(w.lower() for w in _re.findall(r"\b[\w'-]+\b", continuation))
hit = any(a in _words for a in answers_lower)
if first_gen is None:
first_gen = generated
if hit:
any_hit = True
if hit_gen is None:
hit_gen = generated
if any_hit:
hits += 1
print(f" prompt: {prompt!r}")
print(f" output: {(first_gen or '').replace(chr(10), ' ')!r}")
print(f" hit: {any_hit} (any of {num_samples} samples, temps={temps}, gen={gen_tokens}tok)")
if hit_gen is not None and hit_gen != first_gen:
print(f" hit_sample: {hit_gen.replace(chr(10), ' ')!r}")
score = hits / len(FACTUAL_EVAL)
print("---")
print(f"factual_english_score: {score:.4f}")
print(f"factual_english_hits: {hits}/{len(FACTUAL_EVAL)}")
return score, hits, len(FACTUAL_EVAL)
# ---------------------------------------------------------------------------
# Public entry point
# ---------------------------------------------------------------------------
def run_factual_english(model, tokenizer, max_seq_len: int):
"""Dispatch to probe (fast, default) or gen (original) mode.
Set HYDRA_FACTUAL_MODE=gen to use the autoregressive path.
"""
if FACTUAL_MODE == "gen":
return _run_factual_english_gen(model, tokenizer, max_seq_len)
return _run_factual_english_probe(model, tokenizer, max_seq_len)