Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

feather-a10g-large-runtime / overlay /hydra /eval.py

icarus112

Upload folder using huggingface_hub

c383594 verified 11 days ago

raw

history blame contribute delete

9.96 kB

	"""Evaluation: factual probes + sampled factual English scoring.

	Extracted from train.py (W1 modularization). Semantics unchanged.

	Perf optimizations (eval_perf_fix):
	- Probe mode: single forward per prompt instead of autoregressive gen
	- Batch decode: all GPU work first, all CPU decode after
	- Batched factual probes: single padded forward instead of N sequential
	"""

	from __future__ import annotations

	import os
	import re as _re

	import torch

	from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS, USE_MDLM, MDLM_MASK_ID
	from hydra.mdlm_decode import mdlm_next_token_logits

	# Default to probe mode (1 forward per prompt); set HYDRA_FACTUAL_MODE=gen for
	# the original autoregressive generation path.
	FACTUAL_MODE = os.environ.get("HYDRA_FACTUAL_MODE", "probe")


	def _next_token_logits(model, x: torch.Tensor) -> torch.Tensor:
	"""Return next-token logits, branching on MDLM training mode.

	Audit 2026-05-09 issue #16: when MDLM training is on, the model was
	trained to reconstruct masked positions, not to autoregressively predict
	the next token. Reading ``model(x)[:, -1, :]`` therefore measures the
	wrong distribution. Route through ``mdlm_next_token_logits`` which
	appends a single MASK slot and returns the prediction at that slot.

	Returns a 2D tensor of shape (B, V) in float precision.
	"""
	if USE_MDLM:
	# mask_id default of -1 is a sentinel for "use vocab_size-1"; the
	# mdlm_decode helper resolves the actual mask id via
	# validate_mask_token_id once we know the vocab size.
	mask_id = MDLM_MASK_ID
	if mask_id < 0:
	mask_id = int(getattr(model.config, "vocab_size", 0)) - 1
	return mdlm_next_token_logits(
	model,
	x,
	mask_id=mask_id,
	vocab_size=int(model.config.vocab_size),
	)
	logits = model(x, targets=None)
	if logits.dim() == 3:
	return logits[:, -1, :].float()
	return logits.float()

	FACTUAL_EVAL = [
	# Hard factual recall — requires specific knowledge memorization
	("The capital of France is", ["Paris", "paris"]),
	("Water boils at", ["100", "boiling"]),
	("The largest planet in our solar system is", ["Jupiter", "jupiter"]),
	# Easier completions — common collocations / patterns the model may pick up
	("Once upon a", ["time"]),
	("Hello, my name", ["is", "'s"]),
	("The cat sat on the", ["mat", "floor", "rug", "table", "couch", "chair", "ground"]),
	("She opened the door and", ["walked", "saw", "found", "stepped", "looked", "went", "ran"]),
	# Original hard ones kept for completeness
	("The speed of light is approximately", ["299", "300", "186,000", "light speed"]),
	("Two plus two equals", ["4", "four"]),
	]

	_FACTUAL_PROBES = [
	"The capital of France is",
	"Water boils at",
	"The largest planet in our solar system is",
	"The speed of light is approximately",
	"Shakespeare wrote",
	]


	def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:
	"""Top-5 next-token predictions for canonical factual prompts.

	Batched: pads all prompts into a single forward pass instead of N
	sequential passes.
	"""
	print("\n--- Factual Probes ---")
	model.eval()

	# Process probes one at a time to avoid cooperative launch limit
	# (batched forward with B=len(probes) can exceed SM residency cap).
	for prompt_text in _FACTUAL_PROBES:
	ids = tokenizer.encode(prompt_text)
	x = torch.tensor([ids], device=device)
	with torch.no_grad(), autocast_ctx:
	logits = model(x)
	probs = torch.softmax(logits[0, -1].float(), dim=-1)
	top5 = torch.topk(probs, 5)
	completions = [tokenizer.decode([idx.item()]) for idx in top5.indices]
	probs_list = [f"{p:.4f}" for p in top5.values[:3].tolist()]
	print(f' "{prompt_text}" -> {completions[:3]} (p={probs_list})')
	print("--- End Factual Probes ---\n")


	# ---------------------------------------------------------------------------
	# Probe mode: single forward per prompt (Fix D)
	# ---------------------------------------------------------------------------

	def _run_factual_english_probe(model, tokenizer, max_seq_len: int):
	"""Fast probe mode: for each (prompt, answers), encode prompt + each answer
	candidate as a single sequence, do ONE forward pass, and check if the model's
	argmax at the last prompt token matches the first answer token.

	Falls back to checking top-K predictions to be generous (same as gen mode
	which samples multiple temperatures).
	"""
	print("---")
	print("factual_english_samples: (probe mode)")
	model.eval()
	hits = 0

	with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
	for prompt, answers in FACTUAL_EVAL:
	prompt_ids = tokenizer.encode(prompt)
	prompt_len = len(prompt_ids)
	x = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
	# Audit 2026-05-09 #16: route through MDLM contract if active.
	last_logits = _next_token_logits(model, x)[0]

	probs = torch.softmax(last_logits, dim=-1)
	# Check top-K predictions (generous: K=20 to match multi-sample gen)
	top_k = min(20, probs.shape[-1])
	top_ids = torch.topk(probs, top_k).indices.tolist()
	top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]

	answers_lower = [a.lower() for a in answers]
	any_hit = any(
	any(a in tok for a in answers_lower)
	for tok in top_tokens
	)
	if any_hit:
	hits += 1

	best_completion = tokenizer.decode([top_ids[0]])
	print(f" prompt: {prompt!r}")
	print(f" output: {(prompt + best_completion).replace(chr(10), ' ')!r}")
	print(f" hit: {any_hit} (probe top-{top_k})")

	score = hits / len(FACTUAL_EVAL)
	print("---")
	print(f"factual_english_score: {score:.4f}")
	print(f"factual_english_hits: {hits}/{len(FACTUAL_EVAL)}")
	return score, hits, len(FACTUAL_EVAL)


	# ---------------------------------------------------------------------------
	# Gen mode: original autoregressive path (Fix F: batch decode)
	# ---------------------------------------------------------------------------

	def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
	"""Original autoregressive generation path with batch decode optimization:
	all GPU work runs first, then all CPU decoding happens after."""
	print("---")
	print("factual_english_samples: (gen mode)")
	model.eval()

	num_samples = FACTUAL_SAMPLES
	batch = FACTUAL_BATCH
	gen_tokens = FACTUAL_GEN_TOKENS
	temps = [0.7, 0.9, 1.1]
	hits = 0

	with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
	for prompt, answers in FACTUAL_EVAL:
	ids = tokenizer.encode(prompt)
	answers_lower = [a.lower() for a in answers]
	# Collect all generated token sequences on GPU first
	all_rows: list[list[int]] = []
	samples_done = 0
	batch_idx = 0
	while samples_done < num_samples:
	b = min(batch, num_samples - samples_done)
	temp = temps[batch_idx % len(temps)]
	batch_idx += 1
	ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
	for _ in range(gen_tokens):
	# Audit 2026-05-09 #16: route through MDLM contract if active.
	next_logits = _next_token_logits(model, ctx)
	probs = torch.softmax(next_logits / temp, dim=-1)
	next_id = torch.multinomial(probs, num_samples=1)
	ctx = torch.cat([ctx, next_id], dim=1)
	if ctx.size(1) >= max_seq_len:
	break
	# Transfer to CPU in one shot, no per-row sync
	all_rows.extend(ctx.cpu().tolist())
	samples_done += b

	# CPU-side batch decode — no GPU sync between decodes
	any_hit = False
	first_gen = None
	hit_gen = None
	for row in all_rows:
	generated = tokenizer.decode(row)
	continuation = generated[len(prompt):].strip()
	_words = set(w.lower() for w in _re.findall(r"\b[\w'-]+\b", continuation))
	hit = any(a in _words for a in answers_lower)
	if first_gen is None:
	first_gen = generated
	if hit:
	any_hit = True
	if hit_gen is None:
	hit_gen = generated
	if any_hit:
	hits += 1
	print(f" prompt: {prompt!r}")
	print(f" output: {(first_gen or '').replace(chr(10), ' ')!r}")
	print(f" hit: {any_hit} (any of {num_samples} samples, temps={temps}, gen={gen_tokens}tok)")
	if hit_gen is not None and hit_gen != first_gen:
	print(f" hit_sample: {hit_gen.replace(chr(10), ' ')!r}")

	score = hits / len(FACTUAL_EVAL)
	print("---")
	print(f"factual_english_score: {score:.4f}")
	print(f"factual_english_hits: {hits}/{len(FACTUAL_EVAL)}")
	return score, hits, len(FACTUAL_EVAL)


	# ---------------------------------------------------------------------------
	# Public entry point
	# ---------------------------------------------------------------------------

	def run_factual_english(model, tokenizer, max_seq_len: int):
	"""Dispatch to probe (fast, default) or gen (original) mode.

	Set HYDRA_FACTUAL_MODE=gen to use the autoregressive path.
	"""
	if FACTUAL_MODE == "gen":
	return _run_factual_english_gen(model, tokenizer, max_seq_len)
	return _run_factual_english_probe(model, tokenizer, max_seq_len)