Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

File size: 9,960 Bytes

c383594

"""Evaluation: factual probes + sampled factual English scoring.

Extracted from train.py (W1 modularization). Semantics unchanged.

Perf optimizations (eval_perf_fix):
- Probe mode: single forward per prompt instead of autoregressive gen
- Batch decode: all GPU work first, all CPU decode after
- Batched factual probes: single padded forward instead of N sequential
"""

from __future__ import annotations

import os
import re as _re

import torch

from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS, USE_MDLM, MDLM_MASK_ID
from hydra.mdlm_decode import mdlm_next_token_logits

# Default to probe mode (1 forward per prompt); set HYDRA_FACTUAL_MODE=gen for
# the original autoregressive generation path.
FACTUAL_MODE = os.environ.get("HYDRA_FACTUAL_MODE", "probe")


def _next_token_logits(model, x: torch.Tensor) -> torch.Tensor:
    """Return next-token logits, branching on MDLM training mode.

    Audit 2026-05-09 issue #16: when MDLM training is on, the model was
    trained to reconstruct masked positions, not to autoregressively predict
    the next token. Reading ``model(x)[:, -1, :]`` therefore measures the
    wrong distribution. Route through ``mdlm_next_token_logits`` which
    appends a single MASK slot and returns the prediction at that slot.

    Returns a 2D tensor of shape (B, V) in float precision.
    """
    if USE_MDLM:
        # mask_id default of -1 is a sentinel for "use vocab_size-1"; the
        # mdlm_decode helper resolves the actual mask id via
        # validate_mask_token_id once we know the vocab size.
        mask_id = MDLM_MASK_ID
        if mask_id < 0:
            mask_id = int(getattr(model.config, "vocab_size", 0)) - 1
        return mdlm_next_token_logits(
            model,
            x,
            mask_id=mask_id,
            vocab_size=int(model.config.vocab_size),
        )
    logits = model(x, targets=None)
    if logits.dim() == 3:
        return logits[:, -1, :].float()
    return logits.float()

FACTUAL_EVAL = [
    # Hard factual recall — requires specific knowledge memorization
    ("The capital of France is", ["Paris", "paris"]),
    ("Water boils at", ["100", "boiling"]),
    ("The largest planet in our solar system is", ["Jupiter", "jupiter"]),
    # Easier completions — common collocations / patterns the model may pick up
    ("Once upon a", ["time"]),
    ("Hello, my name", ["is", "'s"]),
    ("The cat sat on the", ["mat", "floor", "rug", "table", "couch", "chair", "ground"]),
    ("She opened the door and", ["walked", "saw", "found", "stepped", "looked", "went", "ran"]),
    # Original hard ones kept for completeness
    ("The speed of light is approximately", ["299", "300", "186,000", "light speed"]),
    ("Two plus two equals", ["4", "four"]),
]

_FACTUAL_PROBES = [
    "The capital of France is",
    "Water boils at",
    "The largest planet in our solar system is",
    "The speed of light is approximately",
    "Shakespeare wrote",
]


def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:
    """Top-5 next-token predictions for canonical factual prompts.

    Batched: pads all prompts into a single forward pass instead of N
    sequential passes.
    """
    print("\n--- Factual Probes ---")
    model.eval()

    # Process probes one at a time to avoid cooperative launch limit
    # (batched forward with B=len(probes) can exceed SM residency cap).
    for prompt_text in _FACTUAL_PROBES:
        ids = tokenizer.encode(prompt_text)
        x = torch.tensor([ids], device=device)
        with torch.no_grad(), autocast_ctx:
            logits = model(x)
        probs = torch.softmax(logits[0, -1].float(), dim=-1)
        top5 = torch.topk(probs, 5)
        completions = [tokenizer.decode([idx.item()]) for idx in top5.indices]
        probs_list = [f"{p:.4f}" for p in top5.values[:3].tolist()]
        print(f'  "{prompt_text}" -> {completions[:3]} (p={probs_list})')
    print("--- End Factual Probes ---\n")


# ---------------------------------------------------------------------------
# Probe mode: single forward per prompt (Fix D)
# ---------------------------------------------------------------------------

def _run_factual_english_probe(model, tokenizer, max_seq_len: int):
    """Fast probe mode: for each (prompt, answers), encode prompt + each answer
    candidate as a single sequence, do ONE forward pass, and check if the model's
    argmax at the last prompt token matches the first answer token.

    Falls back to checking top-K predictions to be generous (same as gen mode
    which samples multiple temperatures).
    """
    print("---")
    print("factual_english_samples: (probe mode)")
    model.eval()
    hits = 0

    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
        for prompt, answers in FACTUAL_EVAL:
            prompt_ids = tokenizer.encode(prompt)
            prompt_len = len(prompt_ids)
            x = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
            # Audit 2026-05-09 #16: route through MDLM contract if active.
            last_logits = _next_token_logits(model, x)[0]

            probs = torch.softmax(last_logits, dim=-1)
            # Check top-K predictions (generous: K=20 to match multi-sample gen)
            top_k = min(20, probs.shape[-1])
            top_ids = torch.topk(probs, top_k).indices.tolist()
            top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]

            answers_lower = [a.lower() for a in answers]
            any_hit = any(
                any(a in tok for a in answers_lower)
                for tok in top_tokens
            )
            if any_hit:
                hits += 1

            best_completion = tokenizer.decode([top_ids[0]])
            print(f"  prompt: {prompt!r}")
            print(f"  output: {(prompt + best_completion).replace(chr(10), ' ')!r}")
            print(f"  hit:    {any_hit} (probe top-{top_k})")

    score = hits / len(FACTUAL_EVAL)
    print("---")
    print(f"factual_english_score: {score:.4f}")
    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
    return score, hits, len(FACTUAL_EVAL)


# ---------------------------------------------------------------------------
# Gen mode: original autoregressive path (Fix F: batch decode)
# ---------------------------------------------------------------------------

def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
    """Original autoregressive generation path with batch decode optimization:
    all GPU work runs first, then all CPU decoding happens after."""
    print("---")
    print("factual_english_samples: (gen mode)")
    model.eval()

    num_samples = FACTUAL_SAMPLES
    batch = FACTUAL_BATCH
    gen_tokens = FACTUAL_GEN_TOKENS
    temps = [0.7, 0.9, 1.1]
    hits = 0

    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
        for prompt, answers in FACTUAL_EVAL:
            ids = tokenizer.encode(prompt)
            answers_lower = [a.lower() for a in answers]
            # Collect all generated token sequences on GPU first
            all_rows: list[list[int]] = []
            samples_done = 0
            batch_idx = 0
            while samples_done < num_samples:
                b = min(batch, num_samples - samples_done)
                temp = temps[batch_idx % len(temps)]
                batch_idx += 1
                ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
                for _ in range(gen_tokens):
                    # Audit 2026-05-09 #16: route through MDLM contract if active.
                    next_logits = _next_token_logits(model, ctx)
                    probs = torch.softmax(next_logits / temp, dim=-1)
                    next_id = torch.multinomial(probs, num_samples=1)
                    ctx = torch.cat([ctx, next_id], dim=1)
                    if ctx.size(1) >= max_seq_len:
                        break
                # Transfer to CPU in one shot, no per-row sync
                all_rows.extend(ctx.cpu().tolist())
                samples_done += b

            # CPU-side batch decode — no GPU sync between decodes
            any_hit = False
            first_gen = None
            hit_gen = None
            for row in all_rows:
                generated = tokenizer.decode(row)
                continuation = generated[len(prompt):].strip()
                _words = set(w.lower() for w in _re.findall(r"\b[\w'-]+\b", continuation))
                hit = any(a in _words for a in answers_lower)
                if first_gen is None:
                    first_gen = generated
                if hit:
                    any_hit = True
                    if hit_gen is None:
                        hit_gen = generated
            if any_hit:
                hits += 1
            print(f"  prompt: {prompt!r}")
            print(f"  output: {(first_gen or '').replace(chr(10), ' ')!r}")
            print(f"  hit:    {any_hit} (any of {num_samples} samples, temps={temps}, gen={gen_tokens}tok)")
            if hit_gen is not None and hit_gen != first_gen:
                print(f"  hit_sample: {hit_gen.replace(chr(10), ' ')!r}")

    score = hits / len(FACTUAL_EVAL)
    print("---")
    print(f"factual_english_score: {score:.4f}")
    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
    return score, hits, len(FACTUAL_EVAL)


# ---------------------------------------------------------------------------
# Public entry point
# ---------------------------------------------------------------------------

def run_factual_english(model, tokenizer, max_seq_len: int):
    """Dispatch to probe (fast, default) or gen (original) mode.

    Set HYDRA_FACTUAL_MODE=gen to use the autoregressive path.
    """
    if FACTUAL_MODE == "gen":
        return _run_factual_english_gen(model, tokenizer, max_seq_len)
    return _run_factual_english_probe(model, tokenizer, max_seq_len)