sad / scripts /inference_ar.py

Add files using upload-large-folder tool

8b0aeb2 verified 20 days ago

13.8 kB

	#!/usr/bin/env python3
	"""
	inference_ar.py — Autoregressive left-to-right sampling from a trained ARModel.

	Companion to train_ar.py. Standard next-token sampling: given a prompt (or just
	BOS), run the model on the current prefix and sample the next token until we
	hit max_new_tokens or EOS.

	Supports:
	- greedy (temperature=0)
	- temperature
	- top-k
	- top-p (nucleus)

	Usage:
	python scripts/inference_ar.py \\
	--config configs/ar_owt.yaml \\
	--checkpoint outputs/ar_baseline/latest.pt \\
	--num_samples 4 \\
	--max_new_tokens 256

	# conditional (prompt from the training stream)
	python scripts/inference_ar.py --config ... --checkpoint ... \\
	--mode conditional --prompt_len 32 --num_samples 4
	"""

	from __future__ import annotations

	import argparse
	import sys
	from pathlib import Path

	ROOT = Path(__file__).resolve().parents[1] # sad/
	from typing import Optional

	import torch
	import torch.nn.functional as F
	import yaml

	sys.path.insert(0, str(ROOT))

	from src.models.ar_model import ARModel
	from src.data import build_owt_dataloader


	# ─────────────────────────────────────────────────────────────────────────────
	# Sampling helpers
	# ─────────────────────────────────────────────────────────────────────────────

	def _apply_top_k(logits: torch.Tensor, top_k: int) -> torch.Tensor:
	"""Zero out everything below the top-k largest logits (per row)."""
	if top_k <= 0 or top_k >= logits.size(-1):
	return logits
	topk_vals, _ = logits.topk(top_k, dim=-1)
	threshold = topk_vals[..., -1:].expand_as(logits)
	return torch.where(logits < threshold, torch.full_like(logits, float("-inf")), logits)


	def _apply_top_p(logits: torch.Tensor, top_p: float) -> torch.Tensor:
	"""Keep the smallest prefix of sorted probabilities whose sum ≥ top_p."""
	if top_p >= 1.0 or top_p <= 0.0:
	return logits
	sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True)
	probs = sorted_logits.softmax(dim=-1)
	cum = probs.cumsum(dim=-1)
	# Tokens whose cumulative prob is already past top_p (excluding the first
	# crossing token itself) are removed.
	remove = cum - probs > top_p
	sorted_logits = sorted_logits.masked_fill(remove, float("-inf"))
	# Scatter back to original vocab order.
	out = torch.empty_like(sorted_logits)
	out.scatter_(-1, sorted_idx, sorted_logits)
	return out


	def _sample_next(
	logits: torch.Tensor,
	temperature: float,
	top_k: int,
	top_p: float,
	) -> torch.Tensor:
	"""Sample one token per row from the last-step logits [B, V]."""
	if temperature <= 0.0:
	return logits.argmax(dim=-1)
	logits = logits / temperature
	logits = _apply_top_k(logits, top_k)
	logits = _apply_top_p(logits, top_p)
	probs = logits.softmax(dim=-1)
	return torch.multinomial(probs, num_samples=1).squeeze(-1)


	# ─────────────────────────────────────────────────────────────────────────────
	# Sampler
	# ─────────────────────────────────────────────────────────────────────────────

	class ARSampler:
	"""
	Plain left-to-right AR sampler with KV cache.

	Pass 1 (prompt): one `forward_cached` call over the full prompt builds the
	initial KV cache of length P.
	Pass 2..: single-token `forward_cached` calls that append one (k, v) slice
	per layer per step. Total length capped at `max_seq_len` (512).
	"""

	def __init__(
	self,
	model: ARModel,
	tokenizer,
	device: torch.device,
	dtype: torch.dtype = torch.bfloat16,
	):
	self.model = model
	self.tokenizer = tokenizer
	self.device = device
	self.dtype = dtype
	self.max_seq_len: int = model.max_seq_len
	self.vocab_size: int = model.vocab_size

	@torch.no_grad()
	def generate(
	self,
	prompt_ids: torch.Tensor,
	max_new_tokens: int,
	temperature: float = 1.0,
	top_k: int = 0,
	top_p: float = 1.0,
	eos_token_id: Optional[int] = None,
	stop_on_eos: bool = True,
	) -> torch.Tensor:
	"""
	Args:
	prompt_ids: [B, P] int64
	max_new_tokens: number of new tokens to append
	Returns:
	full sequence [B, P + k] where k ≤ max_new_tokens.
	"""
	device = self.device
	seq = prompt_ids.to(device=device, dtype=torch.long)
	B, P = seq.shape
	assert P < self.max_seq_len, f"prompt length {P} >= max_seq_len {self.max_seq_len}"

	autocast_device = "cuda" if device.type == "cuda" else "cpu"
	done = torch.zeros(B, dtype=torch.bool, device=device)

	# ── Pass 1: consume the prompt, build the initial KV cache ─────────
	with torch.autocast(device_type=autocast_device, dtype=self.dtype):
	logits, past_kv = self.model.forward_cached(seq, past_kv_list=None)
	next_logits = logits[:, -1, :].float() # [B, V]
	next_tok = _sample_next(next_logits, temperature, top_k, top_p) # [B]
	if eos_token_id is not None:
	done = done \| (next_tok == eos_token_id)
	seq = torch.cat([seq, next_tok.unsqueeze(-1)], dim=1)

	# ── Pass 2..: single-token appends using the growing KV cache ──────
	for _ in range(max_new_tokens - 1):
	if stop_on_eos and eos_token_id is not None and done.all():
	break
	if seq.size(1) >= self.max_seq_len:
	break

	with torch.autocast(device_type=autocast_device, dtype=self.dtype):
	logits, past_kv = self.model.forward_cached(
	seq[:, -1:], past_kv_list=past_kv,
	)
	next_logits = logits[:, -1, :].float()
	next_tok = _sample_next(next_logits, temperature, top_k, top_p)

	# Frozen rows keep emitting EOS so the batch stays rectangular.
	if eos_token_id is not None:
	next_tok = torch.where(
	done, torch.full_like(next_tok, eos_token_id), next_tok,
	)
	done = done \| (next_tok == eos_token_id)

	seq = torch.cat([seq, next_tok.unsqueeze(-1)], dim=1)

	return seq.cpu()


	# ─────────────────────────────────────────────────────────────────────────────
	# Plumbing
	# ─────────────────────────────────────────────────────────────────────────────

	def _unwrap(model):
	while True:
	if hasattr(model, "_orig_mod"):
	model = model._orig_mod
	elif hasattr(model, "module"):
	model = model.module
	else:
	return model


	def load_config(path: str) -> dict:
	with open(path) as f:
	return yaml.safe_load(f)


	def build_tokenizer(config: dict):
	from transformers import AutoTokenizer
	tok = AutoTokenizer.from_pretrained(
	ROOT / "tokenizers" / "gpt2",
	local_files_only=True,
	)
	if tok.eos_token is None:
	tok.add_special_tokens({"eos_token": "<\|endoftext\|>"})
	if tok.bos_token is None:
	tok.bos_token = tok.eos_token
	if tok.pad_token is None:
	tok.pad_token = tok.eos_token
	config["model"]["vocab_size"] = len(tok)
	return tok


	def build_model(config: dict, device: torch.device) -> ARModel:
	mc = config["model"]
	return ARModel(
	vocab_size=mc["vocab_size"],
	hidden_size=mc["hidden_size"],
	n_blocks=mc["n_blocks"],
	n_heads=mc["n_heads"],
	max_seq_len=mc["max_seq_len"],
	dropout=mc.get("dropout", 0.0),
	).to(device)


	def parse_args():
	p = argparse.ArgumentParser()
	p.add_argument("--checkpoint", type=str, required=True)
	p.add_argument("--config", type=str, default="configs/ar_owt.yaml")
	p.add_argument("--num_samples", type=int, default=1)
	p.add_argument("--max_new_tokens", type=int, default=256)
	p.add_argument("--temperature", type=float, default=1.0)
	p.add_argument("--top_k", type=int, default=0,
	help="0 = disabled")
	p.add_argument("--top_p", type=float, default=1.0,
	help="1.0 = disabled")
	p.add_argument("--seed", type=int, default=42)
	p.add_argument("--device", type=str,
	default="cuda" if torch.cuda.is_available() else "cpu")
	p.add_argument("--dtype", type=str, default="bf16",
	choices=["bf16", "fp16", "fp32"])
	p.add_argument("--mode", type=str, default="unconditional",
	choices=["unconditional", "conditional"],
	help="unconditional: start from BOS only. "
	"conditional: take a prefix from the training stream.")
	p.add_argument("--prompt_len", type=int, default=32,
	help="(conditional) number of leading tokens drawn from data.")
	p.add_argument("--prompt_text", type=str, default=None,
	help="(optional) override prompt with a user-provided string. "
	"Encoded with the GPT-2 tokenizer.")
	p.add_argument("--data_seed", type=int, default=0,
	help="(conditional) seed for shuffling the training split.")
	p.add_argument("--no_stop_on_eos", action="store_true",
	help="Disable early-stop on EOS; always emit max_new_tokens.")
	return p.parse_args()


	def resolve_dtype(name: str) -> torch.dtype:
	return {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[name]


	def _build_prompt_ids(args, config, tokenizer, device) -> torch.Tensor:
	"""Returns [B, P] int64 prompt tensor on `device`."""
	bos = tokenizer.bos_token_id
	assert bos is not None, "tokenizer has no bos_token_id"

	if args.prompt_text is not None:
	ids = tokenizer(args.prompt_text, return_tensors="pt")["input_ids"][0].tolist()
	if not ids or ids[0] != bos:
	ids = [bos] + ids
	prompt = torch.tensor(ids, dtype=torch.long, device=device)
	return prompt.unsqueeze(0).expand(args.num_samples, -1).contiguous()

	if args.mode == "unconditional":
	return torch.full((args.num_samples, 1), bos, dtype=torch.long, device=device)

	# conditional: pull a batch from the OWT train split, take the first
	# `prompt_len` tokens (already [BOS]-prefixed by the dataloader).
	data_cfg = config.get("data", {})
	seq_len = config["model"]["max_seq_len"]
	cache_dir = data_cfg.get("cache_dir", None)
	if cache_dir is not None and not Path(cache_dir).is_absolute():
	repo_root = ROOT
	candidate = repo_root / cache_dir
	if candidate.exists():
	cache_dir = str(candidate)
	loader = build_owt_dataloader(
	tokenizer,
	split="train[:-100000]",
	seq_len=seq_len,
	batch_size=args.num_samples,
	num_workers=0,
	cache_dir=cache_dir,
	seed=args.data_seed,
	mode=data_cfg.get("mode", "subsample"),
	shard_across_ranks=False,
	)
	batch = next(iter(loader))
	return batch["input_ids"][:args.num_samples, :args.prompt_len].to(device)


	def main():
	args = parse_args()
	torch.manual_seed(args.seed)

	device = torch.device(args.device)
	dtype = resolve_dtype(args.dtype)

	config = load_config(args.config)
	tokenizer = build_tokenizer(config)

	model = build_model(config, device).to(dtype)
	ckpt = torch.load(args.checkpoint, map_location=device)
	raw_state = ckpt.get("model", ckpt)
	_unwrap(model).load_state_dict(raw_state, strict=False)
	model.eval()
	print(f"Loaded checkpoint: {args.checkpoint} (step={ckpt.get('step', '?')})")

	prompt_ids = _build_prompt_ids(args, config, tokenizer, device)
	P = prompt_ids.size(1)
	print(f"Sampling {args.num_samples} sequences ({args.mode}) "
	f"prompt_len={P} max_new_tokens={args.max_new_tokens} "
	f"T={args.temperature} top_k={args.top_k} top_p={args.top_p}")

	sampler = ARSampler(
	model=_unwrap(model),
	tokenizer=tokenizer,
	device=device,
	dtype=dtype,
	)
	out_ids = sampler.generate(
	prompt_ids=prompt_ids,
	max_new_tokens=args.max_new_tokens,
	temperature=args.temperature,
	top_k=args.top_k,
	top_p=args.top_p,
	eos_token_id=tokenizer.eos_token_id,
	stop_on_eos=not args.no_stop_on_eos,
	) # [B, P + k]

	print("\n" + "=" * 72)
	for i, ids in enumerate(out_ids):
	ids_list = ids.tolist()
	print(f"[Sample {i + 1}]")
	prompt_text = tokenizer.decode(ids_list[:P], skip_special_tokens=True)
	gen_text = tokenizer.decode(ids_list[P:], skip_special_tokens=True)
	if P > 1 or args.prompt_text is not None:
	print(f"<prompt ({P} tok)> {prompt_text}")
	print(f"<generated> {gen_text}")
	else:
	print(tokenizer.decode(ids_list, skip_special_tokens=True))
	print()


	if __name__ == "__main__":
	main()