lta / LTA_openwebtext_dualt /scripts /flowtext_decode_lab.py

Add files using upload-large-folder tool

0241b9f verified 6 days ago

22.6 kB

	#!/usr/bin/env python3
	"""Decode-sweep lab for FlowText OpenWebText checkpoints.

	The goal is to debug inference without touching training. We try several
	simplex-valid update rules, generate many candidates, and rank them with
	anti-collapse diagnostics instead of pure self-likelihood.

	Run from the flowtext_standard_bench repository root.
	"""

	from __future__ import annotations

	import argparse
	import json
	import math
	import re
	import sys
	from collections import Counter
	from dataclasses import dataclass, asdict
	from pathlib import Path
	from typing import Iterable, List, Sequence

	import torch
	import torch.nn.functional as F

	REPO_ROOT = Path(__file__).resolve().parents[1]
	if str(REPO_ROOT) not in sys.path:
	sys.path.insert(0, str(REPO_ROOT))

	from eval import build_model_from_ckpt
	from flowtext_lab.bridges import smooth_onehot
	from flowtext_lab.decode import model_time_for_step, sample_noise_simplex, state_for_model
	from flowtext_lab.tokenization import BpeTextTokenizer


	WORD_RE = re.compile(r"[A-Za-z]+\|\d+\|[^\sA-Za-z\d]")


	@dataclass
	class DecodeConfig:
	label: str
	rule: str
	steps: int = 64
	model_t_mode: str = "flow"
	eta: float = 0.5
	damping: float = 1.0
	max_gamma: float = 1.0
	endpoint_temp: float = 1.0
	state_floor: float = 1e-8
	final_from: str = "state"
	noise_mix: float = 0.0
	noise_decay: str = "linear"
	eos_logit_bias: float = 0.0


	def tokenize_for_metrics(text: str) -> list[str]:
	return WORD_RE.findall(text)


	def repeated_ngram_frac(tokens: Sequence[str], n: int) -> float:
	if len(tokens) < n:
	return 0.0
	grams = list(zip(*[tokens[i:] for i in range(n)]))
	counts = Counter(grams)
	return sum(v - 1 for v in counts.values() if v > 1) / max(len(grams), 1)


	def text_metrics(text: str) -> dict:
	toks = tokenize_for_metrics(text)
	words = [t.lower() for t in toks if re.fullmatch(r"[A-Za-z]+", t)]
	n_tok = max(len(toks), 1)
	n_words = max(len(words), 1)
	word_counts = Counter(words)
	max_word_frac = word_counts.most_common(1)[0][1] / n_words if word_counts else 1.0
	distinct1 = len(set(words)) / n_words if words else 0.0
	bigrams = list(zip(words, words[1:]))
	distinct2 = len(set(bigrams)) / max(len(bigrams), 1) if bigrams else 0.0
	digit_frac = sum(t.isdigit() for t in toks) / n_tok
	punct_frac = sum(bool(re.fullmatch(r"[,.;:!?]+", t)) for t in toks) / n_tok
	eos_count = text.count("<\|endoftext\|>")
	bad_char_count = text.count("�")
	rep3 = repeated_ngram_frac([t.lower() for t in toks], 3)
	rep4 = repeated_ngram_frac([t.lower() for t in toks], 4)
	# This score is deliberately simple and non-oracle. It rewards length and
	# lexical variety while heavily penalizing classic collapse artifacts.
	quality = (
	min(len(text) / 700.0, 1.0)
	+ 0.35 * distinct2
	+ 0.15 * distinct1
	- 0.30 * eos_count
	- 2.60 * rep3
	- 1.60 * rep4
	- 1.30 * digit_frac
	- 0.65 * punct_frac
	- 1.35 * max_word_frac
	- 0.35 * bad_char_count
	)
	return {
	"quality": float(quality),
	"chars": len(text),
	"tokens": len(toks),
	"words": len(words),
	"eos_count": eos_count,
	"bad_char_count": bad_char_count,
	"rep3": float(rep3),
	"rep4": float(rep4),
	"distinct1": float(distinct1),
	"distinct2": float(distinct2),
	"digit_frac": float(digit_frac),
	"punct_frac": float(punct_frac),
	"max_word_frac": float(max_word_frac),
	}


	def decode_text(tokenizer: BpeTextTokenizer, ids: Sequence[int]) -> str:
	return tokenizer.decode(ids, stop_at_eos=False, skip_special_tokens=False)


	def encode_prompt(tokenizer: BpeTextTokenizer, prompt: str, max_len: int) -> list[int]:
	return list(tokenizer.tokenizer.encode(prompt).ids)[:max_len]


	@torch.no_grad()
	def build_initial_state(
	tokenizer: BpeTextTokenizer,
	prompts: list[str],
	restarts: int,
	max_len: int,
	target_prob: float,
	eps: float,
	noise_init: str,
	noise_sigma: float,
	dirichlet_init_concentration: float,
	device: torch.device,
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, list[str]]:
	expanded: list[str] = []
	prompt_ids: list[list[int]] = []
	for prompt in prompts:
	ids = encode_prompt(tokenizer, prompt, max_len=max_len)
	for _ in range(restarts):
	expanded.append(prompt)
	prompt_ids.append(ids)

	batch = len(prompt_ids)
	attn = torch.ones((batch, max_len), dtype=torch.bool, device=device)
	probs = sample_noise_simplex(
	(batch, max_len),
	tokenizer.vocab_size,
	device,
	eps,
	noise_mode=noise_init,
	target_prob=target_prob,
	noise_sigma=noise_sigma,
	dirichlet_concentration=dirichlet_init_concentration,
	)
	lock = torch.zeros((batch, max_len), dtype=torch.bool, device=device)
	lock_probs = torch.zeros((batch, max_len, tokenizer.vocab_size), dtype=torch.float32, device=device)
	for row, ids in enumerate(prompt_ids):
	if not ids:
	continue
	ids_t = torch.tensor(ids, dtype=torch.long, device=device).unsqueeze(0)
	sp = smooth_onehot(ids_t, tokenizer.vocab_size, target_prob, eps)[0]
	probs[row, : len(ids)] = sp
	lock_probs[row, : len(ids)] = sp
	lock[row, : len(ids)] = True
	return probs, attn, lock, lock_probs, expanded


	def flowmap_gamma(step: int, steps: int, damping: float, max_gamma: float, eps: float) -> float:
	s = step / max(steps, 1)
	t_next = (step + 1) / max(steps, 1)
	base_gamma = (t_next - s) / max(1.0 - s, eps)
	gamma = float(damping) * base_gamma
	return min(gamma, float(max_gamma)) if max_gamma > 0 else gamma


	@torch.no_grad()
	def decode_batch(
	model,
	init_probs: torch.Tensor,
	attn: torch.Tensor,
	lock: torch.Tensor,
	lock_probs: torch.Tensor,
	cfg: DecodeConfig,
	eps: float,
	eos_id: int \| None = None,
	) -> torch.Tensor:
	probs = init_probs.float().clone()
	device = probs.device
	last_endpoint = probs
	for step in range(cfg.steps):
	t = model_time_for_step(cfg.model_t_mode, step, cfg.steps, probs.size(0), device, dtype=torch.float32)
	logits = model(state_for_model(model, probs, eps), t, attn).float()
	if cfg.endpoint_temp != 1.0:
	logits = logits / float(cfg.endpoint_temp)
	if cfg.eos_logit_bias != 0.0 and eos_id is not None and 0 <= eos_id < logits.size(-1):
	logits[..., eos_id] = logits[..., eos_id] + float(cfg.eos_logit_bias)
	endpoint = F.softmax(logits, dim=-1)
	last_endpoint = endpoint

	if cfg.rule == "flowmap":
	gamma = flowmap_gamma(step, cfg.steps, cfg.damping, cfg.max_gamma, eps)
	new_probs = probs + gamma * (endpoint - probs)
	elif cfg.rule == "replace":
	new_probs = (1.0 - cfg.eta) * probs + cfg.eta * endpoint
	elif cfg.rule == "geometric":
	log_mix = (1.0 - cfg.eta) * torch.log(probs.clamp_min(eps)) + cfg.eta * torch.log(endpoint.clamp_min(eps))
	new_probs = F.softmax(log_mix, dim=-1)
	elif cfg.rule == "centered_residual":
	# Add a zero-sum probability residual, then project back to simplex.
	residual = endpoint - probs
	residual = residual - residual.mean(dim=-1, keepdim=True)
	new_probs = probs + cfg.eta * residual
	else:
	raise ValueError(f"Unknown decode rule: {cfg.rule}")

	if cfg.noise_mix > 0:
	if cfg.noise_decay == "linear":
	lam = cfg.noise_mix * (1.0 - (step + 1) / max(cfg.steps, 1))
	elif cfg.noise_decay == "sqrt":
	lam = cfg.noise_mix * math.sqrt(max(0.0, 1.0 - (step + 1) / max(cfg.steps, 1)))
	else:
	lam = cfg.noise_mix
	if lam > 0:
	uniform = torch.full_like(new_probs, 1.0 / new_probs.size(-1))
	new_probs = (1.0 - lam) * new_probs + lam * uniform

	new_probs = new_probs.clamp_min(max(float(cfg.state_floor), eps))
	new_probs = new_probs / new_probs.sum(dim=-1, keepdim=True).clamp_min(eps)
	new_probs = torch.where(lock.unsqueeze(-1), lock_probs, new_probs)
	probs = new_probs

	if cfg.final_from == "endpoint":
	out = last_endpoint
	out = torch.where(lock.unsqueeze(-1), lock_probs, out)
	return out / out.sum(dim=-1, keepdim=True).clamp_min(eps)
	if cfg.final_from == "blend":
	out = 0.5 * probs + 0.5 * last_endpoint
	out = torch.where(lock.unsqueeze(-1), lock_probs, out)
	return out / out.sum(dim=-1, keepdim=True).clamp_min(eps)
	return probs


	@torch.no_grad()
	def pseudo_likelihood_scores(
	model,
	tokenizer: BpeTextTokenizer,
	probs: torch.Tensor,
	attn: torch.Tensor,
	lock: torch.Tensor,
	target_prob: float,
	eps: float,
	repeats: int,
	mask_frac: float,
	rerank_t: float,
	) -> torch.Tensor:
	ids = probs.argmax(dim=-1)
	endpoint = smooth_onehot(ids, tokenizer.vocab_size, target_prob, eps)
	eligible = attn & (~lock)
	scores = torch.zeros(ids.size(0), dtype=torch.float32, device=ids.device)
	counts = torch.zeros_like(scores)
	for _ in range(max(1, repeats)):
	score_mask = (torch.rand_like(ids.float()) < mask_frac) & eligible
	for row in range(ids.size(0)):
	if eligible[row].any() and not score_mask[row].any():
	choices = torch.nonzero(eligible[row], as_tuple=False).flatten()
	score_mask[row, choices[torch.randint(0, choices.numel(), (1,), device=ids.device)]] = True
	noise = sample_noise_simplex(
	(ids.size(0), ids.size(1)),
	tokenizer.vocab_size,
	ids.device,
	eps,
	noise_mode="logistic_normal",
	target_prob=target_prob,
	noise_sigma=-1.0,
	)
	inp = torch.where(score_mask.unsqueeze(-1), noise, endpoint)
	inp = torch.where(lock.unsqueeze(-1), probs, inp)
	t = torch.full((ids.size(0),), float(rerank_t), dtype=torch.float32, device=ids.device)
	logits = model(state_for_model(model, inp, eps), t, attn).float()
	logp = F.log_softmax(logits, dim=-1).gather(-1, ids.unsqueeze(-1)).squeeze(-1)
	scores += (logp * score_mask.float()).sum(dim=-1)
	counts += score_mask.float().sum(dim=-1)
	return scores / counts.clamp_min(1.0)


	def default_configs(steps: int, config_set: str) -> list[DecodeConfig]:
	if config_set == "focused_flowmap":
	return [
	DecodeConfig("flowmap_t1p00_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0),
	DecodeConfig("flowmap_t1p10_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.10),
	DecodeConfig("flowmap_t1p25_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.25),
	DecodeConfig("flowmap_t1p40_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.40),
	DecodeConfig("flowmap_t1p60_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.60),
	DecodeConfig("flowmap_t1p25_d0p7", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.25),
	DecodeConfig("flowmap_t1p40_d0p7", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.40),
	DecodeConfig("flowmap_t1p60_d0p7", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.60),
	DecodeConfig("flowmap_t1p25_g0p5", "flowmap", steps=steps, damping=1.0, max_gamma=0.5, endpoint_temp=1.25),
	DecodeConfig("flowmap_t1p40_g0p5", "flowmap", steps=steps, damping=1.0, max_gamma=0.5, endpoint_temp=1.40),
	]
	if config_set == "best_flowmap":
	return [
	DecodeConfig("flowmap_t1p25_d0p7", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.25),
	DecodeConfig("flowmap_t1p25_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.25),
	DecodeConfig("flowmap_t1p35_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35),
	DecodeConfig("flowmap_t1p40_d1p0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.40),
	]
	if config_set == "final_projection":
	return [
	DecodeConfig("flowmap_t1p35_state", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35, final_from="state"),
	DecodeConfig("flowmap_t1p35_endpoint", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35, final_from="endpoint"),
	DecodeConfig("flowmap_t1p35_blend", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35, final_from="blend"),
	DecodeConfig("flowmap_t1p40_state", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.40, final_from="state"),
	DecodeConfig("flowmap_t1p40_endpoint", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.40, final_from="endpoint"),
	DecodeConfig("flowmap_t1p40_blend", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.40, final_from="blend"),
	DecodeConfig("flowmap_t1p25_d0p7_state", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.25, final_from="state"),
	DecodeConfig("flowmap_t1p25_d0p7_endpoint", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.25, final_from="endpoint"),
	DecodeConfig("flowmap_t1p25_d0p7_blend", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.25, final_from="blend"),
	]
	if config_set == "eos_sweep":
	return [
	DecodeConfig("flowmap_t1p35_eos0", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35, eos_logit_bias=0.0),
	DecodeConfig("flowmap_t1p35_eos-1", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35, eos_logit_bias=-1.0),
	DecodeConfig("flowmap_t1p35_eos-2", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35, eos_logit_bias=-2.0),
	DecodeConfig("flowmap_t1p35_eos-3", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.35, eos_logit_bias=-3.0),
	DecodeConfig("flowmap_t1p40_eos-2", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.40, eos_logit_bias=-2.0),
	DecodeConfig("flowmap_t1p25_d0p7_eos-2", "flowmap", steps=steps, damping=0.7, max_gamma=1.0, endpoint_temp=1.25, eos_logit_bias=-2.0),
	]
	if config_set != "broad":
	raise ValueError(f"Unknown config_set: {config_set}")
	return [
	DecodeConfig("flowmap64", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, final_from="state"),
	DecodeConfig("flowmap_temp1p25", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=1.25),
	DecodeConfig("flowmap_temp0p85", "flowmap", steps=steps, damping=1.0, max_gamma=1.0, endpoint_temp=0.85),
	DecodeConfig("replace_eta0p35", "replace", steps=steps, eta=0.35),
	DecodeConfig("replace_eta0p50", "replace", steps=steps, eta=0.50),
	DecodeConfig("replace_eta0p65", "replace", steps=steps, eta=0.65),
	DecodeConfig("replace_eta0p50_temp1p25", "replace", steps=steps, eta=0.50, endpoint_temp=1.25),
	DecodeConfig("geometric_eta0p25", "geometric", steps=steps, eta=0.25),
	DecodeConfig("geometric_eta0p50", "geometric", steps=steps, eta=0.50),
	DecodeConfig("centered_residual_eta0p20", "centered_residual", steps=steps, eta=0.20),
	DecodeConfig("replace_eta0p50_floor1e6", "replace", steps=steps, eta=0.50, state_floor=1e-6),
	DecodeConfig("replace_eta0p50_leak", "replace", steps=steps, eta=0.50, noise_mix=0.03, noise_decay="sqrt"),
	]


	def aggregate(rows: list[dict]) -> dict:
	keys = ["quality", "eos_count", "rep3", "rep4", "distinct1", "distinct2", "digit_frac", "max_word_frac"]
	return {f"mean_{k}": sum(float(r[k]) for r in rows) / max(len(rows), 1) for k in keys}


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument("--checkpoint", required=True)
	parser.add_argument("--tokenizer_path", required=True)
	parser.add_argument("--max_len", type=int, default=128)
	parser.add_argument("--steps", type=int, default=64)
	parser.add_argument("--restarts", type=int, default=64)
	parser.add_argument("--target_prob", type=float, default=0.99)
	parser.add_argument("--eps", type=float, default=1e-8)
	parser.add_argument("--model_t_mode", choices=["linear", "flow", "const0", "const05", "const1", "random"], default="flow")
	parser.add_argument("--noise_init", choices=["uniform", "logistic_normal", "dirichlet"], default="dirichlet")
	parser.add_argument("--noise_sigma", type=float, default=-1.0)
	parser.add_argument("--dirichlet_init_concentration", type=float, default=1.0)
	parser.add_argument("--prompts", default="\|The\|In the early morning\|Scientists have\|The company said\|A young woman")
	parser.add_argument("--score_repeats", type=int, default=0)
	parser.add_argument("--score_mask_frac", type=float, default=0.5)
	parser.add_argument("--rerank_t", type=float, default=0.5)
	parser.add_argument("--pl_weight", type=float, default=0.0)
	parser.add_argument("--output", default="runs/decode_lab/latest_decode_lab.jsonl")
	parser.add_argument("--config_set", default="broad", choices=["broad", "focused_flowmap", "best_flowmap", "final_projection", "eos_sweep"])
	parser.add_argument("--decode_batch_size", type=int, default=0)
	parser.add_argument("--topk", type=int, default=5)
	parser.add_argument("--seed", type=int, default=20260428)
	args = parser.parse_args()

	torch.manual_seed(args.seed)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	tokenizer = BpeTextTokenizer.from_file(args.tokenizer_path)
	ckpt = torch.load(args.checkpoint, map_location="cpu")
	model = build_model_from_ckpt(ckpt, tokenizer.vocab_size, args.max_len, device)
	model.eval()

	prompts = args.prompts.split("\|")
	# Keep the first empty prompt: it is unconditional generation.
	print(f"[info] device={device} prompts={prompts} restarts={args.restarts} steps={args.steps}")
	print(f"[info] checkpoint={args.checkpoint}")

	out_path = Path(args.output)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	configs = default_configs(args.steps, args.config_set)
	for cfg in configs:
	cfg.model_t_mode = args.model_t_mode
	with out_path.open("w") as f:
	for cfg in configs:
	init, attn, lock, lock_probs, expanded = build_initial_state(
	tokenizer=tokenizer,
	prompts=prompts,
	restarts=args.restarts,
	max_len=args.max_len,
	target_prob=args.target_prob,
	eps=args.eps,
	noise_init=args.noise_init,
	noise_sigma=args.noise_sigma,
	dirichlet_init_concentration=args.dirichlet_init_concentration,
	device=device,
	)
	if args.decode_batch_size > 0 and init.size(0) > args.decode_batch_size:
	decoded_parts = []
	for start in range(0, init.size(0), args.decode_batch_size):
	end = min(start + args.decode_batch_size, init.size(0))
	part = decode_batch(
	model,
	init[start:end],
	attn[start:end],
	lock[start:end],
	lock_probs[start:end],
	cfg,
	args.eps,
	tokenizer.eos_id,
	)
	decoded_parts.append(part.detach().cpu())
	print(f"[chunk] {cfg.label} decoded {end}/{init.size(0)}", flush=True)
	decoded = torch.cat(decoded_parts, dim=0)
	else:
	decoded = decode_batch(model, init, attn, lock, lock_probs, cfg, args.eps, tokenizer.eos_id)
	ids = decoded.argmax(dim=-1).detach().cpu().tolist()
	texts = [decode_text(tokenizer, row) for row in ids]
	rows = []
	for i, text in enumerate(texts):
	m = text_metrics(text)
	m.update({"candidate": i, "prompt": expanded[i], "text": text})
	rows.append(m)
	if args.score_repeats > 0:
	decoded_for_score = decoded.to(device) if decoded.device != device else decoded
	pl = pseudo_likelihood_scores(
	model,
	tokenizer,
	decoded_for_score,
	attn,
	lock,
	args.target_prob,
	args.eps,
	repeats=args.score_repeats,
	mask_frac=args.score_mask_frac,
	rerank_t=args.rerank_t,
	).detach().cpu().tolist()
	for row, score in zip(rows, pl):
	row["pseudo_logp"] = float(score)
	row["rank_score"] = float(row["quality"] + args.pl_weight * score)
	else:
	for row in rows:
	row["pseudo_logp"] = None
	row["rank_score"] = float(row["quality"])

	summary = {"type": "summary", "config": asdict(cfg), "agg": aggregate(rows)}
	f.write(json.dumps(summary, ensure_ascii=False) + "\n")
	print("\n" + "=" * 96)
	print("[config]", cfg.label, asdict(cfg))
	print("[metrics]", json.dumps(summary["agg"], ensure_ascii=False))
	for prompt in prompts:
	subset = [r for r in rows if r["prompt"] == prompt]
	subset.sort(key=lambda r: r["rank_score"], reverse=True)
	for rank, row in enumerate(subset[: args.topk], 1):
	rec = {"type": "sample", "config": asdict(cfg), "rank": rank, **row}
	f.write(json.dumps(rec, ensure_ascii=False) + "\n")
	if rank <= 1:
	print(f"\n--- best prompt={prompt!r} rank_score={row['rank_score']:.4f} quality={row['quality']:.4f} ---")
	print(row["text"])

	del init, attn, lock, lock_probs, decoded
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	print(f"[done] wrote {out_path}")


	if __name__ == "__main__":
	main()