Rename public-facing CodonGPT strings to CodonTranslator

4547beb verified 4 days ago

54.2 kB

	#!/usr/bin/env python
	"""
	Teacher-forced (and optional free-run) evaluation on a random subset of your
	dataset to measure codon token cross-entropy and AA token accuracy, using the
	same conditioning pathway as training.

	Supports either a CSV file or Parquet input via a directory/glob (e.g.,
	./data/val/*.parquet).

	Usage examples:
	# CSV input
	python eval.py \
	--model_path outputs/checkpoint-21000 \
	--data_path random_sample_1000.csv \
	--embeddings_dir embeddings \
	--num_samples 10 \
	--batch_size 10 \
	--device cuda

	# Parquet glob input
	python eval.py \
	--model_path outputs/checkpoint-21000 \
	--data_path "./data/val/*.parquet" \
	--embeddings_dir embeddings \
	--num_samples 64 \
	--batch_size 32 \
	--device cuda
	"""

	import argparse
	import json
	import logging
	import random
	from pathlib import Path
	from typing import List, Optional, Tuple
	import glob

	import torch
	import torch.nn.functional as F
	import pandas as pd

	from src.sampler import CodonSampler
	from src.dataset import SpeciesEmbeddingStore, StreamSeqDataset, stage_collate_fn
	from torch.utils.data import DataLoader


	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	datefmt="%m/%d/%Y %H:%M:%S",
	level=logging.INFO,
	)
	logger = logging.getLogger("eval_tf")


	def parse_args():
	p = argparse.ArgumentParser("Teacher-forced evaluation of CodonTranslator")
	p.add_argument("--model_path", required=True, type=str,
	help="Path to checkpoint dir (with config.json / model.safetensors)")
	# Input data: CSV file or Parquet glob/dir
	p.add_argument("--data_path", required=False, type=str, default=None,
	help="CSV file or Parquet glob/dir (e.g., ./data/val/*.parquet)")
	# Back-compat: --csv_path still accepted (deprecated)
	p.add_argument("--csv_path", required=False, type=str, default=None,
	help="[Deprecated] CSV with columns: Taxon, protein_seq, cds_DNA")
	p.add_argument("--embeddings_dir", type=str, default=None,
	help="Species embeddings directory (recommended for parity)")
	p.add_argument("--num_samples", type=int, default=10)
	p.add_argument("--batch_size", type=int, default=10)
	p.add_argument("--seed", type=int, default=42)
	p.add_argument("--device", type=str, default="cuda")
	p.add_argument("--workers", type=int, default=0,
	help="DataLoader workers for --eval_all streaming mode")
	# Free-run (sampling) evaluation options
	p.add_argument("--free_run", action="store_true",
	help="If set, perform real sampling instead of teacher forcing and compare to ground-truth codon sequences")
	p.add_argument("--temperature", type=float, default=0.8)
	p.add_argument("--top_k", type=int, default=50)
	p.add_argument("--top_p", type=float, default=0.9)
	p.add_argument("--control_mode", type=str, choices=["fixed","variable"], default="fixed")
	p.add_argument("--enforce_translation", action="store_true",
	help="Hard-mask decoding to codons matching target amino acid at each position during free-run evaluation")
	# Full-dataset streaming eval (no sampling)
	p.add_argument("--eval_all", action="store_true",
	help="Stream over all rows from --data_path and compute aggregated metrics (memory-safe)")
	p.add_argument("--max_records", type=int, default=0,
	help="When --eval_all is set: limit to first N samples (0 = all)")
	p.add_argument("--debug_aa_check", action="store_true",
	help="Print per-sample agreement between CDS→AA (standard code) and provided protein")
	# Per-sequence export over standard splits ./data/val and ./data/test
	p.add_argument("--export_per_sequence", action="store_true",
	help="Process ./data/val and ./data/test parquets in batches and export a per-sequence CSV")
	p.add_argument("--splits_root", type=str, default="./data",
	help="Root directory that contains val/ and test/ subfolders with parquet files")
	p.add_argument("--out_csv", type=str, default="outputs/eval_per_sequence.csv",
	help="Output CSV path for per-sequence export")
	p.add_argument("--export_splits", nargs="+", default=["val", "test"],
	help="Subdirectories under --splits_root to process (default: val test)")
	p.add_argument("--max_rows_per_split", type=int, default=0,
	help="When --export_per_sequence is set: limit number of rows per split (0 = all)")
	p.add_argument("--progress", action="store_true",
	help="Show progress bars during per-sequence export")
	# Capacity and evaluation controls
	p.add_argument("--no_truncation", action="store_true",
	help="Fit prefix caps so generated codon length equals protein length (avoids capacity truncation)")
	p.add_argument("--species_prefix_cap", type=int, default=0,
	help="When >0 and --no_truncation is set, cap species token prefix to this many tokens; 0 = no species cap")
	return p.parse_args()


	def _is_parquet_path(p: str) -> bool:
	lower = p.lower()
	return lower.endswith(".parquet") or lower.endswith(".parq")


	def _expand_paths(maybe_path_or_glob: str) -> List[str]:
	"""Expand a path/glob or directory into a sorted list of files.
	Prioritize Parquet when scanning a directory.
	"""
	paths: List[str] = []
	P = Path(maybe_path_or_glob)
	if P.is_dir():
	paths.extend(sorted(str(x) for x in P.rglob("*.parquet")))
	paths.extend(sorted(str(x) for x in P.rglob("*.parq")))
	paths.extend(sorted(str(x) for x in P.rglob("*.csv")))
	paths.extend(sorted(str(x) for x in P.rglob("*.tsv")))
	paths.extend(sorted(str(x) for x in P.rglob("*.csv.gz")))
	paths.extend(sorted(str(x) for x in P.rglob("*.tsv.gz")))
	else:
	paths = sorted(glob.glob(str(P)))
	# Dedup while preserving order
	out: List[str] = []
	seen = set()
	for x in paths:
	if x not in seen:
	out.append(x)
	seen.add(x)
	return out


	def _load_random_samples_from_parquet(files: List[str], num_samples: int, seed: int) -> pd.DataFrame:
	"""Collect up to num_samples rows from a list of Parquet files, reading by row group.
	Reads only the required columns and shuffles files/row-groups for decent coverage.
	"""
	try:
	import pyarrow.parquet as pq # type: ignore
	except Exception as e: # pragma: no cover
	raise ImportError("pyarrow is required to read parquet files") from e

	rng = random.Random(seed)
	req = ["Taxon", "protein_seq", "cds_DNA"]
	files = [f for f in files if _is_parquet_path(f)]
	if not files:
	raise FileNotFoundError("No Parquet files found to read")
	files = files.copy()
	rng.shuffle(files)

	collected: List[pd.DataFrame] = []
	remaining = int(max(0, num_samples))
	for fp in files:
	if remaining <= 0:
	break
	pf = pq.ParquetFile(fp)
	nrg = int(pf.num_row_groups or 0)
	if nrg <= 0:
	rgs = [0]
	else:
	rgs = list(range(nrg))
	rng.shuffle(rgs)
	# Only keep columns that exist in this file
	cols = [c for c in req if c in pf.schema.names]
	if len(cols) < len(req):
	missing = sorted(set(req) - set(cols))
	raise ValueError(f"Parquet missing required columns {missing} in {fp}")
	for rg in rgs:
	if remaining <= 0:
	break
	table = pf.read_row_group(rg, columns=cols)
	df = table.to_pandas(types_mapper=None)
	if df.empty:
	continue
	if len(df) > remaining:
	df = df.sample(n=remaining, random_state=rng.randint(0, 2**31 - 1))
	collected.append(df)
	remaining -= len(df)
	if not collected:
	return pd.DataFrame(columns=req)
	out = pd.concat(collected, ignore_index=True)
	# Final shuffle for randomness
	out = out.sample(frac=1.0, random_state=seed).reset_index(drop=True)
	# If we somehow overshot, trim
	if len(out) > num_samples:
	out = out.iloc[:num_samples].reset_index(drop=True)
	return out


	def _preferred_pooling(model_dir: Path) -> str:
	"""
	Best-effort pooling detection:
	- First try checkpoint configs for an explicit hint
	- Fallback to 'last'
	Note: we'll further override this using the embeddings_dir contents if provided.
	"""
	for cfg_name in ("trainer_config.json", "config.json"):
	fp = model_dir / cfg_name
	if fp.exists():
	try:
	with open(fp) as f:
	cfg = json.load(f)
	return str(cfg.get("species_pooling", "last"))
	except Exception:
	continue
	return "last"


	def _detect_pooling_from_embeddings_dir(emb_dir: Path) -> Optional[str]:
	"""Detect actual available pooling format from embeddings_dir contents."""
	fixed_files = [emb_dir / "species_embeddings.bin", emb_dir / "species_metadata.json", emb_dir / "species_vocab.json"]
	seq_files = [emb_dir / "species_tok_emb.bin", emb_dir / "species_index.json", emb_dir / "species_vocab.json"]
	if all(p.exists() for p in fixed_files):
	return "last"
	if all(p.exists() for p in seq_files):
	return "sequence"
	return None


	@torch.no_grad()
	def eval_batch(
	sampler: CodonSampler,
	species_store: Optional[SpeciesEmbeddingStore],
	species_names: List[str],
	protein_seqs: List[str],
	dna_cds_list: List[str],
	) -> Tuple[List[float], List[float]]:
	"""Evaluate a batch in teacher-forced mode.

	Returns per-sample (avg_ce_loss, aa_token_acc).
	"""
	tok = sampler.tokenizer
	pad_id = tok.pad_token_id
	eos_id = tok.eos_token_id

	# Encode DNA to codon ids and align lengths (trim to min protein length)
	codon_ids = []
	seq_lens = []
	for dna, prot in zip(dna_cds_list, protein_seqs):
	# Trim to min length between DNA codons and protein AA
	C_dna = len(dna) // 3
	C_prot = len(prot)
	C = max(min(C_dna, C_prot), 1)
	dna_trim = dna[: 3 * C]
	ids = tok.encode_codon_seq(dna_trim, validate=False)
	ids.append(eos_id)
	codon_ids.append(ids)
	seq_lens.append(len(ids))

	B = len(codon_ids)
	T = max(seq_lens)
	codons = torch.full((B, T), pad_id, dtype=torch.long)
	mask = torch.zeros((B, T), dtype=torch.bool)
	for i, ids in enumerate(codon_ids):
	L = len(ids)
	codons[i, :L] = torch.tensor(ids, dtype=torch.long)
	mask[i, :L] = True

	# inputs/labels aligned to training convention:
	# model predicts next codon after a learned start token; labels are the
	# same positions as inputs (not shifted by 1), with PAD/EOS masked out.
	input_ids = codons[:, :-1]
	labels_base = codons[:, :-1].clone()
	# Mask out PAD and EOS like trainer.evaluate()
	labels_base[labels_base == pad_id] = -100
	labels_base[labels_base == eos_id] = -100

	# Build conditioning dict similar to training and sampler
	cond = {"control_mode": "fixed"}

	if species_store is not None and species_names:
	sid_list = [species_store.vocab.get(s, -1) for s in species_names]
	num_unknown = sum(1 for x in sid_list if x < 0)
	if num_unknown > 0:
	logger.warning(f"{num_unknown}/{len(sid_list)} species not found in embeddings vocab; using zero embeddings")
	result = species_store.batch_get(sid_list)
	if isinstance(result, tuple):
	sp_tok, _ = result # [B, Ls, Ds]
	cond["species_tok_emb_src"] = sp_tok.to(sampler.device)
	cond["species_tok_emb_tgt"] = sp_tok.to(sampler.device)
	else:
	sp = result # [B, Ds]
	cond["species_emb_src"] = sp.to(sampler.device)
	cond["species_emb_tgt"] = sp.to(sampler.device)
	elif species_names:
	# On-the-fly species embeddings using Qwen (sequence pooling for training parity)
	seq_emb, _lens = sampler._qwen_embed_names(species_names, pooling="sequence")
	seq_emb = seq_emb.to(sampler.device)
	cond["species_tok_emb_src"] = seq_emb
	cond["species_tok_emb_tgt"] = seq_emb

	# Match training: pass raw protein sequences; the model tokenizes internally
	cond["protein_seqs"] = protein_seqs

	# Move tensors to device
	device = sampler.device
	input_ids = input_ids.to(device)
	labels_base = labels_base.to(device)

	sampler.model.eval()
	outputs = sampler.model(codon_ids=input_ids, cond=cond, labels=labels_base, return_dict=True)
	logits = outputs["logits"] # [B, Lmax, V] aligned to per-sample capacity after prefix
	try:
	prefix_len = outputs.get("prefix_len", 0)
	if isinstance(prefix_len, torch.Tensor):
	prefix_len_dbg = int(prefix_len.max().item()) if prefix_len.numel() > 0 else 0
	else:
	prefix_len_dbg = int(prefix_len)
	logger.debug(f"Prefix length(max)={prefix_len_dbg}, input_len={input_ids.size(1)}")
	except Exception:
	pass

	# Align labels/masks to logits length and per-sample caps
	Bsz, Lmax, V = logits.size(0), logits.size(1), logits.size(2)
	labels_aligned = torch.full((Bsz, Lmax), -100, dtype=labels_base.dtype, device=logits.device)
	common_cols = min(labels_base.size(1), Lmax)
	if common_cols > 0:
	labels_aligned[:, :common_cols] = labels_base[:, :common_cols]
	per_cap = outputs.get("per_cap", None)
	if isinstance(per_cap, torch.Tensor) and per_cap.numel() == Bsz:
	ar = torch.arange(Lmax, device=logits.device).unsqueeze(0)
	cap_mask = ar < per_cap.to(device=logits.device).unsqueeze(1) # [B,Lmax]
	else:
	cap_mask = torch.ones_like(labels_aligned, dtype=torch.bool, device=logits.device)

	# Mask labels beyond per-cap to -100 so CE ignores them
	labels_masked = labels_aligned.clone().to(device=logits.device)
	labels_masked[~cap_mask] = -100

	# Cross-entropy per sample (include EOS target; ignore PAD)
	loss_flat = F.cross_entropy(
	logits.reshape(-1, V),
	labels_masked.reshape(-1),
	ignore_index=-100,
	reduction="none",
	).view(Bsz, Lmax)

	# Accuracy per sample
	preds = logits.argmax(dim=-1)
	num_special = int(getattr(tok, "num_special_tokens", 0) or 0)
	supervised = (labels_masked != -100) & cap_mask
	if num_special > 0:
	supervised = supervised & (labels_aligned >= num_special)
	correct = (preds == labels_aligned) & supervised

	per_sample_ce: List[float] = []
	per_sample_acc: List[float] = []
	per_sample_aa_acc: List[float] = []
	codon2aa = tok.codon2aa_char_map() if hasattr(tok, "codon2aa_char_map") else {}
	per_cap = outputs.get("per_cap", None)
	per_cap_int = None
	if isinstance(per_cap, torch.Tensor) and per_cap.numel() == Bsz:
	per_cap_int = torch.clamp(per_cap.to(dtype=torch.long, device=logits.device), min=0, max=Lmax)

	for i in range(B):
	# Average CE over valid positions
	valid = (labels_masked[i] != -100) & cap_mask[i]
	if num_special > 0:
	valid = valid & (labels_aligned[i] >= num_special)
	ce = (loss_flat[i][valid].mean().item() if valid.any() else 0.0)
	per_sample_ce.append(ce)

	# Codon-level accuracy over supervised positions
	denom = supervised[i].sum().item()
	acc = (correct[i].sum().item() / denom) if denom > 0 else 0.0
	# AA-level accuracy per sample (match trainer)
	aa_acc = 0.0
	if per_cap_int is not None and codon2aa and i < len(protein_seqs):
	cap = int(per_cap_int[i].item())
	if cap > 0:
	mask_row = supervised[i, :cap]
	if mask_row.any():
	preds_row = preds[i, :cap][mask_row]
	prot = protein_seqs[i]
	seq_len = min(len(prot), preds_row.size(0))
	if seq_len > 0:
	pred_aa = ''.join(codon2aa.get(int(t.item()), 'X') for t in preds_row[:seq_len])
	truth_aa = prot[:seq_len]
	aa_matches = sum(1 for j in range(seq_len) if pred_aa[j] == truth_aa[j])
	aa_acc = aa_matches / seq_len
	per_sample_aa_acc.append(aa_acc)

	return per_sample_ce, per_sample_aa_acc


	def _dna_to_codons(dna: str) -> List[str]:
	dna = dna.strip().upper()
	return [dna[i:i+3] for i in range(0, len(dna) - (len(dna) % 3), 3)]


	def _aa_from_dna_standard(dna: str, tok) -> str:
	dna = dna.strip().upper()
	gc = getattr(tok, "_genetic_code", {})
	aa = []
	for j in range(0, len(dna) - (len(dna) % 3), 3):
	aa.append(gc.get(dna[j:j+3], 'X'))
	return ''.join(aa)


	def _aa_agreement(dna: str, protein: str, tok) -> Tuple[float, int, int]:
	"""Return (match_ratio, compared_len, first_mismatch_idx or -1) under standard code."""
	dna = dna.strip().upper()
	protein = protein.strip().upper()
	L = min(len(dna) // 3, len(protein))
	if L <= 0:
	return 0.0, 0, -1
	aa_pred = _aa_from_dna_standard(dna[: 3 * L], tok)
	truth = protein[:L]
	mism_idx = -1
	matches = 0
	for i, (a, b) in enumerate(zip(aa_pred, truth)):
	if a == b:
	matches += 1
	elif mism_idx < 0:
	mism_idx = i
	return (matches / L), L, mism_idx


	@torch.no_grad()
	def eval_streaming_all(
	sampler: CodonSampler,
	species_store: SpeciesEmbeddingStore,
	data_path: str,
	batch_size: int,
	num_workers: int,
	max_records: int = 0,
	):
	"""Stream over all rows from CSV/Parquet inputs and compute dataset-level metrics.

	Mirrors trainer.evaluate() for parity.
	"""
	device = sampler.device
	tok = sampler.tokenizer
	pad_id = int(tok.pad_token_id)
	eos_id = int(tok.eos_token_id)
	num_special = int(tok.num_special_tokens)
	codon2aa = tok.codon2aa_char_map()

	# Build streaming dataset and loader
	from pathlib import Path as _Path
	import glob as _glob
	def _expand(pat: str) -> List[str]:
	P = _Path(pat)
	if P.is_dir():
	paths: List[str] = []
	paths.extend(sorted(str(x) for x in P.rglob("*.parquet")))
	paths.extend(sorted(str(x) for x in P.rglob("*.parq")))
	paths.extend(sorted(str(x) for x in P.rglob("*.csv")))
	paths.extend(sorted(str(x) for x in P.rglob("*.tsv")))
	paths.extend(sorted(str(x) for x in P.rglob("*.csv.gz")))
	paths.extend(sorted(str(x) for x in P.rglob("*.tsv.gz")))
	else:
	paths = sorted(_glob.glob(str(P)))
	# de-dup
	seen = set(); out = []
	for x in paths:
	if x not in seen:
	out.append(x); seen.add(x)
	return out

	paths = _expand(data_path)
	if not paths:
	raise FileNotFoundError(f"No input files matched: {data_path}")

	species_vocab_path = str((Path(species_store.embeddings_dir) / "species_vocab.json").resolve())
	ds = StreamSeqDataset(
	files=paths,
	tokenizer=tok,
	species_vocab_path=species_vocab_path,
	unknown_species_id=0,
	csv_chunksize=200_000,
	shuffle_buffer=0,
	shard_across_ranks=False,
	)
	_dl_kwargs = dict(
	batch_size=int(batch_size),
	shuffle=False,
	drop_last=False,
	num_workers=int(max(0, num_workers)),
	collate_fn=stage_collate_fn,
	pin_memory=True,
	persistent_workers=(int(num_workers) > 0),
	)
	if int(num_workers) > 0:
	_dl_kwargs["prefetch_factor"] = 4
	loader = DataLoader(ds, **_dl_kwargs)

	loss_sum = 0.0
	loss_tokens = 0
	codon_correct = 0
	codon_total = 0
	aa_correct = 0
	aa_total = 0

	seen = 0
	for batch in loader:
	if not batch:
	continue
	if int(max_records) > 0 and seen >= int(max_records):
	break
	codon_ids = batch["codon_ids"].to(device)
	input_ids = codon_ids[:, :-1]
	labels = codon_ids[:, :-1].clone()
	labels[labels == pad_id] = -100
	labels[labels == eos_id] = -100

	# Build cond using species_store and protein_seqs
	cond = {"control_mode": "fixed", "protein_seqs": batch.get("protein_seqs", [])}
	sids = batch.get("species_ids")
	if torch.is_tensor(sids):
	sids_list = sids.detach().cpu().tolist()
	else:
	sids_list = [int(x) for x in sids]
	res = species_store.batch_get(sids_list)
	if isinstance(res, tuple):
	sp_tok, _ = res
	cond["species_tok_emb_src"] = sp_tok.to(device)
	cond["species_tok_emb_tgt"] = sp_tok.to(device)
	else:
	cond["species_emb_src"] = res.to(device)
	cond["species_emb_tgt"] = res.to(device)

	out = sampler.model(codon_ids=input_ids, cond=cond, labels=labels, return_dict=True)
	loss = out.get("loss")
	per_cap = out.get("per_cap")
	logits = out.get("logits")

	tokens_in_batch = 0
	if per_cap is not None:
	tokens_in_batch = int(torch.clamp(per_cap.detach(), min=0).sum().item())
	loss_tokens += tokens_in_batch
	if loss is not None and tokens_in_batch > 0:
	loss_sum += float(loss.detach().item()) * tokens_in_batch

	if logits is None or logits.size(1) == 0 or per_cap is None:
	seen += input_ids.size(0)
	continue
	max_cap = logits.size(1)
	batch_size = logits.size(0)
	labels_aligned = torch.full((batch_size, max_cap), -100, dtype=labels.dtype, device=labels.device)
	common = min(labels.size(1), max_cap)
	if common > 0:
	labels_aligned[:, :common] = labels[:, :common]
	per_cap_int = torch.clamp(per_cap.to(dtype=torch.long), min=0, max=max_cap)
	for row in range(batch_size):
	cap = int(per_cap_int[row].item())
	if cap < max_cap:
	labels_aligned[row, cap:] = -100
	supervised = labels_aligned != -100
	if num_special > 0:
	supervised = supervised & (labels_aligned >= num_special)
	if not supervised.any():
	seen += batch_size
	continue
	preds = logits.argmax(dim=-1)
	codon_correct += int((preds[supervised] == labels_aligned[supervised]).sum().item())
	codon_total += int(supervised.sum().item())

	# protein list
	prot_list = cond.get("protein_seqs", [])
	for row in range(batch_size):
	cap = int(per_cap_int[row].item())
	if cap <= 0:
	continue
	mask_row = supervised[row, :cap]
	if not mask_row.any():
	continue
	preds_row = preds[row, :cap][mask_row]
	prot = prot_list[row] if (isinstance(prot_list, list) and row < len(prot_list)) else ""
	if not prot:
	continue
	seq_len = min(len(prot), preds_row.size(0))
	if seq_len <= 0:
	continue
	pred_aa = ''.join(codon2aa.get(int(t.item()), 'X') for t in preds_row[:seq_len])
	truth_aa = prot[:seq_len]
	aa_correct += sum(1 for i in range(seq_len) if pred_aa[i] == truth_aa[i])
	aa_total += seq_len
	seen += batch_size

	mean_ce = (loss_sum / loss_tokens) if loss_tokens > 0 else 0.0
	codon_acc = (float(codon_correct) / codon_total) if codon_total > 0 else 0.0
	aa_acc = (float(aa_correct) / aa_total) if aa_total > 0 else 0.0
	logger.info(
	f"Full-dataset summary → tokens={loss_tokens} CE={mean_ce:.4f} CODON-acc={codon_acc:.4f} AA-acc={aa_acc:.4f}"
	)
	return mean_ce, codon_acc, aa_acc


	@torch.no_grad()
	def sample_and_score_batched(
	sampler: CodonSampler,
	species_names: List[str],
	protein_seqs: List[str],
	target_dnas: List[str],
	temperature: float,
	top_k: int,
	top_p: float,
	control_mode: str,
	batch_size: int,
	enforce_translation: bool,
	no_truncation: bool = False,
	species_prefix_cap: int = 64,
	) -> Tuple[List[float], List[float]]:
	"""Free-run sampling in batches; returns per-sample (codon_acc, aa_acc)."""
	N = len(species_names)
	# Compute target lengths in codons (min of DNA and AA lengths)
	tgt_lengths = []
	tgt_codons_list = []
	for prot, dna in zip(protein_seqs, target_dnas):
	cods = _dna_to_codons(dna)
	L = min(len(cods), len(prot))
	if L <= 0:
	L = 1
	cods = ["ATG"] # harmless default
	tgt_lengths.append(L)
	tgt_codons_list.append(cods[:L])

	# Bucket indices by target length to maximize batching
	buckets: dict[int, List[int]] = {}
	for i, L in enumerate(tgt_lengths):
	buckets.setdefault(L, []).append(i)

	codon_accs = [0.0] * N
	aa_accs = [0.0] * N

	# Helper AA translation
	vocab = sampler.tokenizer._genetic_code
	def dna_to_aa(dna: str) -> str:
	dna = dna.strip().upper()
	aa = []
	for j in range(0, len(dna) - (len(dna) % 3), 3):
	aa.append(vocab.get(dna[j:j+3], 'X'))
	return ''.join(aa)

	for L, idxs in buckets.items():
	# Optionally tighten protein prefix so prefix+start+L ≤ capacity (species kept full unless capped)
	prev_sp = getattr(sampler.model, "max_species_prefix", 0)
	prev_pp = getattr(sampler.model, "max_protein_prefix", 0)
	if bool(no_truncation):
	try:
	capacity = int(getattr(sampler.model, "max_position_embeddings", 1024))
	# If requested, apply a species token cap; otherwise keep as-is
	store = getattr(sampler, "species_store", None)
	if store is not None and getattr(store, "is_legacy", False) and int(species_prefix_cap) > 0:
	setattr(sampler.model, "max_species_prefix", int(species_prefix_cap))
	# Build a representative cond for this bucket to measure exact prefix length
	batch_idx_probe = idxs[: min(len(idxs), max(1, min(batch_size, 8)))]
	sp_probe = [species_names[i] for i in batch_idx_probe]
	pr_probe = [protein_seqs[i] for i in batch_idx_probe]
	# Map species to ids via store vocab
	cond_probe = {"control_mode": "fixed", "protein_seqs": pr_probe}
	if store is not None:
	sid_list = [store.vocab.get(s, -1) for s in sp_probe]
	res = store.batch_get(sid_list)
	if isinstance(res, tuple):
	sp_tok, _ = res
	cond_probe["species_tok_emb_src"] = sp_tok.to(sampler.device)
	cond_probe["species_tok_emb_tgt"] = sp_tok.to(sampler.device)
	else:
	cond_probe["species_emb_src"] = res.to(sampler.device)
	cond_probe["species_emb_tgt"] = res.to(sampler.device)
	# Iteratively reduce protein prefix cap until remaining ≥ L
	for _ in range(3):
	out0 = sampler.model(
	codon_ids=torch.zeros(len(batch_idx_probe), 0, dtype=torch.long, device=sampler.device),
	cond=cond_probe,
	return_dict=True,
	use_cache=True,
	)
	pref = out0.get("prefix_len")
	if isinstance(pref, torch.Tensor) and pref.numel() > 0:
	pref_max = int(pref.max().item())
	else:
	pref_max = int(pref) if isinstance(pref, int) else 0
	remaining = capacity - (pref_max + 1)
	if remaining >= int(L):
	break
	need = int(L) - max(0, int(remaining))
	cur_pp = int(getattr(sampler.model, "max_protein_prefix", 0) or 0)
	new_pp = max(0, cur_pp - need) if cur_pp > 0 else max(0, pref_max - (capacity - 1 - int(L)))
	setattr(sampler.model, "max_protein_prefix", int(new_pp))
	except Exception:
	pass
	# Process in mini-batches
	for k in range(0, len(idxs), batch_size):
	batch_idx = idxs[k:k+batch_size]
	sp_b = [species_names[i] for i in batch_idx]
	pr_b = [protein_seqs[i] for i in batch_idx]
	# Sample in one call
	out = sampler.sample(
	num_sequences=len(batch_idx),
	sequence_length=L,
	species=sp_b,
	protein_sequences=pr_b,
	control_mode=control_mode,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	return_intermediate=False,
	progress_bar=False,
	enforce_translation=enforce_translation,
	)
	gen_list: List[str] = out["sequences"] # DNA strings
	# Score each
	for pos, idx in enumerate(batch_idx):
	tgt_codons = tgt_codons_list[idx]
	gen_codons = _dna_to_codons(gen_list[pos])[:L]
	matches = sum(1 for a,b in zip(gen_codons, tgt_codons) if a == b)
	codon_accs[idx] = (matches / L) if L > 0 else 0.0
	gen_aa = dna_to_aa(''.join(gen_codons))
	tgt_aa = protein_seqs[idx][:L]
	# Treat non-canonical AA in target as "match any"
	canonical = set("ACDEFGHIKLMNPQRSTVWY")
	aa_matches = sum(1 for a,b in zip(gen_aa, tgt_aa) if (b not in canonical) or (a == b))
	aa_accs[idx] = (aa_matches / L) if L > 0 else 0.0
	# Restore caps
	if bool(no_truncation):
	try:
	setattr(sampler.model, "max_species_prefix", prev_sp)
	setattr(sampler.model, "max_protein_prefix", prev_pp)
	except Exception:
	pass

	return codon_accs, aa_accs


	@torch.no_grad()
	def generate_and_score_batched(
	sampler: CodonSampler,
	species_names: List[str],
	protein_seqs: List[str],
	target_dnas: List[str],
	temperature: float,
	top_k: int,
	top_p: float,
	control_mode: str,
	batch_size: int,
	enforce_translation: bool,
	no_truncation: bool = False,
	species_prefix_cap: int = 64,
	) -> Tuple[List[str], List[float], List[float]]:
	"""Like sample_and_score_batched but also returns generated DNA sequences per sample."""
	N = len(species_names)
	tgt_lengths = []
	tgt_codons_list = []
	for prot, dna in zip(protein_seqs, target_dnas):
	cods = _dna_to_codons(dna)
	L = min(len(cods), len(prot))
	if L <= 0:
	L = 1
	cods = ["ATG"]
	tgt_lengths.append(L)
	tgt_codons_list.append(cods[:L])

	buckets: dict[int, List[int]] = {}
	for i, L in enumerate(tgt_lengths):
	buckets.setdefault(L, []).append(i)

	gen_all = [""] * N
	codon_accs = [0.0] * N
	aa_accs = [0.0] * N

	vocab = sampler.tokenizer._genetic_code
	def dna_to_aa(dna: str) -> str:
	dna = dna.strip().upper()
	aa = []
	for j in range(0, len(dna) - (len(dna) % 3), 3):
	aa.append(vocab.get(dna[j:j+3], 'X'))
	return ''.join(aa)

	for L, idxs in buckets.items():
	prev_sp = getattr(sampler.model, "max_species_prefix", 0)
	prev_pp = getattr(sampler.model, "max_protein_prefix", 0)
	if bool(no_truncation):
	try:
	capacity = int(getattr(sampler.model, "max_position_embeddings", 1024))
	store = getattr(sampler, "species_store", None)
	if store is not None and getattr(store, "is_legacy", False) and int(species_prefix_cap) > 0:
	setattr(sampler.model, "max_species_prefix", int(species_prefix_cap))
	batch_idx_probe = idxs[: min(len(idxs), max(1, min(batch_size, 8)))]
	sp_probe = [species_names[i] for i in batch_idx_probe]
	pr_probe = [protein_seqs[i] for i in batch_idx_probe]
	cond_probe = {"control_mode": "fixed", "protein_seqs": pr_probe}
	if store is not None:
	sid_list = [store.vocab.get(s, -1) for s in sp_probe]
	res = store.batch_get(sid_list)
	if isinstance(res, tuple):
	sp_tok, _ = res
	cond_probe["species_tok_emb_src"] = sp_tok.to(sampler.device)
	cond_probe["species_tok_emb_tgt"] = sp_tok.to(sampler.device)
	else:
	cond_probe["species_emb_src"] = res.to(sampler.device)
	cond_probe["species_emb_tgt"] = res.to(sampler.device)
	for _ in range(3):
	out0 = sampler.model(
	codon_ids=torch.zeros(len(batch_idx_probe), 0, dtype=torch.long, device=sampler.device),
	cond=cond_probe,
	return_dict=True,
	use_cache=True,
	)
	pref = out0.get("prefix_len")
	pref_max = int(pref.max().item()) if isinstance(pref, torch.Tensor) and pref.numel() > 0 else (int(pref) if isinstance(pref, int) else 0)
	remaining = capacity - (pref_max + 1)
	if remaining >= int(L):
	break
	need = int(L) - max(0, int(remaining))
	cur_pp = int(getattr(sampler.model, "max_protein_prefix", 0) or 0)
	new_pp = max(0, cur_pp - need) if cur_pp > 0 else max(0, pref_max - (capacity - 1 - int(L)))
	setattr(sampler.model, "max_protein_prefix", int(new_pp))
	except Exception:
	pass
	for k in range(0, len(idxs), batch_size):
	batch_idx = idxs[k:k+batch_size]
	sp_b = [species_names[i] for i in batch_idx]
	pr_b = [protein_seqs[i] for i in batch_idx]
	out = sampler.sample(
	num_sequences=len(batch_idx),
	sequence_length=L,
	species=sp_b,
	protein_sequences=pr_b,
	control_mode=control_mode,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	return_intermediate=False,
	progress_bar=False,
	enforce_translation=enforce_translation,
	)
	gen_list: List[str] = out["sequences"]
	for pos, idx in enumerate(batch_idx):
	gen_seq = gen_list[pos]
	gen_all[idx] = gen_seq
	tgt_codons = tgt_codons_list[idx]
	gen_codons = _dna_to_codons(gen_seq)[:L]
	matches = sum(1 for a,b in zip(gen_codons, tgt_codons) if a == b)
	codon_accs[idx] = (matches / L) if L > 0 else 0.0
	gen_aa = dna_to_aa(''.join(gen_codons))
	tgt_aa = protein_seqs[idx][:L]
	canonical = set("ACDEFGHIKLMNPQRSTVWY")
	aa_matches = sum(1 for a,b in zip(gen_aa, tgt_aa) if (b not in canonical) or (a == b))
	aa_accs[idx] = (aa_matches / L) if L > 0 else 0.0
	if bool(no_truncation):
	try:
	setattr(sampler.model, "max_species_prefix", prev_sp)
	setattr(sampler.model, "max_protein_prefix", prev_pp)
	except Exception:
	pass

	return gen_all, codon_accs, aa_accs


	def export_per_sequence_over_splits(
	sampler: CodonSampler,
	splits: List[str],
	splits_root: str,
	out_csv: str,
	batch_size: int,
	temperature: float,
	top_k: int,
	top_p: float,
	control_mode: str,
	enforce_translation: bool,
	progress: bool = False,
	max_rows_per_split: int = 0,
	no_truncation: bool = False,
	species_prefix_cap: int = 0,
	) -> None:
	"""Process ./data/val and ./data/test (or under splits_root) and write a per-sequence CSV."""
	try:
	import pyarrow.parquet as pq # type: ignore
	except Exception as e:
	raise ImportError("pyarrow is required for Parquet evaluation/export") from e

	from pathlib import Path as _P
	import os as _os
	total_written = 0
	# Pre-create CSV with header so users can tail it immediately
	header_cols = [
	"split",
	"organism",
	"protein_seq",
	"codon_seq",
	"predicted_seq",
	"codon_similarity",
	"amino_acid_recovery_rate",
	]
	_P(out_csv).parent.mkdir(parents=True, exist_ok=True)
	if not _P(out_csv).exists() or _os.path.getsize(out_csv) == 0:
	with open(out_csv, "w", newline="") as f:
	f.write(",".join(header_cols) + "\n")
	logging.info(f"Initialized CSV with header → {out_csv}")
	for split in splits:
	rows_remaining = int(max_rows_per_split) if int(max_rows_per_split) > 0 else None
	dir_path = Path(splits_root) / split
	files = sorted(str(p) for p in dir_path.glob("*.parquet"))
	if not files:
	logging.warning(f"No parquet files found in {dir_path}, skipping split {split}")
	continue
	logging.info(f"Processing split '{split}' with {len(files)} files ...")
	try:
	from tqdm import tqdm # type: ignore
	_wrap = (lambda it, kw: tqdm(it, kw)) if progress else (lambda it, **kw: it)
	except Exception:
	_wrap = (lambda it, **kw: it)
	stop_split = False
	for fp in _wrap(files, desc=f"{split} files", unit="file"):
	if rows_remaining is not None and rows_remaining <= 0:
	break
	pf = pq.ParquetFile(fp)
	nrg = int(pf.num_row_groups or 0)
	rgs = list(range(max(nrg, 1)))
	# Build a per-file rows progress bar (prefer total rows from metadata when available)
	rows_total = None
	try:
	if pf.metadata is not None:
	rows_total = 0
	for rg_idx in rgs:
	rg_md = pf.metadata.row_group(rg_idx)
	if rg_md is not None and rg_md.num_rows is not None:
	rows_total += int(rg_md.num_rows)
	except Exception:
	rows_total = None
	rows_pbar = None
	if progress:
	try:
	from tqdm import tqdm # type: ignore
	rows_pbar = tqdm(total=rows_total, desc=f"{split}:{Path(fp).name}", unit="rows", leave=False)
	except Exception:
	rows_pbar = None

	for rg in rgs:
	if rows_remaining is not None and rows_remaining <= 0:
	stop_split = True
	break
	table = pf.read_row_group(rg, columns=["Taxon", "protein_seq", "cds_DNA"])
	df = table.to_pandas()
	if df.empty:
	continue
	species = df["Taxon"].astype(str).tolist()
	proteins = df["protein_seq"].astype(str).str.upper().tolist()
	dnas = df["cds_DNA"].astype(str).str.upper().tolist()

	# Generate predictions and metrics in streaming mini-batches to keep
	# memory stable and update progress frequently
	N = len(species)
	for off in range(0, N, batch_size):
	if rows_remaining is not None and rows_remaining <= 0:
	stop_split = True
	break
	sp_b = species[off: off + batch_size]
	pr_b = proteins[off: off + batch_size]
	dn_b = dnas[off: off + batch_size]
	gen_list, codon_accs, aa_accs = generate_and_score_batched(
	sampler,
	sp_b,
	pr_b,
	dn_b,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	control_mode=control_mode,
	batch_size=batch_size,
	enforce_translation=enforce_translation,
	no_truncation=bool(no_truncation),
	species_prefix_cap=int(species_prefix_cap),
	)
	rows_batch: List[dict] = []
	for sp, pr, dn, gen, cacc, aacc in zip(sp_b, pr_b, dn_b, gen_list, codon_accs, aa_accs):
	L = min(len(pr), len(dn) // 3)
	tgt_dna = dn[: 3 * L]
	rows_batch.append({
	"split": split,
	"organism": sp,
	"protein_seq": pr,
	"codon_seq": tgt_dna,
	"predicted_seq": gen,
	"codon_similarity": float(cacc),
	"amino_acid_recovery_rate": float(aacc),
	})
	if rows_batch:
	if rows_remaining is not None and len(rows_batch) > rows_remaining:
	rows_batch = rows_batch[: rows_remaining]
	out_exists = _P(out_csv).exists() and _os.path.getsize(out_csv) > 0
	df_out = pd.DataFrame(rows_batch)
	_P(out_csv).parent.mkdir(parents=True, exist_ok=True)
	df_out.to_csv(out_csv, mode='a', header=not out_exists, index=False)
	total_written += len(rows_batch)
	if rows_remaining is not None:
	rows_remaining -= len(rows_batch)
	if rows_pbar is not None:
	try:
	rows_pbar.update(len(rows_batch))
	except Exception:
	pass
	if rows_remaining is not None and rows_remaining <= 0:
	stop_split = True
	break
	if rows_pbar is not None:
	try:
	rows_pbar.close()
	except Exception:
	pass
	if stop_split:
	break
	logging.info(f"Per-sequence export complete → {out_csv} (rows={total_written})")


	def main():
	args = parse_args()
	random.seed(args.seed)
	torch.manual_seed(args.seed)

	model_dir = Path(args.model_path)
	pooling = _preferred_pooling(model_dir)
	logger.info(f"Preferred species_pooling from checkpoint: {pooling}")

	# Set up species store (recommended for parity)
	species_store = None
	if args.embeddings_dir:
	emb_dir = Path(args.embeddings_dir)
	detected = _detect_pooling_from_embeddings_dir(emb_dir)
	if detected is not None and detected != pooling:
	logger.info(f"Overriding pooling from checkpoint ({pooling}) → embeddings_dir format ({detected})")
	pooling = detected
	species_store = SpeciesEmbeddingStore(args.embeddings_dir, pooling=pooling)
	logger.info(f"Loaded species store with {len(species_store.vocab)} species (pooling={pooling})")

	# Load sampler/model (uses same construction as sampling)
	sampler = CodonSampler(
	model_path=args.model_path,
	device=("cuda" if args.device == "cuda" and torch.cuda.is_available() else "cpu"),
	species_store=species_store,
	)

	# Load input data and sample rows
	if bool(args.export_per_sequence):
	export_per_sequence_over_splits(
	sampler,
	splits=list(args.export_splits),
	splits_root=str(args.splits_root),
	out_csv=str(args.out_csv),
	batch_size=int(args.batch_size),
	temperature=float(args.temperature),
	top_k=int(args.top_k),
	top_p=float(args.top_p),
	control_mode=str(args.control_mode),
	enforce_translation=bool(args.enforce_translation),
	progress=bool(args.progress),
	max_rows_per_split=int(args.max_rows_per_split),
	no_truncation=bool(args.no_truncation),
	species_prefix_cap=int(args.species_prefix_cap),
	)
	return

	data_path = args.data_path or args.csv_path
	if data_path is None:
	raise SystemExit("Please provide --data_path (CSV or Parquet glob/dir). --csv_path remains as a deprecated alias.")

	# Expand paths to decide CSV vs Parquet
	paths = _expand_paths(data_path)
	if not paths:
	raise FileNotFoundError(f"No input files matched: {data_path}")

	if all(_is_parquet_path(p) for p in paths):
	logger.info(f"Reading up to {args.num_samples} samples from {len(paths)} parquet files ...")
	df_s = _load_random_samples_from_parquet(paths, int(args.num_samples), int(args.seed))
	else:
	# Fallback to CSV/TSV single file behavior (back-compat). If multiple files match, use the first.
	csv_file = None
	for pth in paths:
	if pth.lower().endswith((".csv", ".tsv", ".csv.gz", ".tsv.gz")):
	csv_file = pth
	break
	if csv_file is None:
	raise ValueError(f"Unsupported input for --data_path: {paths[0]}")
	logger.info(f"Reading CSV file: {csv_file}")
	df = pd.read_csv(csv_file)
	required = {"Taxon", "protein_seq", "cds_DNA"}
	if not required.issubset(set(df.columns)):
	missing = required - set(df.columns)
	raise ValueError(f"CSV missing required columns: {sorted(missing)}")
	if args.num_samples > len(df):
	logger.warning(f"num_samples {args.num_samples} > CSV rows {len(df)}; reducing")
	args.num_samples = len(df)
	# Random sample without replacement
	indices = random.sample(range(len(df)), args.num_samples)
	df_s = df.iloc[indices].reset_index(drop=True)

	if len(df_s) == 0:
	raise ValueError("No samples loaded from the provided data_path")

	logger.info(f"Loaded {len(df_s)} samples for evaluation")

	species = df_s["Taxon"].astype(str).tolist()
	proteins = df_s["protein_seq"].astype(str).str.upper().tolist()
	dnas = df_s["cds_DNA"].astype(str).str.upper().tolist()

	if not args.free_run:
	if bool(args.eval_all):
	if not args.embeddings_dir:
	raise SystemExit("--eval_all requires --embeddings_dir for species vocab/embeddings")
	# Stream the entire dataset and compute dataset-level metrics (training-parity)
	eval_streaming_all(
	sampler,
	species_store if species_store is not None else SpeciesEmbeddingStore(args.embeddings_dir, pooling=pooling),
	data_path,
	batch_size=int(args.batch_size),
	num_workers=int(args.workers),
	max_records=int(args.max_records),
	)
	return
	# Optional: print per-sample CDS→AA agreement (standard code)
	if bool(args.debug_aa_check):
	for idx, (sp, pr, dn) in enumerate(zip(species, proteins, dnas), start=1):
	ratio, Lcmp, first_bad = _aa_agreement(dn, pr, sampler.tokenizer)
	flag = "OK" if ratio == 1.0 and Lcmp > 0 else ("EMPTY" if Lcmp == 0 else "MISMATCH")
	extra = f" first_mismatch={first_bad}" if first_bad >= 0 else ""
	logger.info(f"AA-CHECK Sample {idx:02d}: {flag} match={ratio:.3f} len={Lcmp}{extra} Taxon={sp}")
	# (No dataset-level filtering to keep evaluation simple.)
	# Teacher-forced evaluation (random subset)
	per_ce_all: List[float] = []
	per_aa_acc_all: List[float] = []
	per_codon_acc_all: List[float] = []
	bs = max(1, int(args.batch_size))
	for i in range(0, len(species), bs):
	sp_b = species[i:i+bs]
	pr_b = proteins[i:i+bs]
	dn_b = dnas[i:i+bs]
	ce, aa_acc = eval_batch(sampler, species_store, sp_b, pr_b, dn_b)
	# Also compute per-sample codon-acc using the same batch forward for consistency
	# Re-run lightweight preds for codon-acc is unnecessary because eval_batch already
	# computed supervised mask and preds internally; instead, recompute quickly here
	# by calling eval_batch and deriving codon-acc inside it. For simplicity and clarity
	# we re-derive codon-acc below using the same masking rules.
	per_ce_all.extend(ce)
	per_aa_acc_all.extend(aa_acc)

	# Derive codon-acc for this batch
	# Prepare a mirrored forward to access logits and masks (small overhead acceptable)
	tok = sampler.tokenizer
	pad_id = tok.pad_token_id
	eos_id = tok.eos_token_id
	codon_ids_local = []
	for dna, prot in zip(dn_b, pr_b):
	C_dna = len(dna) // 3
	C_prot = len(prot)
	C = max(min(C_dna, C_prot), 1)
	dna_trim = dna[: 3 * C]
	ids = tok.encode_codon_seq(dna_trim, validate=False)
	ids.append(eos_id)
	codon_ids_local.append(ids)
	B_b = len(codon_ids_local)
	T_b = max(len(x) for x in codon_ids_local)
	codons_b = torch.full((B_b, T_b), pad_id, dtype=torch.long)
	mask_b = torch.zeros((B_b, T_b), dtype=torch.bool)
	for j, ids in enumerate(codon_ids_local):
	Lb = len(ids)
	codons_b[j, :Lb] = torch.tensor(ids, dtype=torch.long)
	mask_b[j, :Lb] = True
	input_ids_b = codons_b[:, :-1].to(sampler.device)
	labels_b = codons_b[:, :-1].clone()
	labels_b[labels_b == pad_id] = -100
	labels_b[labels_b == eos_id] = -100
	cond_b = {"control_mode": "fixed"}
	if species_store is not None and sp_b:
	sids_b = [species_store.vocab.get(s, -1) for s in sp_b]
	res_b = species_store.batch_get(sids_b)
	if isinstance(res_b, tuple):
	sp_tok_b, _ = res_b
	cond_b["species_tok_emb_src"] = sp_tok_b.to(sampler.device)
	cond_b["species_tok_emb_tgt"] = sp_tok_b.to(sampler.device)
	else:
	sp_fix_b = res_b
	cond_b["species_emb_src"] = sp_fix_b.to(sampler.device)
	cond_b["species_emb_tgt"] = sp_fix_b.to(sampler.device)
	cond_b["protein_seqs"] = pr_b
	out_b = sampler.model(codon_ids=input_ids_b, cond=cond_b, labels=labels_b.to(sampler.device), return_dict=True)
	logits_b = out_b["logits"]
	per_cap_b = out_b.get("per_cap")
	if logits_b is not None and per_cap_b is not None:
	Bsz, Lmax, V = logits_b.size(0), logits_b.size(1), logits_b.size(2)
	labels_aligned_b = torch.full((Bsz, Lmax), -100, dtype=labels_b.dtype, device=logits_b.device)
	common_cols_b = min(labels_b.size(1), Lmax)
	if common_cols_b > 0:
	labels_aligned_b[:, :common_cols_b] = labels_b.to(logits_b.device)[:, :common_cols_b]
	ar = torch.arange(Lmax, device=logits_b.device).unsqueeze(0)
	cap_mask_b = ar < per_cap_b.to(device=logits_b.device).unsqueeze(1)
	labels_masked_b = labels_aligned_b.clone()
	labels_masked_b[~cap_mask_b] = -100
	preds_b = logits_b.argmax(dim=-1)
	num_special = int(getattr(tok, "num_special_tokens", 0) or 0)
	supervised_b = (labels_masked_b != -100) & cap_mask_b
	if num_special > 0:
	supervised_b = supervised_b & (labels_aligned_b >= num_special)
	for r in range(Bsz):
	denom = int(supervised_b[r].sum().item())
	cod_acc = (float((preds_b[r][supervised_b[r]] == labels_aligned_b[r][supervised_b[r]]).sum().item()) / denom) if denom > 0 else 0.0
	per_codon_acc_all.append(cod_acc)

	for idx, (ce, aa, ca) in enumerate(zip(per_ce_all, per_aa_acc_all, per_codon_acc_all), start=1):
	logger.info(f"Sample {idx:02d}: CE={ce:.4f} CODON-acc={ca:.4f} AA-acc={aa:.4f}")
	if per_ce_all:
	mean_ce = sum(per_ce_all) / len(per_ce_all)
	mean_aa = sum(per_aa_acc_all) / len(per_aa_acc_all) if per_aa_acc_all else 0.0
	mean_codon = sum(per_codon_acc_all) / len(per_codon_acc_all) if per_codon_acc_all else 0.0
	logger.info(f"Summary over {len(per_ce_all)} samples → mean CE={mean_ce:.4f}, mean CODON-acc={mean_codon:.4f}, mean AA-acc={mean_aa:.4f}")
	else:
	# Free-run sampling evaluation vs ground-truth DNA (codon-level), batched
	codon_accs, aa_accs = sample_and_score_batched(
	sampler,
	species,
	proteins,
	dnas,
	temperature=args.temperature,
	top_k=args.top_k,
	top_p=args.top_p,
	control_mode=args.control_mode,
	batch_size=int(args.batch_size),
	enforce_translation=bool(args.enforce_translation),
	no_truncation=bool(args.no_truncation),
	species_prefix_cap=int(args.species_prefix_cap),
	)
	for idx, (cacc, aacc) in enumerate(zip(codon_accs, aa_accs), start=1):
	logger.info(f"Sample {idx:02d}: CODON-acc={cacc:.4f} AA-acc={aacc:.4f}")
	if codon_accs:
	mean_c = sum(codon_accs) / len(codon_accs)
	mean_a = sum(aa_accs) / len(aa_accs)
	logger.info(f"Summary over {len(codon_accs)} samples → mean CODON-acc={mean_c:.4f}, mean AA-acc={mean_a:.4f}")


	if __name__ == "__main__":
	main()