Atom2.7m / lm_eval_fusion.py

Upload Atom2.7m model

271e253 verified 1 day ago

12.3 kB

	"""lm-eval wrapper for Atom2.7m checkpoints.

	The standard ``hf`` lm-eval model does not use the fusion tokenizer wrapper and
	does not pass arithmetic feature streams. This model keeps lm-eval's
	log-likelihood interface while encoding with ``tokenizer_utils.load_tokenizer``
	and forwarding ``place_ids`` and ``role_ids``.
	"""

	from __future__ import annotations

	from contextlib import nullcontext
	from pathlib import Path
	from typing import Any

	import torch
	import torch.nn.functional as F
	from lm_eval.api.model import LM
	from lm_eval.api.registry import register_model
	from tqdm import tqdm
	from transformers import AutoModelForCausalLM

	from tokenizer_utils import EOT_ID, FusionTokenizer, load_tokenizer


	def _parse_bool(value: Any, default: bool = False) -> bool:
	if value is None:
	return default
	if isinstance(value, bool):
	return value
	return str(value).strip().lower() in {"1", "true", "yes", "on"}


	def _parse_batch_size(value: int \| str \| None, max_batch_size: int \| None) -> int:
	if value is None:
	return 1
	if isinstance(value, int):
	return value
	text = str(value).strip().lower()
	if text == "auto" or text.startswith("auto:"):
	return int(max_batch_size or 64)
	return int(text)


	def _dtype_from_name(value: str \| torch.dtype \| None) -> torch.dtype \| None:
	if value is None or value == "auto":
	return None
	if isinstance(value, torch.dtype):
	return value
	normalized = str(value).replace("torch.", "").lower()
	if normalized in {"bf16", "bfloat16"}:
	return torch.bfloat16
	if normalized in {"fp16", "float16", "half"}:
	return torch.float16
	if normalized in {"fp32", "float32", "float"}:
	return torch.float32
	raise ValueError(f"Unsupported dtype: {value!r}")


	@register_model("atom2.7m")
	class FusionGPTLM(LM):
	"""Fusion-tokenizer GPT adapter for lm-eval log-likelihood tasks."""

	def __init__(
	self,
	pretrained: str = "outputs/fusion_run/final_model",
	tokenizer_dir: str = "tokenizer_4k",
	batch_size: int \| str \| None = 1,
	max_batch_size: int \| None = 64,
	max_length: int \| None = None,
	device: str \| None = "cuda",
	dtype: str \| torch.dtype \| None = "auto",
	mixed_precision_dtype: str \| torch.dtype \| None = "auto",
	trust_remote_code: bool \| str \| None = None,
	**_: Any,
	) -> None:
	super().__init__()
	del trust_remote_code
	if device is None or device == "auto":
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self._device = torch.device(device)
	self.batch_size = _parse_batch_size(batch_size, max_batch_size)
	self.tokenizer: FusionTokenizer = load_tokenizer(Path(tokenizer_dir))
	self.model = AutoModelForCausalLM.from_pretrained(
	Path(pretrained),
	trust_remote_code=True,
	).to(self.device)
	model_dtype = _dtype_from_name(dtype)
	if model_dtype is not None:
	self.model = self.model.to(dtype=model_dtype)
	if mixed_precision_dtype == "auto":
	self.mixed_precision_dtype = (
	torch.bfloat16 if self.device.type == "cuda" else None
	)
	else:
	self.mixed_precision_dtype = _dtype_from_name(mixed_precision_dtype)
	self.model.eval()
	self.max_length = int(
	max_length
	or getattr(self.model.config, "block_size", None)
	or getattr(self.model.config, "max_position_embeddings", 512)
	)

	@property
	def eot_token_id(self) -> int:
	return EOT_ID

	def tok_encode(
	self,
	string: str,
	add_special_tokens: bool \| None = None,
	left_truncate_len: int \| None = None,
	**_: Any,
	) -> list[int]:
	del add_special_tokens
	ids = self.tokenizer.encode(string).input_ids
	if left_truncate_len is not None:
	ids = ids[-left_truncate_len:]
	return ids

	def tok_decode(self, tokens, skip_special_tokens: bool = True) -> str:
	if isinstance(tokens, int):
	tokens = [tokens]
	return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)

	def _encode_request(
	self,
	context: str,
	continuation: str,
	) -> tuple[list[int], list[int], list[int], list[int], int]:
	if context == "":
	continuation_encoding = self.tokenizer.encode(continuation)
	ids = [self.eot_token_id] + continuation_encoding.input_ids
	place_ids = [0] + continuation_encoding.place_ids
	role_ids = [0] + continuation_encoding.role_ids
	context_len = 1
	continuation_ids = continuation_encoding.input_ids
	else:
	n_spaces = len(context) - len(context.rstrip())
	if n_spaces > 0:
	continuation = context[-n_spaces:] + continuation
	context = context[:-n_spaces]
	full_encoding = self.tokenizer.encode(context + continuation)
	context_encoding = self.tokenizer.encode(context)
	ids = full_encoding.input_ids
	place_ids = full_encoding.place_ids
	role_ids = full_encoding.role_ids
	context_len = len(context_encoding.input_ids)
	continuation_ids = ids[context_len:]

	if not continuation_ids:
	raise ValueError("Continuation encoded to zero tokens")
	return ids, place_ids, role_ids, continuation_ids, context_len

	def loglikelihood(
	self,
	requests: list["Instance"],
	disable_tqdm: bool = False,
	) -> list[tuple[float, bool]]:
	encoded = [
	self._encode_request(context, continuation)
	for context, continuation in tqdm(
	[req.args for req in requests],
	desc="Fusion tokenizing inputs",
	disable=disable_tqdm,
	)
	]
	results: list[tuple[float, bool]] = []
	for start in tqdm(
	range(0, len(encoded), self.batch_size),
	desc="Running fusion loglikelihood requests",
	disable=disable_tqdm or self.rank != 0,
	):
	batch = encoded[start : start + self.batch_size]
	rows = []
	row_places = []
	row_roles = []
	row_targets = []
	row_score_slices = []
	for ids, place_ids, role_ids, continuation_ids, context_len in batch:
	window_start = max(0, len(ids) - (self.max_length + 1))
	window_ids = ids[window_start:]
	window_places = place_ids[window_start:]
	window_roles = role_ids[window_start:]
	input_ids = window_ids[:-1]
	targets = window_ids[1:]
	full_score_start = context_len - 1
	full_score_end = len(ids) - 1
	score_start = max(full_score_start, window_start) - window_start
	score_end = full_score_end - window_start
	if score_end <= score_start:
	raise ValueError("No continuation tokens remain after truncation")
	scored_continuation_ids = continuation_ids[-(score_end - score_start) :]
	rows.append(input_ids)
	row_places.append(window_places[:-1])
	row_roles.append(window_roles[:-1])
	row_targets.append(targets)
	row_score_slices.append((score_start, score_end, scored_continuation_ids))

	max_len = max(len(row) for row in rows)
	input_tensor = torch.full(
	(len(rows), max_len),
	self.eot_token_id,
	dtype=torch.long,
	device=self.device,
	)
	place_tensor = torch.zeros_like(input_tensor)
	role_tensor = torch.zeros_like(input_tensor)
	attention_mask = torch.zeros_like(input_tensor, dtype=torch.bool)
	target_tensor = torch.full_like(input_tensor, self.eot_token_id)
	for row, (ids, places, roles, targets) in enumerate(
	zip(rows, row_places, row_roles, row_targets, strict=True)
	):
	length = len(ids)
	input_tensor[row, :length] = torch.tensor(ids, device=self.device)
	place_tensor[row, :length] = torch.tensor(places, device=self.device)
	role_tensor[row, :length] = torch.tensor(roles, device=self.device)
	target_tensor[row, :length] = torch.tensor(targets, device=self.device)
	attention_mask[row, :length] = True

	autocast = (
	torch.autocast(
	device_type=self.device.type,
	dtype=self.mixed_precision_dtype,
	enabled=self.mixed_precision_dtype is not None,
	)
	if self.device.type == "cuda"
	else nullcontext()
	)
	with torch.inference_mode(), autocast:
	logits = self.model(
	input_ids=input_tensor,
	place_ids=place_tensor,
	role_ids=role_tensor,
	attention_mask=attention_mask,
	).logits
	log_probs = F.log_softmax(logits.float(), dim=-1)

	for row, (score_start, score_end, continuation_ids) in enumerate(row_score_slices):
	row_log_probs = log_probs[row, score_start:score_end]
	row_targets_for_score = target_tensor[row, score_start:score_end]
	token_log_probs = torch.gather(
	row_log_probs,
	1,
	row_targets_for_score.unsqueeze(-1),
	).squeeze(-1)
	greedy = torch.equal(
	row_log_probs.argmax(dim=-1),
	torch.tensor(continuation_ids, dtype=torch.long, device=self.device),
	)
	results.append((float(token_log_probs.sum().item()), bool(greedy)))

	return results

	def loglikelihood_rolling(
	self,
	requests: list["Instance"],
	disable_tqdm: bool = False,
	) -> list[float]:
	results = []
	for (text,) in tqdm(
	[req.args for req in requests],
	desc="Running fusion rolling loglikelihood",
	disable=disable_tqdm or self.rank != 0,
	):
	encoding = self.tokenizer.encode(text)
	ids = encoding.input_ids
	places = encoding.place_ids
	roles = encoding.role_ids
	total = 0.0
	start = 0
	while start < len(ids):
	end = min(len(ids), start + self.max_length)
	prefix = [self.eot_token_id] if start == 0 else ids[start - 1 : start]
	chunk_ids = prefix + ids[start:end]
	chunk_places = [0] + places[start:end] if start == 0 else places[start - 1 : end]
	chunk_roles = [0] + roles[start:end] if start == 0 else roles[start - 1 : end]
	input_ids = torch.tensor([chunk_ids[:-1]], dtype=torch.long, device=self.device)
	place_ids = torch.tensor([chunk_places[:-1]], dtype=torch.long, device=self.device)
	role_ids = torch.tensor([chunk_roles[:-1]], dtype=torch.long, device=self.device)
	targets = torch.tensor(chunk_ids[1:], dtype=torch.long, device=self.device)
	with torch.inference_mode():
	logits = self.model(
	input_ids=input_ids,
	place_ids=place_ids,
	role_ids=role_ids,
	).logits[0]
	log_probs = F.log_softmax(logits.float(), dim=-1)
	total += float(
	torch.gather(log_probs, 1, targets.unsqueeze(-1)).sum().item()
	)
	start = end
	results.append(total)
	return results

	def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
	raise NotImplementedError(
	"FusionGPTLM currently supports loglikelihood tasks. "
	"Use tasks with multiple-choice/loglikelihood output."
	)