Upload model

320681e verified 7 months ago

17.7 kB

	from math import sqrt
	from functools import partial
	from typing import Self
	from collections.abc import Generator
	from collections import deque

	import torch

	from torch import Tensor
	from torch.nn import (
	Module,
	ModuleList,
	Sequential,
	Embedding,
	Linear,
	SiLU,
	RMSNorm,
	Dropout1d,
	CrossEntropyLoss,
	Parameter,
	)

	from torch.nn.functional import softmax, scaled_dot_product_attention
	from torch.nn.utils.parametrize import register_parametrization, remove_parametrizations
	from torch.utils.checkpoint import checkpoint as torch_checkpoint

	from transformers import PretrainedConfig, PreTrainedModel

	from caching import KVCache, DynamicKVBlock

	from data import IGNORE_INDEX


	class NoPEGPT(Module):
	"""A generative pretrained transformer with no positional embeddings."""

	def __init__(
	self,
	vocabulary_size: int,
	embedding_dimensions: int,
	num_heads: int,
	num_layers: int,
	feed_forward_ratio: int,
	dropout: float,
	):
	super().__init__()

	if vocabulary_size <= 0:
	raise ValueError(
	f"Vocabulary size must be greater than 0, {vocabulary_size} given."
	)

	if num_layers <= 0:
	raise ValueError(f"Num layers must be greater than 0, {num_layers} given.")

	token_embeddings = Embedding(vocabulary_size, embedding_dimensions)

	output_layer = Linear(embedding_dimensions, vocabulary_size, bias=False)

	output_layer.weight = token_embeddings.weight # Tie weights

	self.token_embeddings = token_embeddings

	self.body = ModuleList(
	[
	DecoderBlock(
	embedding_dimensions,
	num_heads,
	feed_forward_ratio,
	dropout,
	)
	for _ in range(num_layers)
	]
	)

	self.checkpoint = lambda layer, x: layer(x)

	self.output_norm = RMSNorm(embedding_dimensions)
	self.output_layer = output_layer

	self.loss_function = CrossEntropyLoss(ignore_index=IGNORE_INDEX)

	self.vocabulary_size: int = vocabulary_size
	self.embedding_dimensions: int = embedding_dimensions
	self.num_heads: int = num_heads
	self.num_layers: int = num_layers

	@property
	def num_trainable_params(self) -> int:
	return sum(param.numel() for param in self.parameters() if param.requires_grad)

	def enable_activation_checkpointing(self) -> None:
	"""Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""

	self.checkpoint = partial(torch_checkpoint, use_reentrant=False)

	def freeze_model_parameters(self) -> None:
	"""Freeze all model parameters to prevent them from being updated during training."""

	for param in self.parameters():
	param.requires_grad = False

	@torch.no_grad()
	def resize_token_embeddings(self, vocabulary_size: int) -> None:
	"""Resize the token embeddings to accommodate a new vocabulary size."""

	if vocabulary_size <= 0:
	raise ValueError(
	f"Vocabulary size must be greater than 0, {vocabulary_size} given."
	)

	new_embeddings = Embedding(vocabulary_size, self.embedding_dimensions)

	new_embeddings = new_embeddings.to(self.token_embeddings.weight.device)

	num_tokens_to_copy = min(vocabulary_size, self.token_embeddings.num_embeddings)

	new_embeddings.weight[:num_tokens_to_copy, :] = self.token_embeddings.weight[
	:num_tokens_to_copy, :
	]

	# Initialize new embeddings with kaiming normal distribution.
	for i in range(num_tokens_to_copy, vocabulary_size):
	new_embeddings.weight[i] = torch.randn(self.embedding_dimensions) / sqrt(
	self.embedding_dimensions
	)

	self.token_embeddings.weight = new_embeddings.weight
	self.token_embeddings.num_embeddings = new_embeddings.num_embeddings

	self.output_layer.weight = self.token_embeddings.weight # Retie weights

	self.vocabulary_size = vocabulary_size

	def unfreeze_token_embeddings(self) -> None:
	"""Unfreeze the token embeddings to allow for fine-tuning."""

	self.token_embeddings.weight.requires_grad = True

	def add_lora_parameters(self, rank: int, alpha: float, dropout: float) -> None:
	"""Reparameterize the weights of the model using LoRA adapters."""

	for module in self.body:
	register_parametrization(
	module.attention.qkv_proj,
	"weight",
	LoRA.from_linear(module.attention.qkv_proj, 3 * rank, alpha, dropout),
	)

	register_parametrization(
	module.attention.out_proj,
	"weight",
	LoRA.from_linear(module.attention.out_proj, rank, alpha, dropout),
	)

	register_parametrization(
	module.mlp.layers[0],
	"weight",
	LoRA.from_linear(module.mlp.layers[0], rank, alpha, dropout),
	)

	register_parametrization(
	module.mlp.layers[2],
	"weight",
	LoRA.from_linear(module.mlp.layers[2], rank, alpha, dropout),
	)

	def lora_state_dict(self) -> dict[str, Tensor]:
	"""Return a state dict containing only the LoRA parameters."""

	return {
	name: module for name, module in self.state_dict().items() if "lora" in name
	}

	def merge_lora_parameters(self) -> None:
	"""Merge the LoRA parameters with the original parameters."""

	for module in self.modules():
	if hasattr(module, "parametrizations"):
	lora_params = [name for name in module.parametrizations.keys()]

	for name in lora_params:
	remove_parametrizations(module, name)

	def forward(
	self, x: Tensor, y: Tensor \| None = None
	) -> tuple[Tensor, Tensor \| None]:
	"""A forward pass optimized for batch training."""

	z = self.token_embeddings(x)

	for layer in self.body:
	z = self.checkpoint(layer, z)

	z = self.output_norm(z)
	z = self.output_layer(z)

	if y is not None:
	y_pred = z.view(-1, z.size(-1))
	labels = y.view(-1) # Flatten the batch dimension.

	loss = self.loss_function(y_pred, labels)
	else:
	loss = None

	return z, loss

	@torch.no_grad()
	def predict(self, x: Tensor, kv_cache: KVCache) -> Tensor:
	"""A forward pass optimized for next-token prediction."""

	z = self.token_embeddings(x)

	for layer, kv_block in zip(self.body, kv_cache):
	z = layer.predict(z, kv_block)

	z = z[:, -1, :] # Pluck only the last token embedding from each batch.

	z = self.output_norm(z)
	z = self.output_layer(z)

	return z

	@torch.no_grad()
	def generate(
	self,
	prompt: Tensor,
	max_tokens: int = 1000,
	context_length: int = 1024,
	temperature: float = 1.0,
	top_k: int = 500,
	top_p: float = 0.9,
	repeat_penalty: float = 0.1,
	repeat_window: int = 50,
	) -> Generator[tuple[Tensor, Tensor], None, int]:
	"""
	Given a prompt, sample the next {max_tokens} tokens from the model weighted
	by their predicted probabilities and filtered by the {top_k} and {top_p}.
	"""

	if max_tokens <= 0:
	raise ValueError(f"Max tokens must be greater than 0, {max_tokens} given.")

	if context_length <= 0:
	raise ValueError(
	f"Context length must be greater than 0, {context_length} given."
	)

	if temperature <= 0:
	raise ValueError(
	f"Temperature must be greater than 0, {temperature} given."
	)

	if top_k <= 0 or top_k > self.vocabulary_size:
	raise ValueError(
	f"Top k must be between 1 and {self.vocabulary_size}, {top_k} given."
	)

	if top_p <= 0.0 or top_p > 1.0:
	raise ValueError(f"Top p must be between 0 and 1, {top_p} given.")

	if repeat_penalty < 0.0 or repeat_penalty > 1.0:
	raise ValueError(
	f"Repeat penalty must be between 0 and 1, {repeat_penalty} given."
	)

	if repeat_window <= 0:
	raise ValueError(
	f"Repeat window must be greater than 0, {repeat_window} given."
	)

	kv_cache = KVCache(self, 1, context_length).to(prompt.device)

	prompt = prompt[-context_length:]

	previous_tokens = deque(maxlen=repeat_window)
	num_tokens = 0

	while num_tokens < max_tokens:
	logits = self.predict(prompt.unsqueeze(0), kv_cache).squeeze()

	for previous_token in previous_tokens:
	logits[previous_token] -= repeat_penalty * torch.abs(
	logits[previous_token]
	)

	logits, indices = torch.topk(logits, top_k, sorted=True)

	logits /= temperature

	probabilities = softmax(logits, dim=0)

	cumulative_probability_mass = torch.cumsum(probabilities, dim=0)

	min_probability_mass = cumulative_probability_mass[0]

	threshold_p = max(top_p, min_probability_mass.item())

	selected_indices = cumulative_probability_mass <= threshold_p

	logits = logits[selected_indices]
	indices = indices[selected_indices]

	probabilities = softmax(logits, dim=0)

	offset = torch.multinomial(probabilities, num_samples=1).squeeze()

	next_token = indices[offset]
	probability = probabilities[offset]

	yield next_token, probability

	num_tokens += 1

	previous_tokens.append(next_token)

	prompt = next_token.unsqueeze(0)

	return num_tokens


	class NoPEGPTHuggingFaceConfig(PretrainedConfig):
	"""Provide a monolithic configuration object to enable compatibility with HuggingFace Transformers API."""

	model_type = "nope-gpt"

	def __init__(
	self,
	vocabulary_size: int = 50257,
	embedding_dimensions: int = 1024,
	num_heads: int = 16,
	num_layers: int = 24,
	feed_forward_ratio: int = 4,
	dropout: float = 0.1,
	**kwargs,
	):
	self.vocabulary_size = vocabulary_size
	self.embedding_dimensions = embedding_dimensions
	self.num_heads = num_heads
	self.num_layers = num_layers
	self.feed_forward_ratio = feed_forward_ratio
	self.dropout = dropout

	super().__init__(**kwargs)


	class NoPEGPTHuggingFaceModel(PreTrainedModel):
	"""Wrap model to enable compatibility with HuggingFace Transformers API."""

	config_class = NoPEGPTHuggingFaceConfig

	def __init__(self, config: NoPEGPTHuggingFaceConfig):
	super().__init__(config)

	self.model = NoPEGPT(
	config.vocabulary_size,
	config.embedding_dimensions,
	config.num_heads,
	config.num_layers,
	config.feed_forward_ratio,
	config.dropout,
	)

	def forward(self, x: Tensor, y: Tensor \| None = None) -> dict[str, Tensor \| None]:
	logits, loss = self.model.forward(x, y)

	return {
	"logits": logits,
	"loss": loss,
	}


	class DecoderBlock(Module):
	"""Decoder block with multi-head attention, multilayer perceptron, and residual connections."""

	def __init__(
	self,
	embedding_dimensions: int,
	num_heads: int,
	feed_forward_ratio: int,
	dropout: float,
	):
	super().__init__()

	self.norm1 = RMSNorm(embedding_dimensions)
	self.attention = SelfAttention(embedding_dimensions, num_heads, dropout)

	self.norm2 = RMSNorm(embedding_dimensions)
	self.mlp = MLP(embedding_dimensions, feed_forward_ratio, dropout)

	def forward(self, x: Tensor) -> Tensor:
	z = self.norm1(x)
	z = self.attention(z)

	z = x + z # Residual connection

	x = z

	z = self.norm2(x)
	z = self.mlp(z)

	z = x + z # Residual connection

	return z

	@torch.no_grad()
	def predict(self, x: Tensor, kv_block: DynamicKVBlock) -> Tensor:
	"""A forward pass optimized for next-token prediction."""

	z = self.norm1(x)
	z = self.attention.predict(z, kv_block)

	z = x + z # Residual connection

	x = z

	z = self.norm2(x)
	z = self.mlp.predict(z)

	z = x + z # Residual connection

	return z


	class SelfAttention(Module):
	"""Multihead self-attention with causal masking."""

	def __init__(self, embedding_dimensions: int, num_heads: int, dropout: float):
	super().__init__()

	if embedding_dimensions <= 0:
	raise ValueError(
	f"Embedding dimensions must be greater than 0, {embedding_dimensions} given."
	)

	if num_heads <= 0:
	raise ValueError(f"Num heads must be greater than 0, {num_heads} given.")

	if embedding_dimensions % num_heads != 0:
	raise ValueError(
	f"Embedding dimensions must be divisible by num heads, {embedding_dimensions} and {num_heads} given."
	)

	self.qkv_proj = Linear(
	embedding_dimensions, 3 * embedding_dimensions, bias=False
	)

	self.out_proj = Linear(embedding_dimensions, embedding_dimensions, bias=False)

	head_dimensions: int = embedding_dimensions // num_heads
	scale: float = 1.0 / sqrt(head_dimensions)

	self.embedding_dimensions: int = embedding_dimensions
	self.num_heads: int = num_heads
	self.head_dimensions: int = head_dimensions
	self.scale: float = scale
	self.dropout: float = dropout

	def forward(self, x: Tensor) -> Tensor:
	b, t, d = x.size()

	q, k, v = self.qkv_proj(x).split(self.embedding_dimensions, dim=-1)

	q = q.view(b, t, self.num_heads, self.head_dimensions).transpose(1, 2)
	k = k.view(b, t, self.num_heads, self.head_dimensions).transpose(1, 2)
	v = v.view(b, t, self.num_heads, self.head_dimensions).transpose(1, 2)

	z = scaled_dot_product_attention(
	q,
	k,
	v,
	scale=self.scale,
	dropout_p=self.dropout if self.training else 0,
	is_causal=True,
	)

	z = z.transpose(1, 2).contiguous().view(b, t, d)

	z = self.out_proj(z)

	return z

	@torch.no_grad()
	def predict(self, x: Tensor, kv_block: DynamicKVBlock) -> Tensor:
	"""A forward pass optimized for next-token prediction."""

	b, t, d = x.size()

	is_autoregressive_phase = t == 1

	q, k, v = self.qkv_proj(x).split(self.embedding_dimensions, dim=-1)

	q = q.view(b, t, self.num_heads, self.head_dimensions).transpose(1, 2)
	k = k.view(b, t, self.num_heads, self.head_dimensions).transpose(1, 2)
	v = v.view(b, t, self.num_heads, self.head_dimensions).transpose(1, 2)

	k, v = kv_block.update(k, v)

	z = scaled_dot_product_attention(
	q,
	k,
	v,
	scale=self.scale,
	is_causal=not is_autoregressive_phase,
	)

	z = z.transpose(1, 2).contiguous().view(b, t, d)

	z = self.out_proj(z)

	return z


	class MLP(Module):
	"""A two layer fully-connected network with dropout."""

	def __init__(
	self, embedding_dimensions: int, feed_forward_ratio: int, dropout: float
	):
	super().__init__()

	if feed_forward_ratio not in {1, 2, 4}:
	raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")

	hidden_dimensions: int = feed_forward_ratio * embedding_dimensions

	self.layers = Sequential(
	Linear(embedding_dimensions, hidden_dimensions, bias=False),
	SiLU(),
	Linear(hidden_dimensions, embedding_dimensions, bias=False),
	)

	self.dropout = Dropout1d(p=dropout)

	def forward(self, x: Tensor) -> Tensor:
	return self.dropout(self.layers(x))

	def predict(self, x: Tensor) -> Tensor:
	return self.layers(x)


	class LoRA(Module):
	"""Low rank weight decomposition transformation."""

	@classmethod
	def from_linear(
	cls, linear: Linear, rank: int, alpha: float, dropout: float
	) -> Self:
	out_features, in_features = linear.weight.shape

	return cls(in_features, out_features, rank, alpha, dropout)

	def __init__(
	self,
	in_features: int,
	out_features: int,
	rank: int,
	alpha: float,
	dropout: float,
	):
	super().__init__()

	if rank <= 0:
	raise ValueError(f"Rank must be greater than 0, {rank} given.")

	if alpha <= 0.0:
	raise ValueError(f"Alpha must be greater than 0, {alpha} given.")

	lora_a = torch.randn(rank, in_features) / sqrt(rank)
	lora_b = torch.zeros(out_features, rank)

	self.lora_a = Parameter(lora_a)
	self.lora_b = Parameter(lora_b)

	self.dropout = Dropout1d(dropout)

	self.alpha: float = alpha

	def forward(self, weight: Tensor) -> Tensor:
	z = self.lora_b @ self.dropout(self.lora_a)

	z *= self.alpha

	return weight + z