Upload folder using huggingface_hub

714cf46 verified 20 days ago

15.4 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math
	from functools import partial
	from einops import rearrange, repeat
	from typing import Optional, Tuple, Union


	Linear = partial(nn.Linear, bias=False)
	LayerNorm = partial(nn.LayerNorm, bias=False)


	def rotate_half(x, interleaved=False):
	if not interleaved:
	x1, x2 = x.chunk(2, dim=-1)
	return torch.cat((-x2, x1), dim=-1)
	else:
	x1, x2 = x[..., ::2], x[..., 1::2]
	return rearrange(
	torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
	)


	def apply_rotary_emb_torch(x, cos, sin, interleaved=False, _inplace=False):
	"""
	x: (batch_size, seqlen, nheads, headdim)
	cos, sin: (seqlen, rotary_dim / 2)
	"""
	ro_dim = cos.shape[-1] * 2
	assert ro_dim <= x.shape[-1]
	seqlen = x.size(1)
	cos = cos[:seqlen]
	sin = sin[:seqlen]
	cos = repeat(cos, "s d -> s 1 (2 d)")
	sin = repeat(sin, "s d -> s 1 (2 d)")
	return torch.cat(
	[
	x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
	x[..., ro_dim:],
	],
	dim=-1,
	)


	class RotaryEmbedding(torch.nn.Module):
	def __init__(
	self,
	dim: int,
	base=10000.0,
	interleaved=False,
	scale_base=None,
	scaling_factor=1.0,
	pos_idx_in_fp32=True,
	device=None,
	):
	super().__init__()
	self.dim = dim
	self.base = float(base)
	self.pos_idx_in_fp32 = pos_idx_in_fp32
	# Generate and save the inverse frequency buffer (non trainable)
	self.interleaved = interleaved
	self.scale_base = scale_base
	self.scaling_factor = scaling_factor
	self.device = device

	self._seq_len_cached = 0
	self._cos_cached = None
	self._sin_cached = None
	self._cos_k_cached = None
	self._sin_k_cached = None
	self.reset_parameters()

	def reset_parameters(self):
	inv_freq = self._compute_inv_freq(self.device)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	arange = torch.arange(0, self.dim, 2, device=self.device, dtype=torch.float32)
	scale = (
	(arange + 0.4 * self.dim) / (1.4 * self.dim)
	if self.scale_base is not None
	else None
	)
	self.register_buffer("scale", scale)

	def _compute_inv_freq(self, device=None):
	return 1 / (
	self.base
	** (
	torch.arange(0, self.dim, 2, device=device, dtype=torch.float32)
	/ self.dim
	)
	)

	def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
	if (
	seqlen > self._seq_len_cached
	or self._cos_cached is None
	or self._cos_cached.device != device
	or self._cos_cached.dtype != dtype
	or (self.training and self._cos_cached.is_inference())
	):
	self._seq_len_cached = seqlen
	if self.pos_idx_in_fp32:
	t = torch.arange(seqlen, device=device, dtype=torch.float32)
	t /= self.scaling_factor
	if self.inv_freq.dtype != torch.float32:
	inv_freq = self.inv_freq.to(torch.float32)
	else:
	inv_freq = self.inv_freq
	else:
	t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
	t /= self.scaling_factor
	inv_freq = self.inv_freq
	freqs = torch.outer(t, inv_freq)

	if self.scale is None:
	self._cos_cached = torch.cos(freqs).to(dtype)
	self._sin_cached = torch.sin(freqs).to(dtype)
	else:
	power = (
	torch.arange(
	seqlen, dtype=self.scale.dtype, device=self.scale.device
	)
	- seqlen // 2
	) / self.scale_base
	scale = self.scale.to(device=power.device) ** power.unsqueeze(-1)
	self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
	self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
	self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
	self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)

	def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	q: (batch, seqlen, nheads, headdim)
	k: (batch, seqlen, nheads, headdim)
	"""
	self._update_cos_sin_cache(q.shape[1], device=q.device, dtype=q.dtype)
	assert self._cos_cached is not None
	assert self._sin_cached is not None
	if self.scale is None:
	return (
	apply_rotary_emb_torch(
	q,
	self._cos_cached,
	self._sin_cached,
	self.interleaved,
	True, # inplace=True
	),
	apply_rotary_emb_torch(
	k,
	self._cos_cached,
	self._sin_cached,
	self.interleaved,
	True, # inplace=True
	),
	) # type: ignore
	else:
	assert False


	class MultiHeadAttention(nn.Module):
	def __init__(self, hidden_size: int, n_heads: int, rotary: bool = True):
	super().__init__()
	self.hidden_size = hidden_size
	self.n_heads = n_heads
	self.d_head = self.hidden_size // self.n_heads
	self.layernorm_qkv = nn.Sequential(
	LayerNorm(hidden_size), Linear(hidden_size, hidden_size * 3)
	)
	self.out_proj = Linear(hidden_size, hidden_size)
	self.q_ln = LayerNorm(hidden_size, bias=False)
	self.k_ln = LayerNorm(hidden_size, bias=False)
	self.reshaper = partial(rearrange, pattern="b s (h d) -> b h s d", h=n_heads)
	self.rotary = RotaryEmbedding(hidden_size // n_heads) if rotary else None

	def _apply_rotary(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	q = q.unflatten(-1, (self.n_heads, self.d_head))
	k = k.unflatten(-1, (self.n_heads, self.d_head))
	q, k = self.rotary(q, k)
	q = q.flatten(-2, -1)
	k = k.flatten(-2, -1)
	return q, k

	def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	# attention mask already prepped for sdpa shape (bs, 1, seq_len, seq_len)
	qkv = self.layernorm_qkv(x) # (bs, seq_len, d_model * 3)
	q, k, v = torch.chunk(qkv, 3, dim=-1) # (bs, seq_len, hidden_size)
	q, k = self.q_ln(q).to(q.dtype), self.k_ln(k).to(q.dtype)
	if self.rotary:
	q, k = self._apply_rotary(q, k)
	q, k, v = map(self.reshaper, (q, k, v)) # (bs, n_heads, seq_len, d_head)
	a = F.scaled_dot_product_attention(q, k, v, attention_mask) # (bs, n_heads, seq_len, d_head)
	a = rearrange(a, "b h s d -> b s (h d)") # (bs, seq_len, n_heads * d_head)
	return self.out_proj(a) # (bs, seq_len, hidden_size)


	class PAttention(nn.Module):
	"""
	Cross-attention mechanism for token-parameter-attention (b, L, d) -> (b, L, n_tokens) -> (b, L, d)
	"""
	def __init__(
	self,
	hidden_size: int,
	n_tokens: int,
	dropout: float = 0.2,
	):
	super(PAttention, self).__init__()
	self.n_tokens = n_tokens
	self.Wq = Linear(hidden_size, hidden_size)
	self.Pk = nn.Parameter(torch.randn(1, n_tokens, hidden_size))
	self.Pv = nn.Parameter(torch.randn(1, n_tokens, hidden_size))
	self.dropout = nn.Dropout(dropout)

	def forward(
	self,
	x: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	b, L, _ = x.size()
	if attention_mask is not None:
	attention_mask = attention_mask[:, None, :].expand(b, self.n_token, self.L).bool()

	q = self.Wq(x) # (b, L, d)
	out = F.scaled_dot_product_attention(q, self.Pk, self.Pv, attn_mask=attention_mask, is_causal=False) # (b, L, d)
	return self.dropout(out)


	class AttentionLogitsSequence(nn.Module):
	"""
	Cross-attention mechanism for token-parameter-attention (b, L, d) -> (b, L, num_labels) -> (b, num_labels)
	"""
	def __init__(self, hidden_size: int, num_labels: int = 1, sim_type: str = 'dot'):
	super(AttentionLogitsSequence, self).__init__()
	self.num_labels = num_labels
	self.Wp = nn.Parameter(torch.randn(1, hidden_size, num_labels))
	self.Wx = Linear(hidden_size, hidden_size)
	self.sim_type = sim_type

	def mean_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
	if attention_mask is None:
	return emb.mean(dim=1)
	else:
	return (emb * attention_mask).sum(dim=1) / attention_mask.sum(dim=1) # (b, d)

	def dot_product(self, x: torch.Tensor, p: torch.Tensor): # (b, L, d) * (b, d, num_labels) -> (b, L, num_labels)
	return torch.matmul(x, p)

	def euclidean_distance(self, x: torch.Tensor, p: torch.Tensor): # (b, L, d) * (b, d, num_labels) -> (b, L, num_labels)
	# x: (b, L, d), p: (b, d, num_labels)
	x_exp = x.unsqueeze(-1) # (b, L, d, 1)
	p_exp = p.unsqueeze(1) # (b, 1, d, num_labels)
	dist = torch.abs(torch.norm(x_exp - p_exp, p=2, dim=2)) # (b, L, num_labels)
	return -dist

	def cosine_similarity(
	self,
	x: torch.Tensor,
	p: torch.Tensor,
	attention_mask: torch.Tensor = None,
	) -> torch.Tensor: # (b, L, d) * (b, d, num_labels) -> (b, L, num_labels)
	x = x * attention_mask
	x = F.normalize(x, p=2, dim=-1)
	p = F.normalize(p, p=2, dim=1)
	cos_sims = torch.matmul(x, p)
	assert cos_sims.max().item() <= 1.0 and cos_sims.min().item() >= -1.0, "Cosine similarity values should be between -1 and 1"
	return cos_sims

	def forward(
	self,
	x: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	b, l, _ = x.size()
	p = self.Wp.expand(b, -1, -1) # (b, d, num_labels)
	x = self.Wx(x) # (b, L, d)

	if attention_mask is None:
	attention_mask = torch.ones(b, l, device=x.device, dtype=x.dtype)

	attention_mask = attention_mask.unsqueeze(-1)

	if self.sim_type == 'dot':
	y = self.dot_product(x, p)
	elif self.sim_type == 'euclidean':
	y = self.euclidean_distance(x, p)
	elif self.sim_type == 'cosine':
	y = self.cosine_similarity(x, p, attention_mask)
	else:
	raise ValueError(f"Invalid similarity type: {self.sim_type}")

	# y (b, L, num_labels)
	logits = self.mean_pooling(y, attention_mask) # (b, num_labels)
	return logits, y, x


	class AttentionLogitsToken(nn.Module):
	"""
	Cross-attention mechanism for token-parameter-attention (b, L, d) -> (b, L, num_labels)
	"""
	def __init__(self, hidden_size: int, num_labels: int = 1, sim_type: str = 'dot'):
	super(AttentionLogitsToken, self).__init__()
	self.num_labels = num_labels
	self.Wp = nn.Parameter(torch.randn(1, hidden_size, num_labels))
	self.Wx = Linear(hidden_size, hidden_size)
	self.sim_type = sim_type

	def dot_product(self, x: torch.Tensor, p: torch.Tensor):
	return torch.matmul(x, p)

	def euclidean_distance(self, x: torch.Tensor, p: torch.Tensor):
	return torch.norm(x - p, p=2, dim=-1)

	def cosine_similarity(
	self,
	x: torch.Tensor,
	p: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	if attention_mask is not None:
	x = x * attention_mask.unsqueeze(-1)

	x = F.normalize(x, p=2, dim=-1)
	p = F.normalize(p, p=2, dim=1)
	cos_sims = torch.matmul(x, p)
	assert cos_sims.max().item() <= 1.0 and cos_sims.min().item() >= -1.0, "Cosine similarity values should be between -1 and 1"
	return cos_sims

	def forward(
	self,
	x: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs,
	) -> torch.Tensor:
	b, L, d = x.size()
	p = self.Wp.expand(b, -1, -1) # (b, d, num_labels)
	x = self.Wx(x) # (b, L, d)
	if self.sim_type == 'dot':
	logits = self.dot_product(x, p)
	elif self.sim_type == 'euclidean':
	logits = self.euclidean_distance(x, p)
	elif self.sim_type == 'cosine':
	logits = self.cosine_similarity(x, p, attention_mask)
	else:
	raise ValueError(f"Invalid similarity type: {self.sim_type}")
	return logits # (b, L, num_labels)


	class MultiHeadPAttention(nn.Module):
	def __init__(
	self,
	hidden_size: int,
	n_heads: int,
	n_tokens: int,
	dropout: float = 0.2,
	rotary: bool = True,
	causal: bool = False,
	):
	super().__init__()
	self.hidden_size = hidden_size
	self.n_heads = n_heads
	self.d_head = self.hidden_size // self.n_heads
	self.Wq = PAttention(hidden_size, n_tokens=n_tokens, dropout=dropout)
	self.Wk = PAttention(hidden_size, n_tokens=n_tokens, dropout=dropout)
	self.Wv = PAttention(hidden_size, n_tokens=n_tokens, dropout=dropout)
	self.out_proj = Linear((hidden_size // n_heads) * n_heads, hidden_size)
	self.q_ln = LayerNorm(hidden_size)
	self.k_ln = LayerNorm(hidden_size)
	self.reshaper = partial(rearrange, pattern="b s (h d) -> b h s d", h=n_heads)
	self.rotary = RotaryEmbedding(hidden_size // n_heads) if rotary else None
	self.causal = causal

	def _apply_rotary(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	q = q.unflatten(-1, (self.n_heads, self.d_head))
	k = k.unflatten(-1, (self.n_heads, self.d_head))
	q, k = self.rotary(q, k)
	q = q.flatten(-2, -1)
	k = k.flatten(-2, -1)
	return q, k

	def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	# attention mask already prepped for sdpa shape (bs, 1, seq_len, seq_len)
	b, L, _ = x.shape
	if attention_mask is not None and attention_mask.dim() == 2:
	attention_mask = attention_mask[:, None, None, :].expand(b, 1, L, L).bool()
	q = self.Wq(x)
	k = self.Wk(x)
	v = self.Wv(x)
	q, k = self.q_ln(q).to(q.dtype), self.k_ln(k).to(q.dtype)
	if self.rotary:
	q, k = self._apply_rotary(q, k)
	q, k, v = map(self.reshaper, (q, k, v)) # (bs, n_heads, seq_len, d_head)
	a = F.scaled_dot_product_attention(q, k, v, attention_mask if not self.causal else None, is_causal=self.causal) # (bs, n_heads, seq_len, d_head)
	a = rearrange(a, "b h s d -> b s (h d)") # (bs, seq_len, n_heads * d_head)
	return self.out_proj(a) # (bs, seq_len, hidden_size)