decodon-200M / modeling_decodon.py

Upload DeCodon

db0de6a verified over 1 year ago

59.4 kB

	from typing import Optional, Tuple
	from dataclasses import dataclass
	import torch
	import torch.nn as nn

	from transformers.modeling_outputs import (
	SequenceClassifierOutput,
	)

	from typing import Optional, Tuple

	import torch
	import torch.utils.checkpoint
	from torch import nn

	from dataclasses import dataclass
	from transformers.activations import ACT2FN, ACT2CLS
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import logging
	from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutput, CausalLMOutputWithPast
	from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
	import xformers.ops as xops

	from collections import OrderedDict

	logger = logging.get_logger(__name__)

	import torch
	import torch.nn as nn
	from einops import rearrange, einsum
	from transformers.pytorch_utils import Conv1D


	import torch
	from torch.amp import autocast
	from torch import nn, einsum, Tensor

	from einops import rearrange, repeat
	from typing import Optional, Union

	from .configuration_decodon import DeCodonConfig

	logger = logging.get_logger(__name__)


	def rotate_half(x):
	x = rearrange(x, "... (d r) -> ... d r", r=2)
	x1, x2 = x.unbind(dim=-1)
	x = torch.stack((-x2, x1), dim=-1)
	return rearrange(x, "... d r -> ... (d r)")


	@autocast(device_type="cuda", enabled=False)
	def apply_rotary_emb(freqs, t, start_index=0, scale=1.0):
	"""
	Applies rotary embeddings to a tensor.

	Parameters
	----------
	freqs : Tensor
	The frequencies to apply to the tensor: (seq_len, dim)
	t : Tensor
	The tensor to apply the rotary embeddings to: (..., seq_len, n_heads, dim)
	start_index : int
	The starting index to apply the rotary embeddings. (default: 0)
	scale : float
	The scale to apply to the rotary embeddings. (default: 1.0)

	Returns
	-------
	Tensor
	The tensor with the rotary embeddings applied.: (..., seq_len, n_heads, dim)

	"""
	# if t.ndim == 3:
	# seq_len = t.shape[seq_dim]
	# freqs = freqs[-seq_len:].to(t)

	rot_dim = freqs.shape[-1]
	end_index = start_index + rot_dim

	assert (
	rot_dim <= t.shape[-1]
	), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"

	t_left, t, t_right = (
	t[..., :start_index],
	t[..., start_index:end_index],
	t[..., end_index:],
	)
	if isinstance(scale, float):
	scale = torch.tensor(scale, device=t.device, dtype=t.dtype)

	t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
	return torch.cat((t_left, t, t_right), dim=-1)


	# learned rotation helpers
	def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
	if freq_ranges is not None:
	rotations = einsum("..., f -> ... f", rotations, freq_ranges)
	rotations = rearrange(rotations, "... r f -> ... (r f)")

	rotations = repeat(rotations, "... n -> ... (n r)", r=2)
	return apply_rotary_emb(rotations, t, start_index=start_index)


	"""
	Inspired from https://github.com/lucidrains/rotary-embedding-torch
	"""

	class RotaryEmbedding(nn.Module):
	"""
	Rotary Embeddings Implemenetation inspired by https://github.com/lucidrains/rotary-embedding-torch.

	Rotary Positional Embeddings (RoPE) encode position information of tokens with a
	rotation matrix that naturally incorporates explicit relative position dependency.

	Parameters
	----------
	emb_dim : int
	Embedding dimension. Usually set to the dim of each head in the attention module.
	freqs : Optional[Tensor]
	Custom frequencies to apply to query/key tensors. (default: None)
	theta : float
	Base constant used for computing rotation angles.
	learned_freq : bool (default: False)
	Whether to learn the frequencies.
	use_xpos : bool (default: False)
	Whether to employ XPos technique for resolving length extrapolation issue.
	NOTE: This can only be enabled for autoregressive models like GPT.
	xpos_scale_base : int (default: 512)
	The base for the scale factor used in XPos technique.
	interpolate_factor : float (default: 1.0)
	Length interpolation factor for extending context length of the pretrained model.
	Final model's context length = pretrained_model_context_length * interpolate_factor.

	theta_rescale_factor : float (default: 1.0)
	The factor to rescale the theta.

	cache_if_possible : bool (default: True)
	Whether to cache the frequencies/scales if possible.

	"""

	def __init__(
	self,
	emb_dim,
	freqs: Optional[Tensor] = None,
	theta=1e4,
	learned_freq=False,
	use_xpos=False,
	xpos_scale_base=512,
	interpolate_factor=1.0,
	theta_rescale_factor=1.0,
	cache_if_possible=True,
	):
	super().__init__()
	# proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
	# has some connection to NTK literature
	# https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/

	theta = theta_rescale_factor * (emb_dim / (emb_dim - 2))

	if freqs is None:
	freqs = 1.0 / (
	theta
	** (torch.arange(0, emb_dim, 2)[: (emb_dim // 2)].float() / emb_dim)
	)
	# freqs = torch.ones(num_freqs).float()

	self.cache_if_possible = cache_if_possible

	self.register_buffer("cached_freqs", None, persistent=False)
	self.register_buffer("cached_scales", None, persistent=False)

	self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)

	self.learned_freq = learned_freq

	# interpolation factors

	assert interpolate_factor >= 1.0
	self.interpolate_factor = interpolate_factor

	# xpos
	self.use_xpos = use_xpos
	if not use_xpos:
	self.register_buffer("scale", None, persistent=False)
	return

	scale = (torch.arange(0, emb_dim, 2) + 0.4 * emb_dim) / (1.4 * emb_dim)
	self.scale_base = xpos_scale_base
	self.register_buffer("scale", scale, persistent=False)

	@property
	def device(self):
	return self.freqs.device

	def rotate_queries_or_keys(self, t, offset=0, freq_seq_len=None, scale=None):
	"""
	Parameters
	----------
	t : Tensor
	tensor to rotate: (batch_size, seq_len, num_heads, head_dim)
	"""
	seq_len = t.shape[1]
	assert (
	not self.use_xpos or scale is not None
	), "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"

	if freq_seq_len is not None:
	assert freq_seq_len >= seq_len
	seq_len = freq_seq_len

	seq = (
	torch.arange(seq_len, device=t.device, dtype=t.dtype) + offset
	) / self.interpolate_factor

	freqs = self.forward(
	seq,
	seq_len=seq_len,
	offset=offset,
	).to(t.dtype)

	freqs = rearrange(freqs, "n d -> n 1 d")

	if scale is not None:
	scale = rearrange(scale, "n d -> n 1 d")

	if scale is None:
	scale = torch.tensor(1.0, device=t.device, dtype=t.dtype)

	return apply_rotary_emb(freqs, t, scale=scale)

	def rotate_queries_and_keys(self, q, k):
	"""
	Parameters
	----------
	q : Tensor
	queries tensor: (batch_size, seq_len, num_heads, head_dim)
	k : Tensor
	keys tensor: (batch_size, seq_len, num_heads, head_dim)
	"""
	assert self.use_xpos
	seq_len = q.shape[-3]

	seq = (
	torch.arange(seq_len, device=q.device, dtype=q.dtype)
	) / self.interpolate_factor

	freqs = self.forward(seq, seq_len=seq_len)
	scale = self.get_scale(seq, seq_len=seq_len)

	freqs = rearrange(freqs, "n d -> n 1 d")
	scale = rearrange(scale, "n d -> n 1 d")

	rotated_q = apply_rotary_emb(freqs, q, scale=scale)
	rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1)

	rotated_q = rotated_q.type(q.dtype)
	rotated_k = rotated_k.type(k.dtype)

	return rotated_q, rotated_k

	def get_scale(self, t: Tensor, seq_len: Optional[int] = None, offset=0):
	assert self.use_xpos

	should_cache = self.cache_if_possible and seq_len is not None

	if (
	should_cache
	and self.cached_scales is not None
	and (seq_len + offset) <= self.cached_scales.shape[0]
	):
	return self.cached_scales[offset : (offset + seq_len)]

	scale = 1.0
	if self.use_xpos:
	power = (t - len(t) // 2) / self.scale_base
	scale = self.scale ** rearrange(power, "n -> n 1")
	scale = torch.cat((scale, scale), dim=-1)

	if should_cache:
	self.register_buffer("cached_scales", scale, persistent=False)

	return scale

	def rotate_queries_with_cached_keys(self, q, k, offset=0):
	q_len, k_len = q.shape[1], k.shape[1]
	assert q_len <= k_len

	rotated_q, rotated_k = self.rotate_queries_and_keys(q, k)

	rotated_q = rotated_q[:, -1:, ...]

	return rotated_q, rotated_k

	seq = (
	torch.arange(k_len, device=q.device, dtype=q.dtype)
	) / self.interpolate_factor

	if self.use_xpos:
	q_scale = self.get_scale(seq[-q_len:]).to(q.dtype)
	k_scale = self.get_scale(seq).to(k.dtype)

	else:
	k_scale = 1.0
	q_scale = 1.0

	rotated_q = self.rotate_queries_or_keys(
	q, scale=q_scale, offset=k_len - q_len + offset
	)
	rotated_k = self.rotate_queries_or_keys(k, scale=k_scale**-1)

	return rotated_q, rotated_k

	@autocast(device_type="cuda", enabled=False)
	def forward(self, t: Tensor, seq_len=None, offset=0):
	should_cache = (
	self.cache_if_possible and not self.learned_freq and seq_len is not None
	)

	if (
	should_cache
	and self.cached_freqs is not None
	and (offset + seq_len) <= self.cached_freqs.shape[0]
	):
	return self.cached_freqs[offset : (offset + seq_len)].detach()

	freqs = self.freqs

	freqs = einsum("..., f -> ... f", t, freqs)
	freqs = repeat(freqs, "... n -> ... (n r)", r=2)

	if should_cache:
	self.register_buffer("cached_freqs", freqs.detach(), persistent=False)

	return freqs



	class MultiHeadedSelfAttention(nn.Module):
	"""
	Multi-Headed Self Attention module supported with Flash Attention and Rotary Embeddings.

	Parameters
	----------
	q_input_dim: int
	The input dimension of the query tensor.
	kv_input_dim: int
	The input dimension of the key and value tensors.
	qk_proj_dim: int
	The projected dimension of the query and key tensors.
	v_proj_dim: int
	The projected dimension of the value tensors.
	num_heads: int
	Number of attention heads.
	dropout: float
	Dropout rate to apply to the attention scores.
	projection_layer: str
	The type of projection layer to use. Either 'linear' or 'conv'.
	Basically both are linear projections, but 'conv' uses Conv1D layer as proposed in the original GPT2 paper.
	use_flash_attn: bool
	Whether to use Flash Attention or not. If True, Flash Attention will be used.
	NOTE: Flash Attention is required to be installed.
	use_rotary_emb: bool
	Whether to use Rotary Embeddings or not.
	rotary_theta: int
	The base for the geometric progression used to compute the rotation angles.
	rotary_use_xpos: bool
	Whether to use XPos technique for resolving length extrapolation issue.
	NOTE: This can only be enabled for autoregressive models like GPT.
	"""

	def __init__(
	self,
	q_input_dim,
	kv_input_dim,
	qk_proj_dim,
	v_proj_dim,
	num_heads,
	dropout: float = 0.0,
	projection_layer: str = "linear",
	use_flash_attn: bool = True,
	use_rotary_emb: bool = False,
	rotary_theta: int = 1e4,
	rotary_use_xpos: bool = False,
	is_cross_attention: bool = False,
	**kwargs,
	):
	super().__init__()
	assert (
	qk_proj_dim % num_heads == 0
	), "qk_proj_dim must be divisible by num_heads"
	assert v_proj_dim % num_heads == 0, "v_proj_dim must be divisible by num_heads"

	self.num_heads = num_heads
	self.dropout_rate = dropout
	self.projection_layer = projection_layer
	self.use_rotary_emb = use_rotary_emb
	self.is_cross_attention = is_cross_attention

	if use_flash_attn and not is_cross_attention:
	try:
	from flash_attn import flash_attn_qkvpacked_func

	self.use_flash_attn = True
	self.flashattn_fn = flash_attn_qkvpacked_func
	except ImportError:
	print("flash_attn not installed, reverting to default attention")
	self.use_flash_attn = False
	self.flashattn_fn = None
	else:
	self.use_flash_attn = False
	self.flashattn_fn = None

	if self.projection_layer == "linear":
	self.query = nn.Linear(q_input_dim, qk_proj_dim)
	self.key = nn.Linear(kv_input_dim, qk_proj_dim)
	self.value = nn.Linear(kv_input_dim, v_proj_dim)
	elif self.projection_layer == "conv":
	self.query = Conv1D(qk_proj_dim, q_input_dim)
	self.key = Conv1D(qk_proj_dim, kv_input_dim)
	self.value = Conv1D(v_proj_dim, kv_input_dim)
	else:
	raise ValueError(
	f"projection_layer must be either 'linear' or 'conv', got {projection_layer}"
	)

	if self.use_rotary_emb:
	self.rotary_emb = RotaryEmbedding(
	emb_dim=qk_proj_dim // num_heads // 2,
	theta=rotary_theta,
	use_xpos=rotary_use_xpos,
	)

	self.dr_rate = dropout
	self.dropout = nn.Dropout(dropout)

	def forward(
	self,
	x_q,
	x_kv,
	is_causal=False,
	attention_bias=None,
	attention_mask=None,
	output_attentions=False,
	query=None,
	key=None,
	value=None,
	use_cache=False,
	):
	"""
	Applies a classical self attention operation.

	Parameters
	----------
	x_q: torch.Tensor
	The query tensor of shape (batch_size, query_seq_len, emb_dim)
	x_kv: torch.Tensor
	The key/value tensor of shape (batch_size, kv_seq_len, emb_dim)
	attention_bias: torch.Tensor
	The attention bias to apply to the attention scores. (default: None)
	attention_mask: torch.Tensor
	The attention mask to apply to the attention scores. Shape: (batch_size, q_len, kv_seq_len)
	"""
	assert (x_q is not None and x_kv is not None) or (
	query is not None and key is not None and value is not None
	), "Either x_q and x_kv or query, key and value must be provided"

	past_memory_provided = (
	query is not None and key is not None and value is not None
	)

	if query is None:
	q_len = x_q.size(1)
	k_len = x_kv.size(1)

	query = self.query(x_q)
	key = self.key(x_kv)
	value = self.value(x_kv)

	else:
	q_len = query.size(1)
	k_len = key.size(1)

	if use_cache:
	cache = (key.clone(), value.clone(), query.clone())

	q = rearrange(query, "b q (h d) -> b q h d", h=self.num_heads)
	k = rearrange(key, "b k (h d) -> b k h d", h=self.num_heads)
	v = rearrange(value, "b v (h d) -> b v h d", h=self.num_heads)

	if self.use_rotary_emb:
	if use_cache and past_memory_provided:
	q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
	if self.rotary_emb.use_xpos:
	q, k = self.rotary_emb.rotate_queries_and_keys(q, k)
	else:
	q = self.rotary_emb.rotate_queries_or_keys(q)
	k = self.rotary_emb.rotate_queries_or_keys(k)

	if (
	self.use_flash_attn
	and not use_cache
	and not output_attentions
	and attention_bias is None
	):
	qkv = torch.stack([q, k, v], dim=2).to(torch.bfloat16)
	x = self.flashattn_fn(
	qkv=qkv,
	dropout_p=self.dropout_rate if self.training else 0.0,
	causal=is_causal,
	deterministic=False,
	return_attn_probs=False,
	)

	x = x.to(x_q.dtype)
	elif self.use_flash_attn and not output_attentions:
	attn_bias = xops.LowerTriangularMask() if is_causal else attention_bias

	if attention_mask is not None:
	if attn_bias is None:
	attn_bias = attention_mask
	else:
	if isinstance(attn_bias, torch.Tensor):
	attn_bias = attn_bias + attention_mask
	else:
	attn_bias.add_bias(bias=attention_mask)

	attn_bias = attn_bias.materialize(
	shape=(q_len, k_len),
	device=q.device,
	dtype=q.dtype,
	)
	else:
	if isinstance(attn_bias, torch.Tensor) and len(attn_bias.shape) == 3:
	attn_bias = (
	attn_bias.unsqueeze(1)
	.expand(-1, self.num_heads, -1, -1)
	.float()
	) # (batch_size, num_heads, q_len, k_len)
	else:
	attn_bias = attn_bias.materialize(
	shape=(q_len, k_len),
	device=q.device,
	dtype=q.dtype,
	)

	if isinstance(attn_bias, xops.LowerTriangularMask):
	attn_bias = attn_bias.materialize(
	shape=(q_len, k_len),
	device=q.device,
	dtype=q.dtype,
	)

	# print(attention_mask.shape, attn_bias.shape)
	# print(attn_bias[0, 0, 0, :])

	need_adjustment = False
	if attn_bias.shape[-2] % 8 != 0:
	nearest_multiple_q = 8 * (1 + attn_bias.shape[-2] // 8)
	need_adjustment = True
	else:
	nearest_multiple_q = attn_bias.shape[-2]

	if attn_bias.shape[-1] % 8 != 0:
	nearest_multiple_k = 8 * (1 + attn_bias.shape[-1] // 8)
	need_adjustment = True
	else:
	nearest_multiple_k = attn_bias.shape[-1]

	if need_adjustment:
	new_attn_bias = torch.zeros(
	attn_bias.shape[0],
	attn_bias.shape[1],
	nearest_multiple_q,
	nearest_multiple_k,
	).to(attn_bias.device)
	new_attn_bias[:, :, : attn_bias.shape[-2], : attn_bias.shape[-1]] = (
	attn_bias
	)

	x = xops.memory_efficient_attention(
	query=q,
	key=k,
	value=v,
	op=None,
	attn_bias=new_attn_bias[:, :, :q_len, :k_len],
	p=self.dr_rate,
	)
	else:
	attn_bias = attn_bias.to(q.dtype)
	attn_bias = attn_bias.repeat(1, self.num_heads, 1, 1)
	x = xops.memory_efficient_attention(
	query=q,
	key=k,
	value=v,
	op=None,
	attn_bias=attn_bias,
	p=self.dr_rate,
	)
	# x: (batch_size, query_seq_len, n_head, head_dim)
	else:
	# if output_attentions:
	attention_scores = einsum(q, k, "b q h d, b k h d -> b h q k")
	attention_scores = attention_scores / (q.size(-1) ** 0.5)

	if attention_bias is not None:
	attn_bias = attention_bias.unsqueeze(1).expand(
	-1, self.num_heads, -1, -1
	)
	# elif is_causal:
	# attn_bias = xops.LowerTriangularMask().materialize(
	# shape=attention_scores.shape, device=attention_scores.device
	# )
	else:
	attn_bias = None

	if attention_mask is not None:
	if attn_bias is None:
	attn_bias = attention_mask
	else:
	attn_bias = attn_bias + attention_mask

	attention_scores = attention_scores + attn_bias

	attention_probs = attention_scores.softmax(dim=-1)
	attention_probs = self.dropout(attention_probs)

	x = einsum(attention_probs, v, "b h q k, b k h d -> b q h d")

	x = rearrange(x, "b q h d -> b q (h d)", h=self.num_heads)

	if use_cache:
	if output_attentions:
	return x, attention_probs, cache
	else:
	return x, None, cache
	else:
	if output_attentions:
	return x, attention_probs
	else:
	return x, None

	class DeCodonPreTrainedModel(PreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
	models.
	"""
	base_model_prefix = "decodon"
	supports_gradient_checkpointing = True

	def _init_weights(self, module):
	"""MAGNETO Initialize the weights"""
	if isinstance(module, nn.Linear):
	nn.init.xavier_normal_(module.weight, gain=self.config.gamma_init)
	if module.bias is not None:
	module.bias.data.zero_()

	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()

	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)

	def _set_gradient_checkpointing(self, module, value=False):
	if isinstance(module, DeCodonLayer):
	module.gradient_checkpointing = value


	class DeCodonEmbeddings(nn.Module):
	"""
	DeCodon Embeddings

	Word, position and token type embeddings for DeCodon.
	"""

	def __init__(self, config):
	super().__init__()
	self.word_embeddings = nn.Embedding(
	config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
	)
	self.position_embeddings = nn.Embedding(
	config.max_position_embeddings, config.hidden_size
	)
	self.token_type_embeddings = nn.Embedding(
	config.type_vocab_size, config.hidden_size
	)

	self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	self.position_embedding_type = getattr(
	config, "position_embedding_type", "absolute"
	)

	self.register_buffer(
	"position_ids",
	torch.arange(config.max_position_embeddings).expand((1, -1)),
	persistent=False,
	)

	self.register_buffer(
	"token_type_ids",
	torch.zeros(self.position_ids.size(), dtype=torch.long),
	persistent=False,
	)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	past_key_values_length: int = 0,
	) -> torch.Tensor:

	if input_ids is not None:
	input_shape = input_ids.size()
	else:
	input_shape = inputs_embeds.size()[:-1]

	seq_length = input_shape[1]

	if position_ids is None:
	position_ids = self.position_ids[
	:, past_key_values_length : seq_length + past_key_values_length
	]

	# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
	# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
	# issue #5664
	if token_type_ids is None:
	if hasattr(self, "token_type_ids"):
	buffered_token_type_ids = self.token_type_ids[:, :seq_length]
	buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
	input_shape[0], seq_length
	)
	token_type_ids = buffered_token_type_ids_expanded
	else:
	token_type_ids = torch.zeros(
	input_shape, dtype=torch.long, device=self.position_ids.device
	)

	if inputs_embeds is None:
	inputs_embeds = self.word_embeddings(input_ids)

	token_type_embeddings = self.token_type_embeddings(token_type_ids)

	embeddings = inputs_embeds + token_type_embeddings
	if self.position_embedding_type == "absolute":
	position_embeddings = self.position_embeddings(position_ids)
	embeddings += position_embeddings

	# embeddings = self.ln(embeddings)
	embeddings = self.dropout(embeddings)

	return embeddings


	class DeCodonAttention(nn.Module):
	"""
	DeCodon Attention Layer

	This module supports self-attention and dilated attention with Rotary Positional Embeddings (RoPE).
	"""

	def __init__(self, config):
	super().__init__()

	self.pre_layer_norm = nn.LayerNorm(
	config.hidden_size, eps=config.layer_norm_eps
	)
	self.post_attn_dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.post_layer_norm = nn.LayerNorm(
	config.hidden_size, eps=config.layer_norm_eps
	)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	self.self_attention = MultiHeadedSelfAttention(
	q_input_dim=config.hidden_size,
	kv_input_dim=config.hidden_size,
	qk_proj_dim=config.hidden_size,
	v_proj_dim=config.hidden_size,
	num_heads=config.num_attention_heads,
	dropout=config.attention_probs_dropout_prob,
	projection_layer="conv",
	use_flash_attn=config.use_flash_attn,
	use_rotary_emb=config.use_rotary_emb,
	rotary_theta=config.rotary_theta,
	rotary_use_xpos=True,
	)

	def forward(
	self,
	hidden_states: Optional[Tuple[torch.FloatTensor]],
	attention_mask: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = False,
	past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
	use_cache: Optional[bool] = False,
	) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:

	attn_input = self.pre_layer_norm(hidden_states)

	if past_key_values is not None:
	query = self.self_attention.query(attn_input)
	key = self.self_attention.key(attn_input)
	value = self.self_attention.value(attn_input)

	past_key, past_value, past_query = past_key_values

	# past_new_query = query[:, :-1, :]
	# past_new_key = key[:, :-1, :]
	# past_new_value = value[:, :-1, :]

	# print(
	# (past_new_query[0] != past_query[0]).sum(),
	# past_new_query.size(),
	# past_new_query[past_new_query != past_query].cpu().numpy(),
	# past_query[past_new_query != past_query].cpu().numpy(),
	# past_query.sum().item(),
	# )
	# print(
	# (past_new_key[0] == past_key[0]).sum(),
	# past_new_key.size(),
	# # past_new_key[0, 0, :1024],
	# # past_key[0, 0, :1024],
	# past_new_key[past_new_key != past_key].cpu().numpy(),
	# past_key[past_new_key != past_key].cpu().numpy(),
	# past_key.sum().item(),
	# )

	# print(
	# (past_new_value[0] == past_value[0]).sum(),
	# past_new_value.size(),
	# # past_new_value[0, 0, :1024],
	# # past_value[0, 0, :1024],
	# past_new_value[past_new_value != past_value].cpu().numpy(),
	# past_value[past_new_value != past_value].cpu().numpy(),
	# past_value.sum().item(),
	# )

	# print(query.shape, key.shape, value.shape)
	# print(past_query.shape, past_key.shape, past_value.shape)

	key = torch.cat(
	(past_key, key), dim=1
	) # (batch_size, seq_len, hidden_size)
	value = torch.cat(
	(past_value, value), dim=1
	) # (batch_size, seq_len, hidden_size)
	query = torch.cat((past_query, query), dim=1)

	# print(query.shape, key.shape, value.shape)
	# print()

	attn_outputs = self.self_attention(
	x_q=None,
	x_kv=None,
	query=query,
	key=key,
	value=value,
	is_causal=True,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	use_cache=use_cache,
	attention_bias=None,
	)
	else:
	attn_outputs = self.self_attention(
	x_q=attn_input,
	x_kv=attn_input,
	is_causal=True,
	attention_bias=None,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	use_cache=use_cache,
	)

	attn_output = attn_outputs[0]
	attn_output = self.post_layer_norm(attn_output)
	attn_output = self.post_attn_dense(attn_output)
	attn_output = self.dropout(attn_output)
	attn_output = hidden_states + attn_output

	return (attn_output,) + attn_outputs[1:]


	class DeCodonFFN(nn.Module):
	"""
	DeCodon Position-wise Feed-Forward Network
	"""

	def __init__(self, config):
	super().__init__()
	embed_dim = config.hidden_size
	self.pre_layer_norm = nn.LayerNorm(
	config.hidden_size, eps=config.layer_norm_eps
	)
	self.intermediate_dense = Conv1D(config.intermediate_size, embed_dim)
	self.post_layer_norm = nn.LayerNorm(
	config.intermediate_size, eps=config.layer_norm_eps
	)
	self.post_dense = Conv1D(embed_dim, config.intermediate_size)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	if isinstance(config.hidden_act, str):
	self.intermediate_act_fn = ACT2FN[config.hidden_act]
	else:
	self.intermediate_act_fn = config.hidden_act

	def forward(
	self, hidden_states: Optional[Tuple[torch.FloatTensor]]
	) -> torch.FloatTensor:
	hidden_states = self.pre_layer_norm(hidden_states)
	hidden_states = self.intermediate_dense(hidden_states)
	hidden_states = self.intermediate_act_fn(hidden_states)
	hidden_states = self.post_layer_norm(hidden_states)
	hidden_states = self.post_dense(hidden_states)
	hidden_states = self.dropout(hidden_states)
	return hidden_states


	class DeCodonLayer(nn.Module):
	"""
	DeCodon (Decoder) Layer consists of an attention layer and a position-wise feed-forward network.
	"""

	def __init__(self, config):
	super().__init__()
	self.attention = DeCodonAttention(config)
	self.output = DeCodonFFN(config)

	def forward(
	self,
	hidden_states: Optional[Tuple[torch.FloatTensor]],
	attention_mask: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = False,
	past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
	use_cache: Optional[bool] = False,
	) -> Union[
	Tuple[torch.Tensor],
	Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]],
	]:
	self_attention_outputs = self.attention(
	hidden_states,
	attention_mask,
	output_attentions=output_attentions,
	past_key_values=past_key_values,
	use_cache=use_cache,
	)
	attention_output = self_attention_outputs[0]

	outputs = self_attention_outputs[
	1:
	] # add self attentions if we output attention weights

	layer_output = self.output(attention_output)
	outputs = (layer_output,) + outputs

	return outputs


	class DeCodonStack(nn.Module):
	"""
	DeCodon Stack consists of multiple DeCodon layers.
	"""

	def __init__(self, config):
	super().__init__()
	self.config = config
	self.blocks = nn.ModuleList(
	[DeCodonLayer(config) for _ in range(config.num_hidden_layers)]
	)
	self.gradient_checkpointing = False

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
	output_attentions: Optional[bool] = False,
	output_hidden_states: Optional[bool] = False,
	return_dict: Optional[bool] = True,
	use_cache: Optional[bool] = False,
	) -> Union[Tuple[torch.Tensor], BaseModelOutput]:

	if past_key_values is None:
	past_key_values = [None] * len(self.blocks)
	past_length = 0
	else:
	past_length = past_key_values[0][0].size(-2)

	all_hidden_states = () if output_hidden_states else None
	all_self_attentions = () if output_attentions else None
	presents = () if use_cache else None
	for i, (block, past_key_value) in enumerate(zip(self.blocks, past_key_values)):
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	block_outputs = block(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	past_key_values=past_key_value,
	use_cache=use_cache,
	)

	hidden_states = block_outputs[0]

	if use_cache:
	presents = presents + (block_outputs[2],)

	if output_attentions:
	all_self_attentions = all_self_attentions + (block_outputs[1],)

	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if not return_dict:
	return tuple(
	v
	for v in [
	hidden_states,
	presents,
	all_hidden_states,
	all_self_attentions,
	]
	if v is not None
	)

	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=presents,
	hidden_states=all_hidden_states,
	attentions=all_self_attentions,
	)


	class DeCodonModule(DeCodonPreTrainedModel):
	"""
	The DeCodon Module (Decoder only) without any task-specific head on top.
	"""

	def __init__(self, config):
	super().__init__(config)

	self.embeddings = DeCodonEmbeddings(config)
	self.decoder = DeCodonStack(config)
	self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

	self.gradient_checkpointing = False

	# Initialize weights and apply final processing
	self.post_init()

	def set_input_embeddings(self, new_embeddings):
	self.embeddings.word_embeddings = new_embeddings

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	use_cache: Optional[bool] = False,
	) -> Union[Tuple, BaseModelOutput]:
	output_attentions = (
	output_attentions
	if output_attentions is not None
	else self.config.output_attentions
	)
	output_hidden_states = (
	output_hidden_states
	if output_hidden_states is not None
	else self.config.output_hidden_states
	)
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError(
	"You cannot specify both input_ids and inputs_embeds at the same time"
	)
	elif input_ids is not None:
	self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
	input_shape = input_ids.size()
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	else:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	if past_key_values is not None:
	past_length = past_key_values[0][0].size(-2)
	else:
	past_length = 0

	batch_size, seq_length = input_shape
	device = input_ids.device if input_ids is not None else inputs_embeds.device

	if attention_mask is None:
	attention_mask = torch.ones(((batch_size, seq_length)), device=device)

	if token_type_ids is None:
	if hasattr(self.embeddings, "token_type_ids"):
	buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
	buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
	batch_size, seq_length
	)
	token_type_ids = buffered_token_type_ids_expanded
	else:
	token_type_ids = torch.zeros(
	input_shape, dtype=torch.long, device=device
	)

	# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
	# ourselves in which case we just need to make it broadcastable to all heads.
	# extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
	# attention_mask, input_shape
	# )
	embedding_output = self.embeddings(
	input_ids=input_ids,
	position_ids=position_ids,
	token_type_ids=token_type_ids,
	inputs_embeds=inputs_embeds,
	)

	extended_attention_mask = _prepare_4d_causal_attention_mask(
	attention_mask=attention_mask,
	input_shape=(batch_size, input_shape[-1]),
	inputs_embeds=embedding_output,
	past_key_values_length=past_length,
	)
	# extended_attention_mask = attention_mask

	decoder_outputs = self.decoder(
	embedding_output,
	attention_mask=extended_attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	past_key_values=past_key_values,
	return_dict=return_dict,
	use_cache=use_cache,
	)

	sequence_output = decoder_outputs[0]

	if not return_dict:
	return (sequence_output,) + decoder_outputs[1:]

	return BaseModelOutputWithPast(
	last_hidden_state=sequence_output,
	past_key_values=decoder_outputs.past_key_values,
	hidden_states=decoder_outputs.hidden_states,
	attentions=decoder_outputs.attentions,
	)


	@dataclass
	class DeCodonForPreTrainingOutput(CausalLMOutputWithPast):
	"""
	Output type of [`BERTransForPreTraining`].

	Args:
	loss (optional, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
	Total loss as the sum of the masked language modeling loss and the next sequence prediction
	(classification) loss.
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	org_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
	Prediction scores for organism classification (scores for each organism label before SoftMax).
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
	shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: torch.FloatTensor = None
	past_key_values: Optional[Tuple[torch.FloatTensor]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None


	class DeCodon(DeCodonPreTrainedModel):
	config_class = DeCodonConfig
	_tied_weights_keys = []

	def __init__(self, config):
	super().__init__(config)

	self.gpt = DeCodonModule(config)

	# causal language modeling head
	if config.lm_type == "gpt":
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
	DeCodon._tied_weights_keys.append("lm_head.weight")
	else:
	self.lm_head = nn.Sequential(
	OrderedDict(
	[
	("dropout", nn.Dropout(config.hidden_dropout_prob)),
	(
	"transform",
	nn.Linear(config.hidden_size, config.hidden_size),
	),
	("act", nn.ReLU()),
	(
	"norm",
	nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps),
	),
	(
	"pred",
	nn.Linear(
	config.hidden_size, config.vocab_size, bias=False
	),
	),
	]
	)
	)
	DeCodon._tied_weights_keys.append("lm_head.pred.weight")

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.gpt.embeddings.word_embeddings

	def get_output_embeddings(self):
	return (
	self.lm_head.pred.weight
	if isinstance(self.lm_head, nn.Sequential)
	else self.lm_head.weight if self.config.lm_type == "gpt" else None
	)

	def set_output_embeddings(self, new_embeddings):
	if isinstance(self.lm_head, nn.Sequential):
	self.lm_head.pred.weight = new_embeddings
	else:
	self.lm_head.weight = new_embeddings

	def prepare_inputs_for_generation(
	self, input_ids, inputs_embeds=None, past_key_values=None, **kwargs
	):
	token_type_ids = kwargs.get("token_type_ids", None)
	attention_mask = kwargs.get("attention_mask", None)
	position_ids = kwargs.get("position_ids", None)
	use_cache = kwargs.get("use_cache", True)

	if past_key_values is not None and use_cache:
	past_length = past_key_values[0][0].shape[1]

	if input_ids.shape[1] > past_length:
	remove_prefix_len = past_length
	else:
	remove_prefix_len = input_ids.shape[1] - 1

	input_ids = input_ids[:, remove_prefix_len:]

	if token_type_ids is not None:
	token_type_ids = token_type_ids[:, remove_prefix_len:]

	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	else:
	position_ids = None

	if inputs_embeds is not None:
	model_inputs = {"inputs_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"attention_mask": attention_mask,
	"token_type_ids": token_type_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache", True),
	}
	)

	return model_inputs

	@staticmethod
	def _reorder_cache(
	past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
	) -> Tuple[Tuple[torch.Tensor]]:
	"""
	This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
	[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
	beam_idx at every generation step.
	"""
	return tuple(
	tuple(
	past_state.index_select(0, beam_idx.to(past_state.device))
	for past_state in layer_past
	)
	for layer_past in past_key_values
	)

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	organism: Optional[torch.Tensor] = None,
	past_key_values: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	use_cache: Optional[bool] = False,
	**kwargs,
	) -> Union[Tuple[torch.Tensor], DeCodonForPreTrainingOutput]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
	config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
	the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
	organism (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Organism labels
	kwargs (`Dict[str, any]`, optional, defaults to {}):
	Used to hide legacy arguments that have been deprecated.

	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, BertForPreTraining
	>>> import torch

	>>> tokenizer = AutoTokenizer.from_pretrained("bertrans-base")
	>>> model = BERTransForPreTraining.from_pretrained("bertrans-base")

	>>> inputs = tokenizer("AAAAGGGGGGCCCCCCTTTTT", return_tensors="pt")
	>>> outputs = model(**inputs)

	>>> prediction_logits = outputs.prediction_logits
	>>> organism_logits = outputs.organism_logits
	>>> biotype_logits = outputs.biotype_logits
	```
	"""
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	if input_ids is not None:
	batch_size, sequence_length = input_ids.shape[:2]
	else:
	batch_size, sequence_length = inputs_embeds.shape[:2]

	if self.config.pad_token_id is None:
	sequence_lengths = -1
	else:
	if input_ids is not None:
	sequence_lengths = (
	torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1
	).to(input_ids.device)
	else:
	sequence_lengths = -1
	logger.warning(
	f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
	"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
	)

	gpt_outputs = self.gpt(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	inputs_embeds=inputs_embeds,
	past_key_values=past_key_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	use_cache=use_cache,
	)

	hidden_states = gpt_outputs[0] # (batch_size, sequence_length, hidden_size)
	lm_logits = self.lm_head(
	hidden_states
	) # (batch_size, sequence_length, vocab_size)

	loss = None
	if labels is not None:
	# move labels to correct device to enable model parallelism
	labels = labels.to(lm_logits.device)
	# Shift so that tokens < n predict n
	shift_logits = lm_logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = nn.CrossEntropyLoss()
	lm_loss = loss_fct(
	shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
	)
	loss = lm_loss
	else:
	lm_loss = None

	if not return_dict:
	output = (lm_logits,) + gpt_outputs[1:]
	return ((loss,) + output) if loss is not None else output

	return DeCodonForPreTrainingOutput(
	loss=loss,
	logits=lm_logits,
	past_key_values=gpt_outputs.past_key_values,
	hidden_states=gpt_outputs.hidden_states,
	attentions=gpt_outputs.attentions,
	)

	def freeze(self, layer_indices: Optional[list] = None):
	if layer_indices is None or len(layer_indices) == 0:
	for param in self.gpt.parameters():
	param.requires_grad = False
	else:
	for param in self.gpt.embeddings.parameters():
	param.requires_grad = False

	if isinstance(layer_indices, int):
	layer_indices = [layer_indices]

	layer_indices = [i % len(self.gpt.decoder.blocks) for i in layer_indices]

	for i in range(len(self.gpt.decoder.blocks)):
	if i not in layer_indices:
	for param in self.gpt.decoder.blocks[i].parameters():
	param.requires_grad = False



	class DeCodonForSequenceTask(DeCodonPreTrainedModel):
	def __init__(self, config):
	super().__init__(config)
	self.config = config

	self.gpt = DeCodonModule(config)

	if config.cls_type.lower() == "cls":
	layer_indices = config.layer_indices
	layer_indices = (
	[]
	if layer_indices is None
	else (
	[layer_indices] if isinstance(layer_indices, int) else layer_indices
	)
	)
	layer_indices = [i % len(self.gpt.decoder.blocks) for i in layer_indices]

	n_layers = len(layer_indices)
	self.layer_indices = layer_indices
	self.classifier = nn.Sequential(
	nn.LayerNorm(config.hidden_size * n_layers),
	nn.Linear(config.hidden_size * n_layers, config.hidden_size),
	ACT2CLS[config.cls_hidden_act](),
	nn.Dropout(config.cls_dropout_prob),
	nn.Linear(
	config.hidden_size,
	config.num_labels * config.num_tasks,
	),
	)
	else:
	raise ValueError(f"Invalid cls_type: {config.cls_type}.")

	self.init_weights()

	def freeze(self, layers_idx: Optional[list] = None):
	if layers_idx is None or len(layers_idx) == 0:
	for param in self.gpt.parameters():
	param.requires_grad = False
	else:
	for param in self.gpt.embeddings.parameters():
	param.requires_grad = False

	if isinstance(layers_idx, int):
	layers_idx = [layers_idx]

	layers_idx = [i % self.config.num_hidden_layers for i in layers_idx]

	for i in range(self.config.num_hidden_layers):
	if i not in layers_idx:
	for param in self.gpt.decoder.blocks[i].parameters():
	param.requires_grad = False

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	target: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs,
	):
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	if input_ids is not None:
	batch_size, sequence_length = input_ids.shape[:2]
	else:
	batch_size, sequence_length = inputs_embeds.shape[:2]

	if self.config.pad_token_id is None:
	sequence_lengths = -1
	else:
	if input_ids is not None:
	sequence_lengths = (
	torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1
	).to(
	input_ids.device
	) # (batch_size,)
	else:
	sequence_lengths = -1
	logger.warning(
	f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
	"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
	)

	gpt_outputs = self.gpt(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=True,
	return_dict=return_dict,
	)

	all_hidden_states = gpt_outputs.hidden_states

	if self.config.cls_type.lower() not in ["crossattention", "ca", "cls"]:
	logits, _ = self.classifier(all_hidden_states, attention_mask)
	elif self.config.cls_type.lower() in ["crossattention", "ca"]:
	bs, seq_len = input_ids.shape

	query_tasks = self.task_embeddings.weight # (num_tasks, hidden_size)
	query_tasks = query_tasks.unsqueeze(0).expand(
	bs, -1, -1
	) # (batch_size, num_tasks, hidden_size)

	cls_outputs = self.classifier(
	query_tasks,
	all_hidden_states,
	attention_mask,
	output_attentions=output_attentions,
	) # (batch_size, num_tasks, num_labels)

	logits, ca = cls_outputs

	logits = logits.squeeze()
	elif self.config.cls_type.lower() == "cls":
	bs, seq_len = input_ids.shape
	# here we select latest token's hidden states as pooled output
	pooled_hidden_states = [
	h[torch.arange(bs, device=h.device), sequence_lengths - 1, :]
	for i, h in enumerate(all_hidden_states)
	if i in self.layer_indices
	]
	pooled_output = torch.cat(
	pooled_hidden_states, dim=-1
	) # (batch_size, hidden_size * n_layers)

	logits = self.classifier(pooled_output)

	loss = None
	if target is not None:
	if self.config.problem_type == "regression":
	logits = logits.view(-1, self.config.num_labels * self.config.num_tasks)
	target = target.view(-1, self.config.num_labels * self.config.num_tasks)

	mask = target != -500.0

	if self.config.loss_fn == "mse":
	loss_fct = nn.MSELoss()
	loss = loss_fct(logits[mask], target[mask])
	elif self.config.loss_fn == "mae":
	loss_fct = nn.L1Loss()
	loss = loss_fct(logits[mask], target[mask])
	elif self.config.loss_fn == "huber":
	loss_fct = nn.SmoothL1Loss()
	loss = loss_fct(logits[mask], target[mask])
	else:
	raise ValueError(f"Invalid loss_fn: {self.config.loss_fn}.")
	else:
	loss_fct = nn.CrossEntropyLoss()

	logits = logits.view(-1, self.config.num_labels * self.config.num_tasks)
	target = target.view(
	-1,
	)

	loss = loss_fct(logits, target)

	if not return_dict:
	output = (logits,) + gpt_outputs[2:]
	return ((loss,) + output) if loss is not None else output

	if output_attentions:
	if ca is not None:
	attentions = gpt_outputs.attentions + [ca]
	else:
	attentions = gpt_outputs.attentions
	else:
	attentions = None

	return SequenceClassifierOutput(
	loss=loss,
	logits=logits,
	hidden_states=pooled_output,
	attentions=attentions,
	)