nas / BioReason /bioreason /models /pl /configuration_pl.py

Add files using upload-large-folder tool

ffcfc75 verified 8 months ago

10.3 kB

	from transformers import PretrainedConfig

	class ProteinLLMESMConfig(PretrainedConfig):
	model_type = "protein_llm"
	base_config_key = "esm_config"

	def __init__(
	self,
	# ESM2 related configurations
	esm_hidden_size=1280,
	esm_num_layers=33,
	esm_num_attention_heads=20,
	esm_vocab_size=33,
	esm_max_position_embeddings=1026,
	esm_layer_norm_eps=1e-5,
	esm_hidden_dropout_prob=0.1,
	esm_attention_probs_dropout_prob=0.1,
	esm_intermediate_size=5120,
	esm_hidden_act="gelu",
	esm_initializer_range=0.02,
	esm_layer_norm_eps=1e-5,
	**kwargs,
	):
	super().__init__(**kwargs)

	# ESM2 configurations
	self.esm_hidden_size = esm_hidden_size
	self.esm_num_layers = esm_num_layers
	self.esm_num_attention_heads = esm_num_attention_heads
	self.esm_vocab_size = esm_vocab_size
	self.esm_max_position_embeddings = esm_max_position_embeddings
	self.esm_layer_norm_eps = esm_layer_norm_eps
	self.esm_hidden_dropout_prob = esm_hidden_dropout_prob
	self.esm_attention_probs_dropout_prob = esm_attention_probs_dropout_prob
	self.esm_intermediate_size = esm_intermediate_size
	self.esm_hidden_act = esm_hidden_act
	self.esm_initializer_range = esm_initializer_range

	class ProteinLLMQFormerConfig(PretrainedConfig):
	model_type = "protein_llm"
	base_config_key = "qformer_config"

	def __init__(
	self,
	# Q-Former configurations
	qformer_hidden_size=768,
	qformer_num_hidden_layers=12,
	qformer_num_attention_heads=12,
	qformer_intermediate_size=3072,
	qformer_hidden_act="gelu",
	qformer_hidden_dropout_prob=0.1,
	qformer_attention_probs_dropout_prob=0.1,
	qformer_max_position_embeddings=512,
	qformer_layer_norm_eps=1e-12,
	qformer_initializer_range=0.02,
	qformer_vocab_size=30522,
	qformer_pad_token_id=0,
	qformer_position_embedding_type="absolute",
	qformer_use_cache=True,
	# Query tokens
	num_query_tokens=32,
	**kwargs,
	):
	super().__init__(**kwargs)

	# Q-Former configurations
	self.qformer_hidden_size = qformer_hidden_size
	self.qformer_num_hidden_layers = qformer_num_hidden_layers
	self.qformer_num_attention_heads = qformer_num_attention_heads
	self.qformer_intermediate_size = qformer_intermediate_size
	self.qformer_hidden_act = qformer_hidden_act
	self.qformer_hidden_dropout_prob = qformer_hidden_dropout_prob
	self.qformer_attention_probs_dropout_prob = qformer_attention_probs_dropout_prob
	self.qformer_max_position_embeddings = qformer_max_position_embeddings
	self.qformer_layer_norm_eps = qformer_layer_norm_eps
	self.qformer_initializer_range = qformer_initializer_range
	self.qformer_vocab_size = qformer_vocab_size
	self.qformer_pad_token_id = qformer_pad_token_id
	self.qformer_position_embedding_type = qformer_position_embedding_type
	self.qformer_use_cache = qformer_use_cache
	self.num_query_tokens = num_query_tokens

	class ProteinLLMConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`ProteinLLMModel`]. It is used to instantiate a
	Protein-LLM model according to the specified arguments, defining the model architecture. The model combines
	ESM2 protein encoder, Q-Former, and a language model for protein understanding and generation.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	vocab_size (`int`, optional, defaults to 152064):
	Vocabulary size of the language model. Defines the number of different tokens that can be represented by the
	`inputs_ids` passed when calling the model.
	hidden_size (`int`, optional, defaults to 8192):
	Dimension of the hidden representations in the language model.
	intermediate_size (`int`, optional, defaults to 29568):
	Dimension of the MLP representations in the language model.
	num_hidden_layers (`int`, optional, defaults to 80):
	Number of hidden layers in the Transformer encoder of the language model.
	num_attention_heads (`int`, optional, defaults to 64):
	Number of attention heads for each attention layer in the Transformer encoder.
	num_key_value_heads (`int`, optional, defaults to 8):
	This is the number of key_value heads that should be used to implement Grouped Query Attention.
	hidden_act (`str` or `function`, optional, defaults to `"silu"`):
	The non-linear activation function (function or string) in the decoder.
	max_position_embeddings (`int`, optional, defaults to 32768):
	The maximum sequence length that this model might ever be used with.
	initializer_range (`float`, optional, defaults to 0.02):
	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
	rms_norm_eps (`float`, optional, defaults to 1e-05):
	The epsilon used by the rms normalization layers.
	use_cache (`bool`, optional, defaults to `True`):
	Whether or not the model should return the last key/values attentions.
	tie_word_embeddings (`bool`, optional, defaults to `False`):
	Whether the model's input and output word embeddings should be tied.
	rope_theta (`float`, optional, defaults to 1000000.0):
	The base period of the RoPE embeddings.
	use_sliding_window (`bool`, optional, defaults to `False`):
	Whether to use sliding window attention.
	sliding_window (`int`, optional, defaults to 4096):
	Sliding window attention (SWA) window size.
	max_window_layers (`int`, optional, defaults to 80):
	The number of layers that use SWA.
	attention_dropout (`float`, optional, defaults to 0.0):
	The dropout ratio for the attention probabilities.
	esm_config (`Dict`, optional):
	The config for the ESM2 protein encoder initialization.
	qformer_config (`Dict`, optional):
	The config for the Q-Former initialization.
	rope_scaling (`Dict`, optional):
	Dictionary containing the scaling configuration for the RoPE embeddings.
	"""

	model_type = "protein_llm"
	sub_configs = {
	"esm_config": ProteinLLMESMConfig,
	"qformer_config": ProteinLLMQFormerConfig
	}
	keys_to_ignore_at_inference = ["past_key_values"]

	# Default tensor parallel plan for base model
	base_model_tp_plan = {
	"layers.*.self_attn.q_proj": "colwise",
	"layers.*.self_attn.k_proj": "colwise",
	"layers.*.self_attn.v_proj": "colwise",
	"layers.*.self_attn.o_proj": "rowwise",
	"layers.*.mlp.gate_proj": "colwise",
	"layers.*.mlp.up_proj": "colwise",
	"layers.*.mlp.down_proj": "rowwise",
	}
	base_model_pp_plan = {
	"embed_tokens": (["input_ids"], ["inputs_embeds"]),
	"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
	"norm": (["hidden_states"], ["hidden_states"]),
	}

	def __init__(
	self,
	vocab_size=152064,
	hidden_size=8192,
	intermediate_size=29568,
	num_hidden_layers=80,
	num_attention_heads=64,
	num_key_value_heads=8,
	hidden_act="silu",
	max_position_embeddings=32768,
	initializer_range=0.02,
	rms_norm_eps=1e-05,
	use_cache=True,
	tie_word_embeddings=False,
	rope_theta=1000000.0,
	use_sliding_window=False,
	sliding_window=4096,
	max_window_layers=80,
	attention_dropout=0.0,
	esm_config=None,
	qformer_config=None,
	rope_scaling=None,
	protein_token_id=None,
	**kwargs,
	):
	# Initialize ESM config
	if isinstance(esm_config, dict):
	self.esm_config = self.sub_configs["esm_config"](**esm_config)
	elif esm_config is None:
	self.esm_config = self.sub_configs["esm_config"]()
	else:
	self.esm_config = esm_config

	# Initialize Q-Former config
	if isinstance(qformer_config, dict):
	self.qformer_config = self.sub_configs["qformer_config"](**qformer_config)
	elif qformer_config is None:
	self.qformer_config = self.sub_configs["qformer_config"]()
	else:
	self.qformer_config = qformer_config

	# Language model configurations
	self.vocab_size = vocab_size
	self.max_position_embeddings = max_position_embeddings
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.use_sliding_window = use_sliding_window
	self.sliding_window = sliding_window
	self.max_window_layers = max_window_layers

	# for backward compatibility
	if num_key_value_heads is None:
	num_key_value_heads = num_attention_heads

	self.num_key_value_heads = num_key_value_heads
	self.hidden_act = hidden_act
	self.initializer_range = initializer_range
	self.rms_norm_eps = rms_norm_eps
	self.use_cache = use_cache
	self.rope_theta = rope_theta
	self.attention_dropout = attention_dropout
	self.rope_scaling = rope_scaling

	self.protein_token_id = protein_token_id

	# Validate the correctness of rotary position embeddings parameters
	if self.rope_scaling is not None and "type" in self.rope_scaling:
	if self.rope_scaling["type"] == "mrope":
	self.rope_scaling["type"] = "default"
	self.rope_scaling["rope_type"] = self.rope_scaling["type"]

	super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

	__all__ = ["ProteinLLMConfig"]