| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | """ Transnormer configuration""" |
| |
|
| | from transformers.configuration_utils import PretrainedConfig |
| | from transformers.utils import logging |
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| |
|
| | class TransnormerConfig(PretrainedConfig): |
| | model_type = "transnormer" |
| | keys_to_ignore_at_inference = ["past_key_values"] |
| |
|
| | def __init__( |
| | self, |
| | pad_token_id=0, |
| | bos_token_id=1, |
| | eos_token_id=2, |
| | vocab_size=64000, |
| | use_cache=True, |
| | init_std=0.02, |
| | |
| | decoder_embed_dim=1024, |
| | decoder_layers=24, |
| | decoder_attention_heads=8, |
| | no_scale_embedding=False, |
| | add_bos_token=False, |
| | norm_type="simplermsnorm", |
| | linear_use_lrpe_list=[], |
| | hidden_dim=1024, |
| | linear_act_fun="silu", |
| | glu_dim=2816, |
| | bias=False, |
| | **kwargs, |
| | ): |
| | super().__init__( |
| | pad_token_id=pad_token_id, |
| | bos_token_id=bos_token_id, |
| | eos_token_id=eos_token_id, |
| | **kwargs, |
| | ) |
| | |
| | self.vocab_size = vocab_size |
| | self.use_cache = use_cache |
| | self.init_std = init_std |
| | |
| | self.decoder_embed_dim = decoder_embed_dim |
| | self.decoder_layers = decoder_layers |
| | self.decoder_attention_heads = decoder_attention_heads |
| | self.no_scale_embedding = no_scale_embedding |
| | self.add_bos_token = add_bos_token |
| | self.norm_type = norm_type |
| | self.linear_use_lrpe_list = linear_use_lrpe_list |
| | self.hidden_dim = hidden_dim |
| | self.linear_act_fun = linear_act_fun |
| | self.glu_dim = glu_dim |
| | self.bias = bias |
| |
|