| |
| """ Hgrn configuration""" |
|
|
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
|
|
|
|
| logger = logging.get_logger(__name__) |
|
|
| LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} |
|
|
|
|
| class HgrnConfig(PretrainedConfig): |
| model_type = "hgrn" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| pad_token_id=1, |
| bos_token_id=0, |
| eos_token_id=2, |
| vocab_size=50272, |
| use_cache=True, |
| init_std=0.02, |
| |
| decoder_embed_dim=1024, |
| decoder_layers=24, |
| add_bos_token=False, |
| act_fun="swish", |
| causal=True, |
| use_triton=False, |
| glu_act="swish", |
| glu_dim=2816, |
| bias=False, |
| norm_type="layernorm", |
| no_scale_embedding=False, |
| **kwargs, |
| ): |
| super().__init__( |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| **kwargs, |
| ) |
| |
| self.vocab_size = vocab_size |
| self.use_cache = use_cache |
| self.init_std = init_std |
| |
| self.decoder_embed_dim = decoder_embed_dim |
| self.decoder_layers = decoder_layers |
| self.add_bos_token = add_bos_token |
| self.act_fun = act_fun |
| self.causal = causal |
| self.use_triton = use_triton |
| self.glu_act = glu_act |
| self.glu_dim = glu_dim |
| self.bias = bias |
| self.norm_type = norm_type |
| self.no_scale_embedding = no_scale_embedding |