| from transformers import PretrainedConfig | |
| class RetNetConfig(PretrainedConfig): | |
| model_type = "retnet" | |
| def __init__( | |
| self, | |
| vocab_size=32000, | |
| hidden_size=512, | |
| num_hidden_layers=6, | |
| num_rettention_heads=8, | |
| intermediate_size=2048, | |
| hidden_act="gelu", | |
| max_position_embeddings=512, | |
| initializer_range=0.02, | |
| layer_norm_eps=1e-5, | |
| dropout=0.1, | |
| activation_dropout=0.0, | |
| normalize_before=False, | |
| attention_type="parallel", | |
| recurrent_chunk_size=512, | |
| output_retentions=False, | |
| output_hidden_states=False, | |
| **kwargs | |
| ): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.num_hidden_layers = num_hidden_layers | |
| self.num_rettention_heads = num_rettention_heads | |
| self.intermediate_size = intermediate_size | |
| self.hidden_act = hidden_act | |
| self.attention_type = attention_type | |
| self.max_position_embeddings = max_position_embeddings | |
| self.initializer_range = initializer_range | |
| self.layer_norm_eps = layer_norm_eps | |
| self.dropout = dropout | |
| self.normalize_before = normalize_before | |
| self.activation_dropout = activation_dropout | |
| self.recurrent_chunk_size = recurrent_chunk_size | |
| self.output_retentions = output_retentions | |
| self.output_hidden_states = output_hidden_states | |