| | from typing import Optional |
| |
|
| | from transformers import PretrainedConfig |
| |
|
| |
|
| | class VeronicaConfig(PretrainedConfig): |
| | model_type = "veronica" |
| |
|
| | def __init__( |
| | self, |
| | vocab_size: int = 50257, |
| | n_layer: int = 24, |
| | n_head: int = 12, |
| | n_embd: int = 768, |
| | mlp_mult: float = 4.0, |
| | num_funcs: int = 3, |
| | router_dim: Optional[int] = None, |
| | dropout: float = 0.0, |
| | use_channel_attention: bool = False, |
| | max_position_embeddings: int = 4096, |
| | layer_norm_epsilon: float = 1e-5, |
| | gradient_checkpointing: bool = False, |
| | |
| | router_aux_weight: float = 0.02, |
| | |
| | router_tau: float = 1.0, |
| | |
| | rope_theta: float = 10000.0, |
| | **kwargs, |
| | ): |
| | super().__init__(**kwargs) |
| |
|
| | |
| | self.vocab_size = vocab_size |
| | self.n_layer = n_layer |
| | self.n_head = n_head |
| | self.n_embd = n_embd |
| | self.mlp_mult = mlp_mult |
| | self.num_funcs = num_funcs |
| | self.router_dim = router_dim |
| | self.dropout = dropout |
| | self.use_channel_attention = use_channel_attention |
| | self.max_position_embeddings = max_position_embeddings |
| | self.layer_norm_epsilon = layer_norm_epsilon |
| | self.gradient_checkpointing = gradient_checkpointing |
| |
|
| | |
| | self.num_hidden_layers = n_layer |
| | self.num_attention_heads = n_head |
| | self.hidden_size = n_embd |
| |
|
| | |
| | self.router_aux_weight = router_aux_weight |
| | self.router_tau = router_tau |
| | |
| | |
| | self.rope_theta = rope_theta |
| |
|