MhaWay commited on
Commit
2c26b1e
·
verified ·
1 Parent(s): b69b7b7

Create configuration_veronica.py

Browse files
src/veronica/configuration_veronica.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from transformers import PretrainedConfig
4
+
5
+
6
+ class VeronicaConfig(PretrainedConfig):
7
+ model_type = "veronica"
8
+
9
+ def __init__(
10
+ self,
11
+ vocab_size: int = 50257,
12
+ n_layer: int = 24,
13
+ n_head: int = 12,
14
+ n_embd: int = 768,
15
+ mlp_mult: float = 4.0,
16
+ num_funcs: int = 3,
17
+ router_dim: Optional[int] = None,
18
+ dropout: float = 0.0,
19
+ use_channel_attention: bool = False,
20
+ max_position_embeddings: int = 4096,
21
+ layer_norm_epsilon: float = 1e-5,
22
+ gradient_checkpointing: bool = False,
23
+ # router aux-loss weight (entropy regularizer)
24
+ router_aux_weight: float = 0.02,
25
+ # temperatura del router (softmax(logits / tau))
26
+ router_tau: float = 1.0,
27
+ # RoPE theta (base for frequency computation)
28
+ rope_theta: float = 10000.0,
29
+ **kwargs,
30
+ ):
31
+ super().__init__(**kwargs)
32
+
33
+ # Dimensioni base
34
+ self.vocab_size = vocab_size
35
+ self.n_layer = n_layer
36
+ self.n_head = n_head
37
+ self.n_embd = n_embd
38
+ self.mlp_mult = mlp_mult
39
+ self.num_funcs = num_funcs
40
+ self.router_dim = router_dim
41
+ self.dropout = dropout
42
+ self.use_channel_attention = use_channel_attention
43
+ self.max_position_embeddings = max_position_embeddings
44
+ self.layer_norm_epsilon = layer_norm_epsilon
45
+ self.gradient_checkpointing = gradient_checkpointing
46
+
47
+ # HF standard field names
48
+ self.num_hidden_layers = n_layer
49
+ self.num_attention_heads = n_head
50
+ self.hidden_size = n_embd
51
+
52
+ # Router
53
+ self.router_aux_weight = router_aux_weight
54
+ self.router_tau = router_tau
55
+
56
+ # RoPE
57
+ self.rope_theta = rope_theta