Trouter-Library commited on
Commit
3d5c120
·
verified ·
1 Parent(s): 0e49e6c

Create configuration_helion.py

Browse files
Files changed (1) hide show
  1. configuration_helion.py +118 -0
configuration_helion.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion Model Configuration
3
+ """
4
+
5
+ from transformers import PretrainedConfig
6
+
7
+
8
+ class HelionConfig(PretrainedConfig):
9
+ """
10
+ Configuration class for Helion model.
11
+
12
+ Args:
13
+ vocab_size (`int`, *optional*, defaults to 100000):
14
+ Vocabulary size of the Helion model.
15
+ hidden_size (`int`, *optional*, defaults to 6144):
16
+ Dimension of the hidden representations.
17
+ intermediate_size (`int`, *optional*, defaults to 24576):
18
+ Dimension of the MLP representations.
19
+ num_hidden_layers (`int`, *optional*, defaults to 48):
20
+ Number of hidden layers in the Transformer decoder.
21
+ num_attention_heads (`int`, *optional*, defaults to 32):
22
+ Number of attention heads for each attention layer.
23
+ num_key_value_heads (`int`, *optional*, defaults to 8):
24
+ Number of key-value heads for Grouped Query Attention.
25
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
26
+ The non-linear activation function.
27
+ max_position_embeddings (`int`, *optional*, defaults to 16384):
28
+ Maximum sequence length that the model can handle.
29
+ initializer_range (`float`, *optional*, defaults to 0.02):
30
+ Standard deviation of the truncated_normal_initializer.
31
+ rms_norm_eps (`float`, *optional*, defaults to 1e-6):
32
+ Epsilon value for RMSNorm layers.
33
+ use_cache (`bool`, *optional*, defaults to `True`):
34
+ Whether to use cache for faster decoding.
35
+ pad_token_id (`int`, *optional*, defaults to 0):
36
+ Padding token id.
37
+ bos_token_id (`int`, *optional*, defaults to 1):
38
+ Beginning of stream token id.
39
+ eos_token_id (`int`, *optional*, defaults to 2):
40
+ End of stream token id.
41
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
42
+ Whether to tie input and output embeddings.
43
+ rope_theta (`float`, *optional*, defaults to 10000.0):
44
+ The base period of the RoPE embeddings.
45
+ rope_scaling (`Dict`, *optional*):
46
+ Dictionary containing the scaling configuration for RoPE.
47
+ attention_bias (`bool`, *optional*, defaults to `False`):
48
+ Whether to use bias in attention layers.
49
+ attention_dropout (`float`, *optional*, defaults to 0.0):
50
+ Dropout probability for attention weights.
51
+ """
52
+
53
+ model_type = "helion"
54
+ keys_to_ignore_at_inference = ["past_key_values"]
55
+
56
+ def __init__(
57
+ self,
58
+ vocab_size=100000,
59
+ hidden_size=6144,
60
+ intermediate_size=24576,
61
+ num_hidden_layers=48,
62
+ num_attention_heads=32,
63
+ num_key_value_heads=8,
64
+ hidden_act="silu",
65
+ max_position_embeddings=16384,
66
+ initializer_range=0.02,
67
+ rms_norm_eps=1e-6,
68
+ use_cache=True,
69
+ pad_token_id=0,
70
+ bos_token_id=1,
71
+ eos_token_id=2,
72
+ tie_word_embeddings=False,
73
+ rope_theta=10000.0,
74
+ rope_scaling=None,
75
+ attention_bias=False,
76
+ attention_dropout=0.0,
77
+ **kwargs,
78
+ ):
79
+ self.vocab_size = vocab_size
80
+ self.max_position_embeddings = max_position_embeddings
81
+ self.hidden_size = hidden_size
82
+ self.intermediate_size = intermediate_size
83
+ self.num_hidden_layers = num_hidden_layers
84
+ self.num_attention_heads = num_attention_heads
85
+
86
+ # GQA parameters
87
+ if num_key_value_heads is None:
88
+ num_key_value_heads = num_attention_heads
89
+ self.num_key_value_heads = num_key_value_heads
90
+
91
+ self.hidden_act = hidden_act
92
+ self.initializer_range = initializer_range
93
+ self.rms_norm_eps = rms_norm_eps
94
+ self.use_cache = use_cache
95
+ self.rope_theta = rope_theta
96
+ self.rope_scaling = rope_scaling
97
+ self.attention_bias = attention_bias
98
+ self.attention_dropout = attention_dropout
99
+
100
+ # Validate rope_scaling
101
+ if self.rope_scaling is not None:
102
+ if not isinstance(self.rope_scaling, dict):
103
+ raise ValueError("`rope_scaling` must be a dictionary")
104
+
105
+ required_keys = {"type", "factor"}
106
+ if not all(key in self.rope_scaling for key in required_keys):
107
+ raise ValueError(f"`rope_scaling` must contain keys {required_keys}")
108
+
109
+ if self.rope_scaling["type"] not in ["linear", "dynamic"]:
110
+ raise ValueError("`rope_scaling.type` must be 'linear' or 'dynamic'")
111
+
112
+ super().__init__(
113
+ pad_token_id=pad_token_id,
114
+ bos_token_id=bos_token_id,
115
+ eos_token_id=eos_token_id,
116
+ tie_word_embeddings=tie_word_embeddings,
117
+ **kwargs,
118
+ )