kgrabko commited on
Commit
4101d6e
·
verified ·
1 Parent(s): 90f6903

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +63 -41
README.md CHANGED
@@ -72,27 +72,29 @@ My LLMs
72
  # ========================================
73
  # Model Configuration (1B-class model)
74
  # ========================================
75
- VOCAB_SIZE = 50257
76
- MODEL_DIM = 2048
77
- NUM_HEADS = 32
78
- NUM_LAYERS = 16
79
- MAX_SEQ_LEN = 2048
80
- # POS_EMB_MAX_LEN больше не используется, RoPE использует MAX_SEQ_LEN
81
- FFN_HIDDEN_DIM = int(MODEL_DIM * 4)
82
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
 
83
  ---
84
 
85
  # ========================================
86
  # Model Configuration 31B-class model)
87
  # ========================================
88
- VOCAB_SIZE = 50257
89
- MODEL_DIM = 2560
90
- NUM_HEADS = 32
91
- NUM_LAYERS = 32
92
- MAX_SEQ_LEN = 2048
93
- # POS_EMB_MAX_LEN больше не используется, RoPE использует MAX_SEQ_LEN
94
- FFN_HIDDEN_DIM = int(MODEL_DIM * 4)
95
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
 
96
 
97
  ---
98
 
@@ -100,27 +102,44 @@ HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
100
  # Model Configuration (8B-class model)
101
  # ========================================
102
  - VOCAB_SIZE = 50257
103
- - MODEL_DIM = 2048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  - NUM_HEADS = 32
105
- - NUM_LAYERS = 24
106
  - MAX_SEQ_LEN = 2048
107
- # POS_EMB_MAX_LEN больше не используется, RoPE использует MAX_SEQ_LEN
108
- - FFN_HIDDEN_DIM = int(MODEL_DIM * 8 / 3)
109
- - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
 
110
 
111
  ---
112
 
113
  # =====================================================================
114
- # Model Configuration (33B-class model) that available by request , 135 Gb
115
  # =====================================================================
116
  - VOCAB_SIZE = 50257
117
- - MODEL_DIM = 8192
118
  - NUM_HEADS = 64
119
  - NUM_LAYERS = 32
120
- - MAX_SEQ_LEN = 8192
121
- - POS_EMB_MAX_LEN = 32768
122
- - FFN_HIDDEN_DIM = 4 * MODEL_DIM
123
- - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
 
124
 
125
  ---
126
 
@@ -129,13 +148,16 @@ HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
129
  # =======================================================================
130
  - VOCAB_SIZE = 50257
131
  - MODEL_DIM = 8192 # Hidden size (d_model)
132
- - NUM_HEADS = 64 # Attention heads → head_dim = 128
133
- - NUM_KV_HEADS = 8 # GQA: 8 KV heads (like LLaMA-70B), 64 Q heads
134
- - NUM_LAYERS = 80 # 80 layers → ~71B params
135
- - MAX_SEQ_LEN = 8192 # Training context
136
- - POS_EMB_MAX_LEN = 32768 # Safe for long generation
137
- - FFN_HIDDEN_DIM = 32768 # 4 × MODEL_DIM (32,768) matches LLaMA-70B exactly
 
 
138
  - HEAD_DIM = MODEL_DIM // NUM_HEADS
 
139
 
140
  ---
141
  #
@@ -143,17 +165,17 @@ HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
143
  # It was Designed military design and Discover worlds and learn space and science goals
144
  #
145
  # =======================================================================
146
- # 120B Configuration (real numbers) that available by request , JiRack Super Brain
147
  # =======================================================================
148
- - VOCAB_SIZE = 32000 # Modern tokenizer size (you can change later)
149
- - MODEL_DIM = 12288 # d_model = 12288 → matches 120B+ scale
150
  - NUM_HEADS = 96 # Query heads
151
- - NUM_KV_HEADS = 12 # GQA: 8× groups (12 KV heads → 96/12 = 8)
152
- - NUM_LAYERS = 80 # 80 layers
153
  - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
154
- - FFN_HIDDEN_DIM = int(4 * MODEL_DIM * 1.3) # ~4.3× expansion (DeepSeek/Qwen style) → 53248
155
- - MAX_SEQ_LEN = 131072 # Training on 128k context
156
- - POS_EMB_MAX_LEN = 262144 # Generation up to 256k+ tokens safely
157
 
158
 
159
 
 
72
  # ========================================
73
  # Model Configuration (1B-class model)
74
  # ========================================
75
+ - VOCAB_SIZE = 50257
76
+ - MODEL_DIM = 2048
77
+ - NUM_HEADS = 32
78
+ - NUM_LAYERS = 16
79
+ - MAX_SEQ_LEN = 2048
80
+ # RoPE
81
+ FFN_HIDDEN_DIM = int(MODEL_DIM * 4) # Нестандартный FFN (4D)
82
+ HEAD_DIM = MODEL_DIM // NUM_HEADS # 64
83
+ EPSILON = 1e-6
84
  ---
85
 
86
  # ========================================
87
  # Model Configuration 31B-class model)
88
  # ========================================
89
+ - VOCAB_SIZE = 50257
90
+ - MODEL_DIM = 8192 # Большая размерность (как Llama 2 70B)
91
+ - NUM_HEADS = 64
92
+ - NUM_LAYERS = 32
93
+ - MAX_SEQ_LEN = 8192 # Большая длина контекста
94
+ # RoPE
95
+ - FFN_HIDDEN_DIM = int(MODEL_DIM * 4) # Нестандартный FFN (4D) - 32768
96
+ - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
97
+ - EPSILON = 1e-6
98
 
99
  ---
100
 
 
102
  # Model Configuration (8B-class model)
103
  # ========================================
104
  - VOCAB_SIZE = 50257
105
+ - MODEL_DIM = 4096 # Увеличен для 8.5B-класса (Стандартный, высокоэффективный)
106
+ - NUM_HEADS = 32
107
+ - NUM_LAYERS = 40 # Увеличен до 40 (как у Llama 13B)
108
+ - MAX_SEQ_LEN = 2048
109
+ # RoPE
110
+ - FFN_HIDDEN_DIM = int(MODEL_DIM * 8 / 3) # 10922 (стандарт Llama)
111
+ - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
112
+ - EPSILON = 1e-6
113
+
114
+ ---
115
+
116
+ # ========================================
117
+ # Model Configuration (10B-class model)
118
+ # ========================================
119
+ - VOCAB_SIZE = 50257
120
+ - MODEL_DIM = 4096
121
  - NUM_HEADS = 32
122
+ - NUM_LAYERS = 48 # Увеличена глубина
123
  - MAX_SEQ_LEN = 2048
124
+ # RoPE
125
+ - FFN_HIDDEN_DIM = int(MODEL_DIM * 8 / 3) # 10922 (стандарт Llama)
126
+ - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
127
+ - EPSILON = 1e-6
128
 
129
  ---
130
 
131
  # =====================================================================
132
+ # Model Configuration (33B-class model) that available by request
133
  # =====================================================================
134
  - VOCAB_SIZE = 50257
135
+ - MODEL_DIM = 8192 # Большая размерность (как Llama 2 70B)
136
  - NUM_HEADS = 64
137
  - NUM_LAYERS = 32
138
+ - MAX_SEQ_LEN = 8192 # Большая длина контекста
139
+ # RoPE
140
+ - FFN_HIDDEN_DIM = int(MODEL_DIM * 4) # Нестандартный FFN (4D) - 32768
141
+ - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
142
+ - EPSILON = 1e-6
143
 
144
  ---
145
 
 
148
  # =======================================================================
149
  - VOCAB_SIZE = 50257
150
  - MODEL_DIM = 8192 # Hidden size (d_model)
151
+ - NUM_HEADS = 64 # Q Heads
152
+ - NUM_KV_HEADS = 8 # KV Heads (GQA ratio = 8)
153
+ - NUM_LAYERS = 80 # 80 layers
154
+ - MAX_SEQ_LEN = 8192 # Max context (RoPE)
155
+ # FFN LLaMA-70B Hidden Dim: 28672 (32768 * 2/3 + 32768 * 1/3 * 2/3 * 0.95, roughly 28672)
156
+ # Точное значение для LLaMA: 2 * (D * 2/3) + D * 2/3 * (1 - 2/3) * ~1.2 (для 70B)
157
+ # Используем стандартный FFN LLaMA-70B для точности
158
+ - FFN_HIDDEN_DIM = 28672
159
  - HEAD_DIM = MODEL_DIM // NUM_HEADS
160
+ - EPSILON = 1e-6
161
 
162
  ---
163
  #
 
165
  # It was Designed military design and Discover worlds and learn space and science goals
166
  #
167
  # =======================================================================
168
+ # 140B Configuration (real numbers) that available by request , JiRack Super Brain
169
  # =======================================================================
170
+ - VOCAB_SIZE = 32000
171
+ - MODEL_DIM = 12288 # d_model
172
  - NUM_HEADS = 96 # Query heads
173
+ - NUM_KV_HEADS = 12 # GQA: 8× groups
174
+ - NUM_LAYERS = 80
175
  - HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
176
+ - FFN_HIDDEN_DIM = int(4 * MODEL_DIM * 1.3) # 53248
177
+ - MAX_SEQ_LEN = 131072 # Max context
178
+ - EPSILON = 1e-6
179
 
180
 
181