Update README.md
Browse files
README.md
CHANGED
|
@@ -72,27 +72,29 @@ My LLMs
|
|
| 72 |
# ========================================
|
| 73 |
# Model Configuration (1B-class model)
|
| 74 |
# ========================================
|
| 75 |
-
VOCAB_SIZE = 50257
|
| 76 |
-
MODEL_DIM = 2048
|
| 77 |
-
NUM_HEADS = 32
|
| 78 |
-
NUM_LAYERS = 16
|
| 79 |
-
MAX_SEQ_LEN = 2048
|
| 80 |
-
#
|
| 81 |
-
FFN_HIDDEN_DIM = int(MODEL_DIM * 4)
|
| 82 |
-
HEAD_DIM = MODEL_DIM // NUM_HEADS
|
|
|
|
| 83 |
---
|
| 84 |
|
| 85 |
# ========================================
|
| 86 |
# Model Configuration 31B-class model)
|
| 87 |
# ========================================
|
| 88 |
-
VOCAB_SIZE = 50257
|
| 89 |
-
MODEL_DIM =
|
| 90 |
-
NUM_HEADS =
|
| 91 |
-
NUM_LAYERS = 32
|
| 92 |
-
MAX_SEQ_LEN =
|
| 93 |
-
#
|
| 94 |
-
FFN_HIDDEN_DIM = int(MODEL_DIM * 4)
|
| 95 |
-
HEAD_DIM = MODEL_DIM // NUM_HEADS
|
|
|
|
| 96 |
|
| 97 |
---
|
| 98 |
|
|
@@ -100,27 +102,44 @@ HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
|
| 100 |
# Model Configuration (8B-class model)
|
| 101 |
# ========================================
|
| 102 |
- VOCAB_SIZE = 50257
|
| 103 |
-
- MODEL_DIM =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
- NUM_HEADS = 32
|
| 105 |
-
- NUM_LAYERS =
|
| 106 |
- MAX_SEQ_LEN = 2048
|
| 107 |
-
#
|
| 108 |
-
- FFN_HIDDEN_DIM = int(MODEL_DIM * 8 / 3)
|
| 109 |
-
- HEAD_DIM = MODEL_DIM // NUM_HEADS
|
|
|
|
| 110 |
|
| 111 |
---
|
| 112 |
|
| 113 |
# =====================================================================
|
| 114 |
-
# Model Configuration (33B-class model) that available by request
|
| 115 |
# =====================================================================
|
| 116 |
- VOCAB_SIZE = 50257
|
| 117 |
-
- MODEL_DIM = 8192
|
| 118 |
- NUM_HEADS = 64
|
| 119 |
- NUM_LAYERS = 32
|
| 120 |
-
- MAX_SEQ_LEN = 8192
|
| 121 |
-
|
| 122 |
-
- FFN_HIDDEN_DIM =
|
| 123 |
-
- HEAD_DIM = MODEL_DIM // NUM_HEADS
|
|
|
|
| 124 |
|
| 125 |
---
|
| 126 |
|
|
@@ -129,13 +148,16 @@ HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
|
| 129 |
# =======================================================================
|
| 130 |
- VOCAB_SIZE = 50257
|
| 131 |
- MODEL_DIM = 8192 # Hidden size (d_model)
|
| 132 |
-
- NUM_HEADS = 64 #
|
| 133 |
-
- NUM_KV_HEADS = 8 #
|
| 134 |
-
- NUM_LAYERS = 80 # 80 layers
|
| 135 |
-
- MAX_SEQ_LEN = 8192 #
|
| 136 |
-
-
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
- HEAD_DIM = MODEL_DIM // NUM_HEADS
|
|
|
|
| 139 |
|
| 140 |
---
|
| 141 |
#
|
|
@@ -143,17 +165,17 @@ HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
|
| 143 |
# It was Designed military design and Discover worlds and learn space and science goals
|
| 144 |
#
|
| 145 |
# =======================================================================
|
| 146 |
-
#
|
| 147 |
# =======================================================================
|
| 148 |
-
- VOCAB_SIZE = 32000
|
| 149 |
-
- MODEL_DIM = 12288 # d_model
|
| 150 |
- NUM_HEADS = 96 # Query heads
|
| 151 |
-
- NUM_KV_HEADS = 12 # GQA: 8× groups
|
| 152 |
-
- NUM_LAYERS = 80
|
| 153 |
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
| 154 |
-
- FFN_HIDDEN_DIM = int(4 * MODEL_DIM * 1.3) #
|
| 155 |
-
- MAX_SEQ_LEN = 131072 #
|
| 156 |
-
-
|
| 157 |
|
| 158 |
|
| 159 |
|
|
|
|
| 72 |
# ========================================
|
| 73 |
# Model Configuration (1B-class model)
|
| 74 |
# ========================================
|
| 75 |
+
- VOCAB_SIZE = 50257
|
| 76 |
+
- MODEL_DIM = 2048
|
| 77 |
+
- NUM_HEADS = 32
|
| 78 |
+
- NUM_LAYERS = 16
|
| 79 |
+
- MAX_SEQ_LEN = 2048
|
| 80 |
+
# RoPE
|
| 81 |
+
FFN_HIDDEN_DIM = int(MODEL_DIM * 4) # Нестандартный FFN (4D)
|
| 82 |
+
HEAD_DIM = MODEL_DIM // NUM_HEADS # 64
|
| 83 |
+
EPSILON = 1e-6
|
| 84 |
---
|
| 85 |
|
| 86 |
# ========================================
|
| 87 |
# Model Configuration 31B-class model)
|
| 88 |
# ========================================
|
| 89 |
+
- VOCAB_SIZE = 50257
|
| 90 |
+
- MODEL_DIM = 8192 # Большая размерность (как Llama 2 70B)
|
| 91 |
+
- NUM_HEADS = 64
|
| 92 |
+
- NUM_LAYERS = 32
|
| 93 |
+
- MAX_SEQ_LEN = 8192 # Большая длина контекста
|
| 94 |
+
# RoPE
|
| 95 |
+
- FFN_HIDDEN_DIM = int(MODEL_DIM * 4) # Нестандартный FFN (4D) - 32768
|
| 96 |
+
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
| 97 |
+
- EPSILON = 1e-6
|
| 98 |
|
| 99 |
---
|
| 100 |
|
|
|
|
| 102 |
# Model Configuration (8B-class model)
|
| 103 |
# ========================================
|
| 104 |
- VOCAB_SIZE = 50257
|
| 105 |
+
- MODEL_DIM = 4096 # Увеличен для 8.5B-класса (Стандартный, высокоэффективный)
|
| 106 |
+
- NUM_HEADS = 32
|
| 107 |
+
- NUM_LAYERS = 40 # Увеличен до 40 (как у Llama 13B)
|
| 108 |
+
- MAX_SEQ_LEN = 2048
|
| 109 |
+
# RoPE
|
| 110 |
+
- FFN_HIDDEN_DIM = int(MODEL_DIM * 8 / 3) # 10922 (стандарт Llama)
|
| 111 |
+
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
| 112 |
+
- EPSILON = 1e-6
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
# ========================================
|
| 117 |
+
# Model Configuration (10B-class model)
|
| 118 |
+
# ========================================
|
| 119 |
+
- VOCAB_SIZE = 50257
|
| 120 |
+
- MODEL_DIM = 4096
|
| 121 |
- NUM_HEADS = 32
|
| 122 |
+
- NUM_LAYERS = 48 # Увеличена глубина
|
| 123 |
- MAX_SEQ_LEN = 2048
|
| 124 |
+
# RoPE
|
| 125 |
+
- FFN_HIDDEN_DIM = int(MODEL_DIM * 8 / 3) # 10922 (стандарт Llama)
|
| 126 |
+
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
| 127 |
+
- EPSILON = 1e-6
|
| 128 |
|
| 129 |
---
|
| 130 |
|
| 131 |
# =====================================================================
|
| 132 |
+
# Model Configuration (33B-class model) that available by request
|
| 133 |
# =====================================================================
|
| 134 |
- VOCAB_SIZE = 50257
|
| 135 |
+
- MODEL_DIM = 8192 # Большая размерность (как Llama 2 70B)
|
| 136 |
- NUM_HEADS = 64
|
| 137 |
- NUM_LAYERS = 32
|
| 138 |
+
- MAX_SEQ_LEN = 8192 # Большая длина контекста
|
| 139 |
+
# RoPE
|
| 140 |
+
- FFN_HIDDEN_DIM = int(MODEL_DIM * 4) # Нестандартный FFN (4D) - 32768
|
| 141 |
+
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
| 142 |
+
- EPSILON = 1e-6
|
| 143 |
|
| 144 |
---
|
| 145 |
|
|
|
|
| 148 |
# =======================================================================
|
| 149 |
- VOCAB_SIZE = 50257
|
| 150 |
- MODEL_DIM = 8192 # Hidden size (d_model)
|
| 151 |
+
- NUM_HEADS = 64 # Q Heads
|
| 152 |
+
- NUM_KV_HEADS = 8 # KV Heads (GQA ratio = 8)
|
| 153 |
+
- NUM_LAYERS = 80 # 80 layers
|
| 154 |
+
- MAX_SEQ_LEN = 8192 # Max context (RoPE)
|
| 155 |
+
# FFN LLaMA-70B Hidden Dim: 28672 (32768 * 2/3 + 32768 * 1/3 * 2/3 * 0.95, roughly 28672)
|
| 156 |
+
# Точное значение для LLaMA: 2 * (D * 2/3) + D * 2/3 * (1 - 2/3) * ~1.2 (для 70B)
|
| 157 |
+
# Используем стандартный FFN LLaMA-70B для точности
|
| 158 |
+
- FFN_HIDDEN_DIM = 28672
|
| 159 |
- HEAD_DIM = MODEL_DIM // NUM_HEADS
|
| 160 |
+
- EPSILON = 1e-6
|
| 161 |
|
| 162 |
---
|
| 163 |
#
|
|
|
|
| 165 |
# It was Designed military design and Discover worlds and learn space and science goals
|
| 166 |
#
|
| 167 |
# =======================================================================
|
| 168 |
+
# 140B Configuration (real numbers) that available by request , JiRack Super Brain
|
| 169 |
# =======================================================================
|
| 170 |
+
- VOCAB_SIZE = 32000
|
| 171 |
+
- MODEL_DIM = 12288 # d_model
|
| 172 |
- NUM_HEADS = 96 # Query heads
|
| 173 |
+
- NUM_KV_HEADS = 12 # GQA: 8× groups
|
| 174 |
+
- NUM_LAYERS = 80
|
| 175 |
- HEAD_DIM = MODEL_DIM // NUM_HEADS # 128
|
| 176 |
+
- FFN_HIDDEN_DIM = int(4 * MODEL_DIM * 1.3) # 53248
|
| 177 |
+
- MAX_SEQ_LEN = 131072 # Max context
|
| 178 |
+
- EPSILON = 1e-6
|
| 179 |
|
| 180 |
|
| 181 |
|