Update README.md
Browse files
README.md
CHANGED
|
@@ -10,6 +10,8 @@ library_name: transformers
|
|
| 10 |
---
|
| 11 |
# **NanoTranslator-Experimental**
|
| 12 |
|
|
|
|
|
|
|
| 13 |
| Arch. | Act. | V. | H. | I. | L. | A. | K. | Tie |
|
| 14 |
| :--: | :--: | :--: | :-----: | :---: | :------: | :--: | :--: | :--: |
|
| 15 |
| LLaMA | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
|
|
@@ -24,13 +26,13 @@ library_name: transformers
|
|
| 24 |
| StableLM | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
|
| 25 |
| GPT2 | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 26 |
| GPT-J | GeGLU | 2K | 256 | 1024 | 2 | 4 | 4 | True |
|
| 27 |
-
| GPT-Neo | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 28 |
| GPT-NeoX | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 29 |
| Bloom | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 30 |
| MPT | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 31 |
| RWKV | - | 2K | 256 | 1024 | 2 | - | - | True |
|
| 32 |
|
| 33 |
|
|
|
|
| 34 |
|
| 35 |
| | Value |
|
| 36 |
| :------------: | :------------------------: |
|
|
@@ -51,6 +53,9 @@ library_name: transformers
|
|
| 51 |
| Seed | 3407 |
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
<table border="1" cellpadding="10" cellspacing="0" style="margin: 0 auto; border-collapse: collapse; text-align: center;">
|
| 56 |
<thead>
|
|
@@ -205,4 +210,60 @@ library_name: transformers
|
|
| 205 |
<td></td>
|
| 206 |
</tr>
|
| 207 |
</tbody>
|
| 208 |
-
</table>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
# **NanoTranslator-Experimental**
|
| 12 |
|
| 13 |
+
## Models
|
| 14 |
+
|
| 15 |
| Arch. | Act. | V. | H. | I. | L. | A. | K. | Tie |
|
| 16 |
| :--: | :--: | :--: | :-----: | :---: | :------: | :--: | :--: | :--: |
|
| 17 |
| LLaMA | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
|
|
|
|
| 26 |
| StableLM | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
|
| 27 |
| GPT2 | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 28 |
| GPT-J | GeGLU | 2K | 256 | 1024 | 2 | 4 | 4 | True |
|
|
|
|
| 29 |
| GPT-NeoX | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 30 |
| Bloom | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 31 |
| MPT | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
|
| 32 |
| RWKV | - | 2K | 256 | 1024 | 2 | - | - | True |
|
| 33 |
|
| 34 |
|
| 35 |
+
## Experimental Setup
|
| 36 |
|
| 37 |
| | Value |
|
| 38 |
| :------------: | :------------------------: |
|
|
|
|
| 53 |
| Seed | 3407 |
|
| 54 |
|
| 55 |
|
| 56 |
+
## Results
|
| 57 |
+
|
| 58 |
+
### Trapezoidal v.s. Cosine
|
| 59 |
|
| 60 |
<table border="1" cellpadding="10" cellspacing="0" style="margin: 0 auto; border-collapse: collapse; text-align: center;">
|
| 61 |
<thead>
|
|
|
|
| 210 |
<td></td>
|
| 211 |
</tr>
|
| 212 |
</tbody>
|
| 213 |
+
</table>
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
### BF16 & FP16
|
| 217 |
+
|
| 218 |
+
<table border="1" cellpadding="10" cellspacing="0" style="margin: 0 auto; border-collapse: collapse; text-align: center;">
|
| 219 |
+
<thead>
|
| 220 |
+
<tr>
|
| 221 |
+
<th rowspan="2">Arch.</th>
|
| 222 |
+
<th colspan="2">Total Loss</th>
|
| 223 |
+
<th colspan="2">Final Loss (Last 10 steps Avg.)</th>
|
| 224 |
+
</tr>
|
| 225 |
+
<tr>
|
| 226 |
+
<th>FP16</th>
|
| 227 |
+
<th>BF16</th>
|
| 228 |
+
<th>FP16</th>
|
| 229 |
+
<th>BF16</th>
|
| 230 |
+
</tr>
|
| 231 |
+
</thead>
|
| 232 |
+
<tbody>
|
| 233 |
+
<tr>
|
| 234 |
+
<td>LLaMA</td>
|
| 235 |
+
<td>1.5734</td>
|
| 236 |
+
<td>1.5714</td>
|
| 237 |
+
<td>1.2784</td>
|
| 238 |
+
<td>1.2758</td>
|
| 239 |
+
</tr>
|
| 240 |
+
<tr>
|
| 241 |
+
<td>Qwen2</td>
|
| 242 |
+
<td>1.5735</td>
|
| 243 |
+
<td>1.5675</td>
|
| 244 |
+
<td>1.2760</td>
|
| 245 |
+
<td>1.2764</td>
|
| 246 |
+
</tr>
|
| 247 |
+
<tr>
|
| 248 |
+
<td>Mistral</td>
|
| 249 |
+
<td>1.5756</td>
|
| 250 |
+
<td>1.5694</td>
|
| 251 |
+
<td>1.2787</td>
|
| 252 |
+
<td>1.2740</td>
|
| 253 |
+
</tr>
|
| 254 |
+
<tr>
|
| 255 |
+
<td>OLMo</td>
|
| 256 |
+
<td>1.6011</td>
|
| 257 |
+
<td>1.6059</td>
|
| 258 |
+
<td>1.2857</td>
|
| 259 |
+
<td>1.2901</td>
|
| 260 |
+
</tr>
|
| 261 |
+
<tr>
|
| 262 |
+
<td>Cohere</td>
|
| 263 |
+
<td>2.1327</td>
|
| 264 |
+
<td>2.1112</td>
|
| 265 |
+
<td>1.6244</td>
|
| 266 |
+
<td>1.6346</td>
|
| 267 |
+
</tr>
|
| 268 |
+
</tbody>
|
| 269 |
+
</table>
|