Mxode commited on
Commit
c49ce1d
·
verified ·
1 Parent(s): e2bc8ea

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +63 -2
README.md CHANGED
@@ -10,6 +10,8 @@ library_name: transformers
10
  ---
11
  # **NanoTranslator-Experimental**
12
 
 
 
13
  | Arch. | Act. | V. | H. | I. | L. | A. | K. | Tie |
14
  | :--: | :--: | :--: | :-----: | :---: | :------: | :--: | :--: | :--: |
15
  | LLaMA | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
@@ -24,13 +26,13 @@ library_name: transformers
24
  | StableLM | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
25
  | GPT2 | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
26
  | GPT-J | GeGLU | 2K | 256 | 1024 | 2 | 4 | 4 | True |
27
- | GPT-Neo | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
28
  | GPT-NeoX | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
29
  | Bloom | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
30
  | MPT | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
31
  | RWKV | - | 2K | 256 | 1024 | 2 | - | - | True |
32
 
33
 
 
34
 
35
  | | Value |
36
  | :------------: | :------------------------: |
@@ -51,6 +53,9 @@ library_name: transformers
51
  | Seed | 3407 |
52
 
53
 
 
 
 
54
 
55
  <table border="1" cellpadding="10" cellspacing="0" style="margin: 0 auto; border-collapse: collapse; text-align: center;">
56
  <thead>
@@ -205,4 +210,60 @@ library_name: transformers
205
  <td></td>
206
  </tr>
207
  </tbody>
208
- </table>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
  # **NanoTranslator-Experimental**
12
 
13
+ ## Models
14
+
15
  | Arch. | Act. | V. | H. | I. | L. | A. | K. | Tie |
16
  | :--: | :--: | :--: | :-----: | :---: | :------: | :--: | :--: | :--: |
17
  | LLaMA | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
 
26
  | StableLM | SwiGLU | 2K | 256 | 768 | 2 | 8 | 4 | True |
27
  | GPT2 | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
28
  | GPT-J | GeGLU | 2K | 256 | 1024 | 2 | 4 | 4 | True |
 
29
  | GPT-NeoX | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
30
  | Bloom | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
31
  | MPT | GeGLU | 2K | 256 | 1024 | 2 | 8 | 8 | True |
32
  | RWKV | - | 2K | 256 | 1024 | 2 | - | - | True |
33
 
34
 
35
+ ## Experimental Setup
36
 
37
  | | Value |
38
  | :------------: | :------------------------: |
 
53
  | Seed | 3407 |
54
 
55
 
56
+ ## Results
57
+
58
+ ### Trapezoidal v.s. Cosine
59
 
60
  <table border="1" cellpadding="10" cellspacing="0" style="margin: 0 auto; border-collapse: collapse; text-align: center;">
61
  <thead>
 
210
  <td></td>
211
  </tr>
212
  </tbody>
213
+ </table>
214
+
215
+
216
+ ### BF16 & FP16
217
+
218
+ <table border="1" cellpadding="10" cellspacing="0" style="margin: 0 auto; border-collapse: collapse; text-align: center;">
219
+ <thead>
220
+ <tr>
221
+ <th rowspan="2">Arch.</th>
222
+ <th colspan="2">Total Loss</th>
223
+ <th colspan="2">Final Loss (Last 10 steps Avg.)</th>
224
+ </tr>
225
+ <tr>
226
+ <th>FP16</th>
227
+ <th>BF16</th>
228
+ <th>FP16</th>
229
+ <th>BF16</th>
230
+ </tr>
231
+ </thead>
232
+ <tbody>
233
+ <tr>
234
+ <td>LLaMA</td>
235
+ <td>1.5734</td>
236
+ <td>1.5714</td>
237
+ <td>1.2784</td>
238
+ <td>1.2758</td>
239
+ </tr>
240
+ <tr>
241
+ <td>Qwen2</td>
242
+ <td>1.5735</td>
243
+ <td>1.5675</td>
244
+ <td>1.2760</td>
245
+ <td>1.2764</td>
246
+ </tr>
247
+ <tr>
248
+ <td>Mistral</td>
249
+ <td>1.5756</td>
250
+ <td>1.5694</td>
251
+ <td>1.2787</td>
252
+ <td>1.2740</td>
253
+ </tr>
254
+ <tr>
255
+ <td>OLMo</td>
256
+ <td>1.6011</td>
257
+ <td>1.6059</td>
258
+ <td>1.2857</td>
259
+ <td>1.2901</td>
260
+ </tr>
261
+ <tr>
262
+ <td>Cohere</td>
263
+ <td>2.1327</td>
264
+ <td>2.1112</td>
265
+ <td>1.6244</td>
266
+ <td>1.6346</td>
267
+ </tr>
268
+ </tbody>
269
+ </table>