Mxode commited on
Commit
69867f4
·
verified ·
1 Parent(s): c49ce1d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -61
README.md CHANGED
@@ -63,7 +63,7 @@ library_name: transformers
63
  <th rowspan="2">Arch.</th>
64
  <th rowspan="2">Training Speed (it/s)</th>
65
  <th colspan="2">Total Loss</th>
66
- <th colspan="2">Final Loss</th>
67
  </tr>
68
  <tr>
69
  <th>Trapezoidal</th>
@@ -77,137 +77,137 @@ library_name: transformers
77
  <td>LLaMA</td>
78
  <td>4.35</td>
79
  <td>1.5734</td>
80
- <td></td>
81
- <td>1.2740</td>
82
- <td></td>
83
  </tr>
84
  <tr>
85
  <td>Qwen2</td>
86
  <td>4.41</td>
87
  <td>1.5735</td>
88
- <td></td>
89
- <td>1.2731</td>
90
- <td></td>
91
  </tr>
92
  <tr>
93
  <td>Mistral</td>
94
  <td>4.44</td>
95
  <td>1.5756</td>
96
- <td></td>
97
- <td>1.2754</td>
98
- <td></td>
99
  </tr>
100
  <tr>
101
  <td>Gemma</td>
102
  <td>1.79</td>
103
  <td>1.3894</td>
104
- <td></td>
105
- <td>1.0781</td>
106
- <td></td>
107
  </tr>
108
  <tr>
109
  <td>Gemma2</td>
110
  <td>1.59</td>
111
  <td>1.3754</td>
112
- <td></td>
113
- <td>1.0556</td>
114
- <td></td>
115
  </tr>
116
  <tr>
117
  <td>OLMo</td>
118
  <td>5.00</td>
119
  <td>1.6011</td>
120
- <td></td>
121
- <td>1.2821</td>
122
- <td></td>
123
  </tr>
124
  <tr>
125
  <td>Cohere</td>
126
  <td>4.04</td>
127
  <td>2.1327</td>
128
- <td>2.1152</td>
129
- <td>1.6205</td>
130
- <td>1.6546</td>
131
  </tr>
132
  <tr>
133
  <td>Phi</td>
134
  <td>5.78</td>
135
  <td>1.7525</td>
136
- <td>1.7419</td>
137
- <td>1.4378</td>
138
- <td>1.4537</td>
139
  </tr>
140
  <tr>
141
  <td>StarCoder2</td>
142
  <td>3.01</td>
143
- <td>1.6125</td>
144
- <td></td>
145
- <td>1.2996</td>
146
- <td></td>
147
  </tr>
148
  <tr>
149
  <td>StableLM</td>
150
  <td>5.06</td>
151
- <td>1.5835</td>
152
- <td></td>
153
- <td>1.2623</td>
154
- <td></td>
155
  </tr>
156
  <tr>
157
  <td>GPT2</td>
158
  <td>3.53</td>
159
  <td>2.1100</td>
160
- <td></td>
161
- <td>1.8190</td>
162
- <td></td>
163
  </tr>
164
  <tr>
165
  <td>GPT-J</td>
166
  <td>3.06</td>
167
  <td>1.7198</td>
168
- <td></td>
169
- <td>1.4475</td>
170
- <td></td>
171
- </tr>
172
- <tr>
173
- <td>GPT-Neo</td>
174
- <td>3.08</td>
175
- <td>1.6465</td>
176
- <td></td>
177
- <td>1.2917</td>
178
- <td></td>
179
  </tr>
180
  <tr>
181
  <td>GPT-NeoX</td>
182
  <td>5.06</td>
183
  <td>1.7233</td>
184
- <td></td>
185
- <td>1.4350</td>
186
- <td></td>
187
  </tr>
188
  <tr>
189
  <td>Bloom</td>
190
  <td>3.33</td>
191
  <td>1.6910</td>
192
- <td></td>
193
- <td>1.3640</td>
194
- <td></td>
195
  </tr>
196
  <tr>
197
  <td>MPT</td>
198
  <td>4.39</td>
199
  <td>1.6466</td>
200
- <td></td>
201
- <td>1.3386</td>
202
- <td></td>
203
  </tr>
204
  <tr>
205
  <td>RWKV</td>
206
- <td></td>
207
- <td></td>
208
- <td></td>
209
- <td></td>
210
- <td></td>
 
 
 
 
 
 
 
 
211
  </tr>
212
  </tbody>
213
  </table>
 
63
  <th rowspan="2">Arch.</th>
64
  <th rowspan="2">Training Speed (it/s)</th>
65
  <th colspan="2">Total Loss</th>
66
+ <th colspan="2">Final Loss (Last 10 steps Avg.)</th>
67
  </tr>
68
  <tr>
69
  <th>Trapezoidal</th>
 
77
  <td>LLaMA</td>
78
  <td>4.35</td>
79
  <td>1.5734</td>
80
+ <td><b>1.5626</b></td>
81
+ <td><b>1.2784</b></td>
82
+ <td>1.2855</td>
83
  </tr>
84
  <tr>
85
  <td>Qwen2</td>
86
  <td>4.41</td>
87
  <td>1.5735</td>
88
+ <td><b>1.5565</b></td>
89
+ <td><b>1.2760</b></td>
90
+ <td>1.2943</td>
91
  </tr>
92
  <tr>
93
  <td>Mistral</td>
94
  <td>4.44</td>
95
  <td>1.5756</td>
96
+ <td><b>1.5645</b></td>
97
+ <td><b>1.2787</b></td>
98
+ <td>1.3004</td>
99
  </tr>
100
  <tr>
101
  <td>Gemma</td>
102
  <td>1.79</td>
103
  <td>1.3894</td>
104
+ <td><b>1.3737</b></td>
105
+ <td><b>1.0841</b></td>
106
+ <td>1.1010</td>
107
  </tr>
108
  <tr>
109
  <td>Gemma2</td>
110
  <td>1.59</td>
111
  <td>1.3754</td>
112
+ <td><b>1.3597</b></td>
113
+ <td><b>1.0601</b></td>
114
+ <td>1.0752</td>
115
  </tr>
116
  <tr>
117
  <td>OLMo</td>
118
  <td>5.00</td>
119
  <td>1.6011</td>
120
+ <td><b>1.5855</b></td>
121
+ <td><b>1.2857</b></td>
122
+ <td>1.3039</td>
123
  </tr>
124
  <tr>
125
  <td>Cohere</td>
126
  <td>4.04</td>
127
  <td>2.1327</td>
128
+ <td><b>2.1152</b></td>
129
+ <td><b>1.6244</b></td>
130
+ <td>1.6593</td>
131
  </tr>
132
  <tr>
133
  <td>Phi</td>
134
  <td>5.78</td>
135
  <td>1.7525</td>
136
+ <td><b>1.7419</b></td>
137
+ <td><b>1.4770</b></td>
138
+ <td>1.4876</td>
139
  </tr>
140
  <tr>
141
  <td>StarCoder2</td>
142
  <td>3.01</td>
143
+ <td><b>1.6125</b></td>
144
+ <td>1.6498</td>
145
+ <td><b>1.3044</b></td>
146
+ <td>1.3718</td>
147
  </tr>
148
  <tr>
149
  <td>StableLM</td>
150
  <td>5.06</td>
151
+ <td><b>1.5835</b></td>
152
+ <td>1.5905</td>
153
+ <td><b>1.2662</b></td>
154
+ <td>1.2998</td>
155
  </tr>
156
  <tr>
157
  <td>GPT2</td>
158
  <td>3.53</td>
159
  <td>2.1100</td>
160
+ <td><b>2.1081</b></td>
161
+ <td><b>1.8236</b></td>
162
+ <td>1.8508</td>
163
  </tr>
164
  <tr>
165
  <td>GPT-J</td>
166
  <td>3.06</td>
167
  <td>1.7198</td>
168
+ <td><b>1.6976</b></td>
169
+ <td><b>1.4503</b></td>
170
+ <td>1.4541</td>
 
 
 
 
 
 
 
 
171
  </tr>
172
  <tr>
173
  <td>GPT-NeoX</td>
174
  <td>5.06</td>
175
  <td>1.7233</td>
176
+ <td><b>1.6981</b></td>
177
+ <td>1.4400</td>
178
+ <td><b>1.4303</b></td>
179
  </tr>
180
  <tr>
181
  <td>Bloom</td>
182
  <td>3.33</td>
183
  <td>1.6910</td>
184
+ <td><b>1.6704</b></td>
185
+ <td><b>1.3690</b></td>
186
+ <td>1.3774</td>
187
  </tr>
188
  <tr>
189
  <td>MPT</td>
190
  <td>4.39</td>
191
  <td>1.6466</td>
192
+ <td><b>1.6317</b></td>
193
+ <td><b>1.3443</b></td>
194
+ <td>1.3550</td>
195
  </tr>
196
  <tr>
197
  <td>RWKV</td>
198
+ <td>0.72</td>
199
+ <td><b>3.0151</b></td>
200
+ <td>3.0810</td>
201
+ <td><b>1.8569</b></td>
202
+ <td>1.9628</td>
203
+ </tr>
204
+ <tr>
205
+ <td>Avg.</td>
206
+ <td>-</td>
207
+ <td>1.755</td>
208
+ <td><b>1.749</b></td>
209
+ <td><b>1.389</b></td>
210
+ <td>1.413</td>
211
  </tr>
212
  </tbody>
213
  </table>