irodkin commited on
Commit
66864fa
·
verified ·
1 Parent(s): 791081f

Training checkpoint at step 4000

Browse files
Files changed (1) hide show
  1. trainer_state.json +1206 -126
trainer_state.json CHANGED
@@ -1,373 +1,1453 @@
1
  {
2
- "best_global_step": 1000,
3
- "best_metric": 2.4606094360351562,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-1000",
5
- "epoch": 0.02,
6
  "eval_steps": 100,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
- "grad_norm": 23.921196999824634,
15
  "learning_rate": 4.8e-08,
16
- "loss": 3.5322,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
- "grad_norm": 25.294482425488255,
22
  "learning_rate": 9.8e-08,
23
- "loss": 3.447,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
- "grad_norm": 15.01769397770396,
29
  "learning_rate": 1.4800000000000003e-07,
30
- "loss": 3.3089,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
- "grad_norm": 16.008940137598024,
36
  "learning_rate": 1.9800000000000003e-07,
37
- "loss": 3.1511,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
- "eval_loss": 2.927788257598877,
43
- "eval_runtime": 32.2151,
44
- "eval_samples_per_second": 3.166,
45
- "eval_steps_per_second": 1.583,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
- "grad_norm": 5.1785225673855315,
51
  "learning_rate": 2.48e-07,
52
- "loss": 2.917,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
- "grad_norm": 2.686420253223822,
58
  "learning_rate": 2.9800000000000005e-07,
59
- "loss": 2.7866,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
- "grad_norm": 2.131122700028578,
65
  "learning_rate": 3.48e-07,
66
- "loss": 2.6983,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
- "grad_norm": 1.156232330871528,
72
  "learning_rate": 3.9800000000000004e-07,
73
- "loss": 2.6605,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
- "eval_loss": 2.622080087661743,
79
- "eval_runtime": 32.478,
80
- "eval_samples_per_second": 3.141,
81
- "eval_steps_per_second": 1.57,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
- "grad_norm": 1.0450250892253987,
87
  "learning_rate": 4.4800000000000004e-07,
88
- "loss": 2.622,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
- "grad_norm": 1.282826275999948,
94
  "learning_rate": 4.98e-07,
95
- "loss": 2.5969,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
- "grad_norm": 1.0689503555811006,
101
  "learning_rate": 5.480000000000001e-07,
102
- "loss": 2.5696,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
- "grad_norm": 1.1295113329856576,
108
  "learning_rate": 5.98e-07,
109
- "loss": 2.5586,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
- "eval_loss": 2.542414903640747,
115
- "eval_runtime": 32.3741,
116
- "eval_samples_per_second": 3.151,
117
- "eval_steps_per_second": 1.575,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
- "grad_norm": 0.9903737262138339,
123
  "learning_rate": 6.48e-07,
124
- "loss": 2.5411,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
- "grad_norm": 0.8804340297720566,
130
  "learning_rate": 6.98e-07,
131
- "loss": 2.5381,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
- "grad_norm": 0.9455594729886727,
137
  "learning_rate": 7.480000000000001e-07,
138
- "loss": 2.5281,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
- "grad_norm": 1.0362797793655913,
144
  "learning_rate": 7.98e-07,
145
- "loss": 2.516,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
- "eval_loss": 2.5159339904785156,
151
- "eval_runtime": 32.5645,
152
- "eval_samples_per_second": 3.132,
153
- "eval_steps_per_second": 1.566,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
- "grad_norm": 0.9118188709796304,
159
  "learning_rate": 8.480000000000001e-07,
160
- "loss": 2.5094,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
- "grad_norm": 0.9386092151497886,
166
  "learning_rate": 8.980000000000001e-07,
167
- "loss": 2.5168,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
- "grad_norm": 0.9462657509886757,
173
  "learning_rate": 9.480000000000001e-07,
174
- "loss": 2.5026,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
- "grad_norm": 0.9797179393690154,
180
  "learning_rate": 9.98e-07,
181
- "loss": 2.5106,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
- "eval_loss": 2.4997920989990234,
187
- "eval_runtime": 32.6441,
188
- "eval_samples_per_second": 3.125,
189
- "eval_steps_per_second": 1.562,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
- "grad_norm": 0.8200361058649662,
195
  "learning_rate": 1.0480000000000002e-06,
196
- "loss": 2.503,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
- "grad_norm": 0.8833572046578635,
202
  "learning_rate": 1.0980000000000001e-06,
203
- "loss": 2.4886,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
- "grad_norm": 0.9703276038236935,
209
  "learning_rate": 1.148e-06,
210
- "loss": 2.4868,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
- "grad_norm": 1.3485829978019601,
216
  "learning_rate": 1.1980000000000002e-06,
217
- "loss": 2.4876,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
- "eval_loss": 2.486616849899292,
223
- "eval_runtime": 32.7645,
224
- "eval_samples_per_second": 3.113,
225
- "eval_steps_per_second": 1.557,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
- "grad_norm": 0.9960360682218028,
231
  "learning_rate": 1.248e-06,
232
- "loss": 2.4846,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
- "grad_norm": 1.2959733905656832,
238
  "learning_rate": 1.2980000000000001e-06,
239
- "loss": 2.4746,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
- "grad_norm": 1.395691356694074,
245
  "learning_rate": 1.348e-06,
246
- "loss": 2.4774,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
- "grad_norm": 1.1519407894265516,
252
  "learning_rate": 1.3980000000000002e-06,
253
- "loss": 2.47,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
- "eval_loss": 2.4779043197631836,
259
- "eval_runtime": 32.5852,
260
- "eval_samples_per_second": 3.13,
261
- "eval_steps_per_second": 1.565,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
- "grad_norm": 1.3066599625373616,
267
  "learning_rate": 1.4480000000000002e-06,
268
- "loss": 2.4635,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
- "grad_norm": 0.9202041250565723,
274
  "learning_rate": 1.498e-06,
275
- "loss": 2.4738,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
- "grad_norm": 1.271428818834587,
281
  "learning_rate": 1.548e-06,
282
- "loss": 2.4587,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
- "grad_norm": 1.124713525693466,
288
  "learning_rate": 1.5980000000000002e-06,
289
- "loss": 2.4683,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
- "eval_loss": 2.4715065956115723,
295
- "eval_runtime": 32.6647,
296
- "eval_samples_per_second": 3.123,
297
- "eval_steps_per_second": 1.561,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
- "grad_norm": 1.3342531101252784,
303
  "learning_rate": 1.6480000000000001e-06,
304
- "loss": 2.4536,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
- "grad_norm": 1.3387154694191934,
310
  "learning_rate": 1.6980000000000003e-06,
311
- "loss": 2.4652,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
- "grad_norm": 1.0351828691489098,
317
  "learning_rate": 1.7480000000000002e-06,
318
- "loss": 2.4607,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
- "grad_norm": 0.9834835131770852,
324
  "learning_rate": 1.798e-06,
325
- "loss": 2.4433,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
- "eval_loss": 2.4658470153808594,
331
- "eval_runtime": 32.359,
332
- "eval_samples_per_second": 3.152,
333
- "eval_steps_per_second": 1.576,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
- "grad_norm": 1.0228623820072862,
339
  "learning_rate": 1.8480000000000001e-06,
340
- "loss": 2.4615,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
- "grad_norm": 1.3007839596114972,
346
  "learning_rate": 1.898e-06,
347
- "loss": 2.4651,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
- "grad_norm": 1.0925217245981393,
353
  "learning_rate": 1.9480000000000002e-06,
354
- "loss": 2.4581,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
- "grad_norm": 1.0790549218851497,
360
  "learning_rate": 1.998e-06,
361
- "loss": 2.4509,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
- "eval_loss": 2.4606094360351562,
367
- "eval_runtime": 32.4413,
368
- "eval_samples_per_second": 3.144,
369
- "eval_steps_per_second": 1.572,
370
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  }
372
  ],
373
  "logging_steps": 25,
@@ -387,7 +1467,7 @@
387
  "attributes": {}
388
  }
389
  },
390
- "total_flos": 3.183202298327204e+18,
391
  "train_batch_size": 1,
392
  "trial_name": null,
393
  "trial_params": null
 
1
  {
2
+ "best_global_step": 4000,
3
+ "best_metric": 2.4323527812957764,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-4000",
5
+ "epoch": 0.08,
6
  "eval_steps": 100,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
+ "grad_norm": 39.75564521032967,
15
  "learning_rate": 4.8e-08,
16
+ "loss": 3.6517,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
+ "grad_norm": 28.937531835097435,
22
  "learning_rate": 9.8e-08,
23
+ "loss": 3.5931,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
+ "grad_norm": 21.922720332659644,
29
  "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.3397,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
+ "grad_norm": 8.739610199908325,
36
  "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.1289,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
+ "eval_loss": 2.9243295192718506,
43
+ "eval_runtime": 264.3302,
44
+ "eval_samples_per_second": 3.11,
45
+ "eval_steps_per_second": 1.555,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
+ "grad_norm": 4.433912600039677,
51
  "learning_rate": 2.48e-07,
52
+ "loss": 2.8957,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
+ "grad_norm": 3.2874790066620303,
58
  "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.763,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
+ "grad_norm": 1.5203472215469231,
65
  "learning_rate": 3.48e-07,
66
+ "loss": 2.676,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
+ "grad_norm": 1.1945541683905954,
72
  "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.635,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
+ "eval_loss": 2.6094932556152344,
79
+ "eval_runtime": 265.7702,
80
+ "eval_samples_per_second": 3.093,
81
+ "eval_steps_per_second": 1.546,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
+ "grad_norm": 1.0852713304633745,
87
  "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.6016,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
+ "grad_norm": 1.0733940346699529,
94
  "learning_rate": 4.98e-07,
95
+ "loss": 2.5797,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
+ "grad_norm": 0.9273949035031271,
101
  "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.5607,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
+ "grad_norm": 0.9289300678591714,
108
  "learning_rate": 5.98e-07,
109
+ "loss": 2.552,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
+ "eval_loss": 2.541522264480591,
115
+ "eval_runtime": 266.7478,
116
+ "eval_samples_per_second": 3.082,
117
+ "eval_steps_per_second": 1.541,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
+ "grad_norm": 1.1328584507449984,
123
  "learning_rate": 6.48e-07,
124
+ "loss": 2.5402,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
+ "grad_norm": 0.8593307029257858,
130
  "learning_rate": 6.98e-07,
131
+ "loss": 2.5286,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
+ "grad_norm": 0.895615604067586,
137
  "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.5311,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
+ "grad_norm": 0.912306580242149,
144
  "learning_rate": 7.98e-07,
145
+ "loss": 2.5037,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
+ "eval_loss": 2.514389991760254,
151
+ "eval_runtime": 266.4899,
152
+ "eval_samples_per_second": 3.085,
153
+ "eval_steps_per_second": 1.542,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
+ "grad_norm": 1.1866535514670034,
159
  "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.5011,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
+ "grad_norm": 1.211342504193914,
166
  "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.503,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
+ "grad_norm": 1.113763817383069,
173
  "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.4999,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
+ "grad_norm": 1.2585585589647226,
180
  "learning_rate": 9.98e-07,
181
+ "loss": 2.4872,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
+ "eval_loss": 2.497868061065674,
187
+ "eval_runtime": 265.7962,
188
+ "eval_samples_per_second": 3.093,
189
+ "eval_steps_per_second": 1.546,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
+ "grad_norm": 1.2585825718084245,
195
  "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.4852,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
+ "grad_norm": 1.4101257437846046,
202
  "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.4892,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
+ "grad_norm": 1.1975234150707363,
209
  "learning_rate": 1.148e-06,
210
+ "loss": 2.4861,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
+ "grad_norm": 1.3662769225582332,
216
  "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.4882,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
+ "eval_loss": 2.4879231452941895,
223
+ "eval_runtime": 267.0005,
224
+ "eval_samples_per_second": 3.079,
225
+ "eval_steps_per_second": 1.539,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
+ "grad_norm": 1.3086724275194024,
231
  "learning_rate": 1.248e-06,
232
+ "loss": 2.4745,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
+ "grad_norm": 1.317023206802888,
238
  "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.4727,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
+ "grad_norm": 1.5284967544483212,
245
  "learning_rate": 1.348e-06,
246
+ "loss": 2.469,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
+ "grad_norm": 1.1047595217316941,
252
  "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.4695,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
+ "eval_loss": 2.480103015899658,
259
+ "eval_runtime": 263.5022,
260
+ "eval_samples_per_second": 3.12,
261
+ "eval_steps_per_second": 1.56,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
+ "grad_norm": 1.2077328209863791,
267
  "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.4654,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
+ "grad_norm": 1.209220841771836,
274
  "learning_rate": 1.498e-06,
275
+ "loss": 2.4663,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
+ "grad_norm": 1.3063169829879686,
281
  "learning_rate": 1.548e-06,
282
+ "loss": 2.4704,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
+ "grad_norm": 1.3180183352683195,
288
  "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.4583,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
+ "eval_loss": 2.473590850830078,
295
+ "eval_runtime": 305.9875,
296
+ "eval_samples_per_second": 2.686,
297
+ "eval_steps_per_second": 1.343,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
+ "grad_norm": 1.1674852380778837,
303
  "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.467,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
+ "grad_norm": 1.2497656349941002,
310
  "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.4612,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
+ "grad_norm": 1.3358614980967494,
317
  "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.4636,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
+ "grad_norm": 1.252489857653356,
324
  "learning_rate": 1.798e-06,
325
+ "loss": 2.454,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
+ "eval_loss": 2.4681763648986816,
331
+ "eval_runtime": 264.702,
332
+ "eval_samples_per_second": 3.105,
333
+ "eval_steps_per_second": 1.553,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
+ "grad_norm": 1.2815437998994337,
339
  "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.4571,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
+ "grad_norm": 1.0902475329451575,
346
  "learning_rate": 1.898e-06,
347
+ "loss": 2.451,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
+ "grad_norm": 1.1502696024965324,
353
  "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.4527,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
+ "grad_norm": 1.2336661855806117,
360
  "learning_rate": 1.998e-06,
361
+ "loss": 2.4496,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
+ "eval_loss": 2.463880777359009,
367
+ "eval_runtime": 275.7426,
368
+ "eval_samples_per_second": 2.981,
369
+ "eval_steps_per_second": 1.491,
370
  "step": 1000
371
+ },
372
+ {
373
+ "epoch": 0.0205,
374
+ "grad_norm": 1.2680742209094296,
375
+ "learning_rate": 2.048e-06,
376
+ "loss": 2.4494,
377
+ "step": 1025
378
+ },
379
+ {
380
+ "epoch": 0.021,
381
+ "grad_norm": 1.0341778808278126,
382
+ "learning_rate": 2.098e-06,
383
+ "loss": 2.4467,
384
+ "step": 1050
385
+ },
386
+ {
387
+ "epoch": 0.0215,
388
+ "grad_norm": 0.9860490736001175,
389
+ "learning_rate": 2.148e-06,
390
+ "loss": 2.4473,
391
+ "step": 1075
392
+ },
393
+ {
394
+ "epoch": 0.022,
395
+ "grad_norm": 0.9419267295275278,
396
+ "learning_rate": 2.198e-06,
397
+ "loss": 2.443,
398
+ "step": 1100
399
+ },
400
+ {
401
+ "epoch": 0.022,
402
+ "eval_loss": 2.4598941802978516,
403
+ "eval_runtime": 265.0502,
404
+ "eval_samples_per_second": 3.101,
405
+ "eval_steps_per_second": 1.551,
406
+ "step": 1100
407
+ },
408
+ {
409
+ "epoch": 0.0225,
410
+ "grad_norm": 1.3280720471027394,
411
+ "learning_rate": 2.2480000000000003e-06,
412
+ "loss": 2.4515,
413
+ "step": 1125
414
+ },
415
+ {
416
+ "epoch": 0.023,
417
+ "grad_norm": 1.053570785582915,
418
+ "learning_rate": 2.2980000000000003e-06,
419
+ "loss": 2.4396,
420
+ "step": 1150
421
+ },
422
+ {
423
+ "epoch": 0.0235,
424
+ "grad_norm": 0.9108119839585552,
425
+ "learning_rate": 2.3480000000000002e-06,
426
+ "loss": 2.4442,
427
+ "step": 1175
428
+ },
429
+ {
430
+ "epoch": 0.024,
431
+ "grad_norm": 1.0062346367900277,
432
+ "learning_rate": 2.398e-06,
433
+ "loss": 2.4443,
434
+ "step": 1200
435
+ },
436
+ {
437
+ "epoch": 0.024,
438
+ "eval_loss": 2.456455945968628,
439
+ "eval_runtime": 264.5888,
440
+ "eval_samples_per_second": 3.107,
441
+ "eval_steps_per_second": 1.553,
442
+ "step": 1200
443
+ },
444
+ {
445
+ "epoch": 0.0245,
446
+ "grad_norm": 1.0264127705426926,
447
+ "learning_rate": 2.448e-06,
448
+ "loss": 2.4351,
449
+ "step": 1225
450
+ },
451
+ {
452
+ "epoch": 0.025,
453
+ "grad_norm": 0.8015249588347212,
454
+ "learning_rate": 2.498e-06,
455
+ "loss": 2.4406,
456
+ "step": 1250
457
+ },
458
+ {
459
+ "epoch": 0.0255,
460
+ "grad_norm": 1.1105649485540114,
461
+ "learning_rate": 2.5480000000000004e-06,
462
+ "loss": 2.4377,
463
+ "step": 1275
464
+ },
465
+ {
466
+ "epoch": 0.026,
467
+ "grad_norm": 0.9701758426012801,
468
+ "learning_rate": 2.598e-06,
469
+ "loss": 2.4341,
470
+ "step": 1300
471
+ },
472
+ {
473
+ "epoch": 0.026,
474
+ "eval_loss": 2.453026056289673,
475
+ "eval_runtime": 264.7653,
476
+ "eval_samples_per_second": 3.105,
477
+ "eval_steps_per_second": 1.552,
478
+ "step": 1300
479
+ },
480
+ {
481
+ "epoch": 0.0265,
482
+ "grad_norm": 0.9587254891845429,
483
+ "learning_rate": 2.648e-06,
484
+ "loss": 2.4303,
485
+ "step": 1325
486
+ },
487
+ {
488
+ "epoch": 0.027,
489
+ "grad_norm": 0.8135883960763247,
490
+ "learning_rate": 2.6980000000000003e-06,
491
+ "loss": 2.4363,
492
+ "step": 1350
493
+ },
494
+ {
495
+ "epoch": 0.0275,
496
+ "grad_norm": 0.9192860127847176,
497
+ "learning_rate": 2.748e-06,
498
+ "loss": 2.4257,
499
+ "step": 1375
500
+ },
501
+ {
502
+ "epoch": 0.028,
503
+ "grad_norm": 0.947465928893444,
504
+ "learning_rate": 2.798e-06,
505
+ "loss": 2.4353,
506
+ "step": 1400
507
+ },
508
+ {
509
+ "epoch": 0.028,
510
+ "eval_loss": 2.450345993041992,
511
+ "eval_runtime": 265.6266,
512
+ "eval_samples_per_second": 3.095,
513
+ "eval_steps_per_second": 1.547,
514
+ "step": 1400
515
+ },
516
+ {
517
+ "epoch": 0.0285,
518
+ "grad_norm": 0.9270137901066681,
519
+ "learning_rate": 2.848e-06,
520
+ "loss": 2.4347,
521
+ "step": 1425
522
+ },
523
+ {
524
+ "epoch": 0.029,
525
+ "grad_norm": 0.8839980710491563,
526
+ "learning_rate": 2.8980000000000005e-06,
527
+ "loss": 2.4213,
528
+ "step": 1450
529
+ },
530
+ {
531
+ "epoch": 0.0295,
532
+ "grad_norm": 0.913196005454606,
533
+ "learning_rate": 2.9480000000000004e-06,
534
+ "loss": 2.4232,
535
+ "step": 1475
536
+ },
537
+ {
538
+ "epoch": 0.03,
539
+ "grad_norm": 0.8139623858623861,
540
+ "learning_rate": 2.9980000000000003e-06,
541
+ "loss": 2.4254,
542
+ "step": 1500
543
+ },
544
+ {
545
+ "epoch": 0.03,
546
+ "eval_loss": 2.447662830352783,
547
+ "eval_runtime": 263.4353,
548
+ "eval_samples_per_second": 3.12,
549
+ "eval_steps_per_second": 1.56,
550
+ "step": 1500
551
+ },
552
+ {
553
+ "epoch": 0.0305,
554
+ "grad_norm": 0.8422198221554755,
555
+ "learning_rate": 3.0480000000000003e-06,
556
+ "loss": 2.4196,
557
+ "step": 1525
558
+ },
559
+ {
560
+ "epoch": 0.031,
561
+ "grad_norm": 0.8542957579365906,
562
+ "learning_rate": 3.0980000000000007e-06,
563
+ "loss": 2.4294,
564
+ "step": 1550
565
+ },
566
+ {
567
+ "epoch": 0.0315,
568
+ "grad_norm": 1.149263137594797,
569
+ "learning_rate": 3.1480000000000006e-06,
570
+ "loss": 2.4265,
571
+ "step": 1575
572
+ },
573
+ {
574
+ "epoch": 0.032,
575
+ "grad_norm": 0.811470126240392,
576
+ "learning_rate": 3.198e-06,
577
+ "loss": 2.4105,
578
+ "step": 1600
579
+ },
580
+ {
581
+ "epoch": 0.032,
582
+ "eval_loss": 2.4456679821014404,
583
+ "eval_runtime": 264.056,
584
+ "eval_samples_per_second": 3.113,
585
+ "eval_steps_per_second": 1.556,
586
+ "step": 1600
587
+ },
588
+ {
589
+ "epoch": 0.0325,
590
+ "grad_norm": 2.3928975221881434,
591
+ "learning_rate": 3.248e-06,
592
+ "loss": 2.4208,
593
+ "step": 1625
594
+ },
595
+ {
596
+ "epoch": 0.033,
597
+ "grad_norm": 0.8031315125360012,
598
+ "learning_rate": 3.298e-06,
599
+ "loss": 2.4224,
600
+ "step": 1650
601
+ },
602
+ {
603
+ "epoch": 0.0335,
604
+ "grad_norm": 0.835567276692195,
605
+ "learning_rate": 3.348e-06,
606
+ "loss": 2.4188,
607
+ "step": 1675
608
+ },
609
+ {
610
+ "epoch": 0.034,
611
+ "grad_norm": 0.8894325175719718,
612
+ "learning_rate": 3.3980000000000003e-06,
613
+ "loss": 2.4206,
614
+ "step": 1700
615
+ },
616
+ {
617
+ "epoch": 0.034,
618
+ "eval_loss": 2.4437851905822754,
619
+ "eval_runtime": 264.6455,
620
+ "eval_samples_per_second": 3.106,
621
+ "eval_steps_per_second": 1.553,
622
+ "step": 1700
623
+ },
624
+ {
625
+ "epoch": 0.0345,
626
+ "grad_norm": 0.802724390649243,
627
+ "learning_rate": 3.4480000000000003e-06,
628
+ "loss": 2.4241,
629
+ "step": 1725
630
+ },
631
+ {
632
+ "epoch": 0.035,
633
+ "grad_norm": 0.8206312612014312,
634
+ "learning_rate": 3.4980000000000002e-06,
635
+ "loss": 2.4157,
636
+ "step": 1750
637
+ },
638
+ {
639
+ "epoch": 0.0355,
640
+ "grad_norm": 0.8653789917535344,
641
+ "learning_rate": 3.548e-06,
642
+ "loss": 2.412,
643
+ "step": 1775
644
+ },
645
+ {
646
+ "epoch": 0.036,
647
+ "grad_norm": 0.7816319078215015,
648
+ "learning_rate": 3.5980000000000005e-06,
649
+ "loss": 2.4179,
650
+ "step": 1800
651
+ },
652
+ {
653
+ "epoch": 0.036,
654
+ "eval_loss": 2.4423036575317383,
655
+ "eval_runtime": 264.5578,
656
+ "eval_samples_per_second": 3.107,
657
+ "eval_steps_per_second": 1.554,
658
+ "step": 1800
659
+ },
660
+ {
661
+ "epoch": 0.0365,
662
+ "grad_norm": 0.707594544466941,
663
+ "learning_rate": 3.6480000000000005e-06,
664
+ "loss": 2.416,
665
+ "step": 1825
666
+ },
667
+ {
668
+ "epoch": 0.037,
669
+ "grad_norm": 0.7481066913011816,
670
+ "learning_rate": 3.6980000000000004e-06,
671
+ "loss": 2.4242,
672
+ "step": 1850
673
+ },
674
+ {
675
+ "epoch": 0.0375,
676
+ "grad_norm": 0.7612014979445353,
677
+ "learning_rate": 3.7480000000000004e-06,
678
+ "loss": 2.4173,
679
+ "step": 1875
680
+ },
681
+ {
682
+ "epoch": 0.038,
683
+ "grad_norm": 0.772750918048857,
684
+ "learning_rate": 3.7980000000000007e-06,
685
+ "loss": 2.4134,
686
+ "step": 1900
687
+ },
688
+ {
689
+ "epoch": 0.038,
690
+ "eval_loss": 2.440969228744507,
691
+ "eval_runtime": 274.3624,
692
+ "eval_samples_per_second": 2.996,
693
+ "eval_steps_per_second": 1.498,
694
+ "step": 1900
695
+ },
696
+ {
697
+ "epoch": 0.0385,
698
+ "grad_norm": 0.7927966042188935,
699
+ "learning_rate": 3.848e-06,
700
+ "loss": 2.4131,
701
+ "step": 1925
702
+ },
703
+ {
704
+ "epoch": 0.039,
705
+ "grad_norm": 0.7664274167276341,
706
+ "learning_rate": 3.898e-06,
707
+ "loss": 2.4133,
708
+ "step": 1950
709
+ },
710
+ {
711
+ "epoch": 0.0395,
712
+ "grad_norm": 0.7038638213491795,
713
+ "learning_rate": 3.948e-06,
714
+ "loss": 2.4135,
715
+ "step": 1975
716
+ },
717
+ {
718
+ "epoch": 0.04,
719
+ "grad_norm": 0.7231696877425319,
720
+ "learning_rate": 3.9980000000000005e-06,
721
+ "loss": 2.4169,
722
+ "step": 2000
723
+ },
724
+ {
725
+ "epoch": 0.04,
726
+ "eval_loss": 2.439641237258911,
727
+ "eval_runtime": 282.4449,
728
+ "eval_samples_per_second": 2.91,
729
+ "eval_steps_per_second": 1.455,
730
+ "step": 2000
731
+ },
732
+ {
733
+ "epoch": 0.0405,
734
+ "grad_norm": 0.7184393791203537,
735
+ "learning_rate": 4.048e-06,
736
+ "loss": 2.4071,
737
+ "step": 2025
738
+ },
739
+ {
740
+ "epoch": 0.041,
741
+ "grad_norm": 0.7366813467336683,
742
+ "learning_rate": 4.098e-06,
743
+ "loss": 2.4113,
744
+ "step": 2050
745
+ },
746
+ {
747
+ "epoch": 0.0415,
748
+ "grad_norm": 0.7081408763220511,
749
+ "learning_rate": 4.148000000000001e-06,
750
+ "loss": 2.4168,
751
+ "step": 2075
752
+ },
753
+ {
754
+ "epoch": 0.042,
755
+ "grad_norm": 0.6912835983850483,
756
+ "learning_rate": 4.198e-06,
757
+ "loss": 2.4105,
758
+ "step": 2100
759
+ },
760
+ {
761
+ "epoch": 0.042,
762
+ "eval_loss": 2.438904047012329,
763
+ "eval_runtime": 277.7481,
764
+ "eval_samples_per_second": 2.96,
765
+ "eval_steps_per_second": 1.48,
766
+ "step": 2100
767
+ },
768
+ {
769
+ "epoch": 0.0425,
770
+ "grad_norm": 0.7745538733736145,
771
+ "learning_rate": 4.248000000000001e-06,
772
+ "loss": 2.4131,
773
+ "step": 2125
774
+ },
775
+ {
776
+ "epoch": 0.043,
777
+ "grad_norm": 0.6897576190091962,
778
+ "learning_rate": 4.298e-06,
779
+ "loss": 2.4084,
780
+ "step": 2150
781
+ },
782
+ {
783
+ "epoch": 0.0435,
784
+ "grad_norm": 0.7020994032566351,
785
+ "learning_rate": 4.3480000000000006e-06,
786
+ "loss": 2.4125,
787
+ "step": 2175
788
+ },
789
+ {
790
+ "epoch": 0.044,
791
+ "grad_norm": 0.6668651869738377,
792
+ "learning_rate": 4.398000000000001e-06,
793
+ "loss": 2.4034,
794
+ "step": 2200
795
+ },
796
+ {
797
+ "epoch": 0.044,
798
+ "eval_loss": 2.4380908012390137,
799
+ "eval_runtime": 268.2252,
800
+ "eval_samples_per_second": 3.065,
801
+ "eval_steps_per_second": 1.532,
802
+ "step": 2200
803
+ },
804
+ {
805
+ "epoch": 0.0445,
806
+ "grad_norm": 0.6547759047620061,
807
+ "learning_rate": 4.4480000000000004e-06,
808
+ "loss": 2.4099,
809
+ "step": 2225
810
+ },
811
+ {
812
+ "epoch": 0.045,
813
+ "grad_norm": 0.6865815945777785,
814
+ "learning_rate": 4.498e-06,
815
+ "loss": 2.412,
816
+ "step": 2250
817
+ },
818
+ {
819
+ "epoch": 0.0455,
820
+ "grad_norm": 0.6878267781655092,
821
+ "learning_rate": 4.548e-06,
822
+ "loss": 2.4137,
823
+ "step": 2275
824
+ },
825
+ {
826
+ "epoch": 0.046,
827
+ "grad_norm": 0.8314813616644483,
828
+ "learning_rate": 4.598e-06,
829
+ "loss": 2.4097,
830
+ "step": 2300
831
+ },
832
+ {
833
+ "epoch": 0.046,
834
+ "eval_loss": 2.4374496936798096,
835
+ "eval_runtime": 263.1701,
836
+ "eval_samples_per_second": 3.123,
837
+ "eval_steps_per_second": 1.562,
838
+ "step": 2300
839
+ },
840
+ {
841
+ "epoch": 0.0465,
842
+ "grad_norm": 0.6723966792931375,
843
+ "learning_rate": 4.648e-06,
844
+ "loss": 2.4051,
845
+ "step": 2325
846
+ },
847
+ {
848
+ "epoch": 0.047,
849
+ "grad_norm": 0.7003756914046538,
850
+ "learning_rate": 4.698000000000001e-06,
851
+ "loss": 2.4032,
852
+ "step": 2350
853
+ },
854
+ {
855
+ "epoch": 0.0475,
856
+ "grad_norm": 0.6747085415631567,
857
+ "learning_rate": 4.748e-06,
858
+ "loss": 2.4096,
859
+ "step": 2375
860
+ },
861
+ {
862
+ "epoch": 0.048,
863
+ "grad_norm": 0.6571218540079207,
864
+ "learning_rate": 4.7980000000000005e-06,
865
+ "loss": 2.4165,
866
+ "step": 2400
867
+ },
868
+ {
869
+ "epoch": 0.048,
870
+ "eval_loss": 2.4365923404693604,
871
+ "eval_runtime": 264.2268,
872
+ "eval_samples_per_second": 3.111,
873
+ "eval_steps_per_second": 1.555,
874
+ "step": 2400
875
+ },
876
+ {
877
+ "epoch": 0.0485,
878
+ "grad_norm": 0.7464314980483315,
879
+ "learning_rate": 4.848000000000001e-06,
880
+ "loss": 2.4098,
881
+ "step": 2425
882
+ },
883
+ {
884
+ "epoch": 0.049,
885
+ "grad_norm": 0.6267266619200393,
886
+ "learning_rate": 4.898e-06,
887
+ "loss": 2.4019,
888
+ "step": 2450
889
+ },
890
+ {
891
+ "epoch": 0.0495,
892
+ "grad_norm": 0.6650772680412506,
893
+ "learning_rate": 4.948000000000001e-06,
894
+ "loss": 2.405,
895
+ "step": 2475
896
+ },
897
+ {
898
+ "epoch": 0.05,
899
+ "grad_norm": 0.7197173899674899,
900
+ "learning_rate": 4.998e-06,
901
+ "loss": 2.4095,
902
+ "step": 2500
903
+ },
904
+ {
905
+ "epoch": 0.05,
906
+ "eval_loss": 2.4358348846435547,
907
+ "eval_runtime": 266.7682,
908
+ "eval_samples_per_second": 3.081,
909
+ "eval_steps_per_second": 1.541,
910
+ "step": 2500
911
+ },
912
+ {
913
+ "epoch": 0.0505,
914
+ "grad_norm": 0.6249572472256157,
915
+ "learning_rate": 5.048000000000001e-06,
916
+ "loss": 2.4058,
917
+ "step": 2525
918
+ },
919
+ {
920
+ "epoch": 0.051,
921
+ "grad_norm": 0.7429228032719255,
922
+ "learning_rate": 5.098000000000001e-06,
923
+ "loss": 2.4084,
924
+ "step": 2550
925
+ },
926
+ {
927
+ "epoch": 0.0515,
928
+ "grad_norm": 0.6320325962693778,
929
+ "learning_rate": 5.1480000000000005e-06,
930
+ "loss": 2.4015,
931
+ "step": 2575
932
+ },
933
+ {
934
+ "epoch": 0.052,
935
+ "grad_norm": 0.672581755106835,
936
+ "learning_rate": 5.198000000000001e-06,
937
+ "loss": 2.4051,
938
+ "step": 2600
939
+ },
940
+ {
941
+ "epoch": 0.052,
942
+ "eval_loss": 2.4351842403411865,
943
+ "eval_runtime": 264.9149,
944
+ "eval_samples_per_second": 3.103,
945
+ "eval_steps_per_second": 1.551,
946
+ "step": 2600
947
+ },
948
+ {
949
+ "epoch": 0.0525,
950
+ "grad_norm": 0.7086480776921088,
951
+ "learning_rate": 5.248000000000001e-06,
952
+ "loss": 2.3988,
953
+ "step": 2625
954
+ },
955
+ {
956
+ "epoch": 0.053,
957
+ "grad_norm": 0.6774201154936552,
958
+ "learning_rate": 5.298000000000001e-06,
959
+ "loss": 2.394,
960
+ "step": 2650
961
+ },
962
+ {
963
+ "epoch": 0.0535,
964
+ "grad_norm": 0.6661104910300973,
965
+ "learning_rate": 5.348000000000001e-06,
966
+ "loss": 2.4034,
967
+ "step": 2675
968
+ },
969
+ {
970
+ "epoch": 0.054,
971
+ "grad_norm": 0.6224421593448741,
972
+ "learning_rate": 5.398e-06,
973
+ "loss": 2.3939,
974
+ "step": 2700
975
+ },
976
+ {
977
+ "epoch": 0.054,
978
+ "eval_loss": 2.434826374053955,
979
+ "eval_runtime": 264.1641,
980
+ "eval_samples_per_second": 3.112,
981
+ "eval_steps_per_second": 1.556,
982
+ "step": 2700
983
+ },
984
+ {
985
+ "epoch": 0.0545,
986
+ "grad_norm": 0.6944661408419767,
987
+ "learning_rate": 5.448e-06,
988
+ "loss": 2.4064,
989
+ "step": 2725
990
+ },
991
+ {
992
+ "epoch": 0.055,
993
+ "grad_norm": 0.6597297955298902,
994
+ "learning_rate": 5.498e-06,
995
+ "loss": 2.4051,
996
+ "step": 2750
997
+ },
998
+ {
999
+ "epoch": 0.0555,
1000
+ "grad_norm": 0.6526109506522182,
1001
+ "learning_rate": 5.548e-06,
1002
+ "loss": 2.4124,
1003
+ "step": 2775
1004
+ },
1005
+ {
1006
+ "epoch": 0.056,
1007
+ "grad_norm": 0.6528041780055424,
1008
+ "learning_rate": 5.5980000000000004e-06,
1009
+ "loss": 2.3979,
1010
+ "step": 2800
1011
+ },
1012
+ {
1013
+ "epoch": 0.056,
1014
+ "eval_loss": 2.4344167709350586,
1015
+ "eval_runtime": 264.2924,
1016
+ "eval_samples_per_second": 3.11,
1017
+ "eval_steps_per_second": 1.555,
1018
+ "step": 2800
1019
+ },
1020
+ {
1021
+ "epoch": 0.0565,
1022
+ "grad_norm": 0.7067565611523313,
1023
+ "learning_rate": 5.648e-06,
1024
+ "loss": 2.398,
1025
+ "step": 2825
1026
+ },
1027
+ {
1028
+ "epoch": 0.057,
1029
+ "grad_norm": 0.6416666495903947,
1030
+ "learning_rate": 5.698e-06,
1031
+ "loss": 2.3991,
1032
+ "step": 2850
1033
+ },
1034
+ {
1035
+ "epoch": 0.0575,
1036
+ "grad_norm": 0.6605105424774851,
1037
+ "learning_rate": 5.748e-06,
1038
+ "loss": 2.3962,
1039
+ "step": 2875
1040
+ },
1041
+ {
1042
+ "epoch": 0.058,
1043
+ "grad_norm": 0.6308761264530915,
1044
+ "learning_rate": 5.798e-06,
1045
+ "loss": 2.4058,
1046
+ "step": 2900
1047
+ },
1048
+ {
1049
+ "epoch": 0.058,
1050
+ "eval_loss": 2.434436082839966,
1051
+ "eval_runtime": 265.0112,
1052
+ "eval_samples_per_second": 3.102,
1053
+ "eval_steps_per_second": 1.551,
1054
+ "step": 2900
1055
+ },
1056
+ {
1057
+ "epoch": 0.0585,
1058
+ "grad_norm": 0.6363649329289001,
1059
+ "learning_rate": 5.848000000000001e-06,
1060
+ "loss": 2.3943,
1061
+ "step": 2925
1062
+ },
1063
+ {
1064
+ "epoch": 0.059,
1065
+ "grad_norm": 0.6147983139117156,
1066
+ "learning_rate": 5.898e-06,
1067
+ "loss": 2.3982,
1068
+ "step": 2950
1069
+ },
1070
+ {
1071
+ "epoch": 0.0595,
1072
+ "grad_norm": 0.611354772141602,
1073
+ "learning_rate": 5.9480000000000005e-06,
1074
+ "loss": 2.3921,
1075
+ "step": 2975
1076
+ },
1077
+ {
1078
+ "epoch": 0.06,
1079
+ "grad_norm": 0.6269054680170398,
1080
+ "learning_rate": 5.998000000000001e-06,
1081
+ "loss": 2.392,
1082
+ "step": 3000
1083
+ },
1084
+ {
1085
+ "epoch": 0.06,
1086
+ "eval_loss": 2.433990955352783,
1087
+ "eval_runtime": 264.2169,
1088
+ "eval_samples_per_second": 3.111,
1089
+ "eval_steps_per_second": 1.556,
1090
+ "step": 3000
1091
+ },
1092
+ {
1093
+ "epoch": 0.0605,
1094
+ "grad_norm": 0.6248207448228328,
1095
+ "learning_rate": 6.048e-06,
1096
+ "loss": 2.3858,
1097
+ "step": 3025
1098
+ },
1099
+ {
1100
+ "epoch": 0.061,
1101
+ "grad_norm": 0.6275258656299642,
1102
+ "learning_rate": 6.098000000000001e-06,
1103
+ "loss": 2.4015,
1104
+ "step": 3050
1105
+ },
1106
+ {
1107
+ "epoch": 0.0615,
1108
+ "grad_norm": 1.0457401571274152,
1109
+ "learning_rate": 6.148e-06,
1110
+ "loss": 2.3909,
1111
+ "step": 3075
1112
+ },
1113
+ {
1114
+ "epoch": 0.062,
1115
+ "grad_norm": 0.6551230863319748,
1116
+ "learning_rate": 6.198000000000001e-06,
1117
+ "loss": 2.3983,
1118
+ "step": 3100
1119
+ },
1120
+ {
1121
+ "epoch": 0.062,
1122
+ "eval_loss": 2.433279275894165,
1123
+ "eval_runtime": 264.1521,
1124
+ "eval_samples_per_second": 3.112,
1125
+ "eval_steps_per_second": 1.556,
1126
+ "step": 3100
1127
+ },
1128
+ {
1129
+ "epoch": 0.0625,
1130
+ "grad_norm": 0.6306746226297937,
1131
+ "learning_rate": 6.248000000000001e-06,
1132
+ "loss": 2.397,
1133
+ "step": 3125
1134
+ },
1135
+ {
1136
+ "epoch": 0.063,
1137
+ "grad_norm": 0.6299802316587856,
1138
+ "learning_rate": 6.2980000000000005e-06,
1139
+ "loss": 2.4018,
1140
+ "step": 3150
1141
+ },
1142
+ {
1143
+ "epoch": 0.0635,
1144
+ "grad_norm": 0.6265424590222634,
1145
+ "learning_rate": 6.348000000000001e-06,
1146
+ "loss": 2.4065,
1147
+ "step": 3175
1148
+ },
1149
+ {
1150
+ "epoch": 0.064,
1151
+ "grad_norm": 0.6717273211615455,
1152
+ "learning_rate": 6.398000000000001e-06,
1153
+ "loss": 2.3906,
1154
+ "step": 3200
1155
+ },
1156
+ {
1157
+ "epoch": 0.064,
1158
+ "eval_loss": 2.4333276748657227,
1159
+ "eval_runtime": 263.9592,
1160
+ "eval_samples_per_second": 3.114,
1161
+ "eval_steps_per_second": 1.557,
1162
+ "step": 3200
1163
+ },
1164
+ {
1165
+ "epoch": 0.0645,
1166
+ "grad_norm": 0.6159924635031793,
1167
+ "learning_rate": 6.448000000000001e-06,
1168
+ "loss": 2.3947,
1169
+ "step": 3225
1170
+ },
1171
+ {
1172
+ "epoch": 0.065,
1173
+ "grad_norm": 0.6124462043712093,
1174
+ "learning_rate": 6.498000000000001e-06,
1175
+ "loss": 2.3963,
1176
+ "step": 3250
1177
+ },
1178
+ {
1179
+ "epoch": 0.0655,
1180
+ "grad_norm": 0.6144378183602921,
1181
+ "learning_rate": 6.548000000000001e-06,
1182
+ "loss": 2.402,
1183
+ "step": 3275
1184
+ },
1185
+ {
1186
+ "epoch": 0.066,
1187
+ "grad_norm": 0.6295732934678283,
1188
+ "learning_rate": 6.598000000000001e-06,
1189
+ "loss": 2.3877,
1190
+ "step": 3300
1191
+ },
1192
+ {
1193
+ "epoch": 0.066,
1194
+ "eval_loss": 2.4331116676330566,
1195
+ "eval_runtime": 263.4524,
1196
+ "eval_samples_per_second": 3.12,
1197
+ "eval_steps_per_second": 1.56,
1198
+ "step": 3300
1199
+ },
1200
+ {
1201
+ "epoch": 0.0665,
1202
+ "grad_norm": 0.5938287129149346,
1203
+ "learning_rate": 6.648e-06,
1204
+ "loss": 2.389,
1205
+ "step": 3325
1206
+ },
1207
+ {
1208
+ "epoch": 0.067,
1209
+ "grad_norm": 0.6194783667871923,
1210
+ "learning_rate": 6.698e-06,
1211
+ "loss": 2.39,
1212
+ "step": 3350
1213
+ },
1214
+ {
1215
+ "epoch": 0.0675,
1216
+ "grad_norm": 0.60927231594853,
1217
+ "learning_rate": 6.7480000000000004e-06,
1218
+ "loss": 2.3968,
1219
+ "step": 3375
1220
+ },
1221
+ {
1222
+ "epoch": 0.068,
1223
+ "grad_norm": 0.6386175333576501,
1224
+ "learning_rate": 6.798e-06,
1225
+ "loss": 2.3861,
1226
+ "step": 3400
1227
+ },
1228
+ {
1229
+ "epoch": 0.068,
1230
+ "eval_loss": 2.4328911304473877,
1231
+ "eval_runtime": 264.2923,
1232
+ "eval_samples_per_second": 3.11,
1233
+ "eval_steps_per_second": 1.555,
1234
+ "step": 3400
1235
+ },
1236
+ {
1237
+ "epoch": 0.0685,
1238
+ "grad_norm": 0.6092295027577579,
1239
+ "learning_rate": 6.848e-06,
1240
+ "loss": 2.3827,
1241
+ "step": 3425
1242
+ },
1243
+ {
1244
+ "epoch": 0.069,
1245
+ "grad_norm": 0.5914846449422462,
1246
+ "learning_rate": 6.898e-06,
1247
+ "loss": 2.3894,
1248
+ "step": 3450
1249
+ },
1250
+ {
1251
+ "epoch": 0.0695,
1252
+ "grad_norm": 0.5927461214526666,
1253
+ "learning_rate": 6.948e-06,
1254
+ "loss": 2.3858,
1255
+ "step": 3475
1256
+ },
1257
+ {
1258
+ "epoch": 0.07,
1259
+ "grad_norm": 0.5992194088197265,
1260
+ "learning_rate": 6.998000000000001e-06,
1261
+ "loss": 2.3941,
1262
+ "step": 3500
1263
+ },
1264
+ {
1265
+ "epoch": 0.07,
1266
+ "eval_loss": 2.432774543762207,
1267
+ "eval_runtime": 263.8546,
1268
+ "eval_samples_per_second": 3.115,
1269
+ "eval_steps_per_second": 1.558,
1270
+ "step": 3500
1271
+ },
1272
+ {
1273
+ "epoch": 0.0705,
1274
+ "grad_norm": 0.6119297158568089,
1275
+ "learning_rate": 7.048e-06,
1276
+ "loss": 2.3897,
1277
+ "step": 3525
1278
+ },
1279
+ {
1280
+ "epoch": 0.071,
1281
+ "grad_norm": 0.6040666217758901,
1282
+ "learning_rate": 7.0980000000000005e-06,
1283
+ "loss": 2.3966,
1284
+ "step": 3550
1285
+ },
1286
+ {
1287
+ "epoch": 0.0715,
1288
+ "grad_norm": 0.6142925813030266,
1289
+ "learning_rate": 7.148000000000001e-06,
1290
+ "loss": 2.3953,
1291
+ "step": 3575
1292
+ },
1293
+ {
1294
+ "epoch": 0.072,
1295
+ "grad_norm": 0.5857079248330344,
1296
+ "learning_rate": 7.198e-06,
1297
+ "loss": 2.3854,
1298
+ "step": 3600
1299
+ },
1300
+ {
1301
+ "epoch": 0.072,
1302
+ "eval_loss": 2.432868719100952,
1303
+ "eval_runtime": 264.1849,
1304
+ "eval_samples_per_second": 3.111,
1305
+ "eval_steps_per_second": 1.556,
1306
+ "step": 3600
1307
+ },
1308
+ {
1309
+ "epoch": 0.0725,
1310
+ "grad_norm": 0.6075613052530382,
1311
+ "learning_rate": 7.248000000000001e-06,
1312
+ "loss": 2.3798,
1313
+ "step": 3625
1314
+ },
1315
+ {
1316
+ "epoch": 0.073,
1317
+ "grad_norm": 0.6146043204282547,
1318
+ "learning_rate": 7.298e-06,
1319
+ "loss": 2.3894,
1320
+ "step": 3650
1321
+ },
1322
+ {
1323
+ "epoch": 0.0735,
1324
+ "grad_norm": 0.613284002341936,
1325
+ "learning_rate": 7.348000000000001e-06,
1326
+ "loss": 2.3897,
1327
+ "step": 3675
1328
+ },
1329
+ {
1330
+ "epoch": 0.074,
1331
+ "grad_norm": 0.6694404263159593,
1332
+ "learning_rate": 7.398000000000001e-06,
1333
+ "loss": 2.3925,
1334
+ "step": 3700
1335
+ },
1336
+ {
1337
+ "epoch": 0.074,
1338
+ "eval_loss": 2.4324021339416504,
1339
+ "eval_runtime": 263.3107,
1340
+ "eval_samples_per_second": 3.122,
1341
+ "eval_steps_per_second": 1.561,
1342
+ "step": 3700
1343
+ },
1344
+ {
1345
+ "epoch": 0.0745,
1346
+ "grad_norm": 0.5756401973694445,
1347
+ "learning_rate": 7.4480000000000005e-06,
1348
+ "loss": 2.3894,
1349
+ "step": 3725
1350
+ },
1351
+ {
1352
+ "epoch": 0.075,
1353
+ "grad_norm": 0.5945783703417461,
1354
+ "learning_rate": 7.498000000000001e-06,
1355
+ "loss": 2.3928,
1356
+ "step": 3750
1357
+ },
1358
+ {
1359
+ "epoch": 0.0755,
1360
+ "grad_norm": 0.5935750222986942,
1361
+ "learning_rate": 7.548000000000001e-06,
1362
+ "loss": 2.3774,
1363
+ "step": 3775
1364
+ },
1365
+ {
1366
+ "epoch": 0.076,
1367
+ "grad_norm": 0.5938734543073783,
1368
+ "learning_rate": 7.598000000000001e-06,
1369
+ "loss": 2.3776,
1370
+ "step": 3800
1371
+ },
1372
+ {
1373
+ "epoch": 0.076,
1374
+ "eval_loss": 2.432751178741455,
1375
+ "eval_runtime": 263.8929,
1376
+ "eval_samples_per_second": 3.115,
1377
+ "eval_steps_per_second": 1.557,
1378
+ "step": 3800
1379
+ },
1380
+ {
1381
+ "epoch": 0.0765,
1382
+ "grad_norm": 0.595820899700728,
1383
+ "learning_rate": 7.648e-06,
1384
+ "loss": 2.3804,
1385
+ "step": 3825
1386
+ },
1387
+ {
1388
+ "epoch": 0.077,
1389
+ "grad_norm": 0.6079304106413467,
1390
+ "learning_rate": 7.698000000000002e-06,
1391
+ "loss": 2.3917,
1392
+ "step": 3850
1393
+ },
1394
+ {
1395
+ "epoch": 0.0775,
1396
+ "grad_norm": 0.6083448146618482,
1397
+ "learning_rate": 7.748000000000001e-06,
1398
+ "loss": 2.3842,
1399
+ "step": 3875
1400
+ },
1401
+ {
1402
+ "epoch": 0.078,
1403
+ "grad_norm": 0.6128893415605828,
1404
+ "learning_rate": 7.798e-06,
1405
+ "loss": 2.3806,
1406
+ "step": 3900
1407
+ },
1408
+ {
1409
+ "epoch": 0.078,
1410
+ "eval_loss": 2.4325239658355713,
1411
+ "eval_runtime": 263.6693,
1412
+ "eval_samples_per_second": 3.118,
1413
+ "eval_steps_per_second": 1.559,
1414
+ "step": 3900
1415
+ },
1416
+ {
1417
+ "epoch": 0.0785,
1418
+ "grad_norm": 0.6079041195191952,
1419
+ "learning_rate": 7.848000000000002e-06,
1420
+ "loss": 2.3801,
1421
+ "step": 3925
1422
+ },
1423
+ {
1424
+ "epoch": 0.079,
1425
+ "grad_norm": 0.6075689821557235,
1426
+ "learning_rate": 7.898e-06,
1427
+ "loss": 2.3797,
1428
+ "step": 3950
1429
+ },
1430
+ {
1431
+ "epoch": 0.0795,
1432
+ "grad_norm": 0.5882326737716994,
1433
+ "learning_rate": 7.948e-06,
1434
+ "loss": 2.3905,
1435
+ "step": 3975
1436
+ },
1437
+ {
1438
+ "epoch": 0.08,
1439
+ "grad_norm": 0.5828476462223788,
1440
+ "learning_rate": 7.998e-06,
1441
+ "loss": 2.3806,
1442
+ "step": 4000
1443
+ },
1444
+ {
1445
+ "epoch": 0.08,
1446
+ "eval_loss": 2.4323527812957764,
1447
+ "eval_runtime": 263.9786,
1448
+ "eval_samples_per_second": 3.114,
1449
+ "eval_steps_per_second": 1.557,
1450
+ "step": 4000
1451
  }
1452
  ],
1453
  "logging_steps": 25,
 
1467
  "attributes": {}
1468
  }
1469
  },
1470
+ "total_flos": 1.2732809193308815e+19,
1471
  "train_batch_size": 1,
1472
  "trial_name": null,
1473
  "trial_params": null