C10X commited on
Commit
477bb2a
·
verified ·
1 Parent(s): a0ef9ad

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +1023 -0
trainer_state.json ADDED
@@ -0,0 +1,1023 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.992969696969697,
6
+ "eval_steps": 50,
7
+ "global_step": 1158,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02585858585858586,
14
+ "grad_norm": 129761.5390625,
15
+ "learning_rate": 7.758620689655173e-06,
16
+ "loss": 5.1637,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.05171717171717172,
21
+ "grad_norm": 85007.0390625,
22
+ "learning_rate": 1.6379310344827585e-05,
23
+ "loss": 4.8462,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.07757575757575758,
28
+ "grad_norm": 77711.90625,
29
+ "learning_rate": 2.5e-05,
30
+ "loss": 4.6229,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.10343434343434343,
35
+ "grad_norm": 75926.6015625,
36
+ "learning_rate": 3.3620689655172414e-05,
37
+ "loss": 4.4402,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.1292929292929293,
42
+ "grad_norm": 81001.3046875,
43
+ "learning_rate": 4.224137931034483e-05,
44
+ "loss": 4.186,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.1292929292929293,
49
+ "eval_loss": 8.234293937683105,
50
+ "eval_runtime": 21.2938,
51
+ "eval_samples_per_second": 46.962,
52
+ "eval_steps_per_second": 2.959,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 0.15515151515151515,
57
+ "grad_norm": 77491.1875,
58
+ "learning_rate": 4.9999898041346155e-05,
59
+ "loss": 3.9984,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 0.181010101010101,
64
+ "grad_norm": 70187.9375,
65
+ "learning_rate": 4.998766400914329e-05,
66
+ "loss": 3.7669,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 0.20686868686868687,
71
+ "grad_norm": 78757.3671875,
72
+ "learning_rate": 4.995504967976098e-05,
73
+ "loss": 3.4933,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.23272727272727273,
78
+ "grad_norm": 93011.2109375,
79
+ "learning_rate": 4.9902081653914056e-05,
80
+ "loss": 3.3346,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 0.2585858585858586,
85
+ "grad_norm": 82705.953125,
86
+ "learning_rate": 4.982880313308417e-05,
87
+ "loss": 3.2104,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.2585858585858586,
92
+ "eval_loss": 6.281063556671143,
93
+ "eval_runtime": 21.0455,
94
+ "eval_samples_per_second": 47.516,
95
+ "eval_steps_per_second": 2.994,
96
+ "step": 100
97
+ },
98
+ {
99
+ "epoch": 0.28444444444444444,
100
+ "grad_norm": 98789.734375,
101
+ "learning_rate": 4.973527388428399e-05,
102
+ "loss": 3.0765,
103
+ "step": 110
104
+ },
105
+ {
106
+ "epoch": 0.3103030303030303,
107
+ "grad_norm": 88273.296875,
108
+ "learning_rate": 4.962157019131037e-05,
109
+ "loss": 2.9833,
110
+ "step": 120
111
+ },
112
+ {
113
+ "epoch": 0.33616161616161616,
114
+ "grad_norm": 78808.640625,
115
+ "learning_rate": 4.948778479252625e-05,
116
+ "loss": 2.8664,
117
+ "step": 130
118
+ },
119
+ {
120
+ "epoch": 0.362020202020202,
121
+ "grad_norm": 68227.828125,
122
+ "learning_rate": 4.933402680522181e-05,
123
+ "loss": 2.7739,
124
+ "step": 140
125
+ },
126
+ {
127
+ "epoch": 0.3878787878787879,
128
+ "grad_norm": 70486.9453125,
129
+ "learning_rate": 4.916042163661697e-05,
130
+ "loss": 2.6577,
131
+ "step": 150
132
+ },
133
+ {
134
+ "epoch": 0.3878787878787879,
135
+ "eval_loss": 5.267953872680664,
136
+ "eval_runtime": 21.2432,
137
+ "eval_samples_per_second": 47.074,
138
+ "eval_steps_per_second": 2.966,
139
+ "step": 150
140
+ },
141
+ {
142
+ "epoch": 0.41373737373737374,
143
+ "grad_norm": 129737.7421875,
144
+ "learning_rate": 4.896711088157736e-05,
145
+ "loss": 2.5488,
146
+ "step": 160
147
+ },
148
+ {
149
+ "epoch": 0.4395959595959596,
150
+ "grad_norm": 92119.6328125,
151
+ "learning_rate": 4.875425220712757e-05,
152
+ "loss": 2.5209,
153
+ "step": 170
154
+ },
155
+ {
156
+ "epoch": 0.46545454545454545,
157
+ "grad_norm": 76282.8046875,
158
+ "learning_rate": 4.852201922385564e-05,
159
+ "loss": 2.496,
160
+ "step": 180
161
+ },
162
+ {
163
+ "epoch": 0.4913131313131313,
164
+ "grad_norm": 64090.921875,
165
+ "learning_rate": 4.827060134431379e-05,
166
+ "loss": 2.4329,
167
+ "step": 190
168
+ },
169
+ {
170
+ "epoch": 0.5171717171717172,
171
+ "grad_norm": 62604.421875,
172
+ "learning_rate": 4.800020362853082e-05,
173
+ "loss": 2.3479,
174
+ "step": 200
175
+ },
176
+ {
177
+ "epoch": 0.5171717171717172,
178
+ "eval_loss": 4.751099109649658,
179
+ "eval_runtime": 21.4328,
180
+ "eval_samples_per_second": 46.658,
181
+ "eval_steps_per_second": 2.939,
182
+ "step": 200
183
+ },
184
+ {
185
+ "epoch": 0.5430303030303031,
186
+ "grad_norm": 89043.59375,
187
+ "learning_rate": 4.7711046616762206e-05,
188
+ "loss": 2.331,
189
+ "step": 210
190
+ },
191
+ {
192
+ "epoch": 0.5688888888888889,
193
+ "grad_norm": 70265.078125,
194
+ "learning_rate": 4.7403366149614304e-05,
195
+ "loss": 2.2867,
196
+ "step": 220
197
+ },
198
+ {
199
+ "epoch": 0.5947474747474748,
200
+ "grad_norm": 66474.015625,
201
+ "learning_rate": 4.7077413175689365e-05,
202
+ "loss": 2.2422,
203
+ "step": 230
204
+ },
205
+ {
206
+ "epoch": 0.6206060606060606,
207
+ "grad_norm": 72337.0703125,
208
+ "learning_rate": 4.6733453546908277e-05,
209
+ "loss": 2.2279,
210
+ "step": 240
211
+ },
212
+ {
213
+ "epoch": 0.6464646464646465,
214
+ "grad_norm": 77678.4921875,
215
+ "learning_rate": 4.63717678016779e-05,
216
+ "loss": 2.1829,
217
+ "step": 250
218
+ },
219
+ {
220
+ "epoch": 0.6464646464646465,
221
+ "eval_loss": 4.369379043579102,
222
+ "eval_runtime": 21.3413,
223
+ "eval_samples_per_second": 46.857,
224
+ "eval_steps_per_second": 2.952,
225
+ "step": 250
226
+ },
227
+ {
228
+ "epoch": 0.6723232323232323,
229
+ "grad_norm": 105560.984375,
230
+ "learning_rate": 4.599265093607993e-05,
231
+ "loss": 2.1284,
232
+ "step": 260
233
+ },
234
+ {
235
+ "epoch": 0.6981818181818182,
236
+ "grad_norm": 94847.96875,
237
+ "learning_rate": 4.5596412163267846e-05,
238
+ "loss": 2.1122,
239
+ "step": 270
240
+ },
241
+ {
242
+ "epoch": 0.724040404040404,
243
+ "grad_norm": 80977.328125,
244
+ "learning_rate": 4.518337466126826e-05,
245
+ "loss": 2.1004,
246
+ "step": 280
247
+ },
248
+ {
249
+ "epoch": 0.74989898989899,
250
+ "grad_norm": 97450.6796875,
251
+ "learning_rate": 4.4753875309392266e-05,
252
+ "loss": 2.0487,
253
+ "step": 290
254
+ },
255
+ {
256
+ "epoch": 0.7757575757575758,
257
+ "grad_norm": 69245.8125,
258
+ "learning_rate": 4.4308264413471814e-05,
259
+ "loss": 2.0288,
260
+ "step": 300
261
+ },
262
+ {
263
+ "epoch": 0.7757575757575758,
264
+ "eval_loss": 4.115316867828369,
265
+ "eval_runtime": 21.4236,
266
+ "eval_samples_per_second": 46.678,
267
+ "eval_steps_per_second": 2.941,
268
+ "step": 300
269
+ },
270
+ {
271
+ "epoch": 0.8016161616161617,
272
+ "grad_norm": 78553.4296875,
273
+ "learning_rate": 4.384690542014529e-05,
274
+ "loss": 2.0385,
275
+ "step": 310
276
+ },
277
+ {
278
+ "epoch": 0.8274747474747475,
279
+ "grad_norm": 68862.0859375,
280
+ "learning_rate": 4.3370174620425216e-05,
281
+ "loss": 2.0088,
282
+ "step": 320
283
+ },
284
+ {
285
+ "epoch": 0.8533333333333334,
286
+ "grad_norm": 96176.8515625,
287
+ "learning_rate": 4.2878460842789905e-05,
288
+ "loss": 2.0057,
289
+ "step": 330
290
+ },
291
+ {
292
+ "epoch": 0.8791919191919192,
293
+ "grad_norm": 109879.5859375,
294
+ "learning_rate": 4.237216513604933e-05,
295
+ "loss": 1.978,
296
+ "step": 340
297
+ },
298
+ {
299
+ "epoch": 0.9050505050505051,
300
+ "grad_norm": 111969.0234375,
301
+ "learning_rate": 4.185170044224404e-05,
302
+ "loss": 1.969,
303
+ "step": 350
304
+ },
305
+ {
306
+ "epoch": 0.9050505050505051,
307
+ "eval_loss": 3.927480936050415,
308
+ "eval_runtime": 21.4891,
309
+ "eval_samples_per_second": 46.535,
310
+ "eval_steps_per_second": 2.932,
311
+ "step": 350
312
+ },
313
+ {
314
+ "epoch": 0.9309090909090909,
315
+ "grad_norm": 94754.984375,
316
+ "learning_rate": 4.131749125984362e-05,
317
+ "loss": 1.9188,
318
+ "step": 360
319
+ },
320
+ {
321
+ "epoch": 0.9567676767676768,
322
+ "grad_norm": 96219.0234375,
323
+ "learning_rate": 4.076997329751977e-05,
324
+ "loss": 1.9077,
325
+ "step": 370
326
+ },
327
+ {
328
+ "epoch": 0.9826262626262626,
329
+ "grad_norm": 84781.5234375,
330
+ "learning_rate": 4.0209593118775937e-05,
331
+ "loss": 1.8951,
332
+ "step": 380
333
+ },
334
+ {
335
+ "epoch": 1.0077575757575759,
336
+ "grad_norm": 91395.078125,
337
+ "learning_rate": 3.963680777772377e-05,
338
+ "loss": 1.8716,
339
+ "step": 390
340
+ },
341
+ {
342
+ "epoch": 1.0336161616161617,
343
+ "grad_norm": 106231.5078125,
344
+ "learning_rate": 3.905208444630327e-05,
345
+ "loss": 1.8927,
346
+ "step": 400
347
+ },
348
+ {
349
+ "epoch": 1.0336161616161617,
350
+ "eval_loss": 3.7736313343048096,
351
+ "eval_runtime": 21.5204,
352
+ "eval_samples_per_second": 46.468,
353
+ "eval_steps_per_second": 2.927,
354
+ "step": 400
355
+ },
356
+ {
357
+ "epoch": 1.0594747474747475,
358
+ "grad_norm": 106825.3359375,
359
+ "learning_rate": 3.845590003325058e-05,
360
+ "loss": 1.8803,
361
+ "step": 410
362
+ },
363
+ {
364
+ "epoch": 1.0853333333333333,
365
+ "grad_norm": 80191.515625,
366
+ "learning_rate": 3.7848740795124436e-05,
367
+ "loss": 1.8041,
368
+ "step": 420
369
+ },
370
+ {
371
+ "epoch": 1.1111919191919193,
372
+ "grad_norm": 113625.0234375,
373
+ "learning_rate": 3.7231101939708305e-05,
374
+ "loss": 1.8161,
375
+ "step": 430
376
+ },
377
+ {
378
+ "epoch": 1.137050505050505,
379
+ "grad_norm": 91820.0546875,
380
+ "learning_rate": 3.660348722211186e-05,
381
+ "loss": 1.7924,
382
+ "step": 440
383
+ },
384
+ {
385
+ "epoch": 1.162909090909091,
386
+ "grad_norm": 108857.2578125,
387
+ "learning_rate": 3.596640853390103e-05,
388
+ "loss": 1.813,
389
+ "step": 450
390
+ },
391
+ {
392
+ "epoch": 1.162909090909091,
393
+ "eval_loss": 3.652933359146118,
394
+ "eval_runtime": 21.5614,
395
+ "eval_samples_per_second": 46.379,
396
+ "eval_steps_per_second": 2.922,
397
+ "step": 450
398
+ },
399
+ {
400
+ "epoch": 1.1887676767676767,
401
+ "grad_norm": 107753.078125,
402
+ "learning_rate": 3.532038548559193e-05,
403
+ "loss": 1.7731,
404
+ "step": 460
405
+ },
406
+ {
407
+ "epoch": 1.2146262626262627,
408
+ "grad_norm": 99259.5,
409
+ "learning_rate": 3.4665944982849086e-05,
410
+ "loss": 1.7662,
411
+ "step": 470
412
+ },
413
+ {
414
+ "epoch": 1.2404848484848485,
415
+ "grad_norm": 96071.25,
416
+ "learning_rate": 3.400362079673357e-05,
417
+ "loss": 1.8034,
418
+ "step": 480
419
+ },
420
+ {
421
+ "epoch": 1.2663434343434343,
422
+ "grad_norm": 100736.2890625,
423
+ "learning_rate": 3.33339531283517e-05,
424
+ "loss": 1.7841,
425
+ "step": 490
426
+ },
427
+ {
428
+ "epoch": 1.2922020202020201,
429
+ "grad_norm": 93535.171875,
430
+ "learning_rate": 3.26574881682593e-05,
431
+ "loss": 1.7428,
432
+ "step": 500
433
+ },
434
+ {
435
+ "epoch": 1.2922020202020201,
436
+ "eval_loss": 3.5527358055114746,
437
+ "eval_runtime": 21.3139,
438
+ "eval_samples_per_second": 46.918,
439
+ "eval_steps_per_second": 2.956,
440
+ "step": 500
441
+ },
442
+ {
443
+ "epoch": 1.3180606060606062,
444
+ "grad_norm": 78968.21875,
445
+ "learning_rate": 3.1974777650980735e-05,
446
+ "loss": 1.7202,
447
+ "step": 510
448
+ },
449
+ {
450
+ "epoch": 1.343919191919192,
451
+ "grad_norm": 115748.171875,
452
+ "learning_rate": 3.1286378405006465e-05,
453
+ "loss": 1.7063,
454
+ "step": 520
455
+ },
456
+ {
457
+ "epoch": 1.3697777777777778,
458
+ "grad_norm": 90799.15625,
459
+ "learning_rate": 3.059285189863564e-05,
460
+ "loss": 1.7286,
461
+ "step": 530
462
+ },
463
+ {
464
+ "epoch": 1.3956363636363636,
465
+ "grad_norm": 108626.28125,
466
+ "learning_rate": 2.9894763782034542e-05,
467
+ "loss": 1.709,
468
+ "step": 540
469
+ },
470
+ {
471
+ "epoch": 1.4214949494949494,
472
+ "grad_norm": 102933.5234375,
473
+ "learning_rate": 2.9192683425884164e-05,
474
+ "loss": 1.7114,
475
+ "step": 550
476
+ },
477
+ {
478
+ "epoch": 1.4214949494949494,
479
+ "eval_loss": 3.471935510635376,
480
+ "eval_runtime": 21.2904,
481
+ "eval_samples_per_second": 46.97,
482
+ "eval_steps_per_second": 2.959,
483
+ "step": 550
484
+ },
485
+ {
486
+ "epoch": 1.4473535353535354,
487
+ "grad_norm": 93085.421875,
488
+ "learning_rate": 2.8487183456993378e-05,
489
+ "loss": 1.6962,
490
+ "step": 560
491
+ },
492
+ {
493
+ "epoch": 1.4732121212121212,
494
+ "grad_norm": 79838.0859375,
495
+ "learning_rate": 2.7778839291256232e-05,
496
+ "loss": 1.6844,
497
+ "step": 570
498
+ },
499
+ {
500
+ "epoch": 1.499070707070707,
501
+ "grad_norm": 91634.3984375,
502
+ "learning_rate": 2.706822866433451e-05,
503
+ "loss": 1.6929,
504
+ "step": 580
505
+ },
506
+ {
507
+ "epoch": 1.524929292929293,
508
+ "grad_norm": 91946.4375,
509
+ "learning_rate": 2.6355931160448272e-05,
510
+ "loss": 1.6786,
511
+ "step": 590
512
+ },
513
+ {
514
+ "epoch": 1.5507878787878788,
515
+ "grad_norm": 69164.7265625,
516
+ "learning_rate": 2.564252773965861e-05,
517
+ "loss": 1.6562,
518
+ "step": 600
519
+ },
520
+ {
521
+ "epoch": 1.5507878787878788,
522
+ "eval_loss": 3.403024911880493,
523
+ "eval_runtime": 22.0604,
524
+ "eval_samples_per_second": 45.33,
525
+ "eval_steps_per_second": 2.856,
526
+ "step": 600
527
+ },
528
+ {
529
+ "epoch": 1.5766464646464646,
530
+ "grad_norm": 91655.7109375,
531
+ "learning_rate": 2.4928600264028312e-05,
532
+ "loss": 1.676,
533
+ "step": 610
534
+ },
535
+ {
536
+ "epoch": 1.6025050505050507,
537
+ "grad_norm": 88343.1875,
538
+ "learning_rate": 2.4214731023046793e-05,
539
+ "loss": 1.6749,
540
+ "step": 620
541
+ },
542
+ {
543
+ "epoch": 1.6283636363636362,
544
+ "grad_norm": 92792.515625,
545
+ "learning_rate": 2.3501502258706492e-05,
546
+ "loss": 1.6358,
547
+ "step": 630
548
+ },
549
+ {
550
+ "epoch": 1.6542222222222223,
551
+ "grad_norm": 89480.8671875,
552
+ "learning_rate": 2.2789495690617857e-05,
553
+ "loss": 1.6461,
554
+ "step": 640
555
+ },
556
+ {
557
+ "epoch": 1.680080808080808,
558
+ "grad_norm": 88156.109375,
559
+ "learning_rate": 2.2079292041550607e-05,
560
+ "loss": 1.6462,
561
+ "step": 650
562
+ },
563
+ {
564
+ "epoch": 1.680080808080808,
565
+ "eval_loss": 3.3437397480010986,
566
+ "eval_runtime": 21.5027,
567
+ "eval_samples_per_second": 46.506,
568
+ "eval_steps_per_second": 2.93,
569
+ "step": 650
570
+ },
571
+ {
572
+ "epoch": 1.7059393939393939,
573
+ "grad_norm": 113831.8671875,
574
+ "learning_rate": 2.1371470563787788e-05,
575
+ "loss": 1.6563,
576
+ "step": 660
577
+ },
578
+ {
579
+ "epoch": 1.73179797979798,
580
+ "grad_norm": 90395.5,
581
+ "learning_rate": 2.06666085666794e-05,
582
+ "loss": 1.6484,
583
+ "step": 670
584
+ },
585
+ {
586
+ "epoch": 1.7576565656565657,
587
+ "grad_norm": 102168.75,
588
+ "learning_rate": 1.9965280945780552e-05,
589
+ "loss": 1.6427,
590
+ "step": 680
591
+ },
592
+ {
593
+ "epoch": 1.7835151515151515,
594
+ "grad_norm": 107093.0625,
595
+ "learning_rate": 1.9268059713958437e-05,
596
+ "loss": 1.6518,
597
+ "step": 690
598
+ },
599
+ {
600
+ "epoch": 1.8093737373737375,
601
+ "grad_norm": 91756.90625,
602
+ "learning_rate": 1.857551353485039e-05,
603
+ "loss": 1.6305,
604
+ "step": 700
605
+ },
606
+ {
607
+ "epoch": 1.8093737373737375,
608
+ "eval_loss": 3.2984097003936768,
609
+ "eval_runtime": 21.4916,
610
+ "eval_samples_per_second": 46.53,
611
+ "eval_steps_per_second": 2.931,
612
+ "step": 700
613
+ },
614
+ {
615
+ "epoch": 1.835232323232323,
616
+ "grad_norm": 91322.09375,
617
+ "learning_rate": 1.788820725905373e-05,
618
+ "loss": 1.6342,
619
+ "step": 710
620
+ },
621
+ {
622
+ "epoch": 1.8610909090909091,
623
+ "grad_norm": 99315.203125,
624
+ "learning_rate": 1.7206701463425446e-05,
625
+ "loss": 1.6008,
626
+ "step": 720
627
+ },
628
+ {
629
+ "epoch": 1.886949494949495,
630
+ "grad_norm": 89938.4921875,
631
+ "learning_rate": 1.6531551993867717e-05,
632
+ "loss": 1.6294,
633
+ "step": 730
634
+ },
635
+ {
636
+ "epoch": 1.9128080808080807,
637
+ "grad_norm": 91629.78125,
638
+ "learning_rate": 1.5863309511971998e-05,
639
+ "loss": 1.5608,
640
+ "step": 740
641
+ },
642
+ {
643
+ "epoch": 1.9386666666666668,
644
+ "grad_norm": 91213.9296875,
645
+ "learning_rate": 1.5202519045891558e-05,
646
+ "loss": 1.627,
647
+ "step": 750
648
+ },
649
+ {
650
+ "epoch": 1.9386666666666668,
651
+ "eval_loss": 3.259477138519287,
652
+ "eval_runtime": 21.4965,
653
+ "eval_samples_per_second": 46.519,
654
+ "eval_steps_per_second": 2.931,
655
+ "step": 750
656
+ },
657
+ {
658
+ "epoch": 1.9645252525252526,
659
+ "grad_norm": 86313.796875,
660
+ "learning_rate": 1.4549719545808632e-05,
661
+ "loss": 1.5955,
662
+ "step": 760
663
+ },
664
+ {
665
+ "epoch": 1.9903838383838384,
666
+ "grad_norm": 80786.09375,
667
+ "learning_rate": 1.3905443444359025e-05,
668
+ "loss": 1.5658,
669
+ "step": 770
670
+ },
671
+ {
672
+ "epoch": 2.0155151515151517,
673
+ "grad_norm": 87623.234375,
674
+ "learning_rate": 1.3270216222372323e-05,
675
+ "loss": 1.5961,
676
+ "step": 780
677
+ },
678
+ {
679
+ "epoch": 2.0413737373737373,
680
+ "grad_norm": 81999.125,
681
+ "learning_rate": 1.2644555980282219e-05,
682
+ "loss": 1.5716,
683
+ "step": 790
684
+ },
685
+ {
686
+ "epoch": 2.0672323232323233,
687
+ "grad_norm": 78692.296875,
688
+ "learning_rate": 1.2028973015556413e-05,
689
+ "loss": 1.5748,
690
+ "step": 800
691
+ },
692
+ {
693
+ "epoch": 2.0672323232323233,
694
+ "eval_loss": 3.231915235519409,
695
+ "eval_runtime": 21.4941,
696
+ "eval_samples_per_second": 46.524,
697
+ "eval_steps_per_second": 2.931,
698
+ "step": 800
699
+ },
700
+ {
701
+ "epoch": 2.093090909090909,
702
+ "grad_norm": 85943.3359375,
703
+ "learning_rate": 1.142396940649062e-05,
704
+ "loss": 1.5775,
705
+ "step": 810
706
+ },
707
+ {
708
+ "epoch": 2.118949494949495,
709
+ "grad_norm": 89760.4765625,
710
+ "learning_rate": 1.0830038602706346e-05,
711
+ "loss": 1.5867,
712
+ "step": 820
713
+ },
714
+ {
715
+ "epoch": 2.144808080808081,
716
+ "grad_norm": 88100.2109375,
717
+ "learning_rate": 1.0247665022686262e-05,
718
+ "loss": 1.5792,
719
+ "step": 830
720
+ },
721
+ {
722
+ "epoch": 2.1706666666666665,
723
+ "grad_norm": 83300.453125,
724
+ "learning_rate": 9.677323658675594e-06,
725
+ "loss": 1.5703,
726
+ "step": 840
727
+ },
728
+ {
729
+ "epoch": 2.1965252525252525,
730
+ "grad_norm": 92031.609375,
731
+ "learning_rate": 9.11947968927157e-06,
732
+ "loss": 1.5711,
733
+ "step": 850
734
+ },
735
+ {
736
+ "epoch": 2.1965252525252525,
737
+ "eval_loss": 3.204101085662842,
738
+ "eval_runtime": 21.5224,
739
+ "eval_samples_per_second": 46.463,
740
+ "eval_steps_per_second": 2.927,
741
+ "step": 850
742
+ },
743
+ {
744
+ "epoch": 2.2223838383838386,
745
+ "grad_norm": 86591.40625,
746
+ "learning_rate": 8.574588100017159e-06,
747
+ "loss": 1.5752,
748
+ "step": 860
749
+ },
750
+ {
751
+ "epoch": 2.248242424242424,
752
+ "grad_norm": 101210.9609375,
753
+ "learning_rate": 8.043093312308248e-06,
754
+ "loss": 1.5828,
755
+ "step": 870
756
+ },
757
+ {
758
+ "epoch": 2.27410101010101,
759
+ "grad_norm": 88097.3984375,
760
+ "learning_rate": 7.525428820917288e-06,
761
+ "loss": 1.5628,
762
+ "step": 880
763
+ },
764
+ {
765
+ "epoch": 2.2999595959595958,
766
+ "grad_norm": 80520.515625,
767
+ "learning_rate": 7.022016840428614e-06,
768
+ "loss": 1.5892,
769
+ "step": 890
770
+ },
771
+ {
772
+ "epoch": 2.325818181818182,
773
+ "grad_norm": 71811.4921875,
774
+ "learning_rate": 6.533267960874282e-06,
775
+ "loss": 1.5501,
776
+ "step": 900
777
+ },
778
+ {
779
+ "epoch": 2.325818181818182,
780
+ "eval_loss": 3.186400890350342,
781
+ "eval_runtime": 21.3699,
782
+ "eval_samples_per_second": 46.795,
783
+ "eval_steps_per_second": 2.948,
784
+ "step": 900
785
+ },
786
+ {
787
+ "epoch": 2.351676767676768,
788
+ "grad_norm": 81277.7578125,
789
+ "learning_rate": 6.059580812850868e-06,
790
+ "loss": 1.5404,
791
+ "step": 910
792
+ },
793
+ {
794
+ "epoch": 2.3775353535353534,
795
+ "grad_norm": 72905.59375,
796
+ "learning_rate": 5.601341742390706e-06,
797
+ "loss": 1.5642,
798
+ "step": 920
799
+ },
800
+ {
801
+ "epoch": 2.4033939393939394,
802
+ "grad_norm": 74492.1484375,
803
+ "learning_rate": 5.158924495852454e-06,
804
+ "loss": 1.5425,
805
+ "step": 930
806
+ },
807
+ {
808
+ "epoch": 2.4292525252525254,
809
+ "grad_norm": 88411.46875,
810
+ "learning_rate": 4.732689915088262e-06,
811
+ "loss": 1.5386,
812
+ "step": 940
813
+ },
814
+ {
815
+ "epoch": 2.455111111111111,
816
+ "grad_norm": 85327.9765625,
817
+ "learning_rate": 4.322985643135952e-06,
818
+ "loss": 1.5771,
819
+ "step": 950
820
+ },
821
+ {
822
+ "epoch": 2.455111111111111,
823
+ "eval_loss": 3.1719841957092285,
824
+ "eval_runtime": 21.5946,
825
+ "eval_samples_per_second": 46.308,
826
+ "eval_steps_per_second": 2.917,
827
+ "step": 950
828
+ },
829
+ {
830
+ "epoch": 2.480969696969697,
831
+ "grad_norm": 91103.5390625,
832
+ "learning_rate": 3.9301458406763746e-06,
833
+ "loss": 1.5707,
834
+ "step": 960
835
+ },
836
+ {
837
+ "epoch": 2.506828282828283,
838
+ "grad_norm": 81763.5546875,
839
+ "learning_rate": 3.5544909134871545e-06,
840
+ "loss": 1.5226,
841
+ "step": 970
842
+ },
843
+ {
844
+ "epoch": 2.5326868686868687,
845
+ "grad_norm": 72524.65625,
846
+ "learning_rate": 3.1963272511151555e-06,
847
+ "loss": 1.5312,
848
+ "step": 980
849
+ },
850
+ {
851
+ "epoch": 2.5585454545454547,
852
+ "grad_norm": 71866.671875,
853
+ "learning_rate": 2.855946976980775e-06,
854
+ "loss": 1.5427,
855
+ "step": 990
856
+ },
857
+ {
858
+ "epoch": 2.5844040404040403,
859
+ "grad_norm": 65724.171875,
860
+ "learning_rate": 2.533627710117889e-06,
861
+ "loss": 1.5305,
862
+ "step": 1000
863
+ },
864
+ {
865
+ "epoch": 2.5844040404040403,
866
+ "eval_loss": 3.1618518829345703,
867
+ "eval_runtime": 21.5264,
868
+ "eval_samples_per_second": 46.455,
869
+ "eval_steps_per_second": 2.927,
870
+ "step": 1000
871
+ },
872
+ {
873
+ "epoch": 2.6102626262626263,
874
+ "grad_norm": 72455.2890625,
875
+ "learning_rate": 2.229632338743773e-06,
876
+ "loss": 1.5533,
877
+ "step": 1010
878
+ },
879
+ {
880
+ "epoch": 2.6361212121212123,
881
+ "grad_norm": 74930.046875,
882
+ "learning_rate": 1.9442088058437113e-06,
883
+ "loss": 1.5636,
884
+ "step": 1020
885
+ },
886
+ {
887
+ "epoch": 2.661979797979798,
888
+ "grad_norm": 71263.109375,
889
+ "learning_rate": 1.6775899069451512e-06,
890
+ "loss": 1.5899,
891
+ "step": 1030
892
+ },
893
+ {
894
+ "epoch": 2.687838383838384,
895
+ "grad_norm": 64114.171875,
896
+ "learning_rate": 1.4299931002463047e-06,
897
+ "loss": 1.5348,
898
+ "step": 1040
899
+ },
900
+ {
901
+ "epoch": 2.7136969696969695,
902
+ "grad_norm": 67584.8671875,
903
+ "learning_rate": 1.2016203292541578e-06,
904
+ "loss": 1.5518,
905
+ "step": 1050
906
+ },
907
+ {
908
+ "epoch": 2.7136969696969695,
909
+ "eval_loss": 3.1565101146698,
910
+ "eval_runtime": 21.4913,
911
+ "eval_samples_per_second": 46.53,
912
+ "eval_steps_per_second": 2.931,
913
+ "step": 1050
914
+ },
915
+ {
916
+ "epoch": 2.7395555555555555,
917
+ "grad_norm": 65095.58984375,
918
+ "learning_rate": 9.926578580764234e-07,
919
+ "loss": 1.5439,
920
+ "step": 1060
921
+ },
922
+ {
923
+ "epoch": 2.7654141414141415,
924
+ "grad_norm": 55857.4296875,
925
+ "learning_rate": 8.032761195018723e-07,
926
+ "loss": 1.5378,
927
+ "step": 1070
928
+ },
929
+ {
930
+ "epoch": 2.791272727272727,
931
+ "grad_norm": 59888.99609375,
932
+ "learning_rate": 6.336295759929028e-07,
933
+ "loss": 1.5323,
934
+ "step": 1080
935
+ },
936
+ {
937
+ "epoch": 2.817131313131313,
938
+ "grad_norm": 58884.90234375,
939
+ "learning_rate": 4.83856593703752e-07,
940
+ "loss": 1.5375,
941
+ "step": 1090
942
+ },
943
+ {
944
+ "epoch": 2.8429898989898987,
945
+ "grad_norm": 54516.77734375,
946
+ "learning_rate": 3.540793296270578e-07,
947
+ "loss": 1.5388,
948
+ "step": 1100
949
+ },
950
+ {
951
+ "epoch": 2.8429898989898987,
952
+ "eval_loss": 3.153146505355835,
953
+ "eval_runtime": 21.5113,
954
+ "eval_samples_per_second": 46.487,
955
+ "eval_steps_per_second": 2.929,
956
+ "step": 1100
957
+ },
958
+ {
959
+ "epoch": 2.8688484848484848,
960
+ "grad_norm": 52652.86328125,
961
+ "learning_rate": 2.4440363196087945e-07,
962
+ "loss": 1.5775,
963
+ "step": 1110
964
+ },
965
+ {
966
+ "epoch": 2.894707070707071,
967
+ "grad_norm": 54156.91796875,
968
+ "learning_rate": 1.5491895377737764e-07,
969
+ "loss": 1.5358,
970
+ "step": 1120
971
+ },
972
+ {
973
+ "epoch": 2.920565656565657,
974
+ "grad_norm": 46885.69921875,
975
+ "learning_rate": 8.569828006361469e-08,
976
+ "loss": 1.5392,
977
+ "step": 1130
978
+ },
979
+ {
980
+ "epoch": 2.9464242424242424,
981
+ "grad_norm": 50899.3359375,
982
+ "learning_rate": 3.6798068193946714e-08,
983
+ "loss": 1.5469,
984
+ "step": 1140
985
+ },
986
+ {
987
+ "epoch": 2.9722828282828284,
988
+ "grad_norm": 53669.515625,
989
+ "learning_rate": 8.25820188258275e-09,
990
+ "loss": 1.5226,
991
+ "step": 1150
992
+ },
993
+ {
994
+ "epoch": 2.9722828282828284,
995
+ "eval_loss": 3.152524948120117,
996
+ "eval_runtime": 21.5012,
997
+ "eval_samples_per_second": 46.509,
998
+ "eval_steps_per_second": 2.93,
999
+ "step": 1150
1000
+ }
1001
+ ],
1002
+ "logging_steps": 10,
1003
+ "max_steps": 1158,
1004
+ "num_input_tokens_seen": 0,
1005
+ "num_train_epochs": 3,
1006
+ "save_steps": 50,
1007
+ "stateful_callbacks": {
1008
+ "TrainerControl": {
1009
+ "args": {
1010
+ "should_epoch_stop": false,
1011
+ "should_evaluate": false,
1012
+ "should_log": false,
1013
+ "should_save": true,
1014
+ "should_training_stop": true
1015
+ },
1016
+ "attributes": {}
1017
+ }
1018
+ },
1019
+ "total_flos": 4.35914399232e+16,
1020
+ "train_batch_size": 8,
1021
+ "trial_name": null,
1022
+ "trial_params": null
1023
+ }