File size: 13,544 Bytes
e30f3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 10.0,
  "eval_steps": 500,
  "global_step": 20400,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.24509803921568626,
      "grad_norm": 26.514015197753906,
      "learning_rate": 4.872549019607843e-06,
      "loss": 9.1121,
      "step": 500
    },
    {
      "epoch": 0.49019607843137253,
      "grad_norm": 17.301259994506836,
      "learning_rate": 9.764705882352942e-06,
      "loss": 4.6199,
      "step": 1000
    },
    {
      "epoch": 0.7352941176470589,
      "grad_norm": 19.435855865478516,
      "learning_rate": 1.4656862745098039e-05,
      "loss": 3.7886,
      "step": 1500
    },
    {
      "epoch": 0.9803921568627451,
      "grad_norm": 21.463775634765625,
      "learning_rate": 1.954901960784314e-05,
      "loss": 3.3042,
      "step": 2000
    },
    {
      "epoch": 1.0,
      "eval_HasAns_exact": 41.66596479407058,
      "eval_HasAns_f1": 45.75952971203001,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 63.918133580392485,
      "eval_best_exact_thresh": 17.15625,
      "eval_best_f1": 68.01169849835183,
      "eval_best_f1_thresh": 17.15625,
      "eval_exact": 41.66596479407058,
      "eval_f1": 45.75952971203001,
      "eval_runtime": 84.8894,
      "eval_samples_per_second": 140.995,
      "eval_steps_per_second": 2.215,
      "eval_total": 11873,
      "step": 2040
    },
    {
      "epoch": 1.2254901960784315,
      "grad_norm": 22.533653259277344,
      "learning_rate": 1.9505446623093686e-05,
      "loss": 2.8594,
      "step": 2500
    },
    {
      "epoch": 1.4705882352941178,
      "grad_norm": 16.711044311523438,
      "learning_rate": 1.8960784313725492e-05,
      "loss": 2.7032,
      "step": 3000
    },
    {
      "epoch": 1.715686274509804,
      "grad_norm": 20.539369583129883,
      "learning_rate": 1.84161220043573e-05,
      "loss": 2.6147,
      "step": 3500
    },
    {
      "epoch": 1.9607843137254903,
      "grad_norm": 17.798688888549805,
      "learning_rate": 1.7871459694989108e-05,
      "loss": 2.565,
      "step": 4000
    },
    {
      "epoch": 2.0,
      "eval_HasAns_exact": 42.69350627474101,
      "eval_HasAns_f1": 46.51650253068049,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 72.38271708919397,
      "eval_best_exact_thresh": 19.26171875,
      "eval_best_f1": 76.20571334513343,
      "eval_best_f1_thresh": 19.26171875,
      "eval_exact": 42.69350627474101,
      "eval_f1": 46.51650253068049,
      "eval_runtime": 85.6719,
      "eval_samples_per_second": 139.707,
      "eval_steps_per_second": 2.194,
      "eval_total": 11873,
      "step": 4080
    },
    {
      "epoch": 2.2058823529411766,
      "grad_norm": 15.679291725158691,
      "learning_rate": 1.7326797385620918e-05,
      "loss": 2.2395,
      "step": 4500
    },
    {
      "epoch": 2.450980392156863,
      "grad_norm": 15.367960929870605,
      "learning_rate": 1.6783224400871462e-05,
      "loss": 2.1524,
      "step": 5000
    },
    {
      "epoch": 2.696078431372549,
      "grad_norm": 13.34710693359375,
      "learning_rate": 1.623856209150327e-05,
      "loss": 2.1288,
      "step": 5500
    },
    {
      "epoch": 2.9411764705882355,
      "grad_norm": 30.506305694580078,
      "learning_rate": 1.5693899782135078e-05,
      "loss": 2.1232,
      "step": 6000
    },
    {
      "epoch": 3.0,
      "eval_HasAns_exact": 42.86195569780173,
      "eval_HasAns_f1": 46.30919897657347,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 74.37884275246357,
      "eval_best_exact_thresh": 22.76953125,
      "eval_best_f1": 77.82608603123529,
      "eval_best_f1_thresh": 22.76953125,
      "eval_exact": 42.86195569780173,
      "eval_f1": 46.30919897657347,
      "eval_runtime": 84.8777,
      "eval_samples_per_second": 141.015,
      "eval_steps_per_second": 2.215,
      "eval_total": 11873,
      "step": 6120
    },
    {
      "epoch": 3.186274509803922,
      "grad_norm": 16.010583877563477,
      "learning_rate": 1.5149237472766886e-05,
      "loss": 1.8918,
      "step": 6500
    },
    {
      "epoch": 3.431372549019608,
      "grad_norm": 17.6584529876709,
      "learning_rate": 1.4604575163398694e-05,
      "loss": 1.8567,
      "step": 7000
    },
    {
      "epoch": 3.6764705882352944,
      "grad_norm": 16.007558822631836,
      "learning_rate": 1.4059912854030502e-05,
      "loss": 1.828,
      "step": 7500
    },
    {
      "epoch": 3.9215686274509802,
      "grad_norm": 13.604976654052734,
      "learning_rate": 1.3515250544662311e-05,
      "loss": 1.844,
      "step": 8000
    },
    {
      "epoch": 4.0,
      "eval_HasAns_exact": 41.497515371009854,
      "eval_HasAns_f1": 44.91728050965404,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 76.1728291080603,
      "eval_best_exact_thresh": 23.875,
      "eval_best_f1": 79.59259424670454,
      "eval_best_f1_thresh": 23.875,
      "eval_exact": 41.497515371009854,
      "eval_f1": 44.91728050965404,
      "eval_runtime": 84.9529,
      "eval_samples_per_second": 140.89,
      "eval_steps_per_second": 2.213,
      "eval_total": 11873,
      "step": 8160
    },
    {
      "epoch": 4.166666666666667,
      "grad_norm": 19.50208282470703,
      "learning_rate": 1.297058823529412e-05,
      "loss": 1.6876,
      "step": 8500
    },
    {
      "epoch": 4.411764705882353,
      "grad_norm": 31.982032775878906,
      "learning_rate": 1.2425925925925927e-05,
      "loss": 1.626,
      "step": 9000
    },
    {
      "epoch": 4.6568627450980395,
      "grad_norm": 28.154998779296875,
      "learning_rate": 1.1881263616557735e-05,
      "loss": 1.6113,
      "step": 9500
    },
    {
      "epoch": 4.901960784313726,
      "grad_norm": 14.066104888916016,
      "learning_rate": 1.1336601307189543e-05,
      "loss": 1.6376,
      "step": 10000
    },
    {
      "epoch": 5.0,
      "eval_HasAns_exact": 41.337488419102165,
      "eval_HasAns_f1": 44.79650825632658,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 76.0549145119178,
      "eval_best_exact_thresh": 24.96484375,
      "eval_best_f1": 79.51393434914232,
      "eval_best_f1_thresh": 24.96484375,
      "eval_exact": 41.337488419102165,
      "eval_f1": 44.79650825632658,
      "eval_runtime": 86.6553,
      "eval_samples_per_second": 138.122,
      "eval_steps_per_second": 2.17,
      "eval_total": 11873,
      "step": 10200
    },
    {
      "epoch": 5.147058823529412,
      "grad_norm": 10.620172500610352,
      "learning_rate": 1.0791938997821352e-05,
      "loss": 1.5461,
      "step": 10500
    },
    {
      "epoch": 5.392156862745098,
      "grad_norm": 8.425840377807617,
      "learning_rate": 1.024727668845316e-05,
      "loss": 1.4638,
      "step": 11000
    },
    {
      "epoch": 5.637254901960784,
      "grad_norm": 14.811394691467285,
      "learning_rate": 9.702614379084968e-06,
      "loss": 1.5059,
      "step": 11500
    },
    {
      "epoch": 5.882352941176471,
      "grad_norm": 24.639976501464844,
      "learning_rate": 9.157952069716776e-06,
      "loss": 1.4819,
      "step": 12000
    },
    {
      "epoch": 6.0,
      "eval_HasAns_exact": 41.48067042870378,
      "eval_HasAns_f1": 45.146223596426786,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 74.41253263707571,
      "eval_best_exact_thresh": 26.44140625,
      "eval_best_f1": 78.0780858047988,
      "eval_best_f1_thresh": 26.44140625,
      "eval_exact": 41.48067042870378,
      "eval_f1": 45.146223596426786,
      "eval_runtime": 85.4707,
      "eval_samples_per_second": 140.036,
      "eval_steps_per_second": 2.2,
      "eval_total": 11873,
      "step": 12240
    },
    {
      "epoch": 6.127450980392156,
      "grad_norm": 15.370038032531738,
      "learning_rate": 8.613289760348584e-06,
      "loss": 1.4085,
      "step": 12500
    },
    {
      "epoch": 6.372549019607844,
      "grad_norm": 21.215662002563477,
      "learning_rate": 8.069716775599129e-06,
      "loss": 1.3618,
      "step": 13000
    },
    {
      "epoch": 6.617647058823529,
      "grad_norm": 21.488882064819336,
      "learning_rate": 7.5250544662309376e-06,
      "loss": 1.3808,
      "step": 13500
    },
    {
      "epoch": 6.862745098039216,
      "grad_norm": 20.903297424316406,
      "learning_rate": 6.9803921568627454e-06,
      "loss": 1.3846,
      "step": 14000
    },
    {
      "epoch": 7.0,
      "eval_HasAns_exact": 40.59631095763497,
      "eval_HasAns_f1": 44.0313300086635,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 78.64061315589994,
      "eval_best_exact_thresh": 26.69140625,
      "eval_best_f1": 82.07563220692849,
      "eval_best_f1_thresh": 26.69140625,
      "eval_exact": 40.59631095763497,
      "eval_f1": 44.0313300086635,
      "eval_runtime": 88.9313,
      "eval_samples_per_second": 134.587,
      "eval_steps_per_second": 2.114,
      "eval_total": 11873,
      "step": 14280
    },
    {
      "epoch": 7.107843137254902,
      "grad_norm": 37.26835250854492,
      "learning_rate": 6.435729847494554e-06,
      "loss": 1.3405,
      "step": 14500
    },
    {
      "epoch": 7.352941176470588,
      "grad_norm": 15.88136100769043,
      "learning_rate": 5.891067538126363e-06,
      "loss": 1.2833,
      "step": 15000
    },
    {
      "epoch": 7.598039215686274,
      "grad_norm": 18.780271530151367,
      "learning_rate": 5.34640522875817e-06,
      "loss": 1.2905,
      "step": 15500
    },
    {
      "epoch": 7.8431372549019605,
      "grad_norm": 9.158843040466309,
      "learning_rate": 4.801742919389979e-06,
      "loss": 1.3058,
      "step": 16000
    },
    {
      "epoch": 8.0,
      "eval_HasAns_exact": 40.32679188073781,
      "eval_HasAns_f1": 44.093437026674906,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 77.43619978101574,
      "eval_best_exact_thresh": 27.0859375,
      "eval_best_f1": 81.20284492695288,
      "eval_best_f1_thresh": 27.0859375,
      "eval_exact": 40.32679188073781,
      "eval_f1": 44.093437026674906,
      "eval_runtime": 85.3184,
      "eval_samples_per_second": 140.286,
      "eval_steps_per_second": 2.204,
      "eval_total": 11873,
      "step": 16320
    },
    {
      "epoch": 8.088235294117647,
      "grad_norm": 27.426488876342773,
      "learning_rate": 4.2570806100217874e-06,
      "loss": 1.2616,
      "step": 16500
    },
    {
      "epoch": 8.333333333333334,
      "grad_norm": 21.802486419677734,
      "learning_rate": 3.7135076252723314e-06,
      "loss": 1.2336,
      "step": 17000
    },
    {
      "epoch": 8.57843137254902,
      "grad_norm": 24.72883415222168,
      "learning_rate": 3.1699346405228758e-06,
      "loss": 1.2427,
      "step": 17500
    },
    {
      "epoch": 8.823529411764707,
      "grad_norm": 23.679203033447266,
      "learning_rate": 2.625272331154684e-06,
      "loss": 1.2367,
      "step": 18000
    },
    {
      "epoch": 9.0,
      "eval_HasAns_exact": 40.25941211151352,
      "eval_HasAns_f1": 43.84815270396611,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 77.7730986271372,
      "eval_best_exact_thresh": 28.12890625,
      "eval_best_f1": 81.36183921958985,
      "eval_best_f1_thresh": 28.12890625,
      "eval_exact": 40.25941211151352,
      "eval_f1": 43.84815270396611,
      "eval_runtime": 85.5334,
      "eval_samples_per_second": 139.934,
      "eval_steps_per_second": 2.198,
      "eval_total": 11873,
      "step": 18360
    },
    {
      "epoch": 9.068627450980392,
      "grad_norm": 6.289318561553955,
      "learning_rate": 2.0806100217864924e-06,
      "loss": 1.202,
      "step": 18500
    },
    {
      "epoch": 9.313725490196079,
      "grad_norm": 13.79410457611084,
      "learning_rate": 1.535947712418301e-06,
      "loss": 1.1927,
      "step": 19000
    },
    {
      "epoch": 9.558823529411764,
      "grad_norm": 10.254155158996582,
      "learning_rate": 9.91285403050109e-07,
      "loss": 1.1843,
      "step": 19500
    },
    {
      "epoch": 9.803921568627452,
      "grad_norm": 8.800715446472168,
      "learning_rate": 4.466230936819173e-07,
      "loss": 1.1861,
      "step": 20000
    },
    {
      "epoch": 10.0,
      "eval_HasAns_exact": 40.50366377495157,
      "eval_HasAns_f1": 44.072135007775564,
      "eval_HasAns_total": 11873,
      "eval_best_exact": 77.4025098964036,
      "eval_best_exact_thresh": 28.15625,
      "eval_best_f1": 80.97098112922765,
      "eval_best_f1_thresh": 28.15625,
      "eval_exact": 40.50366377495157,
      "eval_f1": 44.072135007775564,
      "eval_runtime": 85.0565,
      "eval_samples_per_second": 140.718,
      "eval_steps_per_second": 2.21,
      "eval_total": 11873,
      "step": 20400
    },
    {
      "epoch": 10.0,
      "step": 20400,
      "total_flos": 3.4100627236540416e+17,
      "train_loss": 2.000769229963714,
      "train_runtime": 15111.8581,
      "train_samples_per_second": 86.358,
      "train_steps_per_second": 1.35
    }
  ],
  "logging_steps": 500,
  "max_steps": 20400,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 5000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 3.4100627236540416e+17,
  "train_batch_size": 64,
  "trial_name": null,
  "trial_params": null
}