kumapo commited on
Commit
910efd2
verified
1 Parent(s): 7072cbd

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -42,7 +42,7 @@ The following hyperparameters were used during training:
42
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
  - lr_scheduler_type: cosine
44
  - lr_scheduler_warmup_ratio: 0.1
45
- - num_epochs: 2.0
46
 
47
  ### Training results
48
 
 
42
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
  - lr_scheduler_type: cosine
44
  - lr_scheduler_warmup_ratio: 0.1
45
+ - num_epochs: 1.0
46
 
47
  ### Training results
48
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_exact_match": 87.9783881134624,
4
- "eval_f1": 88.03091700435239,
5
- "eval_runtime": 19.8311,
6
  "eval_samples": 4442,
7
- "eval_samples_per_second": 223.991,
8
- "eval_steps_per_second": 28.037,
9
- "total_flos": 4.284348397959168e+16,
10
- "train_loss": 0.5882660921596692,
11
- "train_runtime": 1480.7732,
12
  "train_samples": 62865,
13
- "train_samples_per_second": 84.908,
14
- "train_steps_per_second": 5.308
15
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_exact_match": 86.65015758667268,
4
+ "eval_f1": 86.8584507193242,
5
+ "eval_runtime": 19.8335,
6
  "eval_samples": 4442,
7
+ "eval_samples_per_second": 223.965,
8
+ "eval_steps_per_second": 28.033,
9
+ "total_flos": 2.142174198979584e+16,
10
+ "train_loss": 0.7834983495658894,
11
+ "train_runtime": 745.8939,
12
  "train_samples": 62865,
13
+ "train_samples_per_second": 84.281,
14
+ "train_steps_per_second": 5.269
15
  }
eval_nbest_predictions.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e5ea5570d46ab43aa4b55dbcdef4e5c46cc6b5f03bdebd3ce17da6f63f0f6a0
3
- size 21926555
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9a9b62efae496dfc71e9c81d39544e29648310ddb58411d39274b07f8cd0c0c
3
+ size 22281954
eval_predictions.json CHANGED
The diff for this file is too large to render. See raw diff
 
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_exact_match": 87.9783881134624,
4
- "eval_f1": 88.03091700435239,
5
- "eval_runtime": 19.8311,
6
  "eval_samples": 4442,
7
- "eval_samples_per_second": 223.991,
8
- "eval_steps_per_second": 28.037
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_exact_match": 86.65015758667268,
4
+ "eval_f1": 86.8584507193242,
5
+ "eval_runtime": 19.8335,
6
  "eval_samples": 4442,
7
+ "eval_samples_per_second": 223.965,
8
+ "eval_steps_per_second": 28.033
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2aee8f70682f9064c2325273a3b27b272b196c9b1a2e217ba2625ec70c6fb65e
3
  size 749600616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84b97f11383764341096211c16978a52dd2d5bb6ddb509fe7b1b30d4f171357e
3
  size 749600616
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.0,
3
- "total_flos": 4.284348397959168e+16,
4
- "train_loss": 0.5882660921596692,
5
- "train_runtime": 1480.7732,
6
  "train_samples": 62865,
7
- "train_samples_per_second": 84.908,
8
- "train_steps_per_second": 5.308
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 2.142174198979584e+16,
4
+ "train_loss": 0.7834983495658894,
5
+ "train_runtime": 745.8939,
6
  "train_samples": 62865,
7
+ "train_samples_per_second": 84.281,
8
+ "train_steps_per_second": 5.269
9
  }
trainer_state.json CHANGED
@@ -2,591 +2,85 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 7860,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
- {
13
- "epoch": 0.02544529262086514,
14
- "grad_norm": 14.018085479736328,
15
- "learning_rate": 5.038167938931297e-06,
16
- "loss": 5.495,
17
- "step": 100
18
- },
19
- {
20
- "epoch": 0.05089058524173028,
21
- "grad_norm": 17.718544006347656,
22
- "learning_rate": 1.0127226463104327e-05,
23
- "loss": 3.2381,
24
- "step": 200
25
- },
26
- {
27
- "epoch": 0.07633587786259542,
28
- "grad_norm": 34.95506286621094,
29
- "learning_rate": 1.5216284987277354e-05,
30
- "loss": 2.3499,
31
- "step": 300
32
- },
33
- {
34
- "epoch": 0.10178117048346055,
35
- "grad_norm": 18.550844192504883,
36
- "learning_rate": 2.0305343511450384e-05,
37
- "loss": 1.6151,
38
- "step": 400
39
- },
40
  {
41
  "epoch": 0.1272264631043257,
42
- "grad_norm": 19.01876449584961,
43
- "learning_rate": 2.5394402035623415e-05,
44
- "loss": 1.2363,
45
  "step": 500
46
  },
47
- {
48
- "epoch": 0.15267175572519084,
49
- "grad_norm": 10.618987083435059,
50
- "learning_rate": 3.048346055979644e-05,
51
- "loss": 1.1415,
52
- "step": 600
53
- },
54
- {
55
- "epoch": 0.178117048346056,
56
- "grad_norm": 15.46481990814209,
57
- "learning_rate": 3.557251908396947e-05,
58
- "loss": 0.9468,
59
- "step": 700
60
- },
61
- {
62
- "epoch": 0.2035623409669211,
63
- "grad_norm": 11.06924057006836,
64
- "learning_rate": 3.9999666684789474e-05,
65
- "loss": 0.8361,
66
- "step": 800
67
- },
68
- {
69
- "epoch": 0.22900763358778625,
70
- "grad_norm": 53.707664489746094,
71
- "learning_rate": 3.997482117988182e-05,
72
- "loss": 0.7264,
73
- "step": 900
74
- },
75
  {
76
  "epoch": 0.2544529262086514,
77
- "grad_norm": 9.740643501281738,
78
- "learning_rate": 3.99105861127605e-05,
79
- "loss": 0.7597,
80
  "step": 1000
81
  },
82
- {
83
- "epoch": 0.27989821882951654,
84
- "grad_norm": 16.776819229125977,
85
- "learning_rate": 3.980708815245299e-05,
86
- "loss": 0.7007,
87
- "step": 1100
88
- },
89
- {
90
- "epoch": 0.3053435114503817,
91
- "grad_norm": 8.06242847442627,
92
- "learning_rate": 3.9664531392868807e-05,
93
- "loss": 0.6887,
94
- "step": 1200
95
- },
96
- {
97
- "epoch": 0.33078880407124683,
98
- "grad_norm": 12.792263984680176,
99
- "learning_rate": 3.9483196950334345e-05,
100
- "loss": 0.6555,
101
- "step": 1300
102
- },
103
- {
104
- "epoch": 0.356234096692112,
105
- "grad_norm": 24.67995834350586,
106
- "learning_rate": 3.9263442409242555e-05,
107
- "loss": 0.6284,
108
- "step": 1400
109
- },
110
  {
111
  "epoch": 0.3816793893129771,
112
- "grad_norm": 10.594704627990723,
113
- "learning_rate": 3.9005701116910544e-05,
114
- "loss": 0.66,
115
  "step": 1500
116
  },
117
- {
118
- "epoch": 0.4071246819338422,
119
- "grad_norm": 16.927017211914062,
120
- "learning_rate": 3.871048132903571e-05,
121
- "loss": 0.6219,
122
- "step": 1600
123
- },
124
- {
125
- "epoch": 0.43256997455470736,
126
- "grad_norm": 11.569594383239746,
127
- "learning_rate": 3.8378365207435505e-05,
128
- "loss": 0.5795,
129
- "step": 1700
130
- },
131
- {
132
- "epoch": 0.4580152671755725,
133
- "grad_norm": 7.332423210144043,
134
- "learning_rate": 3.801000767204719e-05,
135
- "loss": 0.602,
136
- "step": 1800
137
- },
138
- {
139
- "epoch": 0.48346055979643765,
140
- "grad_norm": 15.542613983154297,
141
- "learning_rate": 3.7606135109451464e-05,
142
- "loss": 0.5607,
143
- "step": 1900
144
- },
145
  {
146
  "epoch": 0.5089058524173028,
147
- "grad_norm": 4.43936824798584,
148
- "learning_rate": 3.7167543940466696e-05,
149
- "loss": 0.6086,
150
  "step": 2000
151
  },
152
- {
153
- "epoch": 0.5343511450381679,
154
- "grad_norm": 12.216741561889648,
155
- "learning_rate": 3.6695099049638365e-05,
156
- "loss": 0.5765,
157
- "step": 2100
158
- },
159
- {
160
- "epoch": 0.5597964376590331,
161
- "grad_norm": 10.111679077148438,
162
- "learning_rate": 3.618973207972071e-05,
163
- "loss": 0.5982,
164
- "step": 2200
165
- },
166
- {
167
- "epoch": 0.5852417302798982,
168
- "grad_norm": 11.400161743164062,
169
- "learning_rate": 3.5652439594513904e-05,
170
- "loss": 0.5568,
171
- "step": 2300
172
- },
173
- {
174
- "epoch": 0.6106870229007634,
175
- "grad_norm": 5.88036584854126,
176
- "learning_rate": 3.508428111367932e-05,
177
- "loss": 0.5134,
178
- "step": 2400
179
- },
180
  {
181
  "epoch": 0.6361323155216285,
182
- "grad_norm": 5.014535903930664,
183
- "learning_rate": 3.44863770234085e-05,
184
- "loss": 0.4753,
185
  "step": 2500
186
  },
187
- {
188
- "epoch": 0.6615776081424937,
189
- "grad_norm": 7.856019496917725,
190
- "learning_rate": 3.385990636706554e-05,
191
- "loss": 0.5562,
192
- "step": 2600
193
- },
194
- {
195
- "epoch": 0.6870229007633588,
196
- "grad_norm": 2.1098134517669678,
197
- "learning_rate": 3.3206104520160004e-05,
198
- "loss": 0.5474,
199
- "step": 2700
200
- },
201
- {
202
- "epoch": 0.712468193384224,
203
- "grad_norm": 6.840382099151611,
204
- "learning_rate": 3.252626075423488e-05,
205
- "loss": 0.4687,
206
- "step": 2800
207
- },
208
- {
209
- "epoch": 0.7379134860050891,
210
- "grad_norm": 11.738897323608398,
211
- "learning_rate": 3.1821715694473885e-05,
212
- "loss": 0.548,
213
- "step": 2900
214
- },
215
  {
216
  "epoch": 0.7633587786259542,
217
- "grad_norm": 9.473503112792969,
218
- "learning_rate": 3.10938586760412e-05,
219
- "loss": 0.4915,
220
  "step": 3000
221
  },
222
- {
223
- "epoch": 0.7888040712468194,
224
- "grad_norm": 6.523691177368164,
225
- "learning_rate": 3.0344125004367205e-05,
226
- "loss": 0.4947,
227
- "step": 3100
228
- },
229
- {
230
- "epoch": 0.8142493638676844,
231
- "grad_norm": 5.054864883422852,
232
- "learning_rate": 2.9573993124782555e-05,
233
- "loss": 0.5223,
234
- "step": 3200
235
- },
236
- {
237
- "epoch": 0.8396946564885496,
238
- "grad_norm": 9.77381420135498,
239
- "learning_rate": 2.8784981707082113e-05,
240
- "loss": 0.4955,
241
- "step": 3300
242
- },
243
- {
244
- "epoch": 0.8651399491094147,
245
- "grad_norm": 4.1154584884643555,
246
- "learning_rate": 2.7978646650767803e-05,
247
- "loss": 0.4927,
248
- "step": 3400
249
- },
250
  {
251
  "epoch": 0.8905852417302799,
252
- "grad_norm": 16.85251235961914,
253
- "learning_rate": 2.7156578016875935e-05,
254
- "loss": 0.5009,
255
  "step": 3500
256
  },
257
- {
258
- "epoch": 0.916030534351145,
259
- "grad_norm": 10.117000579833984,
260
- "learning_rate": 2.632039689243941e-05,
261
- "loss": 0.4807,
262
- "step": 3600
263
- },
264
- {
265
- "epoch": 0.9414758269720102,
266
- "grad_norm": 6.332391262054443,
267
- "learning_rate": 2.5471752193767702e-05,
268
- "loss": 0.5069,
269
- "step": 3700
270
- },
271
- {
272
- "epoch": 0.9669211195928753,
273
- "grad_norm": 10.722579956054688,
274
- "learning_rate": 2.4612317414848804e-05,
275
- "loss": 0.5103,
276
- "step": 3800
277
- },
278
- {
279
- "epoch": 0.9923664122137404,
280
- "grad_norm": 3.0242695808410645,
281
- "learning_rate": 2.374378732728483e-05,
282
- "loss": 0.4559,
283
- "step": 3900
284
- },
285
  {
286
  "epoch": 1.0,
287
- "eval_exact_match": 86.28995947771274,
288
- "eval_f1": 86.33873630496772,
289
- "eval_runtime": 19.9463,
290
- "eval_samples_per_second": 222.698,
291
- "eval_steps_per_second": 27.875,
292
  "step": 3930
293
  },
294
  {
295
- "epoch": 1.0178117048346056,
296
- "grad_norm": 9.833579063415527,
297
- "learning_rate": 2.2867874638269023e-05,
298
- "loss": 0.3774,
299
- "step": 4000
300
- },
301
- {
302
- "epoch": 1.0432569974554706,
303
- "grad_norm": 3.0543391704559326,
304
- "learning_rate": 2.1986306613194482e-05,
305
- "loss": 0.3241,
306
- "step": 4100
307
- },
308
- {
309
- "epoch": 1.0687022900763359,
310
- "grad_norm": 1.1584059000015259,
311
- "learning_rate": 2.1100821669554764e-05,
312
- "loss": 0.3121,
313
- "step": 4200
314
- },
315
- {
316
- "epoch": 1.094147582697201,
317
- "grad_norm": 2.015965461730957,
318
- "learning_rate": 2.0213165948852832e-05,
319
- "loss": 0.3262,
320
- "step": 4300
321
- },
322
- {
323
- "epoch": 1.1195928753180662,
324
- "grad_norm": 2.360656499862671,
325
- "learning_rate": 1.9325089873278702e-05,
326
- "loss": 0.2852,
327
- "step": 4400
328
- },
329
- {
330
- "epoch": 1.1450381679389312,
331
- "grad_norm": 11.277798652648926,
332
- "learning_rate": 1.8438344693945684e-05,
333
- "loss": 0.2839,
334
- "step": 4500
335
- },
336
- {
337
- "epoch": 1.1704834605597965,
338
- "grad_norm": 14.375648498535156,
339
- "learning_rate": 1.7554679037491995e-05,
340
- "loss": 0.3317,
341
- "step": 4600
342
- },
343
- {
344
- "epoch": 1.1959287531806615,
345
- "grad_norm": 6.140357494354248,
346
- "learning_rate": 1.667583545785781e-05,
347
- "loss": 0.3063,
348
- "step": 4700
349
- },
350
- {
351
- "epoch": 1.2213740458015268,
352
- "grad_norm": 9.926220893859863,
353
- "learning_rate": 1.5803547000037324e-05,
354
- "loss": 0.3466,
355
- "step": 4800
356
- },
357
- {
358
- "epoch": 1.2468193384223918,
359
- "grad_norm": 1.1600242853164673,
360
- "learning_rate": 1.493953378258222e-05,
361
- "loss": 0.3031,
362
- "step": 4900
363
- },
364
- {
365
- "epoch": 1.272264631043257,
366
- "grad_norm": 8.765625953674316,
367
- "learning_rate": 1.4085499605595403e-05,
368
- "loss": 0.2756,
369
- "step": 5000
370
- },
371
- {
372
- "epoch": 1.297709923664122,
373
- "grad_norm": 10.345256805419922,
374
- "learning_rate": 1.3243128590904269e-05,
375
- "loss": 0.3013,
376
- "step": 5100
377
- },
378
- {
379
- "epoch": 1.3231552162849873,
380
- "grad_norm": 3.349202871322632,
381
- "learning_rate": 1.2414081861038584e-05,
382
- "loss": 0.2554,
383
- "step": 5200
384
- },
385
- {
386
- "epoch": 1.3486005089058524,
387
- "grad_norm": 7.207827091217041,
388
- "learning_rate": 1.1599994263562202e-05,
389
- "loss": 0.2758,
390
- "step": 5300
391
- },
392
- {
393
- "epoch": 1.3740458015267176,
394
- "grad_norm": 3.8416330814361572,
395
- "learning_rate": 1.0802471147217876e-05,
396
- "loss": 0.3062,
397
- "step": 5400
398
- },
399
- {
400
- "epoch": 1.3994910941475827,
401
- "grad_norm": 3.0882201194763184,
402
- "learning_rate": 1.0023085196242704e-05,
403
- "loss": 0.3213,
404
- "step": 5500
405
- },
406
- {
407
- "epoch": 1.424936386768448,
408
- "grad_norm": 3.281527280807495,
409
- "learning_rate": 9.263373329096565e-06,
410
- "loss": 0.2757,
411
- "step": 5600
412
- },
413
- {
414
- "epoch": 1.450381679389313,
415
- "grad_norm": 4.6003217697143555,
416
- "learning_rate": 8.524833667719368e-06,
417
- "loss": 0.2978,
418
- "step": 5700
419
- },
420
- {
421
- "epoch": 1.4758269720101782,
422
- "grad_norm": 2.5000410079956055,
423
- "learning_rate": 7.808922583293472e-06,
424
- "loss": 0.2816,
425
- "step": 5800
426
- },
427
- {
428
- "epoch": 1.5012722646310432,
429
- "grad_norm": 2.911534309387207,
430
- "learning_rate": 7.117051824336958e-06,
431
- "loss": 0.2674,
432
- "step": 5900
433
- },
434
- {
435
- "epoch": 1.5267175572519083,
436
- "grad_norm": 9.231036186218262,
437
- "learning_rate": 6.450585732791013e-06,
438
- "loss": 0.3186,
439
- "step": 6000
440
- },
441
- {
442
- "epoch": 1.5521628498727735,
443
- "grad_norm": 4.873745918273926,
444
- "learning_rate": 5.810838553591298e-06,
445
- "loss": 0.2834,
446
- "step": 6100
447
- },
448
- {
449
- "epoch": 1.5776081424936388,
450
- "grad_norm": 5.407022953033447,
451
- "learning_rate": 5.199071843028569e-06,
452
- "loss": 0.2999,
453
- "step": 6200
454
- },
455
- {
456
- "epoch": 1.6030534351145038,
457
- "grad_norm": 3.650712728500366,
458
- "learning_rate": 4.616491981009243e-06,
459
- "loss": 0.2596,
460
- "step": 6300
461
- },
462
- {
463
- "epoch": 1.6284987277353689,
464
- "grad_norm": 3.6443347930908203,
465
- "learning_rate": 4.064247792121691e-06,
466
- "loss": 0.3064,
467
- "step": 6400
468
- },
469
- {
470
- "epoch": 1.6539440203562341,
471
- "grad_norm": 7.009209156036377,
472
- "learning_rate": 3.5434282801992483e-06,
473
- "loss": 0.3195,
474
- "step": 6500
475
- },
476
- {
477
- "epoch": 1.6793893129770994,
478
- "grad_norm": 5.872575759887695,
479
- "learning_rate": 3.0550604808475073e-06,
480
- "loss": 0.2682,
481
- "step": 6600
482
- },
483
- {
484
- "epoch": 1.7048346055979644,
485
- "grad_norm": 4.577968120574951,
486
- "learning_rate": 2.6001074361704427e-06,
487
- "loss": 0.3039,
488
- "step": 6700
489
- },
490
- {
491
- "epoch": 1.7302798982188294,
492
- "grad_norm": 9.756656646728516,
493
- "learning_rate": 2.1794662956892585e-06,
494
- "loss": 0.2726,
495
- "step": 6800
496
- },
497
- {
498
- "epoch": 1.7557251908396947,
499
- "grad_norm": 11.115859985351562,
500
- "learning_rate": 1.7939665471987844e-06,
501
- "loss": 0.2947,
502
- "step": 6900
503
- },
504
- {
505
- "epoch": 1.78117048346056,
506
- "grad_norm": 3.8502421379089355,
507
- "learning_rate": 1.4443683810501563e-06,
508
- "loss": 0.2809,
509
- "step": 7000
510
- },
511
- {
512
- "epoch": 1.806615776081425,
513
- "grad_norm": 11.346807479858398,
514
- "learning_rate": 1.1313611910853096e-06,
515
- "loss": 0.3048,
516
- "step": 7100
517
- },
518
- {
519
- "epoch": 1.83206106870229,
520
- "grad_norm": 10.86630916595459,
521
- "learning_rate": 8.555622151794352e-07,
522
- "loss": 0.286,
523
- "step": 7200
524
- },
525
- {
526
- "epoch": 1.8575063613231553,
527
- "grad_norm": 4.079004287719727,
528
- "learning_rate": 6.175153180721571e-07,
529
- "loss": 0.2853,
530
- "step": 7300
531
- },
532
- {
533
- "epoch": 1.8829516539440203,
534
- "grad_norm": 18.409320831298828,
535
- "learning_rate": 4.176899188876271e-07,
536
- "loss": 0.2629,
537
- "step": 7400
538
- },
539
- {
540
- "epoch": 1.9083969465648853,
541
- "grad_norm": 0.5087906718254089,
542
- "learning_rate": 2.564800654584687e-07,
543
- "loss": 0.2831,
544
- "step": 7500
545
- },
546
- {
547
- "epoch": 1.9338422391857506,
548
- "grad_norm": 6.566995620727539,
549
- "learning_rate": 1.342036572789507e-07,
550
- "loss": 0.3214,
551
- "step": 7600
552
- },
553
- {
554
- "epoch": 1.9592875318066159,
555
- "grad_norm": 11.18786334991455,
556
- "learning_rate": 5.1101818619667677e-08,
557
- "loss": 0.3245,
558
- "step": 7700
559
- },
560
- {
561
- "epoch": 1.984732824427481,
562
- "grad_norm": 1.765227198600769,
563
- "learning_rate": 7.338423039955356e-09,
564
- "loss": 0.2703,
565
- "step": 7800
566
- },
567
- {
568
- "epoch": 2.0,
569
- "eval_exact_match": 87.9783881134624,
570
- "eval_f1": 88.03091700435239,
571
- "eval_runtime": 20.4577,
572
- "eval_samples_per_second": 217.131,
573
- "eval_steps_per_second": 27.178,
574
- "step": 7860
575
- },
576
- {
577
- "epoch": 2.0,
578
- "step": 7860,
579
- "total_flos": 4.284348397959168e+16,
580
- "train_loss": 0.5882660921596692,
581
- "train_runtime": 1480.7732,
582
- "train_samples_per_second": 84.908,
583
- "train_steps_per_second": 5.308
584
  }
585
  ],
586
- "logging_steps": 100,
587
- "max_steps": 7860,
588
  "num_input_tokens_seen": 0,
589
- "num_train_epochs": 2,
590
  "save_steps": 500,
591
  "stateful_callbacks": {
592
  "TrainerControl": {
@@ -600,7 +94,7 @@
600
  "attributes": {}
601
  }
602
  },
603
- "total_flos": 4.284348397959168e+16,
604
  "train_batch_size": 16,
605
  "trial_name": null,
606
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 3930,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  {
13
  "epoch": 0.1272264631043257,
14
+ "grad_norm": 17.1181640625,
15
+ "learning_rate": 3.9911423090147286e-05,
16
+ "loss": 2.3375,
17
  "step": 500
18
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {
20
  "epoch": 0.2544529262086514,
21
+ "grad_norm": 6.601884841918945,
22
+ "learning_rate": 3.717209892534846e-05,
23
+ "loss": 0.7494,
24
  "step": 1000
25
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  {
27
  "epoch": 0.3816793893129771,
28
+ "grad_norm": 6.359577655792236,
29
+ "learning_rate": 3.110124796432003e-05,
30
+ "loss": 0.6163,
31
  "step": 1500
32
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  {
34
  "epoch": 0.5089058524173028,
35
+ "grad_norm": 2.9361400604248047,
36
+ "learning_rate": 2.2876664647945828e-05,
37
+ "loss": 0.5633,
38
  "step": 2000
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  {
41
  "epoch": 0.6361323155216285,
42
+ "grad_norm": 4.359348297119141,
43
+ "learning_rate": 1.4093985003332392e-05,
44
+ "loss": 0.5159,
45
  "step": 2500
46
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  {
48
  "epoch": 0.7633587786259542,
49
+ "grad_norm": 10.268453598022461,
50
+ "learning_rate": 6.457120282443114e-06,
51
+ "loss": 0.4929,
52
  "step": 3000
53
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  {
55
  "epoch": 0.8905852417302799,
56
+ "grad_norm": 14.500295639038086,
57
+ "learning_rate": 1.4476843268606766e-06,
58
+ "loss": 0.4731,
59
  "step": 3500
60
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  {
62
  "epoch": 1.0,
63
+ "eval_exact_match": 86.65015758667268,
64
+ "eval_f1": 86.8584507193242,
65
+ "eval_runtime": 20.7038,
66
+ "eval_samples_per_second": 214.55,
67
+ "eval_steps_per_second": 26.855,
68
  "step": 3930
69
  },
70
  {
71
+ "epoch": 1.0,
72
+ "step": 3930,
73
+ "total_flos": 2.142174198979584e+16,
74
+ "train_loss": 0.7834983495658894,
75
+ "train_runtime": 745.8939,
76
+ "train_samples_per_second": 84.281,
77
+ "train_steps_per_second": 5.269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  }
79
  ],
80
+ "logging_steps": 500,
81
+ "max_steps": 3930,
82
  "num_input_tokens_seen": 0,
83
+ "num_train_epochs": 1,
84
  "save_steps": 500,
85
  "stateful_callbacks": {
86
  "TrainerControl": {
 
94
  "attributes": {}
95
  }
96
  },
97
+ "total_flos": 2.142174198979584e+16,
98
  "train_batch_size": 16,
99
  "trial_name": null,
100
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef43baae43e94d2eb73cc7449719748b90a41b8bf04c08301b05f12f4f685d7b
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0caf4a2bd00f5c4190d771ad1c57c1c2bc06bae8c1880cab947433020aca0ffe
3
  size 5713