edanigoben commited on
Commit
5c71849
·
1 Parent(s): 40a3b7b

(17) Classification on crawler 66 epochs

Browse files
Files changed (7) hide show
  1. config.json +1 -1
  2. optimizer.pt +1 -1
  3. pytorch_model.bin +1 -1
  4. rng_state.pth +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +657 -651
  7. training_args.bin +2 -2
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "distilbert-base-uncased",
3
  "activation": "gelu",
4
  "architectures": [
5
  "DistilBertForSequenceClassification"
 
1
  {
2
+ "_name_or_path": "factored/distilbert-fr-explorer-mlm",
3
  "activation": "gelu",
4
  "architectures": [
5
  "DistilBertForSequenceClassification"
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6fa49861293b096f4ec4a7287f7759ac99a2062f0f275e63f7280fc15e0f9f6
3
  size 535750213
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d83561355ea40d67ab9012c3166f657e1ff760da6c50027af4e43144c0fff262
3
  size 535750213
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:491feb260883d9c03042e7f92de7b291e3680759cf403faee560c936244e4b35
3
  size 267880109
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32ab236035ae34ead55dc6eb93fab584687f0abd2c6e5fef0d08152a262edbf3
3
  size 267880109
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3899805085e9d2ac6cbf41d1e231c8019961099f94de919796ff76ec1a2a6d27
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5da77d5d668ca0e0c026e5aaa5f6d78e9342d0a43738836703db1ca4b7946b7d
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:772dd81534f16ccad499eda8bef2dc60508234c8daa4ecae9a5aa2c25f5ac971
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf87c4d674820bdb7f033d4d03e7ab7226e2b1c16d339409520876b41a77b97d
3
  size 627
trainer_state.json CHANGED
@@ -2,988 +2,994 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 65.0,
5
- "global_step": 6760,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.99,
12
- "learning_rate": 4.9238165680473374e-05,
13
- "loss": 1.4565,
14
- "step": 103
15
  },
16
  {
17
  "epoch": 1.0,
18
- "eval_f1": 0.5061315988298739,
19
- "eval_loss": 1.226438045501709,
20
- "eval_runtime": 2.9031,
21
- "eval_samples_per_second": 190.487,
22
- "eval_steps_per_second": 12.056,
23
- "step": 104
24
  },
25
  {
26
- "epoch": 1.98,
27
- "learning_rate": 4.8483727810650895e-05,
28
- "loss": 1.08,
29
- "step": 206
30
  },
31
  {
32
  "epoch": 2.0,
33
- "eval_f1": 0.5585235698125446,
34
- "eval_loss": 1.1974396705627441,
35
- "eval_runtime": 2.9416,
36
- "eval_samples_per_second": 187.991,
37
- "eval_steps_per_second": 11.898,
38
- "step": 208
39
  },
40
  {
41
- "epoch": 2.97,
42
- "learning_rate": 4.772189349112427e-05,
43
- "loss": 0.8073,
44
- "step": 309
45
  },
46
  {
47
  "epoch": 3.0,
48
- "eval_f1": 0.5539323613883379,
49
- "eval_loss": 1.276489496231079,
50
- "eval_runtime": 2.969,
51
- "eval_samples_per_second": 186.259,
52
- "eval_steps_per_second": 11.789,
53
- "step": 312
54
  },
55
  {
56
- "epoch": 3.96,
57
- "learning_rate": 4.696005917159764e-05,
58
- "loss": 0.5577,
59
- "step": 412
60
  },
61
  {
62
  "epoch": 4.0,
63
- "eval_f1": 0.5553847052045314,
64
- "eval_loss": 1.427822232246399,
65
- "eval_runtime": 2.9361,
66
- "eval_samples_per_second": 188.346,
67
- "eval_steps_per_second": 11.921,
68
- "step": 416
69
  },
70
  {
71
- "epoch": 4.95,
72
- "learning_rate": 4.619822485207101e-05,
73
- "loss": 0.3941,
74
- "step": 515
75
  },
76
  {
77
  "epoch": 5.0,
78
- "eval_f1": 0.5570342860609194,
79
- "eval_loss": 1.6517128944396973,
80
- "eval_runtime": 2.9022,
81
- "eval_samples_per_second": 190.542,
82
- "eval_steps_per_second": 12.06,
83
- "step": 520
84
  },
85
  {
86
- "epoch": 5.94,
87
- "learning_rate": 4.543639053254438e-05,
88
- "loss": 0.2878,
89
- "step": 618
90
  },
91
  {
92
  "epoch": 6.0,
93
- "eval_f1": 0.5619826716090497,
94
- "eval_loss": 1.8180437088012695,
95
- "eval_runtime": 2.855,
96
- "eval_samples_per_second": 193.697,
97
- "eval_steps_per_second": 12.259,
98
- "step": 624
99
  },
100
  {
101
- "epoch": 6.93,
102
- "learning_rate": 4.468195266272189e-05,
103
- "loss": 0.2337,
104
- "step": 721
105
  },
106
  {
107
  "epoch": 7.0,
108
- "eval_f1": 0.5674708526030706,
109
- "eval_loss": 1.9061989784240723,
110
- "eval_runtime": 2.8641,
111
- "eval_samples_per_second": 193.077,
112
- "eval_steps_per_second": 12.22,
113
- "step": 728
114
  },
115
  {
116
- "epoch": 7.92,
117
- "learning_rate": 4.392011834319526e-05,
118
- "loss": 0.1743,
119
- "step": 824
120
  },
121
  {
122
  "epoch": 8.0,
123
- "eval_f1": 0.5571774381839604,
124
- "eval_loss": 2.166078805923462,
125
- "eval_runtime": 2.8885,
126
- "eval_samples_per_second": 191.449,
127
- "eval_steps_per_second": 12.117,
128
- "step": 832
129
  },
130
  {
131
- "epoch": 8.91,
132
- "learning_rate": 4.315828402366864e-05,
133
- "loss": 0.1324,
134
- "step": 927
135
  },
136
  {
137
  "epoch": 9.0,
138
- "eval_f1": 0.5586999359656486,
139
- "eval_loss": 2.1434192657470703,
140
- "eval_runtime": 2.8793,
141
- "eval_samples_per_second": 192.062,
142
- "eval_steps_per_second": 12.156,
143
- "step": 936
144
  },
145
  {
146
- "epoch": 9.9,
147
- "learning_rate": 4.239644970414201e-05,
148
- "loss": 0.1051,
149
- "step": 1030
150
  },
151
  {
152
  "epoch": 10.0,
153
- "eval_f1": 0.571861247626083,
154
- "eval_loss": 2.2514231204986572,
155
- "eval_runtime": 2.8876,
156
- "eval_samples_per_second": 191.511,
157
- "eval_steps_per_second": 12.121,
158
- "step": 1040
159
  },
160
  {
161
- "epoch": 10.89,
162
- "learning_rate": 4.163461538461539e-05,
163
- "loss": 0.1016,
164
- "step": 1133
165
  },
166
  {
167
  "epoch": 11.0,
168
- "eval_f1": 0.5608736700927537,
169
- "eval_loss": 2.452277898788452,
170
- "eval_runtime": 2.9662,
171
- "eval_samples_per_second": 186.434,
172
- "eval_steps_per_second": 11.8,
173
- "step": 1144
174
  },
175
  {
176
- "epoch": 11.88,
177
- "learning_rate": 4.0872781065088764e-05,
178
- "loss": 0.0814,
179
- "step": 1236
180
  },
181
  {
182
  "epoch": 12.0,
183
- "eval_f1": 0.5643677851728315,
184
- "eval_loss": 2.5340888500213623,
185
- "eval_runtime": 2.8457,
186
- "eval_samples_per_second": 194.329,
187
- "eval_steps_per_second": 12.299,
188
- "step": 1248
189
  },
190
  {
191
- "epoch": 12.88,
192
- "learning_rate": 4.0110946745562136e-05,
193
- "loss": 0.0673,
194
- "step": 1339
195
  },
196
  {
197
  "epoch": 13.0,
198
- "eval_f1": 0.5738915229311208,
199
- "eval_loss": 2.6217703819274902,
200
- "eval_runtime": 2.9035,
201
- "eval_samples_per_second": 190.46,
202
- "eval_steps_per_second": 12.054,
203
- "step": 1352
204
  },
205
  {
206
- "epoch": 13.87,
207
- "learning_rate": 3.934911242603551e-05,
208
- "loss": 0.0684,
209
- "step": 1442
210
  },
211
  {
212
  "epoch": 14.0,
213
- "eval_f1": 0.5366433281464598,
214
- "eval_loss": 2.9552414417266846,
215
- "eval_runtime": 2.9063,
216
- "eval_samples_per_second": 190.277,
217
- "eval_steps_per_second": 12.043,
218
- "step": 1456
219
  },
220
  {
221
- "epoch": 14.86,
222
- "learning_rate": 3.858727810650888e-05,
223
- "loss": 0.0466,
224
- "step": 1545
225
  },
226
  {
227
  "epoch": 15.0,
228
- "eval_f1": 0.5787084254032917,
229
- "eval_loss": 2.7240512371063232,
230
- "eval_runtime": 2.8887,
231
- "eval_samples_per_second": 191.438,
232
- "eval_steps_per_second": 12.116,
233
- "step": 1560
234
  },
235
  {
236
- "epoch": 15.85,
237
- "learning_rate": 3.782544378698225e-05,
238
- "loss": 0.0577,
239
- "step": 1648
240
  },
241
  {
242
  "epoch": 16.0,
243
- "eval_f1": 0.5666557248979172,
244
- "eval_loss": 2.821897506713867,
245
- "eval_runtime": 2.8994,
246
- "eval_samples_per_second": 190.727,
247
- "eval_steps_per_second": 12.071,
248
- "step": 1664
249
  },
250
  {
251
- "epoch": 16.84,
252
- "learning_rate": 3.706360946745562e-05,
253
- "loss": 0.042,
254
- "step": 1751
255
  },
256
  {
257
  "epoch": 17.0,
258
- "eval_f1": 0.56033452806457,
259
- "eval_loss": 2.9155900478363037,
260
- "eval_runtime": 2.9258,
261
- "eval_samples_per_second": 189.01,
262
- "eval_steps_per_second": 11.963,
263
- "step": 1768
264
  },
265
  {
266
- "epoch": 17.83,
267
- "learning_rate": 3.6301775147928995e-05,
268
- "loss": 0.0404,
269
- "step": 1854
270
  },
271
  {
272
  "epoch": 18.0,
273
- "eval_f1": 0.5621979513908701,
274
- "eval_loss": 2.893630266189575,
275
- "eval_runtime": 2.9319,
276
- "eval_samples_per_second": 188.613,
277
- "eval_steps_per_second": 11.938,
278
- "step": 1872
279
  },
280
  {
281
- "epoch": 18.82,
282
- "learning_rate": 3.553994082840237e-05,
283
- "loss": 0.0426,
284
- "step": 1957
285
  },
286
  {
287
  "epoch": 19.0,
288
- "eval_f1": 0.5766503161850353,
289
- "eval_loss": 3.00762939453125,
290
- "eval_runtime": 2.9064,
291
- "eval_samples_per_second": 190.271,
292
- "eval_steps_per_second": 12.042,
293
- "step": 1976
294
  },
295
  {
296
- "epoch": 19.81,
297
- "learning_rate": 3.477810650887574e-05,
298
- "loss": 0.0361,
299
- "step": 2060
300
  },
301
  {
302
  "epoch": 20.0,
303
- "eval_f1": 0.5565194377868121,
304
- "eval_loss": 3.043562173843384,
305
- "eval_runtime": 2.9099,
306
- "eval_samples_per_second": 190.043,
307
- "eval_steps_per_second": 12.028,
308
- "step": 2080
309
  },
310
  {
311
- "epoch": 20.8,
312
- "learning_rate": 3.401627218934911e-05,
313
- "loss": 0.039,
314
- "step": 2163
315
  },
316
  {
317
  "epoch": 21.0,
318
- "eval_f1": 0.5661283975776907,
319
- "eval_loss": 3.034050226211548,
320
- "eval_runtime": 2.8472,
321
- "eval_samples_per_second": 194.228,
322
- "eval_steps_per_second": 12.293,
323
- "step": 2184
324
  },
325
  {
326
- "epoch": 21.79,
327
- "learning_rate": 3.325443786982248e-05,
328
- "loss": 0.0311,
329
- "step": 2266
330
  },
331
  {
332
  "epoch": 22.0,
333
- "eval_f1": 0.5698598461896062,
334
- "eval_loss": 3.1546428203582764,
335
- "eval_runtime": 2.8965,
336
- "eval_samples_per_second": 190.921,
337
- "eval_steps_per_second": 12.084,
338
- "step": 2288
339
  },
340
  {
341
- "epoch": 22.78,
342
- "learning_rate": 3.2492603550295855e-05,
343
- "loss": 0.0296,
344
- "step": 2369
345
  },
346
  {
347
  "epoch": 23.0,
348
- "eval_f1": 0.5584145320343268,
349
- "eval_loss": 3.3160221576690674,
350
- "eval_runtime": 2.9004,
351
- "eval_samples_per_second": 190.664,
352
- "eval_steps_per_second": 12.067,
353
- "step": 2392
354
  },
355
  {
356
- "epoch": 23.77,
357
- "learning_rate": 3.1730769230769234e-05,
358
- "loss": 0.03,
359
- "step": 2472
360
  },
361
  {
362
  "epoch": 24.0,
363
- "eval_f1": 0.5765799312977243,
364
- "eval_loss": 3.2025678157806396,
365
- "eval_runtime": 2.837,
366
- "eval_samples_per_second": 194.921,
367
- "eval_steps_per_second": 12.337,
368
- "step": 2496
369
  },
370
  {
371
- "epoch": 24.76,
372
- "learning_rate": 3.0968934911242606e-05,
373
- "loss": 0.0333,
374
- "step": 2575
375
  },
376
  {
377
  "epoch": 25.0,
378
- "eval_f1": 0.5689553713820321,
379
- "eval_loss": 3.211634397506714,
380
- "eval_runtime": 2.9044,
381
- "eval_samples_per_second": 190.402,
382
- "eval_steps_per_second": 12.051,
383
- "step": 2600
384
  },
385
  {
386
- "epoch": 25.75,
387
- "learning_rate": 3.0207100591715974e-05,
388
- "loss": 0.0321,
389
- "step": 2678
390
  },
391
  {
392
  "epoch": 26.0,
393
- "eval_f1": 0.5756108062994573,
394
- "eval_loss": 3.2678425312042236,
395
- "eval_runtime": 2.8888,
396
- "eval_samples_per_second": 191.428,
397
- "eval_steps_per_second": 12.116,
398
- "step": 2704
399
  },
400
  {
401
- "epoch": 26.74,
402
- "learning_rate": 2.944526627218935e-05,
403
- "loss": 0.0263,
404
- "step": 2781
405
  },
406
  {
407
  "epoch": 27.0,
408
- "eval_f1": 0.5758065273285641,
409
- "eval_loss": 3.2969822883605957,
410
- "eval_runtime": 2.9527,
411
- "eval_samples_per_second": 187.286,
412
- "eval_steps_per_second": 11.854,
413
- "step": 2808
414
  },
415
  {
416
- "epoch": 27.73,
417
- "learning_rate": 2.8683431952662725e-05,
418
- "loss": 0.0281,
419
- "step": 2884
420
  },
421
  {
422
  "epoch": 28.0,
423
- "eval_f1": 0.5781354966097151,
424
- "eval_loss": 3.3730037212371826,
425
- "eval_runtime": 2.8614,
426
- "eval_samples_per_second": 193.264,
427
- "eval_steps_per_second": 12.232,
428
- "step": 2912
429
  },
430
  {
431
- "epoch": 28.72,
432
- "learning_rate": 2.7921597633136097e-05,
433
- "loss": 0.0282,
434
- "step": 2987
435
  },
436
  {
437
  "epoch": 29.0,
438
- "eval_f1": 0.5741866124789994,
439
- "eval_loss": 3.364117383956909,
440
- "eval_runtime": 2.8696,
441
- "eval_samples_per_second": 192.707,
442
- "eval_steps_per_second": 12.197,
443
- "step": 3016
444
  },
445
  {
446
- "epoch": 29.71,
447
- "learning_rate": 2.7159763313609472e-05,
448
  "loss": 0.0296,
449
- "step": 3090
450
  },
451
  {
452
  "epoch": 30.0,
453
- "eval_f1": 0.5771762774162508,
454
- "eval_loss": 3.3623032569885254,
455
- "eval_runtime": 2.9567,
456
- "eval_samples_per_second": 187.031,
457
- "eval_steps_per_second": 11.837,
458
- "step": 3120
459
  },
460
  {
461
- "epoch": 30.7,
462
- "learning_rate": 2.6397928994082844e-05,
463
- "loss": 0.0308,
464
- "step": 3193
465
  },
466
  {
467
  "epoch": 31.0,
468
- "eval_f1": 0.578537002980747,
469
- "eval_loss": 3.4039528369903564,
470
- "eval_runtime": 2.8263,
471
- "eval_samples_per_second": 195.66,
472
- "eval_steps_per_second": 12.384,
473
- "step": 3224
474
  },
475
  {
476
- "epoch": 31.69,
477
- "learning_rate": 2.5636094674556216e-05,
478
- "loss": 0.0308,
479
- "step": 3296
480
  },
481
  {
482
  "epoch": 32.0,
483
- "eval_f1": 0.575919412837488,
484
- "eval_loss": 3.392319679260254,
485
- "eval_runtime": 2.9375,
486
- "eval_samples_per_second": 188.254,
487
- "eval_steps_per_second": 11.915,
488
- "step": 3328
489
  },
490
  {
491
- "epoch": 32.68,
492
- "learning_rate": 2.4874260355029588e-05,
493
- "loss": 0.0262,
494
- "step": 3399
495
  },
496
  {
497
  "epoch": 33.0,
498
- "eval_f1": 0.5563772891428104,
499
- "eval_loss": 3.4757542610168457,
500
- "eval_runtime": 2.865,
501
- "eval_samples_per_second": 193.019,
502
- "eval_steps_per_second": 12.216,
503
- "step": 3432
504
  },
505
  {
506
- "epoch": 33.67,
507
- "learning_rate": 2.411242603550296e-05,
508
- "loss": 0.0319,
509
- "step": 3502
510
  },
511
  {
512
  "epoch": 34.0,
513
- "eval_f1": 0.5738865992034025,
514
- "eval_loss": 3.425334930419922,
515
- "eval_runtime": 2.9109,
516
- "eval_samples_per_second": 189.974,
517
- "eval_steps_per_second": 12.024,
518
- "step": 3536
519
  },
520
  {
521
- "epoch": 34.66,
522
- "learning_rate": 2.3350591715976332e-05,
523
- "loss": 0.0277,
524
- "step": 3605
525
  },
526
  {
527
  "epoch": 35.0,
528
- "eval_f1": 0.5785980513801816,
529
- "eval_loss": 3.4686436653137207,
530
- "eval_runtime": 2.93,
531
- "eval_samples_per_second": 188.738,
532
- "eval_steps_per_second": 11.945,
533
- "step": 3640
534
  },
535
  {
536
- "epoch": 35.65,
537
- "learning_rate": 2.2588757396449707e-05,
538
- "loss": 0.0289,
539
- "step": 3708
540
  },
541
  {
542
  "epoch": 36.0,
543
- "eval_f1": 0.5836924697871717,
544
- "eval_loss": 3.462078094482422,
545
- "eval_runtime": 2.8428,
546
- "eval_samples_per_second": 194.527,
547
- "eval_steps_per_second": 12.312,
548
- "step": 3744
549
  },
550
  {
551
- "epoch": 36.64,
552
- "learning_rate": 2.182692307692308e-05,
553
- "loss": 0.0247,
554
- "step": 3811
555
  },
556
  {
557
  "epoch": 37.0,
558
- "eval_f1": 0.5734707197245945,
559
- "eval_loss": 3.481998920440674,
560
- "eval_runtime": 3.0017,
561
- "eval_samples_per_second": 184.228,
562
- "eval_steps_per_second": 11.66,
563
- "step": 3848
564
  },
565
  {
566
- "epoch": 37.63,
567
- "learning_rate": 2.106508875739645e-05,
568
- "loss": 0.0303,
569
- "step": 3914
570
  },
571
  {
572
  "epoch": 38.0,
573
- "eval_f1": 0.5770262969511715,
574
- "eval_loss": 3.466510772705078,
575
- "eval_runtime": 2.8587,
576
- "eval_samples_per_second": 193.442,
577
- "eval_steps_per_second": 12.243,
578
- "step": 3952
579
  },
580
  {
581
- "epoch": 38.62,
582
- "learning_rate": 2.0303254437869823e-05,
583
- "loss": 0.0239,
584
- "step": 4017
585
  },
586
  {
587
  "epoch": 39.0,
588
- "eval_f1": 0.5666519467364683,
589
- "eval_loss": 3.5593807697296143,
590
- "eval_runtime": 2.8222,
591
- "eval_samples_per_second": 195.946,
592
- "eval_steps_per_second": 12.402,
593
- "step": 4056
594
  },
595
  {
596
- "epoch": 39.62,
597
- "learning_rate": 1.9541420118343195e-05,
598
- "loss": 0.0262,
599
- "step": 4120
600
  },
601
  {
602
  "epoch": 40.0,
603
- "eval_f1": 0.5808476343157906,
604
- "eval_loss": 3.5302422046661377,
605
- "eval_runtime": 2.8598,
606
- "eval_samples_per_second": 193.368,
607
- "eval_steps_per_second": 12.238,
608
- "step": 4160
609
  },
610
  {
611
- "epoch": 40.61,
612
- "learning_rate": 1.8779585798816567e-05,
613
- "loss": 0.0282,
614
- "step": 4223
615
  },
616
  {
617
  "epoch": 41.0,
618
- "eval_f1": 0.5835890408164021,
619
- "eval_loss": 3.4572339057922363,
620
- "eval_runtime": 2.8566,
621
- "eval_samples_per_second": 193.584,
622
- "eval_steps_per_second": 12.252,
623
- "step": 4264
624
  },
625
  {
626
- "epoch": 41.6,
627
- "learning_rate": 1.8025147928994084e-05,
628
- "loss": 0.0469,
629
- "step": 4326
630
  },
631
  {
632
  "epoch": 42.0,
633
- "eval_f1": 0.5685331156394952,
634
- "eval_loss": 3.609334707260132,
635
- "eval_runtime": 2.8251,
636
- "eval_samples_per_second": 195.747,
637
- "eval_steps_per_second": 12.389,
638
- "step": 4368
639
  },
640
  {
641
- "epoch": 42.59,
642
- "learning_rate": 1.7263313609467456e-05,
643
- "loss": 0.0302,
644
- "step": 4429
645
  },
646
  {
647
  "epoch": 43.0,
648
- "eval_f1": 0.5684067370608473,
649
- "eval_loss": 3.6115400791168213,
650
- "eval_runtime": 2.9194,
651
- "eval_samples_per_second": 189.42,
652
- "eval_steps_per_second": 11.989,
653
- "step": 4472
654
  },
655
  {
656
- "epoch": 43.58,
657
- "learning_rate": 1.650147928994083e-05,
658
- "loss": 0.0289,
659
- "step": 4532
660
  },
661
  {
662
  "epoch": 44.0,
663
- "eval_f1": 0.5757900647671246,
664
- "eval_loss": 3.629568099975586,
665
- "eval_runtime": 2.9036,
666
- "eval_samples_per_second": 190.453,
667
- "eval_steps_per_second": 12.054,
668
- "step": 4576
669
  },
670
  {
671
- "epoch": 44.57,
672
- "learning_rate": 1.5739644970414204e-05,
673
- "loss": 0.0254,
674
- "step": 4635
675
  },
676
  {
677
  "epoch": 45.0,
678
- "eval_f1": 0.5689505752768721,
679
- "eval_loss": 3.7250843048095703,
680
- "eval_runtime": 2.9726,
681
- "eval_samples_per_second": 186.035,
682
- "eval_steps_per_second": 11.774,
683
- "step": 4680
684
  },
685
  {
686
- "epoch": 45.56,
687
- "learning_rate": 1.4977810650887576e-05,
688
- "loss": 0.0283,
689
- "step": 4738
690
  },
691
  {
692
  "epoch": 46.0,
693
- "eval_f1": 0.5592198654774546,
694
- "eval_loss": 3.726353645324707,
695
- "eval_runtime": 2.9328,
696
- "eval_samples_per_second": 188.559,
697
- "eval_steps_per_second": 11.934,
698
- "step": 4784
699
  },
700
  {
701
- "epoch": 46.55,
702
- "learning_rate": 1.4215976331360948e-05,
703
- "loss": 0.0246,
704
- "step": 4841
705
  },
706
  {
707
  "epoch": 47.0,
708
- "eval_f1": 0.5650157110711802,
709
- "eval_loss": 3.7832093238830566,
710
- "eval_runtime": 2.9067,
711
- "eval_samples_per_second": 190.249,
712
- "eval_steps_per_second": 12.041,
713
- "step": 4888
714
  },
715
  {
716
- "epoch": 47.54,
717
- "learning_rate": 1.345414201183432e-05,
718
- "loss": 0.0311,
719
- "step": 4944
720
  },
721
  {
722
  "epoch": 48.0,
723
- "eval_f1": 0.5681512072556809,
724
- "eval_loss": 3.6964025497436523,
725
- "eval_runtime": 2.9008,
726
- "eval_samples_per_second": 190.634,
727
- "eval_steps_per_second": 12.065,
728
- "step": 4992
729
  },
730
  {
731
- "epoch": 48.53,
732
- "learning_rate": 1.2692307692307691e-05,
733
- "loss": 0.0268,
734
- "step": 5047
735
  },
736
  {
737
  "epoch": 49.0,
738
- "eval_f1": 0.5674808111122996,
739
- "eval_loss": 3.7195167541503906,
740
- "eval_runtime": 2.8604,
741
- "eval_samples_per_second": 193.33,
742
- "eval_steps_per_second": 12.236,
743
- "step": 5096
744
  },
745
  {
746
- "epoch": 49.52,
747
- "learning_rate": 1.1930473372781067e-05,
748
- "loss": 0.0293,
749
- "step": 5150
750
  },
751
  {
752
  "epoch": 50.0,
753
- "eval_f1": 0.5614419693521525,
754
- "eval_loss": 3.752530097961426,
755
- "eval_runtime": 2.8761,
756
- "eval_samples_per_second": 192.275,
757
- "eval_steps_per_second": 12.169,
758
- "step": 5200
759
  },
760
  {
761
- "epoch": 50.51,
762
- "learning_rate": 1.1168639053254439e-05,
763
- "loss": 0.0282,
764
- "step": 5253
765
  },
766
  {
767
  "epoch": 51.0,
768
- "eval_f1": 0.5655838635083059,
769
- "eval_loss": 3.7514984607696533,
770
- "eval_runtime": 2.8609,
771
- "eval_samples_per_second": 193.296,
772
- "eval_steps_per_second": 12.234,
773
- "step": 5304
774
  },
775
  {
776
- "epoch": 51.5,
777
- "learning_rate": 1.040680473372781e-05,
778
- "loss": 0.0248,
779
- "step": 5356
 
 
 
 
 
 
 
 
 
780
  },
781
  {
782
  "epoch": 52.0,
783
- "eval_f1": 0.5590951084274065,
784
- "eval_loss": 3.7639315128326416,
785
- "eval_runtime": 2.8211,
786
- "eval_samples_per_second": 196.025,
787
- "eval_steps_per_second": 12.407,
788
- "step": 5408
789
  },
790
  {
791
- "epoch": 52.49,
792
- "learning_rate": 9.644970414201183e-06,
793
- "loss": 0.0257,
794
- "step": 5459
795
  },
796
  {
797
  "epoch": 53.0,
798
- "eval_f1": 0.5480134247467852,
799
- "eval_loss": 3.824922800064087,
800
- "eval_runtime": 2.8475,
801
- "eval_samples_per_second": 194.205,
802
- "eval_steps_per_second": 12.291,
803
- "step": 5512
804
  },
805
  {
806
- "epoch": 53.48,
807
- "learning_rate": 8.883136094674558e-06,
808
- "loss": 0.0235,
809
- "step": 5562
810
  },
811
  {
812
  "epoch": 54.0,
813
- "eval_f1": 0.5565796472147394,
814
- "eval_loss": 3.7871253490448,
815
- "eval_runtime": 2.9817,
816
- "eval_samples_per_second": 185.462,
817
- "eval_steps_per_second": 11.738,
818
- "step": 5616
819
  },
820
  {
821
- "epoch": 54.47,
822
- "learning_rate": 8.12130177514793e-06,
823
- "loss": 0.0299,
824
- "step": 5665
825
  },
826
  {
827
  "epoch": 55.0,
828
- "eval_f1": 0.5574154263000176,
829
- "eval_loss": 3.788760185241699,
830
- "eval_runtime": 2.8852,
831
- "eval_samples_per_second": 191.665,
832
- "eval_steps_per_second": 12.131,
833
- "step": 5720
834
  },
835
  {
836
- "epoch": 55.46,
837
- "learning_rate": 7.359467455621302e-06,
838
- "loss": 0.0277,
839
- "step": 5768
840
  },
841
  {
842
  "epoch": 56.0,
843
- "eval_f1": 0.563024311843682,
844
- "eval_loss": 3.7907044887542725,
845
- "eval_runtime": 2.8658,
846
- "eval_samples_per_second": 192.962,
847
- "eval_steps_per_second": 12.213,
848
- "step": 5824
849
  },
850
  {
851
- "epoch": 56.45,
852
- "learning_rate": 6.597633136094675e-06,
853
- "loss": 0.0256,
854
- "step": 5871
855
  },
856
  {
857
  "epoch": 57.0,
858
- "eval_f1": 0.56153234588093,
859
- "eval_loss": 3.799422264099121,
860
- "eval_runtime": 2.8666,
861
- "eval_samples_per_second": 192.912,
862
- "eval_steps_per_second": 12.21,
863
- "step": 5928
864
  },
865
  {
866
- "epoch": 57.44,
867
- "learning_rate": 5.8357988165680474e-06,
868
- "loss": 0.0226,
869
- "step": 5974
870
  },
871
  {
872
  "epoch": 58.0,
873
- "eval_f1": 0.5555061070073688,
874
- "eval_loss": 3.811858892440796,
875
- "eval_runtime": 2.8683,
876
- "eval_samples_per_second": 192.797,
877
- "eval_steps_per_second": 12.202,
878
- "step": 6032
879
  },
880
  {
881
- "epoch": 58.43,
882
- "learning_rate": 5.07396449704142e-06,
883
- "loss": 0.0284,
884
- "step": 6077
885
  },
886
  {
887
  "epoch": 59.0,
888
- "eval_f1": 0.5597671150511061,
889
- "eval_loss": 3.8192451000213623,
890
- "eval_runtime": 2.8512,
891
- "eval_samples_per_second": 193.951,
892
- "eval_steps_per_second": 12.275,
893
- "step": 6136
894
  },
895
  {
896
- "epoch": 59.42,
897
- "learning_rate": 4.312130177514793e-06,
898
- "loss": 0.0233,
899
- "step": 6180
900
  },
901
  {
902
  "epoch": 60.0,
903
- "eval_f1": 0.5584681716027172,
904
- "eval_loss": 3.823091983795166,
905
- "eval_runtime": 2.9385,
906
- "eval_samples_per_second": 188.191,
907
- "eval_steps_per_second": 11.911,
908
- "step": 6240
909
  },
910
  {
911
- "epoch": 60.41,
912
- "learning_rate": 3.550295857988166e-06,
913
- "loss": 0.0266,
914
- "step": 6283
915
  },
916
  {
917
  "epoch": 61.0,
918
- "eval_f1": 0.5625000576804086,
919
- "eval_loss": 3.8085415363311768,
920
- "eval_runtime": 2.9015,
921
- "eval_samples_per_second": 190.588,
922
- "eval_steps_per_second": 12.063,
923
- "step": 6344
924
  },
925
  {
926
- "epoch": 61.4,
927
- "learning_rate": 2.7958579881656803e-06,
928
- "loss": 0.0267,
929
- "step": 6386
930
  },
931
  {
932
  "epoch": 62.0,
933
- "eval_f1": 0.5622167257088028,
934
- "eval_loss": 3.80642032623291,
935
- "eval_runtime": 2.8514,
936
- "eval_samples_per_second": 193.94,
937
- "eval_steps_per_second": 12.275,
938
- "step": 6448
939
  },
940
  {
941
- "epoch": 62.39,
942
- "learning_rate": 2.034023668639053e-06,
943
- "loss": 0.0281,
944
- "step": 6489
945
  },
946
  {
947
  "epoch": 63.0,
948
- "eval_f1": 0.564106811375439,
949
- "eval_loss": 3.8057875633239746,
950
- "eval_runtime": 2.8945,
951
- "eval_samples_per_second": 191.055,
952
- "eval_steps_per_second": 12.092,
953
- "step": 6552
954
  },
955
  {
956
- "epoch": 63.38,
957
- "learning_rate": 1.2721893491124261e-06,
958
- "loss": 0.025,
959
- "step": 6592
960
  },
961
  {
962
  "epoch": 64.0,
963
- "eval_f1": 0.5644375312998279,
964
- "eval_loss": 3.807055950164795,
965
- "eval_runtime": 2.8941,
966
- "eval_samples_per_second": 191.08,
967
- "eval_steps_per_second": 12.094,
968
- "step": 6656
969
  },
970
  {
971
- "epoch": 64.38,
972
- "learning_rate": 5.103550295857988e-07,
973
- "loss": 0.0226,
974
- "step": 6695
975
  },
976
  {
977
  "epoch": 65.0,
978
- "eval_f1": 0.5644375312998279,
979
- "eval_loss": 3.807528018951416,
980
- "eval_runtime": 2.8626,
981
- "eval_samples_per_second": 193.181,
982
- "eval_steps_per_second": 12.227,
983
- "step": 6760
984
  }
985
  ],
986
- "max_steps": 6760,
987
  "num_train_epochs": 65,
988
  "total_flos": 1.4286659901696e+16,
989
  "trial_name": null,
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 65.0,
5
+ "global_step": 3380,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.98,
12
+ "learning_rate": 4.9245562130177516e-05,
13
+ "loss": 1.4852,
14
+ "step": 51
15
  },
16
  {
17
  "epoch": 1.0,
18
+ "eval_f1": 0.4707671304103,
19
+ "eval_loss": 1.2319071292877197,
20
+ "eval_runtime": 3.2712,
21
+ "eval_samples_per_second": 169.049,
22
+ "eval_steps_per_second": 5.502,
23
+ "step": 52
24
  },
25
  {
26
+ "epoch": 1.96,
27
+ "learning_rate": 4.849112426035503e-05,
28
+ "loss": 1.09,
29
+ "step": 102
30
  },
31
  {
32
  "epoch": 2.0,
33
+ "eval_f1": 0.5451014378336685,
34
+ "eval_loss": 1.1638911962509155,
35
+ "eval_runtime": 3.1885,
36
+ "eval_samples_per_second": 173.438,
37
+ "eval_steps_per_second": 5.645,
38
+ "step": 104
39
  },
40
  {
41
+ "epoch": 2.94,
42
+ "learning_rate": 4.7736686390532545e-05,
43
+ "loss": 0.8292,
44
+ "step": 153
45
  },
46
  {
47
  "epoch": 3.0,
48
+ "eval_f1": 0.5496218087071496,
49
+ "eval_loss": 1.2528048753738403,
50
+ "eval_runtime": 3.3002,
51
+ "eval_samples_per_second": 167.566,
52
+ "eval_steps_per_second": 5.454,
53
+ "step": 156
54
  },
55
  {
56
+ "epoch": 3.92,
57
+ "learning_rate": 4.698224852071006e-05,
58
+ "loss": 0.584,
59
+ "step": 204
60
  },
61
  {
62
  "epoch": 4.0,
63
+ "eval_f1": 0.5422764616085469,
64
+ "eval_loss": 1.3278894424438477,
65
+ "eval_runtime": 3.234,
66
+ "eval_samples_per_second": 170.997,
67
+ "eval_steps_per_second": 5.566,
68
+ "step": 208
69
  },
70
  {
71
+ "epoch": 4.9,
72
+ "learning_rate": 4.622781065088758e-05,
73
+ "loss": 0.4007,
74
+ "step": 255
75
  },
76
  {
77
  "epoch": 5.0,
78
+ "eval_f1": 0.5518226359533898,
79
+ "eval_loss": 1.4516562223434448,
80
+ "eval_runtime": 3.204,
81
+ "eval_samples_per_second": 172.595,
82
+ "eval_steps_per_second": 5.618,
83
+ "step": 260
84
  },
85
  {
86
+ "epoch": 5.88,
87
+ "learning_rate": 4.5473372781065094e-05,
88
+ "loss": 0.2696,
89
+ "step": 306
90
  },
91
  {
92
  "epoch": 6.0,
93
+ "eval_f1": 0.5608323328724897,
94
+ "eval_loss": 1.6225392818450928,
95
+ "eval_runtime": 3.2139,
96
+ "eval_samples_per_second": 172.066,
97
+ "eval_steps_per_second": 5.601,
98
+ "step": 312
99
  },
100
  {
101
+ "epoch": 6.87,
102
+ "learning_rate": 4.471893491124261e-05,
103
+ "loss": 0.2147,
104
+ "step": 357
105
  },
106
  {
107
  "epoch": 7.0,
108
+ "eval_f1": 0.5520909376322324,
109
+ "eval_loss": 1.8203964233398438,
110
+ "eval_runtime": 3.1995,
111
+ "eval_samples_per_second": 172.841,
112
+ "eval_steps_per_second": 5.626,
113
+ "step": 364
114
  },
115
  {
116
+ "epoch": 7.85,
117
+ "learning_rate": 4.396449704142012e-05,
118
+ "loss": 0.1709,
119
+ "step": 408
120
  },
121
  {
122
  "epoch": 8.0,
123
+ "eval_f1": 0.5821755753782172,
124
+ "eval_loss": 1.8175333738327026,
125
+ "eval_runtime": 3.1993,
126
+ "eval_samples_per_second": 172.851,
127
+ "eval_steps_per_second": 5.626,
128
+ "step": 416
129
  },
130
  {
131
+ "epoch": 8.83,
132
+ "learning_rate": 4.3210059171597636e-05,
133
+ "loss": 0.1287,
134
+ "step": 459
135
  },
136
  {
137
  "epoch": 9.0,
138
+ "eval_f1": 0.5518326258604016,
139
+ "eval_loss": 1.9974777698516846,
140
+ "eval_runtime": 3.189,
141
+ "eval_samples_per_second": 173.41,
142
+ "eval_steps_per_second": 5.644,
143
+ "step": 468
144
  },
145
  {
146
+ "epoch": 9.81,
147
+ "learning_rate": 4.245562130177515e-05,
148
+ "loss": 0.1204,
149
+ "step": 510
150
  },
151
  {
152
  "epoch": 10.0,
153
+ "eval_f1": 0.5406559029286824,
154
+ "eval_loss": 2.1665594577789307,
155
+ "eval_runtime": 3.3389,
156
+ "eval_samples_per_second": 165.625,
157
+ "eval_steps_per_second": 5.391,
158
+ "step": 520
159
  },
160
  {
161
+ "epoch": 10.79,
162
+ "learning_rate": 4.1701183431952664e-05,
163
+ "loss": 0.1055,
164
+ "step": 561
165
  },
166
  {
167
  "epoch": 11.0,
168
+ "eval_f1": 0.554377033229914,
169
+ "eval_loss": 2.281852960586548,
170
+ "eval_runtime": 3.2286,
171
+ "eval_samples_per_second": 171.281,
172
+ "eval_steps_per_second": 5.575,
173
+ "step": 572
174
  },
175
  {
176
+ "epoch": 11.77,
177
+ "learning_rate": 4.094674556213018e-05,
178
+ "loss": 0.0847,
179
+ "step": 612
180
  },
181
  {
182
  "epoch": 12.0,
183
+ "eval_f1": 0.5624060360392482,
184
+ "eval_loss": 2.189591646194458,
185
+ "eval_runtime": 3.193,
186
+ "eval_samples_per_second": 173.191,
187
+ "eval_steps_per_second": 5.637,
188
+ "step": 624
189
  },
190
  {
191
+ "epoch": 12.75,
192
+ "learning_rate": 4.019230769230769e-05,
193
+ "loss": 0.0628,
194
+ "step": 663
195
  },
196
  {
197
  "epoch": 13.0,
198
+ "eval_f1": 0.5701909735422656,
199
+ "eval_loss": 2.138963222503662,
200
+ "eval_runtime": 3.1991,
201
+ "eval_samples_per_second": 172.862,
202
+ "eval_steps_per_second": 5.627,
203
+ "step": 676
204
  },
205
  {
206
+ "epoch": 13.73,
207
+ "learning_rate": 3.9437869822485207e-05,
208
+ "loss": 0.0709,
209
+ "step": 714
210
  },
211
  {
212
  "epoch": 14.0,
213
+ "eval_f1": 0.5813682933762342,
214
+ "eval_loss": 2.342538356781006,
215
+ "eval_runtime": 3.1993,
216
+ "eval_samples_per_second": 172.851,
217
+ "eval_steps_per_second": 5.626,
218
+ "step": 728
219
  },
220
  {
221
+ "epoch": 14.71,
222
+ "learning_rate": 3.868343195266273e-05,
223
+ "loss": 0.0558,
224
+ "step": 765
225
  },
226
  {
227
  "epoch": 15.0,
228
+ "eval_f1": 0.549666763295021,
229
+ "eval_loss": 2.5715904235839844,
230
+ "eval_runtime": 3.2358,
231
+ "eval_samples_per_second": 170.901,
232
+ "eval_steps_per_second": 5.563,
233
+ "step": 780
234
  },
235
  {
236
+ "epoch": 15.69,
237
+ "learning_rate": 3.792899408284024e-05,
238
+ "loss": 0.0547,
239
+ "step": 816
240
  },
241
  {
242
  "epoch": 16.0,
243
+ "eval_f1": 0.5734318107728033,
244
+ "eval_loss": 2.3701794147491455,
245
+ "eval_runtime": 3.1669,
246
+ "eval_samples_per_second": 174.62,
247
+ "eval_steps_per_second": 5.684,
248
+ "step": 832
249
  },
250
  {
251
+ "epoch": 16.67,
252
+ "learning_rate": 3.7174556213017756e-05,
253
+ "loss": 0.0364,
254
+ "step": 867
255
  },
256
  {
257
  "epoch": 17.0,
258
+ "eval_f1": 0.5778801114194217,
259
+ "eval_loss": 2.543994665145874,
260
+ "eval_runtime": 3.2632,
261
+ "eval_samples_per_second": 169.464,
262
+ "eval_steps_per_second": 5.516,
263
+ "step": 884
264
  },
265
  {
266
+ "epoch": 17.65,
267
+ "learning_rate": 3.642011834319527e-05,
268
+ "loss": 0.0461,
269
+ "step": 918
270
  },
271
  {
272
  "epoch": 18.0,
273
+ "eval_f1": 0.5500625539858052,
274
+ "eval_loss": 2.6528751850128174,
275
+ "eval_runtime": 3.1926,
276
+ "eval_samples_per_second": 173.215,
277
+ "eval_steps_per_second": 5.638,
278
+ "step": 936
279
  },
280
  {
281
+ "epoch": 18.63,
282
+ "learning_rate": 3.5665680473372784e-05,
283
+ "loss": 0.0427,
284
+ "step": 969
285
  },
286
  {
287
  "epoch": 19.0,
288
+ "eval_f1": 0.5774206875551234,
289
+ "eval_loss": 2.50394606590271,
290
+ "eval_runtime": 3.1994,
291
+ "eval_samples_per_second": 172.847,
292
+ "eval_steps_per_second": 5.626,
293
+ "step": 988
294
  },
295
  {
296
+ "epoch": 19.62,
297
+ "learning_rate": 3.49112426035503e-05,
298
+ "loss": 0.0341,
299
+ "step": 1020
300
  },
301
  {
302
  "epoch": 20.0,
303
+ "eval_f1": 0.5610511407715569,
304
+ "eval_loss": 2.6159815788269043,
305
+ "eval_runtime": 3.1937,
306
+ "eval_samples_per_second": 173.156,
307
+ "eval_steps_per_second": 5.636,
308
+ "step": 1040
309
  },
310
  {
311
+ "epoch": 20.6,
312
+ "learning_rate": 3.415680473372781e-05,
313
+ "loss": 0.043,
314
+ "step": 1071
315
  },
316
  {
317
  "epoch": 21.0,
318
+ "eval_f1": 0.5664231277655097,
319
+ "eval_loss": 2.642545223236084,
320
+ "eval_runtime": 3.2172,
321
+ "eval_samples_per_second": 171.891,
322
+ "eval_steps_per_second": 5.595,
323
+ "step": 1092
324
  },
325
  {
326
+ "epoch": 21.58,
327
+ "learning_rate": 3.3402366863905326e-05,
328
+ "loss": 0.0299,
329
+ "step": 1122
330
  },
331
  {
332
  "epoch": 22.0,
333
+ "eval_f1": 0.5879893270089217,
334
+ "eval_loss": 2.617424488067627,
335
+ "eval_runtime": 3.199,
336
+ "eval_samples_per_second": 172.868,
337
+ "eval_steps_per_second": 5.627,
338
+ "step": 1144
339
  },
340
  {
341
+ "epoch": 22.56,
342
+ "learning_rate": 3.264792899408285e-05,
343
+ "loss": 0.032,
344
+ "step": 1173
345
  },
346
  {
347
  "epoch": 23.0,
348
+ "eval_f1": 0.5728848934589272,
349
+ "eval_loss": 2.663990020751953,
350
+ "eval_runtime": 3.2443,
351
+ "eval_samples_per_second": 170.455,
352
+ "eval_steps_per_second": 5.548,
353
+ "step": 1196
354
  },
355
  {
356
+ "epoch": 23.54,
357
+ "learning_rate": 3.1893491124260354e-05,
358
+ "loss": 0.0303,
359
+ "step": 1224
360
  },
361
  {
362
  "epoch": 24.0,
363
+ "eval_f1": 0.5830104673892542,
364
+ "eval_loss": 2.712137222290039,
365
+ "eval_runtime": 3.2419,
366
+ "eval_samples_per_second": 170.577,
367
+ "eval_steps_per_second": 5.552,
368
+ "step": 1248
369
  },
370
  {
371
+ "epoch": 24.52,
372
+ "learning_rate": 3.1139053254437875e-05,
373
+ "loss": 0.0367,
374
+ "step": 1275
375
  },
376
  {
377
  "epoch": 25.0,
378
+ "eval_f1": 0.5769164843276932,
379
+ "eval_loss": 2.775130033493042,
380
+ "eval_runtime": 3.1631,
381
+ "eval_samples_per_second": 174.826,
382
+ "eval_steps_per_second": 5.691,
383
+ "step": 1300
384
  },
385
  {
386
+ "epoch": 25.5,
387
+ "learning_rate": 3.0384615384615382e-05,
388
+ "loss": 0.0307,
389
+ "step": 1326
390
  },
391
  {
392
  "epoch": 26.0,
393
+ "eval_f1": 0.5821869589539869,
394
+ "eval_loss": 2.7242813110351562,
395
+ "eval_runtime": 3.2145,
396
+ "eval_samples_per_second": 172.034,
397
+ "eval_steps_per_second": 5.6,
398
+ "step": 1352
399
  },
400
  {
401
+ "epoch": 26.48,
402
+ "learning_rate": 2.96301775147929e-05,
403
+ "loss": 0.0316,
404
+ "step": 1377
405
  },
406
  {
407
  "epoch": 27.0,
408
+ "eval_f1": 0.5969806619714588,
409
+ "eval_loss": 2.7311301231384277,
410
+ "eval_runtime": 3.2074,
411
+ "eval_samples_per_second": 172.413,
412
+ "eval_steps_per_second": 5.612,
413
+ "step": 1404
414
  },
415
  {
416
+ "epoch": 27.46,
417
+ "learning_rate": 2.8875739644970417e-05,
418
+ "loss": 0.0259,
419
+ "step": 1428
420
  },
421
  {
422
  "epoch": 28.0,
423
+ "eval_f1": 0.5851034622344854,
424
+ "eval_loss": 2.7522106170654297,
425
+ "eval_runtime": 3.2604,
426
+ "eval_samples_per_second": 169.609,
427
+ "eval_steps_per_second": 5.521,
428
+ "step": 1456
429
  },
430
  {
431
+ "epoch": 28.44,
432
+ "learning_rate": 2.8121301775147928e-05,
433
+ "loss": 0.0312,
434
+ "step": 1479
435
  },
436
  {
437
  "epoch": 29.0,
438
+ "eval_f1": 0.5701248207833717,
439
+ "eval_loss": 2.759821653366089,
440
+ "eval_runtime": 3.1781,
441
+ "eval_samples_per_second": 174.003,
442
+ "eval_steps_per_second": 5.664,
443
+ "step": 1508
444
  },
445
  {
446
+ "epoch": 29.42,
447
+ "learning_rate": 2.7366863905325446e-05,
448
  "loss": 0.0296,
449
+ "step": 1530
450
  },
451
  {
452
  "epoch": 30.0,
453
+ "eval_f1": 0.5647225459731341,
454
+ "eval_loss": 2.9071295261383057,
455
+ "eval_runtime": 3.2316,
456
+ "eval_samples_per_second": 171.124,
457
+ "eval_steps_per_second": 5.57,
458
+ "step": 1560
459
  },
460
  {
461
+ "epoch": 30.4,
462
+ "learning_rate": 2.6612426035502956e-05,
463
+ "loss": 0.0301,
464
+ "step": 1581
465
  },
466
  {
467
  "epoch": 31.0,
468
+ "eval_f1": 0.5812217101167925,
469
+ "eval_loss": 2.818995952606201,
470
+ "eval_runtime": 3.1912,
471
+ "eval_samples_per_second": 173.291,
472
+ "eval_steps_per_second": 5.641,
473
+ "step": 1612
474
  },
475
  {
476
+ "epoch": 31.38,
477
+ "learning_rate": 2.5857988165680474e-05,
478
+ "loss": 0.0283,
479
+ "step": 1632
480
  },
481
  {
482
  "epoch": 32.0,
483
+ "eval_f1": 0.5715581356858528,
484
+ "eval_loss": 2.8269426822662354,
485
+ "eval_runtime": 3.1794,
486
+ "eval_samples_per_second": 173.931,
487
+ "eval_steps_per_second": 5.661,
488
+ "step": 1664
489
  },
490
  {
491
+ "epoch": 32.37,
492
+ "learning_rate": 2.510355029585799e-05,
493
+ "loss": 0.0274,
494
+ "step": 1683
495
  },
496
  {
497
  "epoch": 33.0,
498
+ "eval_f1": 0.5793102191468632,
499
+ "eval_loss": 2.8590047359466553,
500
+ "eval_runtime": 3.236,
501
+ "eval_samples_per_second": 170.888,
502
+ "eval_steps_per_second": 5.562,
503
+ "step": 1716
504
  },
505
  {
506
+ "epoch": 33.35,
507
+ "learning_rate": 2.4349112426035502e-05,
508
+ "loss": 0.0282,
509
+ "step": 1734
510
  },
511
  {
512
  "epoch": 34.0,
513
+ "eval_f1": 0.5745047948377129,
514
+ "eval_loss": 2.861100435256958,
515
+ "eval_runtime": 3.3158,
516
+ "eval_samples_per_second": 166.779,
517
+ "eval_steps_per_second": 5.429,
518
+ "step": 1768
519
  },
520
  {
521
+ "epoch": 34.33,
522
+ "learning_rate": 2.359467455621302e-05,
523
+ "loss": 0.0283,
524
+ "step": 1785
525
  },
526
  {
527
  "epoch": 35.0,
528
+ "eval_f1": 0.5817013281834372,
529
+ "eval_loss": 2.895744562149048,
530
+ "eval_runtime": 3.1931,
531
+ "eval_samples_per_second": 173.188,
532
+ "eval_steps_per_second": 5.637,
533
+ "step": 1820
534
  },
535
  {
536
+ "epoch": 35.31,
537
+ "learning_rate": 2.2840236686390534e-05,
538
+ "loss": 0.0279,
539
+ "step": 1836
540
  },
541
  {
542
  "epoch": 36.0,
543
+ "eval_f1": 0.5775728570319975,
544
+ "eval_loss": 2.9081838130950928,
545
+ "eval_runtime": 3.1654,
546
+ "eval_samples_per_second": 174.702,
547
+ "eval_steps_per_second": 5.686,
548
+ "step": 1872
549
  },
550
  {
551
+ "epoch": 36.29,
552
+ "learning_rate": 2.2085798816568048e-05,
553
+ "loss": 0.0233,
554
+ "step": 1887
555
  },
556
  {
557
  "epoch": 37.0,
558
+ "eval_f1": 0.5849617909408964,
559
+ "eval_loss": 2.89267635345459,
560
+ "eval_runtime": 3.2172,
561
+ "eval_samples_per_second": 171.889,
562
+ "eval_steps_per_second": 5.595,
563
+ "step": 1924
564
  },
565
  {
566
+ "epoch": 37.27,
567
+ "learning_rate": 2.1331360946745562e-05,
568
+ "loss": 0.0254,
569
+ "step": 1938
570
  },
571
  {
572
  "epoch": 38.0,
573
+ "eval_f1": 0.5836059453944484,
574
+ "eval_loss": 2.904184341430664,
575
+ "eval_runtime": 3.1651,
576
+ "eval_samples_per_second": 174.716,
577
+ "eval_steps_per_second": 5.687,
578
+ "step": 1976
579
  },
580
  {
581
+ "epoch": 38.25,
582
+ "learning_rate": 2.0576923076923076e-05,
583
+ "loss": 0.0283,
584
+ "step": 1989
585
  },
586
  {
587
  "epoch": 39.0,
588
+ "eval_f1": 0.5889740350751246,
589
+ "eval_loss": 2.901627779006958,
590
+ "eval_runtime": 3.2172,
591
+ "eval_samples_per_second": 171.888,
592
+ "eval_steps_per_second": 5.595,
593
+ "step": 2028
594
  },
595
  {
596
+ "epoch": 39.23,
597
+ "learning_rate": 1.9822485207100593e-05,
598
+ "loss": 0.0276,
599
+ "step": 2040
600
  },
601
  {
602
  "epoch": 40.0,
603
+ "eval_f1": 0.592031757725387,
604
+ "eval_loss": 2.916260242462158,
605
+ "eval_runtime": 3.1892,
606
+ "eval_samples_per_second": 173.397,
607
+ "eval_steps_per_second": 5.644,
608
+ "step": 2080
609
  },
610
  {
611
+ "epoch": 40.21,
612
+ "learning_rate": 1.9068047337278107e-05,
613
+ "loss": 0.0273,
614
+ "step": 2091
615
  },
616
  {
617
  "epoch": 41.0,
618
+ "eval_f1": 0.5849688005890464,
619
+ "eval_loss": 2.934328317642212,
620
+ "eval_runtime": 3.161,
621
+ "eval_samples_per_second": 174.947,
622
+ "eval_steps_per_second": 5.694,
623
+ "step": 2132
624
  },
625
  {
626
+ "epoch": 41.19,
627
+ "learning_rate": 1.831360946745562e-05,
628
+ "loss": 0.0263,
629
+ "step": 2142
630
  },
631
  {
632
  "epoch": 42.0,
633
+ "eval_f1": 0.5811250364444013,
634
+ "eval_loss": 2.9349024295806885,
635
+ "eval_runtime": 3.1625,
636
+ "eval_samples_per_second": 174.86,
637
+ "eval_steps_per_second": 5.692,
638
+ "step": 2184
639
  },
640
  {
641
+ "epoch": 42.17,
642
+ "learning_rate": 1.7559171597633136e-05,
643
+ "loss": 0.0277,
644
+ "step": 2193
645
  },
646
  {
647
  "epoch": 43.0,
648
+ "eval_f1": 0.573433909955672,
649
+ "eval_loss": 2.940727710723877,
650
+ "eval_runtime": 3.1528,
651
+ "eval_samples_per_second": 175.4,
652
+ "eval_steps_per_second": 5.709,
653
+ "step": 2236
654
  },
655
  {
656
+ "epoch": 43.15,
657
+ "learning_rate": 1.680473372781065e-05,
658
+ "loss": 0.0254,
659
+ "step": 2244
660
  },
661
  {
662
  "epoch": 44.0,
663
+ "eval_f1": 0.5729801056423723,
664
+ "eval_loss": 2.9454445838928223,
665
+ "eval_runtime": 3.2178,
666
+ "eval_samples_per_second": 171.857,
667
+ "eval_steps_per_second": 5.594,
668
+ "step": 2288
669
  },
670
  {
671
+ "epoch": 44.13,
672
+ "learning_rate": 1.6050295857988164e-05,
673
+ "loss": 0.0292,
674
+ "step": 2295
675
  },
676
  {
677
  "epoch": 45.0,
678
+ "eval_f1": 0.5757088867272762,
679
+ "eval_loss": 2.9577114582061768,
680
+ "eval_runtime": 3.2403,
681
+ "eval_samples_per_second": 170.665,
682
+ "eval_steps_per_second": 5.555,
683
+ "step": 2340
684
  },
685
  {
686
+ "epoch": 45.12,
687
+ "learning_rate": 1.529585798816568e-05,
688
+ "loss": 0.0254,
689
+ "step": 2346
690
  },
691
  {
692
  "epoch": 46.0,
693
+ "eval_f1": 0.5725735614892553,
694
+ "eval_loss": 2.972485065460205,
695
+ "eval_runtime": 3.2679,
696
+ "eval_samples_per_second": 169.22,
697
+ "eval_steps_per_second": 5.508,
698
+ "step": 2392
699
  },
700
  {
701
+ "epoch": 46.1,
702
+ "learning_rate": 1.4541420118343197e-05,
703
+ "loss": 0.0276,
704
+ "step": 2397
705
  },
706
  {
707
  "epoch": 47.0,
708
+ "eval_f1": 0.583757236585762,
709
+ "eval_loss": 2.956446886062622,
710
+ "eval_runtime": 3.2021,
711
+ "eval_samples_per_second": 172.699,
712
+ "eval_steps_per_second": 5.621,
713
+ "step": 2444
714
  },
715
  {
716
+ "epoch": 47.08,
717
+ "learning_rate": 1.3786982248520711e-05,
718
+ "loss": 0.0253,
719
+ "step": 2448
720
  },
721
  {
722
  "epoch": 48.0,
723
+ "eval_f1": 0.5853028564259082,
724
+ "eval_loss": 2.9614322185516357,
725
+ "eval_runtime": 3.2052,
726
+ "eval_samples_per_second": 172.533,
727
+ "eval_steps_per_second": 5.616,
728
+ "step": 2496
729
  },
730
  {
731
+ "epoch": 48.06,
732
+ "learning_rate": 1.3032544378698225e-05,
733
+ "loss": 0.028,
734
+ "step": 2499
735
  },
736
  {
737
  "epoch": 49.0,
738
+ "eval_f1": 0.5795749762231537,
739
+ "eval_loss": 2.9922454357147217,
740
+ "eval_runtime": 3.1853,
741
+ "eval_samples_per_second": 173.608,
742
+ "eval_steps_per_second": 5.651,
743
+ "step": 2548
744
  },
745
  {
746
+ "epoch": 49.04,
747
+ "learning_rate": 1.2278106508875741e-05,
748
+ "loss": 0.0256,
749
+ "step": 2550
750
  },
751
  {
752
  "epoch": 50.0,
753
+ "eval_f1": 0.583866618944334,
754
+ "eval_loss": 2.9824576377868652,
755
+ "eval_runtime": 3.193,
756
+ "eval_samples_per_second": 173.194,
757
+ "eval_steps_per_second": 5.637,
758
+ "step": 2600
759
  },
760
  {
761
+ "epoch": 50.02,
762
+ "learning_rate": 1.1523668639053255e-05,
763
+ "loss": 0.0269,
764
+ "step": 2601
765
  },
766
  {
767
  "epoch": 51.0,
768
+ "learning_rate": 1.0769230769230771e-05,
769
+ "loss": 0.0263,
770
+ "step": 2652
 
 
 
771
  },
772
  {
773
+ "epoch": 51.0,
774
+ "eval_f1": 0.5850045217224726,
775
+ "eval_loss": 2.9774513244628906,
776
+ "eval_runtime": 3.1985,
777
+ "eval_samples_per_second": 172.892,
778
+ "eval_steps_per_second": 5.628,
779
+ "step": 2652
780
+ },
781
+ {
782
+ "epoch": 51.98,
783
+ "learning_rate": 1.0014792899408285e-05,
784
+ "loss": 0.0262,
785
+ "step": 2703
786
  },
787
  {
788
  "epoch": 52.0,
789
+ "eval_f1": 0.5828168242240888,
790
+ "eval_loss": 2.986954927444458,
791
+ "eval_runtime": 3.2441,
792
+ "eval_samples_per_second": 170.463,
793
+ "eval_steps_per_second": 5.549,
794
+ "step": 2704
795
  },
796
  {
797
+ "epoch": 52.96,
798
+ "learning_rate": 9.2603550295858e-06,
799
+ "loss": 0.0263,
800
+ "step": 2754
801
  },
802
  {
803
  "epoch": 53.0,
804
+ "eval_f1": 0.5813629807328557,
805
+ "eval_loss": 3.000241756439209,
806
+ "eval_runtime": 3.2231,
807
+ "eval_samples_per_second": 171.574,
808
+ "eval_steps_per_second": 5.585,
809
+ "step": 2756
810
  },
811
  {
812
+ "epoch": 53.94,
813
+ "learning_rate": 8.505917159763315e-06,
814
+ "loss": 0.0269,
815
+ "step": 2805
816
  },
817
  {
818
  "epoch": 54.0,
819
+ "eval_f1": 0.5806797641919809,
820
+ "eval_loss": 3.013312339782715,
821
+ "eval_runtime": 3.1848,
822
+ "eval_samples_per_second": 173.638,
823
+ "eval_steps_per_second": 5.652,
824
+ "step": 2808
825
  },
826
  {
827
+ "epoch": 54.92,
828
+ "learning_rate": 7.751479289940829e-06,
829
+ "loss": 0.0263,
830
+ "step": 2856
831
  },
832
  {
833
  "epoch": 55.0,
834
+ "eval_f1": 0.5781702729189796,
835
+ "eval_loss": 3.0137646198272705,
836
+ "eval_runtime": 3.2153,
837
+ "eval_samples_per_second": 171.991,
838
+ "eval_steps_per_second": 5.598,
839
+ "step": 2860
840
  },
841
  {
842
+ "epoch": 55.9,
843
+ "learning_rate": 6.997041420118343e-06,
844
+ "loss": 0.025,
845
+ "step": 2907
846
  },
847
  {
848
  "epoch": 56.0,
849
+ "eval_f1": 0.5787181162619701,
850
+ "eval_loss": 3.0211129188537598,
851
+ "eval_runtime": 3.2002,
852
+ "eval_samples_per_second": 172.8,
853
+ "eval_steps_per_second": 5.625,
854
+ "step": 2912
855
  },
856
  {
857
+ "epoch": 56.88,
858
+ "learning_rate": 6.242603550295858e-06,
859
+ "loss": 0.0266,
860
+ "step": 2958
861
  },
862
  {
863
  "epoch": 57.0,
864
+ "eval_f1": 0.5789670702676735,
865
+ "eval_loss": 3.0238091945648193,
866
+ "eval_runtime": 3.16,
867
+ "eval_samples_per_second": 175.001,
868
+ "eval_steps_per_second": 5.696,
869
+ "step": 2964
870
  },
871
  {
872
+ "epoch": 57.87,
873
+ "learning_rate": 5.488165680473373e-06,
874
+ "loss": 0.0245,
875
+ "step": 3009
876
  },
877
  {
878
  "epoch": 58.0,
879
+ "eval_f1": 0.5818538088014551,
880
+ "eval_loss": 3.0299057960510254,
881
+ "eval_runtime": 3.2046,
882
+ "eval_samples_per_second": 172.563,
883
+ "eval_steps_per_second": 5.617,
884
+ "step": 3016
885
  },
886
  {
887
+ "epoch": 58.85,
888
+ "learning_rate": 4.733727810650888e-06,
889
+ "loss": 0.0246,
890
+ "step": 3060
891
  },
892
  {
893
  "epoch": 59.0,
894
+ "eval_f1": 0.5789670702676735,
895
+ "eval_loss": 3.030819892883301,
896
+ "eval_runtime": 3.2069,
897
+ "eval_samples_per_second": 172.439,
898
+ "eval_steps_per_second": 5.613,
899
+ "step": 3068
900
  },
901
  {
902
+ "epoch": 59.83,
903
+ "learning_rate": 3.979289940828403e-06,
904
+ "loss": 0.0255,
905
+ "step": 3111
906
  },
907
  {
908
  "epoch": 60.0,
909
+ "eval_f1": 0.5789041950659493,
910
+ "eval_loss": 3.034423828125,
911
+ "eval_runtime": 3.2026,
912
+ "eval_samples_per_second": 172.67,
913
+ "eval_steps_per_second": 5.62,
914
+ "step": 3120
915
  },
916
  {
917
+ "epoch": 60.81,
918
+ "learning_rate": 3.2248520710059175e-06,
919
+ "loss": 0.0263,
920
+ "step": 3162
921
  },
922
  {
923
  "epoch": 61.0,
924
+ "eval_f1": 0.5789041950659493,
925
+ "eval_loss": 3.0331101417541504,
926
+ "eval_runtime": 3.2106,
927
+ "eval_samples_per_second": 172.241,
928
+ "eval_steps_per_second": 5.606,
929
+ "step": 3172
930
  },
931
  {
932
+ "epoch": 61.79,
933
+ "learning_rate": 2.470414201183432e-06,
934
+ "loss": 0.0239,
935
+ "step": 3213
936
  },
937
  {
938
  "epoch": 62.0,
939
+ "eval_f1": 0.5771494365349826,
940
+ "eval_loss": 3.0290367603302,
941
+ "eval_runtime": 3.1976,
942
+ "eval_samples_per_second": 172.941,
943
+ "eval_steps_per_second": 5.629,
944
+ "step": 3224
945
  },
946
  {
947
+ "epoch": 62.77,
948
+ "learning_rate": 1.7159763313609468e-06,
949
+ "loss": 0.0296,
950
+ "step": 3264
951
  },
952
  {
953
  "epoch": 63.0,
954
+ "eval_f1": 0.5807646048923409,
955
+ "eval_loss": 3.03131103515625,
956
+ "eval_runtime": 3.1937,
957
+ "eval_samples_per_second": 173.155,
958
+ "eval_steps_per_second": 5.636,
959
+ "step": 3276
960
  },
961
  {
962
+ "epoch": 63.75,
963
+ "learning_rate": 9.615384615384617e-07,
964
+ "loss": 0.0224,
965
+ "step": 3315
966
  },
967
  {
968
  "epoch": 64.0,
969
+ "eval_f1": 0.5772710868092534,
970
+ "eval_loss": 3.0326967239379883,
971
+ "eval_runtime": 3.2373,
972
+ "eval_samples_per_second": 170.823,
973
+ "eval_steps_per_second": 5.56,
974
+ "step": 3328
975
  },
976
  {
977
+ "epoch": 64.73,
978
+ "learning_rate": 2.0710059171597635e-07,
979
+ "loss": 0.0271,
980
+ "step": 3366
981
  },
982
  {
983
  "epoch": 65.0,
984
+ "eval_f1": 0.5772710868092534,
985
+ "eval_loss": 3.033085346221924,
986
+ "eval_runtime": 3.1894,
987
+ "eval_samples_per_second": 173.385,
988
+ "eval_steps_per_second": 5.644,
989
+ "step": 3380
990
  }
991
  ],
992
+ "max_steps": 3380,
993
  "num_train_epochs": 65,
994
  "total_flos": 1.4286659901696e+16,
995
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69dba15562adc963c0e958faa40949482ed3f4a4e7db086eedc1130e4eecc7b6
3
- size 3707
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d587ee297d25c8e66693d6fc322dd9143b16b65714461c36ef30f3be59b6fd24
3
+ size 3643