AlekseyKorshuk commited on
Commit
22862bd
·
1 Parent(s): a589ae8

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/2qs74c6g/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/2tpuc31a) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/2tpuc31a/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
 
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/s2mjtgo8/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/zikdyrc6) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/zikdyrc6/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 0.1634998470544815, "eval_runtime": 14.8673, "eval_samples_per_second": 41.568, "eval_steps_per_second": 5.246, "epoch": 6.0}
 
1
+ {"eval_loss": 0.20774193108081818, "eval_runtime": 15.0542, "eval_samples_per_second": 42.978, "eval_steps_per_second": 5.381, "epoch": 3.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50a3a0c4e02f3c76233da8dd7daa8bdb92af6f6a517bbe0f81cb9c5af8bc811e
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758c6b87d8212253a5334ae7f98913538370445564928cae53eff8b1f96ba0d1
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2399fa5b0be2b088f4b9d64615b38bf9577c1b2d6121dbdb50e56bcc9686dc80
3
  size 995604017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a24e99d4dc7013f43cbadb32badf86893275bcd52f36e3a3e6188836728497e1
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bab5ad9763860d7d14b63f0d5fe99d998b132f1cff9ed17bc063f5fcd361d17
3
  size 510396521
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470bb080f7f86a03fc1b9714d4c1fcfd018e4a30715683ad366a847fc90b25a7
3
  size 510396521
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70105776b79d7d810d0504318367cfecea051515ab03660167c6b973ba2574a0
3
  size 14567
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dd4f3adbf918816f8bc24c02487c37270992d6dc1de5e29ef3b17931ced7d9e
3
  size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d3f154dee44c2e3a5b59da341bc4475af22b3ae583ed30333d76616051a6c98
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eae8c7bd95e47c6b3eb60b363caee41d6c38cf405dd6ff23467a076f503128e
3
  size 623
trainer_state.json CHANGED
@@ -1,1138 +1,1118 @@
1
  {
2
- "best_metric": 0.1634998470544815,
3
- "best_model_checkpoint": "output/eminem/checkpoint-916",
4
  "epoch": 2.0,
5
- "global_step": 916,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
- "learning_rate": 0.0001295937875943477,
13
- "loss": 0.4175,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.02,
18
- "learning_rate": 0.0001306389012238537,
19
- "loss": 0.4452,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.03,
24
- "learning_rate": 0.00013161040580202325,
25
- "loss": 0.4523,
26
  "step": 15
27
  },
28
  {
29
  "epoch": 0.04,
30
- "learning_rate": 0.00013250714864031736,
31
- "loss": 0.4272,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 0.05,
36
- "learning_rate": 0.00013332806575487712,
37
- "loss": 0.4701,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 0.07,
42
- "learning_rate": 0.00013407218312893365,
43
- "loss": 0.4872,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.08,
48
- "learning_rate": 0.00013473861786848294,
49
- "loss": 0.4231,
50
  "step": 35
51
  },
52
  {
53
  "epoch": 0.09,
54
- "learning_rate": 0.00013532657924983333,
55
- "loss": 0.4708,
56
  "step": 40
57
  },
58
  {
59
  "epoch": 0.1,
60
- "learning_rate": 0.0001358353696578007,
61
- "loss": 0.5047,
62
  "step": 45
63
  },
64
  {
65
  "epoch": 0.11,
66
- "learning_rate": 0.00013626438541342652,
67
- "loss": 0.4957,
68
  "step": 50
69
  },
70
  {
71
  "epoch": 0.12,
72
- "learning_rate": 0.00013661311749024328,
73
- "loss": 0.4333,
74
  "step": 55
75
  },
76
  {
77
  "epoch": 0.13,
78
- "learning_rate": 0.0001368811521182315,
79
- "loss": 0.4417,
80
  "step": 60
81
  },
82
  {
83
  "epoch": 0.14,
84
- "learning_rate": 0.00013706817127475857,
85
- "loss": 0.4644,
86
  "step": 65
87
  },
88
  {
89
  "epoch": 0.15,
90
- "learning_rate": 0.00013717395306191163,
91
- "loss": 0.5235,
92
  "step": 70
93
  },
94
  {
95
- "epoch": 0.16,
96
- "learning_rate": 0.00013719837196977938,
97
- "loss": 0.4143,
98
  "step": 75
99
  },
100
  {
101
  "epoch": 0.18,
102
- "learning_rate": 0.00013714139902536895,
103
- "loss": 0.4418,
104
  "step": 80
105
  },
106
  {
107
  "epoch": 0.19,
108
- "learning_rate": 0.00013700310182698214,
109
- "loss": 0.4862,
110
  "step": 85
111
  },
112
  {
113
  "epoch": 0.2,
114
- "learning_rate": 0.0001367836444640114,
115
- "loss": 0.5152,
116
  "step": 90
117
  },
118
  {
119
  "epoch": 0.21,
120
- "learning_rate": 0.00013648328732224639,
121
- "loss": 0.4401,
122
  "step": 95
123
  },
124
  {
125
  "epoch": 0.22,
126
- "learning_rate": 0.00013610238677492728,
127
- "loss": 0.4883,
128
  "step": 100
129
  },
130
  {
131
  "epoch": 0.23,
132
- "learning_rate": 0.00013564139475990883,
133
- "loss": 0.475,
134
  "step": 105
135
  },
136
  {
137
  "epoch": 0.24,
138
- "learning_rate": 0.0001351008582434381,
139
- "loss": 0.4708,
140
  "step": 110
141
  },
142
  {
143
  "epoch": 0.25,
144
- "learning_rate": 0.00013448141857117668,
145
- "loss": 0.5114,
146
  "step": 115
147
  },
148
  {
149
  "epoch": 0.26,
150
- "learning_rate": 0.000133783810707247,
151
- "loss": 0.4598,
152
  "step": 120
153
  },
154
  {
155
- "epoch": 0.27,
156
- "learning_rate": 0.00013300886236219912,
157
- "loss": 0.5016,
158
  "step": 125
159
  },
160
  {
161
  "epoch": 0.29,
162
- "learning_rate": 0.00013215749301093531,
163
- "loss": 0.5246,
164
  "step": 130
165
  },
166
  {
167
  "epoch": 0.3,
168
- "learning_rate": 0.0001312307128017492,
169
- "loss": 0.4599,
170
  "step": 135
171
  },
172
  {
173
  "epoch": 0.31,
174
- "learning_rate": 0.00013022962135779,
175
- "loss": 0.5193,
176
  "step": 140
177
  },
178
  {
179
  "epoch": 0.32,
180
- "learning_rate": 0.0001291554064723639,
181
- "loss": 0.4855,
182
  "step": 145
183
  },
184
  {
185
  "epoch": 0.33,
186
- "learning_rate": 0.00012800934269961218,
187
- "loss": 0.4923,
188
  "step": 150
189
  },
190
  {
191
  "epoch": 0.34,
192
- "learning_rate": 0.00012679278984226595,
193
- "loss": 0.5141,
194
  "step": 155
195
  },
196
  {
197
  "epoch": 0.35,
198
- "learning_rate": 0.00012550719133822919,
199
- "loss": 0.4847,
200
  "step": 160
201
  },
202
  {
203
  "epoch": 0.36,
204
- "learning_rate": 0.0001241540725479539,
205
- "loss": 0.4419,
206
  "step": 165
207
  },
208
  {
209
  "epoch": 0.37,
210
- "learning_rate": 0.00012273503894459195,
211
- "loss": 0.5324,
212
  "step": 170
213
  },
214
  {
215
- "epoch": 0.38,
216
- "learning_rate": 0.00012125177420911749,
217
- "loss": 0.4099,
218
  "step": 175
219
  },
220
  {
221
- "epoch": 0.39,
222
- "learning_rate": 0.00011970603823262598,
223
- "loss": 0.4894,
224
  "step": 180
225
  },
226
  {
227
  "epoch": 0.41,
228
- "learning_rate": 0.00011809966502824082,
229
- "loss": 0.5617,
230
  "step": 185
231
  },
232
  {
233
  "epoch": 0.42,
234
- "learning_rate": 0.00011643456055504982,
235
- "loss": 0.5006,
236
  "step": 190
237
  },
238
  {
239
  "epoch": 0.43,
240
- "learning_rate": 0.00011471270045669035,
241
- "loss": 0.4947,
242
  "step": 195
243
  },
244
  {
245
  "epoch": 0.44,
246
- "learning_rate": 0.00011293612771726151,
247
- "loss": 0.5112,
248
  "step": 200
249
  },
250
  {
251
  "epoch": 0.45,
252
- "learning_rate": 0.00011110695023730843,
253
- "loss": 0.4745,
254
  "step": 205
255
  },
256
  {
257
  "epoch": 0.46,
258
- "learning_rate": 0.00010922733833281926,
259
- "loss": 0.4961,
260
  "step": 210
261
  },
262
  {
263
  "epoch": 0.47,
264
- "learning_rate": 0.0001072995221601338,
265
- "loss": 0.5159,
266
  "step": 215
267
  },
268
  {
269
  "epoch": 0.48,
270
- "learning_rate": 0.00010532578906988555,
271
- "loss": 0.4521,
272
  "step": 220
273
  },
274
  {
275
- "epoch": 0.49,
276
- "learning_rate": 0.00010330848089304184,
277
- "loss": 0.4683,
278
  "step": 225
279
  },
280
  {
281
- "epoch": 0.5,
282
- "learning_rate": 0.00010124999116234466,
283
- "loss": 0.4694,
284
  "step": 230
285
  },
286
  {
287
  "epoch": 0.52,
288
- "learning_rate": 9.915276227237154e-05,
289
- "loss": 0.4838,
290
  "step": 235
291
  },
292
  {
293
  "epoch": 0.53,
294
- "learning_rate": 9.701928258165896e-05,
295
- "loss": 0.4934,
296
  "step": 240
297
  },
298
  {
299
  "epoch": 0.54,
300
- "learning_rate": 9.485208346024501e-05,
301
- "loss": 0.4964,
302
  "step": 245
303
  },
304
  {
305
  "epoch": 0.55,
306
- "learning_rate": 9.265373628622407e-05,
307
- "loss": 0.478,
308
  "step": 250
309
  },
310
  {
311
  "epoch": 0.56,
312
- "learning_rate": 9.04268493947969e-05,
313
- "loss": 0.4836,
314
  "step": 255
315
  },
316
  {
317
  "epoch": 0.57,
318
- "learning_rate": 8.817406498348864e-05,
319
- "loss": 0.4783,
320
  "step": 260
321
  },
322
  {
323
  "epoch": 0.58,
324
- "learning_rate": 8.589805597719735e-05,
325
- "loss": 0.5033,
326
  "step": 265
327
  },
328
  {
329
  "epoch": 0.59,
330
- "learning_rate": 8.360152285675815e-05,
331
- "loss": 0.4933,
332
  "step": 270
333
  },
334
  {
335
- "epoch": 0.6,
336
- "learning_rate": 8.128719045483102e-05,
337
- "loss": 0.4802,
338
  "step": 275
339
  },
340
  {
341
- "epoch": 0.61,
342
- "learning_rate": 7.895780472289125e-05,
343
- "loss": 0.4608,
344
  "step": 280
345
  },
346
  {
347
- "epoch": 0.62,
348
- "learning_rate": 7.661612947317637e-05,
349
- "loss": 0.451,
350
  "step": 285
351
  },
352
  {
353
  "epoch": 0.64,
354
- "learning_rate": 7.426494309940237e-05,
355
- "loss": 0.452,
356
  "step": 290
357
  },
358
  {
359
  "epoch": 0.65,
360
- "learning_rate": 7.190703528022759e-05,
361
- "loss": 0.4496,
362
  "step": 295
363
  },
364
  {
365
  "epoch": 0.66,
366
- "learning_rate": 6.95452036692842e-05,
367
- "loss": 0.4758,
368
  "step": 300
369
  },
370
  {
371
  "epoch": 0.67,
372
- "learning_rate": 6.718225057579034e-05,
373
- "loss": 0.4928,
374
  "step": 305
375
  },
376
  {
377
  "epoch": 0.68,
378
- "learning_rate": 6.48209796395876e-05,
379
- "loss": 0.5023,
380
  "step": 310
381
  },
382
  {
383
  "epoch": 0.69,
384
- "learning_rate": 6.246419250465058e-05,
385
- "loss": 0.426,
386
  "step": 315
387
  },
388
  {
389
  "epoch": 0.7,
390
- "learning_rate": 6.011468549492541e-05,
391
- "loss": 0.4651,
392
  "step": 320
393
  },
394
  {
395
- "epoch": 0.71,
396
- "learning_rate": 5.777524629650007e-05,
397
- "loss": 0.5082,
398
  "step": 325
399
  },
400
  {
401
- "epoch": 0.72,
402
- "learning_rate": 5.544865065003111e-05,
403
- "loss": 0.4546,
404
  "step": 330
405
  },
406
  {
407
- "epoch": 0.73,
408
- "learning_rate": 5.313765905731657e-05,
409
- "loss": 0.4512,
410
  "step": 335
411
  },
412
  {
413
  "epoch": 0.75,
414
- "learning_rate": 5.084501350596927e-05,
415
- "loss": 0.4794,
416
  "step": 340
417
  },
418
  {
419
  "epoch": 0.76,
420
- "learning_rate": 4.857343421605311e-05,
421
- "loss": 0.4781,
422
  "step": 345
423
  },
424
  {
425
  "epoch": 0.77,
426
- "learning_rate": 4.63256164125579e-05,
427
- "loss": 0.5233,
428
  "step": 350
429
  },
430
  {
431
  "epoch": 0.78,
432
- "learning_rate": 4.410422712750424e-05,
433
- "loss": 0.4695,
434
  "step": 355
435
  },
436
  {
437
  "epoch": 0.79,
438
- "learning_rate": 4.191190203551854e-05,
439
- "loss": 0.4788,
440
  "step": 360
441
  },
442
  {
443
  "epoch": 0.8,
444
- "learning_rate": 3.975124232661141e-05,
445
- "loss": 0.4318,
446
  "step": 365
447
  },
448
  {
449
  "epoch": 0.81,
450
- "learning_rate": 3.762481161987185e-05,
451
- "loss": 0.4609,
452
  "step": 370
453
  },
454
  {
455
- "epoch": 0.82,
456
- "learning_rate": 3.553513292174085e-05,
457
- "loss": 0.4854,
458
  "step": 375
459
  },
460
  {
461
- "epoch": 0.83,
462
- "learning_rate": 3.348468563245461e-05,
463
- "loss": 0.4337,
464
  "step": 380
465
  },
466
  {
467
- "epoch": 0.84,
468
- "learning_rate": 3.1475902604251e-05,
469
- "loss": 0.4707,
470
  "step": 385
471
  },
472
  {
473
  "epoch": 0.86,
474
- "learning_rate": 2.951116725479596e-05,
475
- "loss": 0.4394,
476
  "step": 390
477
  },
478
  {
479
  "epoch": 0.87,
480
- "learning_rate": 2.7592810739257415e-05,
481
- "loss": 0.5088,
482
  "step": 395
483
  },
484
  {
485
  "epoch": 0.88,
486
- "learning_rate": 2.572310918439686e-05,
487
- "loss": 0.4753,
488
  "step": 400
489
  },
490
  {
491
  "epoch": 0.89,
492
- "learning_rate": 2.3904280987944108e-05,
493
- "loss": 0.4626,
494
  "step": 405
495
  },
496
  {
497
  "epoch": 0.9,
498
- "learning_rate": 2.2138484186474054e-05,
499
- "loss": 0.4473,
500
  "step": 410
501
  },
502
  {
503
  "epoch": 0.91,
504
- "learning_rate": 2.0427813894908452e-05,
505
- "loss": 0.4662,
506
  "step": 415
507
  },
508
  {
509
- "epoch": 0.92,
510
- "learning_rate": 1.877429982065378e-05,
511
- "loss": 0.4383,
512
  "step": 420
513
  },
514
  {
515
- "epoch": 0.93,
516
- "learning_rate": 1.7179903855360063e-05,
517
- "loss": 0.4584,
518
  "step": 425
519
  },
520
  {
521
- "epoch": 0.94,
522
- "learning_rate": 1.564651774714127e-05,
523
- "loss": 0.4932,
524
  "step": 430
525
  },
526
  {
527
- "epoch": 0.95,
528
- "learning_rate": 1.4175960856020567e-05,
529
- "loss": 0.4168,
530
  "step": 435
531
  },
532
  {
533
- "epoch": 0.96,
534
- "learning_rate": 1.2769977995264743e-05,
535
- "loss": 0.5093,
536
  "step": 440
537
  },
538
  {
539
  "epoch": 0.98,
540
- "learning_rate": 1.1430237361156786e-05,
541
- "loss": 0.486,
542
  "step": 445
543
  },
544
  {
545
  "epoch": 0.99,
546
- "learning_rate": 1.0158328553691274e-05,
547
- "loss": 0.4456,
548
  "step": 450
549
  },
550
  {
551
  "epoch": 1.0,
552
- "learning_rate": 8.95576069051646e-06,
553
- "loss": 0.4546,
554
- "step": 455
555
- },
556
- {
557
- "epoch": 1.0,
558
- "eval_loss": 0.200975701212883,
559
- "eval_runtime": 14.7931,
560
- "eval_samples_per_second": 42.993,
561
- "eval_steps_per_second": 5.408,
562
- "step": 456
563
  },
564
  {
565
  "epoch": 1.0,
566
- "eval_loss": 0.16962459683418274,
567
- "eval_runtime": 12.708,
568
- "eval_samples_per_second": 47.687,
569
- "eval_steps_per_second": 5.98,
570
- "step": 459
571
  },
572
  {
573
- "epoch": 1.0,
574
- "learning_rate": 5.5043781594191076e-05,
575
- "loss": 0.4785,
576
  "step": 460
577
  },
578
  {
579
  "epoch": 1.02,
580
- "learning_rate": 5.274583707771519e-05,
581
- "loss": 0.4345,
582
  "step": 465
583
  },
584
  {
585
- "epoch": 1.03,
586
- "learning_rate": 5.046653958583268e-05,
587
- "loss": 0.4198,
588
  "step": 470
589
  },
590
  {
591
- "epoch": 1.04,
592
- "learning_rate": 4.8208569935981205e-05,
593
- "loss": 0.459,
594
  "step": 475
595
  },
596
  {
597
- "epoch": 1.05,
598
- "learning_rate": 4.597458386064845e-05,
599
- "loss": 0.4086,
600
  "step": 480
601
  },
602
  {
603
- "epoch": 1.06,
604
- "learning_rate": 4.376720888382378e-05,
605
- "loss": 0.4452,
606
  "step": 485
607
  },
608
  {
609
- "epoch": 1.07,
610
- "learning_rate": 4.158904123058869e-05,
611
- "loss": 0.3935,
612
  "step": 490
613
  },
614
  {
615
- "epoch": 1.08,
616
- "learning_rate": 3.9442642773556785e-05,
617
- "loss": 0.4194,
618
  "step": 495
619
  },
620
  {
621
- "epoch": 1.09,
622
- "learning_rate": 3.733053801969927e-05,
623
- "loss": 0.4181,
624
  "step": 500
625
  },
626
  {
627
- "epoch": 1.1,
628
- "learning_rate": 3.525521114112028e-05,
629
- "loss": 0.4243,
630
  "step": 505
631
  },
632
  {
633
- "epoch": 1.11,
634
- "learning_rate": 3.321910305328109e-05,
635
- "loss": 0.4525,
636
  "step": 510
637
  },
638
  {
639
- "epoch": 1.12,
640
- "learning_rate": 3.122460854408385e-05,
641
- "loss": 0.4579,
642
  "step": 515
643
  },
644
  {
645
- "epoch": 1.14,
646
- "learning_rate": 2.9274073457225933e-05,
647
- "loss": 0.4268,
648
  "step": 520
649
  },
650
  {
651
- "epoch": 1.15,
652
- "learning_rate": 2.736979193311177e-05,
653
- "loss": 0.455,
654
  "step": 525
655
  },
656
  {
657
- "epoch": 1.16,
658
- "learning_rate": 2.5514003710574786e-05,
659
- "loss": 0.3784,
660
  "step": 530
661
  },
662
  {
663
- "epoch": 1.17,
664
- "learning_rate": 2.3708891492599452e-05,
665
- "loss": 0.446,
666
  "step": 535
667
  },
668
  {
669
- "epoch": 1.18,
670
- "learning_rate": 2.1956578379095964e-05,
671
- "loss": 0.3845,
672
  "step": 540
673
  },
674
  {
675
- "epoch": 1.19,
676
- "learning_rate": 2.0259125369808434e-05,
677
- "loss": 0.4339,
678
  "step": 545
679
  },
680
  {
681
- "epoch": 1.2,
682
- "learning_rate": 1.86185289402487e-05,
683
- "loss": 0.3976,
684
  "step": 550
685
  },
686
  {
687
- "epoch": 1.21,
688
- "learning_rate": 1.703671869351462e-05,
689
- "loss": 0.461,
690
  "step": 555
691
  },
692
  {
693
- "epoch": 1.22,
694
- "learning_rate": 1.551555509078763e-05,
695
- "loss": 0.3707,
696
  "step": 560
697
  },
698
  {
699
- "epoch": 1.23,
700
- "learning_rate": 1.4056827263112375e-05,
701
- "loss": 0.4239,
702
  "step": 565
703
  },
704
  {
705
- "epoch": 1.24,
706
- "learning_rate": 1.2662250907105194e-05,
707
- "loss": 0.4077,
708
  "step": 570
709
  },
710
  {
711
- "epoch": 1.26,
712
- "learning_rate": 1.1333466267017441e-05,
713
- "loss": 0.4455,
714
  "step": 575
715
  },
716
  {
717
- "epoch": 1.27,
718
- "learning_rate": 1.0072036205550979e-05,
719
- "loss": 0.4022,
720
  "step": 580
721
  },
722
  {
723
- "epoch": 1.28,
724
- "learning_rate": 8.879444365675725e-06,
725
- "loss": 0.4127,
726
  "step": 585
727
  },
728
  {
729
- "epoch": 1.29,
730
- "learning_rate": 7.757093425633299e-06,
731
- "loss": 0.4409,
732
  "step": 590
733
  },
734
  {
735
- "epoch": 1.3,
736
- "learning_rate": 6.706303449162559e-06,
737
- "loss": 0.4247,
738
  "step": 595
739
  },
740
  {
741
- "epoch": 1.31,
742
- "learning_rate": 5.72831033289248e-06,
743
- "loss": 0.3729,
744
  "step": 600
745
  },
746
  {
747
- "epoch": 1.32,
748
- "learning_rate": 4.824264352736968e-06,
749
- "loss": 0.3937,
750
  "step": 605
751
  },
752
  {
753
- "epoch": 1.33,
754
- "learning_rate": 3.9952288109771325e-06,
755
- "loss": 0.4288,
756
  "step": 610
757
  },
758
  {
759
- "epoch": 1.34,
760
- "learning_rate": 3.242178785654847e-06,
761
- "loss": 0.3971,
762
  "step": 615
763
  },
764
  {
765
- "epoch": 1.35,
766
- "learning_rate": 2.5659999837236953e-06,
767
- "loss": 0.4354,
768
  "step": 620
769
  },
770
  {
771
- "epoch": 1.36,
772
- "learning_rate": 1.967487699315891e-06,
773
- "loss": 0.4529,
774
  "step": 625
775
  },
776
  {
777
- "epoch": 1.38,
778
- "learning_rate": 1.4473458783513853e-06,
779
- "loss": 0.4242,
780
  "step": 630
781
  },
782
  {
783
- "epoch": 1.39,
784
- "learning_rate": 1.0061862905822621e-06,
785
- "loss": 0.3537,
786
  "step": 635
787
  },
788
  {
789
- "epoch": 1.4,
790
- "learning_rate": 6.445278100559068e-07,
791
- "loss": 0.4698,
792
  "step": 640
793
  },
794
  {
795
- "epoch": 1.41,
796
- "learning_rate": 3.627958048358698e-07,
797
- "loss": 0.3978,
798
  "step": 645
799
  },
800
  {
801
- "epoch": 1.42,
802
- "learning_rate": 1.613216367010383e-07,
803
- "loss": 0.4303,
804
  "step": 650
805
  },
806
  {
807
- "epoch": 1.43,
808
- "learning_rate": 4.034227141189317e-08,
809
- "loss": 0.5101,
810
  "step": 655
811
  },
812
  {
813
- "epoch": 1.44,
814
- "learning_rate": 0.0,
815
- "loss": 0.4254,
816
  "step": 660
817
  },
818
  {
819
- "epoch": 1.45,
820
- "learning_rate": 4.034227141180178e-08,
821
- "loss": 0.4535,
822
  "step": 665
823
  },
824
  {
825
- "epoch": 1.46,
826
- "learning_rate": 1.6132163670086312e-07,
827
- "loss": 0.4398,
828
  "step": 670
829
  },
830
  {
831
- "epoch": 1.47,
832
- "learning_rate": 3.627958048358089e-07,
833
- "loss": 0.3769,
834
  "step": 675
835
  },
836
  {
837
- "epoch": 1.48,
838
- "learning_rate": 6.445278100558231e-07,
839
- "loss": 0.431,
840
  "step": 680
841
  },
842
  {
843
- "epoch": 1.5,
844
- "learning_rate": 1.0061862905821554e-06,
845
- "loss": 0.3816,
846
  "step": 685
847
  },
848
  {
849
- "epoch": 1.51,
850
- "learning_rate": 1.4473458783512634e-06,
851
- "loss": 0.436,
852
  "step": 690
853
  },
854
  {
855
- "epoch": 1.52,
856
- "learning_rate": 1.9674876993157465e-06,
857
- "loss": 0.4403,
858
  "step": 695
859
  },
860
  {
861
- "epoch": 1.53,
862
- "learning_rate": 2.5659999837235352e-06,
863
- "loss": 0.383,
864
  "step": 700
865
  },
866
  {
867
- "epoch": 1.54,
868
- "learning_rate": 3.2421787856546646e-06,
869
- "loss": 0.4027,
870
  "step": 705
871
  },
872
  {
873
- "epoch": 1.55,
874
- "learning_rate": 3.995228810976927e-06,
875
- "loss": 0.3885,
876
  "step": 710
877
  },
878
  {
879
- "epoch": 1.56,
880
- "learning_rate": 4.824264352736739e-06,
881
- "loss": 0.3878,
882
  "step": 715
883
  },
884
  {
885
- "epoch": 1.57,
886
- "learning_rate": 5.728310332892236e-06,
887
- "loss": 0.3898,
888
  "step": 720
889
  },
890
  {
891
- "epoch": 1.58,
892
- "learning_rate": 6.706303449162301e-06,
893
- "loss": 0.4277,
894
  "step": 725
895
  },
896
  {
897
- "epoch": 1.59,
898
- "learning_rate": 7.757093425632118e-06,
899
- "loss": 0.4404,
900
  "step": 730
901
  },
902
  {
903
- "epoch": 1.6,
904
- "learning_rate": 8.87944436567447e-06,
905
- "loss": 0.4314,
906
  "step": 735
907
  },
908
  {
909
- "epoch": 1.62,
910
- "learning_rate": 1.0072036205549646e-05,
911
- "loss": 0.3975,
912
  "step": 740
913
  },
914
  {
915
- "epoch": 1.63,
916
- "learning_rate": 1.1333466267016031e-05,
917
- "loss": 0.3823,
918
  "step": 745
919
  },
920
  {
921
- "epoch": 1.64,
922
- "learning_rate": 1.2662250907104843e-05,
923
- "loss": 0.4271,
924
  "step": 750
925
  },
926
  {
927
- "epoch": 1.65,
928
- "learning_rate": 1.405682726311201e-05,
929
- "loss": 0.4139,
930
  "step": 755
931
  },
932
  {
933
- "epoch": 1.66,
934
- "learning_rate": 1.551555509078725e-05,
935
- "loss": 0.3756,
936
  "step": 760
937
  },
938
  {
939
- "epoch": 1.67,
940
- "learning_rate": 1.7036718693514217e-05,
941
- "loss": 0.3966,
942
  "step": 765
943
  },
944
  {
945
- "epoch": 1.68,
946
- "learning_rate": 1.861852894024695e-05,
947
- "loss": 0.4297,
948
  "step": 770
949
  },
950
  {
951
- "epoch": 1.69,
952
- "learning_rate": 2.025912536980801e-05,
953
- "loss": 0.4243,
954
  "step": 775
955
  },
956
  {
957
- "epoch": 1.7,
958
- "learning_rate": 2.195657837909552e-05,
959
- "loss": 0.4225,
960
  "step": 780
961
  },
962
  {
963
- "epoch": 1.71,
964
- "learning_rate": 2.3708891492598995e-05,
965
- "loss": 0.3649,
966
  "step": 785
967
  },
968
  {
969
- "epoch": 1.72,
970
- "learning_rate": 2.5514003710574315e-05,
971
- "loss": 0.4292,
972
  "step": 790
973
  },
974
  {
975
- "epoch": 1.74,
976
- "learning_rate": 2.7369791933111284e-05,
977
- "loss": 0.4144,
978
  "step": 795
979
  },
980
  {
981
- "epoch": 1.75,
982
- "learning_rate": 2.9274073457225438e-05,
983
- "loss": 0.4083,
984
  "step": 800
985
  },
986
  {
987
- "epoch": 1.76,
988
- "learning_rate": 3.122460854408334e-05,
989
- "loss": 0.4134,
990
  "step": 805
991
  },
992
  {
993
- "epoch": 1.77,
994
- "learning_rate": 3.321910305328057e-05,
995
- "loss": 0.4068,
996
  "step": 810
997
  },
998
  {
999
- "epoch": 1.78,
1000
- "learning_rate": 3.525521114111805e-05,
1001
- "loss": 0.418,
1002
  "step": 815
1003
  },
1004
  {
1005
- "epoch": 1.79,
1006
- "learning_rate": 3.733053801969874e-05,
1007
- "loss": 0.4407,
1008
  "step": 820
1009
  },
1010
  {
1011
- "epoch": 1.8,
1012
- "learning_rate": 3.9442642773556236e-05,
1013
- "loss": 0.4147,
1014
  "step": 825
1015
  },
1016
  {
1017
- "epoch": 1.81,
1018
- "learning_rate": 4.158904123058812e-05,
1019
- "loss": 0.4527,
1020
  "step": 830
1021
  },
1022
  {
1023
- "epoch": 1.82,
1024
- "learning_rate": 4.376720888382321e-05,
1025
- "loss": 0.4208,
1026
  "step": 835
1027
  },
1028
  {
1029
- "epoch": 1.83,
1030
- "learning_rate": 4.5974583860647876e-05,
1031
- "loss": 0.4358,
1032
  "step": 840
1033
  },
1034
  {
1035
- "epoch": 1.84,
1036
- "learning_rate": 4.820856993597877e-05,
1037
- "loss": 0.3957,
1038
  "step": 845
1039
  },
1040
  {
1041
- "epoch": 1.86,
1042
- "learning_rate": 5.046653958583022e-05,
1043
- "loss": 0.4127,
1044
  "step": 850
1045
  },
1046
  {
1047
- "epoch": 1.87,
1048
- "learning_rate": 5.2745837077712705e-05,
1049
- "loss": 0.3921,
1050
  "step": 855
1051
  },
1052
  {
1053
- "epoch": 1.88,
1054
- "learning_rate": 5.5043781594190486e-05,
1055
- "loss": 0.4401,
1056
  "step": 860
1057
  },
1058
  {
1059
- "epoch": 1.89,
1060
- "learning_rate": 5.7357670385952385e-05,
1061
- "loss": 0.4134,
1062
  "step": 865
1063
  },
1064
  {
1065
- "epoch": 1.9,
1066
- "learning_rate": 5.9684781950679994e-05,
1067
- "loss": 0.4436,
1068
  "step": 870
1069
  },
1070
  {
1071
- "epoch": 1.91,
1072
- "learning_rate": 6.202237923395308e-05,
1073
- "loss": 0.3734,
1074
  "step": 875
1075
  },
1076
  {
1077
- "epoch": 1.92,
1078
- "learning_rate": 6.436771284848528e-05,
1079
- "loss": 0.3881,
1080
  "step": 880
1081
  },
1082
  {
1083
- "epoch": 1.93,
1084
- "learning_rate": 6.67180243078268e-05,
1085
- "loss": 0.4471,
1086
  "step": 885
1087
  },
1088
  {
1089
- "epoch": 1.94,
1090
- "learning_rate": 6.907054927078826e-05,
1091
- "loss": 0.4541,
1092
  "step": 890
1093
  },
1094
- {
1095
- "epoch": 1.95,
1096
- "learning_rate": 7.142252079274891e-05,
1097
- "loss": 0.4096,
1098
- "step": 895
1099
- },
1100
  {
1101
  "epoch": 1.97,
1102
- "learning_rate": 7.377117258001702e-05,
1103
- "loss": 0.4057,
1104
- "step": 900
1105
  },
1106
  {
1107
  "epoch": 1.98,
1108
- "learning_rate": 7.611374224344431e-05,
1109
- "loss": 0.443,
1110
- "step": 905
1111
  },
1112
  {
1113
  "epoch": 1.99,
1114
- "learning_rate": 7.844747454742836e-05,
1115
- "loss": 0.455,
1116
- "step": 910
1117
- },
1118
- {
1119
- "epoch": 2.0,
1120
- "learning_rate": 8.076962465051111e-05,
1121
- "loss": 0.3749,
1122
- "step": 915
1123
  },
1124
  {
1125
  "epoch": 2.0,
1126
- "eval_loss": 0.1634998470544815,
1127
- "eval_runtime": 15.0631,
1128
- "eval_samples_per_second": 41.027,
1129
- "eval_steps_per_second": 5.178,
1130
- "step": 916
1131
  }
1132
  ],
1133
- "max_steps": 2748,
1134
- "num_train_epochs": 6,
1135
- "total_flos": 955936899072000.0,
1136
  "trial_name": null,
1137
  "trial_params": null
1138
  }
 
1
  {
2
+ "best_metric": 0.20774193108081818,
3
+ "best_model_checkpoint": "output/eminem/checkpoint-908",
4
  "epoch": 2.0,
5
+ "global_step": 908,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
+ "learning_rate": 5.638262768466246e-06,
13
+ "loss": 0.3685,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.02,
18
+ "learning_rate": 6.618083216571314e-06,
19
+ "loss": 0.32,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.03,
24
+ "learning_rate": 7.672094408674214e-06,
25
+ "loss": 0.3424,
26
  "step": 15
27
  },
28
  {
29
  "epoch": 0.04,
30
+ "learning_rate": 8.79903472075949e-06,
31
+ "loss": 0.3602,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 0.06,
36
+ "learning_rate": 9.997555234556058e-06,
37
+ "loss": 0.3777,
38
  "step": 25
39
  },
40
  {
41
  "epoch": 0.07,
42
+ "learning_rate": 1.126622135214757e-05,
43
+ "loss": 0.374,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.08,
48
+ "learning_rate": 1.2603514513152213e-05,
49
+ "loss": 0.3274,
50
  "step": 35
51
  },
52
  {
53
  "epoch": 0.09,
54
+ "learning_rate": 1.4007834012404736e-05,
55
+ "loss": 0.3659,
56
  "step": 40
57
  },
58
  {
59
  "epoch": 0.1,
60
+ "learning_rate": 1.5477498915945268e-05,
61
+ "loss": 0.3498,
62
  "step": 45
63
  },
64
  {
65
  "epoch": 0.11,
66
+ "learning_rate": 1.7010750073075836e-05,
67
+ "loss": 0.3689,
68
  "step": 50
69
  },
70
  {
71
  "epoch": 0.12,
72
+ "learning_rate": 1.8605752221991583e-05,
73
+ "loss": 0.3778,
74
  "step": 55
75
  },
76
  {
77
  "epoch": 0.13,
78
+ "learning_rate": 2.026059618656492e-05,
79
+ "loss": 0.3858,
80
  "step": 60
81
  },
82
  {
83
  "epoch": 0.14,
84
+ "learning_rate": 2.1973301161563144e-05,
85
+ "loss": 0.3659,
86
  "step": 65
87
  },
88
  {
89
  "epoch": 0.15,
90
+ "learning_rate": 2.3741817083632926e-05,
91
+ "loss": 0.3721,
92
  "step": 70
93
  },
94
  {
95
+ "epoch": 0.17,
96
+ "learning_rate": 2.5564027085161653e-05,
97
+ "loss": 0.3581,
98
  "step": 75
99
  },
100
  {
101
  "epoch": 0.18,
102
+ "learning_rate": 2.7437750028124533e-05,
103
+ "loss": 0.3966,
104
  "step": 80
105
  },
106
  {
107
  "epoch": 0.19,
108
+ "learning_rate": 2.9360743114838212e-05,
109
+ "loss": 0.4075,
110
  "step": 85
111
  },
112
  {
113
  "epoch": 0.2,
114
+ "learning_rate": 3.1330704572545265e-05,
115
+ "loss": 0.3213,
116
  "step": 90
117
  },
118
  {
119
  "epoch": 0.21,
120
+ "learning_rate": 3.334527640855557e-05,
121
+ "loss": 0.3886,
122
  "step": 95
123
  },
124
  {
125
  "epoch": 0.22,
126
+ "learning_rate": 3.540204723273326e-05,
127
+ "loss": 0.3776,
128
  "step": 100
129
  },
130
  {
131
  "epoch": 0.23,
132
+ "learning_rate": 3.7498555143833497e-05,
133
+ "loss": 0.369,
134
  "step": 105
135
  },
136
  {
137
  "epoch": 0.24,
138
+ "learning_rate": 3.963229067635737e-05,
139
+ "loss": 0.3304,
140
  "step": 110
141
  },
142
  {
143
  "epoch": 0.25,
144
+ "learning_rate": 4.180069980431396e-05,
145
+ "loss": 0.3778,
146
  "step": 115
147
  },
148
  {
149
  "epoch": 0.26,
150
+ "learning_rate": 4.400118699831243e-05,
151
+ "loss": 0.3336,
152
  "step": 120
153
  },
154
  {
155
+ "epoch": 0.28,
156
+ "learning_rate": 4.623111833235935e-05,
157
+ "loss": 0.3594,
158
  "step": 125
159
  },
160
  {
161
  "epoch": 0.29,
162
+ "learning_rate": 4.848782463657604e-05,
163
+ "loss": 0.3779,
164
  "step": 130
165
  },
166
  {
167
  "epoch": 0.3,
168
+ "learning_rate": 5.076860469216043e-05,
169
+ "loss": 0.3485,
170
  "step": 135
171
  },
172
  {
173
  "epoch": 0.31,
174
+ "learning_rate": 5.3070728464635136e-05,
175
+ "loss": 0.3823,
176
  "step": 140
177
  },
178
  {
179
  "epoch": 0.32,
180
+ "learning_rate": 5.539144037166318e-05,
181
+ "loss": 0.3733,
182
  "step": 145
183
  },
184
  {
185
  "epoch": 0.33,
186
+ "learning_rate": 5.7727962581384004e-05,
187
+ "loss": 0.3473,
188
  "step": 150
189
  },
190
  {
191
  "epoch": 0.34,
192
+ "learning_rate": 6.007749833742314e-05,
193
+ "loss": 0.4018,
194
  "step": 155
195
  },
196
  {
197
  "epoch": 0.35,
198
+ "learning_rate": 6.243723530652164e-05,
199
+ "loss": 0.396,
200
  "step": 160
201
  },
202
  {
203
  "epoch": 0.36,
204
+ "learning_rate": 6.480434894484319e-05,
205
+ "loss": 0.3803,
206
  "step": 165
207
  },
208
  {
209
  "epoch": 0.37,
210
+ "learning_rate": 6.71760058788546e-05,
211
+ "loss": 0.3732,
212
  "step": 170
213
  },
214
  {
215
+ "epoch": 0.39,
216
+ "learning_rate": 6.954936729683709e-05,
217
+ "loss": 0.3755,
218
  "step": 175
219
  },
220
  {
221
+ "epoch": 0.4,
222
+ "learning_rate": 7.192159234682948e-05,
223
+ "loss": 0.3447,
224
  "step": 180
225
  },
226
  {
227
  "epoch": 0.41,
228
+ "learning_rate": 7.428984153708128e-05,
229
+ "loss": 0.3732,
230
  "step": 185
231
  },
232
  {
233
  "epoch": 0.42,
234
+ "learning_rate": 7.665128013484672e-05,
235
+ "loss": 0.3869,
236
  "step": 190
237
  },
238
  {
239
  "epoch": 0.43,
240
+ "learning_rate": 7.900308155947452e-05,
241
+ "loss": 0.387,
242
  "step": 195
243
  },
244
  {
245
  "epoch": 0.44,
246
+ "learning_rate": 8.134243076576889e-05,
247
+ "loss": 0.426,
248
  "step": 200
249
  },
250
  {
251
  "epoch": 0.45,
252
+ "learning_rate": 8.366652761350052e-05,
253
+ "loss": 0.3936,
254
  "step": 205
255
  },
256
  {
257
  "epoch": 0.46,
258
+ "learning_rate": 8.597259021913802e-05,
259
+ "loss": 0.4021,
260
  "step": 210
261
  },
262
  {
263
  "epoch": 0.47,
264
+ "learning_rate": 8.825785828564833e-05,
265
+ "loss": 0.3664,
266
  "step": 215
267
  },
268
  {
269
  "epoch": 0.48,
270
+ "learning_rate": 9.051959640653656e-05,
271
+ "loss": 0.3974,
272
  "step": 220
273
  },
274
  {
275
+ "epoch": 0.5,
276
+ "learning_rate": 9.275509734003202e-05,
277
+ "loss": 0.3667,
278
  "step": 225
279
  },
280
  {
281
+ "epoch": 0.51,
282
+ "learning_rate": 9.496168524960306e-05,
283
+ "loss": 0.4155,
284
  "step": 230
285
  },
286
  {
287
  "epoch": 0.52,
288
+ "learning_rate": 9.713671890684909e-05,
289
+ "loss": 0.371,
290
  "step": 235
291
  },
292
  {
293
  "epoch": 0.53,
294
+ "learning_rate": 9.927759485299701e-05,
295
+ "loss": 0.3468,
296
  "step": 240
297
  },
298
  {
299
  "epoch": 0.54,
300
+ "learning_rate": 0.00010138175051515928,
301
+ "loss": 0.4078,
302
  "step": 245
303
  },
304
  {
305
  "epoch": 0.55,
306
+ "learning_rate": 0.0001034466672736829,
307
+ "loss": 0.387,
308
  "step": 250
309
  },
310
  {
311
  "epoch": 0.56,
312
+ "learning_rate": 0.00010546987347685277,
313
+ "loss": 0.4011,
314
  "step": 255
315
  },
316
  {
317
  "epoch": 0.57,
318
+ "learning_rate": 0.00010744894739941007,
319
+ "loss": 0.4018,
320
  "step": 260
321
  },
322
  {
323
  "epoch": 0.58,
324
+ "learning_rate": 0.00010938152014129237,
325
+ "loss": 0.4338,
326
  "step": 265
327
  },
328
  {
329
  "epoch": 0.59,
330
+ "learning_rate": 0.00011126527846313597,
331
+ "loss": 0.4123,
332
  "step": 270
333
  },
334
  {
335
+ "epoch": 0.61,
336
+ "learning_rate": 0.00011309796755517588,
337
+ "loss": 0.3855,
338
  "step": 275
339
  },
340
  {
341
+ "epoch": 0.62,
342
+ "learning_rate": 0.00011487739373618224,
343
+ "loss": 0.4182,
344
  "step": 280
345
  },
346
  {
347
+ "epoch": 0.63,
348
+ "learning_rate": 0.00011660142707925318,
349
+ "loss": 0.3913,
350
  "step": 285
351
  },
352
  {
353
  "epoch": 0.64,
354
+ "learning_rate": 0.00011826800396126461,
355
+ "loss": 0.477,
356
  "step": 290
357
  },
358
  {
359
  "epoch": 0.65,
360
+ "learning_rate": 0.00011987512953299345,
361
+ "loss": 0.3962,
362
  "step": 295
363
  },
364
  {
365
  "epoch": 0.66,
366
+ "learning_rate": 0.00012142088010688345,
367
+ "loss": 0.4238,
368
  "step": 300
369
  },
370
  {
371
  "epoch": 0.67,
372
+ "learning_rate": 0.0001229034054596614,
373
+ "loss": 0.4407,
374
  "step": 305
375
  },
376
  {
377
  "epoch": 0.68,
378
+ "learning_rate": 0.00012432093104699607,
379
+ "loss": 0.4689,
380
  "step": 310
381
  },
382
  {
383
  "epoch": 0.69,
384
+ "learning_rate": 0.00012567176012759143,
385
+ "loss": 0.4264,
386
  "step": 315
387
  },
388
  {
389
  "epoch": 0.7,
390
+ "learning_rate": 0.0001269542757941326,
391
+ "loss": 0.3877,
392
  "step": 320
393
  },
394
  {
395
+ "epoch": 0.72,
396
+ "learning_rate": 0.0001281669429086917,
397
+ "loss": 0.4585,
398
  "step": 325
399
  },
400
  {
401
+ "epoch": 0.73,
402
+ "learning_rate": 0.0001293083099402366,
403
+ "loss": 0.4342,
404
  "step": 330
405
  },
406
  {
407
+ "epoch": 0.74,
408
+ "learning_rate": 0.0001303770107020854,
409
+ "loss": 0.4388,
410
  "step": 335
411
  },
412
  {
413
  "epoch": 0.75,
414
+ "learning_rate": 0.00013137176598719452,
415
+ "loss": 0.4933,
416
  "step": 340
417
  },
418
  {
419
  "epoch": 0.76,
420
+ "learning_rate": 0.00013229138509933145,
421
+ "loss": 0.4994,
422
  "step": 345
423
  },
424
  {
425
  "epoch": 0.77,
426
+ "learning_rate": 0.00013313476727831372,
427
+ "loss": 0.4215,
428
  "step": 350
429
  },
430
  {
431
  "epoch": 0.78,
432
+ "learning_rate": 0.00013390090301758416,
433
+ "loss": 0.4488,
434
  "step": 355
435
  },
436
  {
437
  "epoch": 0.79,
438
+ "learning_rate": 0.00013458887527257018,
439
+ "loss": 0.4785,
440
  "step": 360
441
  },
442
  {
443
  "epoch": 0.8,
444
+ "learning_rate": 0.0001351978605583545,
445
+ "loss": 0.444,
446
  "step": 365
447
  },
448
  {
449
  "epoch": 0.81,
450
+ "learning_rate": 0.00013572712993537543,
451
+ "loss": 0.409,
452
  "step": 370
453
  },
454
  {
455
+ "epoch": 0.83,
456
+ "learning_rate": 0.00013617604988193778,
457
+ "loss": 0.4202,
458
  "step": 375
459
  },
460
  {
461
+ "epoch": 0.84,
462
+ "learning_rate": 0.00013654408305253035,
463
+ "loss": 0.4465,
464
  "step": 380
465
  },
466
  {
467
+ "epoch": 0.85,
468
+ "learning_rate": 0.0001368307889210095,
469
+ "loss": 0.487,
470
  "step": 385
471
  },
472
  {
473
  "epoch": 0.86,
474
+ "learning_rate": 0.0001370358243079002,
475
+ "loss": 0.4389,
476
  "step": 390
477
  },
478
  {
479
  "epoch": 0.87,
480
+ "learning_rate": 0.00013715894379117118,
481
+ "loss": 0.4761,
482
  "step": 395
483
  },
484
  {
485
  "epoch": 0.88,
486
+ "learning_rate": 0.0001372,
487
+ "loss": 0.4531,
488
  "step": 400
489
  },
490
  {
491
  "epoch": 0.89,
492
+ "learning_rate": 0.0001371589437911712,
493
+ "loss": 0.4367,
494
  "step": 405
495
  },
496
  {
497
  "epoch": 0.9,
498
+ "learning_rate": 0.0001370358243079003,
499
+ "loss": 0.4265,
500
  "step": 410
501
  },
502
  {
503
  "epoch": 0.91,
504
+ "learning_rate": 0.0001368307889210096,
505
+ "loss": 0.4721,
506
  "step": 415
507
  },
508
  {
509
+ "epoch": 0.93,
510
+ "learning_rate": 0.00013654408305253054,
511
+ "loss": 0.3988,
512
  "step": 420
513
  },
514
  {
515
+ "epoch": 0.94,
516
+ "learning_rate": 0.00013617604988193797,
517
+ "loss": 0.4344,
518
  "step": 425
519
  },
520
  {
521
+ "epoch": 0.95,
522
+ "learning_rate": 0.00013572712993537567,
523
+ "loss": 0.4899,
524
  "step": 430
525
  },
526
  {
527
+ "epoch": 0.96,
528
+ "learning_rate": 0.00013519786055835476,
529
+ "loss": 0.5019,
530
  "step": 435
531
  },
532
  {
533
+ "epoch": 0.97,
534
+ "learning_rate": 0.0001345888752725705,
535
+ "loss": 0.4645,
536
  "step": 440
537
  },
538
  {
539
  "epoch": 0.98,
540
+ "learning_rate": 0.00013390090301758454,
541
+ "loss": 0.3885,
542
  "step": 445
543
  },
544
  {
545
  "epoch": 0.99,
546
+ "learning_rate": 0.0001331347672783141,
547
+ "loss": 0.426,
548
  "step": 450
549
  },
550
  {
551
  "epoch": 1.0,
552
+ "eval_loss": 0.21941067278385162,
553
+ "eval_runtime": 15.1157,
554
+ "eval_samples_per_second": 42.803,
555
+ "eval_steps_per_second": 5.359,
556
+ "step": 454
 
 
 
 
 
 
557
  },
558
  {
559
  "epoch": 1.0,
560
+ "learning_rate": 0.00013229138509933188,
561
+ "loss": 0.4675,
562
+ "step": 455
 
 
563
  },
564
  {
565
+ "epoch": 1.01,
566
+ "learning_rate": 0.00013137176598719498,
567
+ "loss": 0.3498,
568
  "step": 460
569
  },
570
  {
571
  "epoch": 1.02,
572
+ "learning_rate": 0.00013037701070208588,
573
+ "loss": 0.4028,
574
  "step": 465
575
  },
576
  {
577
+ "epoch": 1.04,
578
+ "learning_rate": 0.00012930830994023715,
579
+ "loss": 0.3913,
580
  "step": 470
581
  },
582
  {
583
+ "epoch": 1.05,
584
+ "learning_rate": 0.00012816694290869227,
585
+ "loss": 0.427,
586
  "step": 475
587
  },
588
  {
589
+ "epoch": 1.06,
590
+ "learning_rate": 0.00012695427579413322,
591
+ "loss": 0.3914,
592
  "step": 480
593
  },
594
  {
595
+ "epoch": 1.07,
596
+ "learning_rate": 0.00012567176012759208,
597
+ "loss": 0.4128,
598
  "step": 485
599
  },
600
  {
601
+ "epoch": 1.08,
602
+ "learning_rate": 0.00012432093104699674,
603
+ "loss": 0.3783,
604
  "step": 490
605
  },
606
  {
607
+ "epoch": 1.09,
608
+ "learning_rate": 0.00012290340545966207,
609
+ "loss": 0.4026,
610
  "step": 495
611
  },
612
  {
613
+ "epoch": 1.1,
614
+ "learning_rate": 0.00012142088010688418,
615
+ "loss": 0.3649,
616
  "step": 500
617
  },
618
  {
619
+ "epoch": 1.11,
620
+ "learning_rate": 0.00011987512953299421,
621
+ "loss": 0.4037,
622
  "step": 505
623
  },
624
  {
625
+ "epoch": 1.12,
626
+ "learning_rate": 0.00011826800396126541,
627
+ "loss": 0.3905,
628
  "step": 510
629
  },
630
  {
631
+ "epoch": 1.13,
632
+ "learning_rate": 0.000116601427079254,
633
+ "loss": 0.3741,
634
  "step": 515
635
  },
636
  {
637
+ "epoch": 1.15,
638
+ "learning_rate": 0.00011487739373618308,
639
+ "loss": 0.3776,
640
  "step": 520
641
  },
642
  {
643
+ "epoch": 1.16,
644
+ "learning_rate": 0.00011309796755517674,
645
+ "loss": 0.3915,
646
  "step": 525
647
  },
648
  {
649
+ "epoch": 1.17,
650
+ "learning_rate": 0.00011126527846313685,
651
+ "loss": 0.3885,
652
  "step": 530
653
  },
654
  {
655
+ "epoch": 1.18,
656
+ "learning_rate": 0.00010938152014129329,
657
+ "loss": 0.3599,
658
  "step": 535
659
  },
660
  {
661
+ "epoch": 1.19,
662
+ "learning_rate": 0.00010744894739941099,
663
+ "loss": 0.3955,
664
  "step": 540
665
  },
666
  {
667
+ "epoch": 1.2,
668
+ "learning_rate": 0.00010546987347685372,
669
+ "loss": 0.3638,
670
  "step": 545
671
  },
672
  {
673
+ "epoch": 1.21,
674
+ "learning_rate": 0.00010344666727368387,
675
+ "loss": 0.4055,
676
  "step": 550
677
  },
678
  {
679
+ "epoch": 1.22,
680
+ "learning_rate": 0.00010138175051516028,
681
+ "loss": 0.3965,
682
  "step": 555
683
  },
684
  {
685
+ "epoch": 1.23,
686
+ "learning_rate": 9.927759485299804e-05,
687
+ "loss": 0.3767,
688
  "step": 560
689
  },
690
  {
691
+ "epoch": 1.24,
692
+ "learning_rate": 9.713671890685015e-05,
693
+ "loss": 0.4088,
694
  "step": 565
695
  },
696
  {
697
+ "epoch": 1.26,
698
+ "learning_rate": 9.49616852496041e-05,
699
+ "loss": 0.4329,
700
  "step": 570
701
  },
702
  {
703
+ "epoch": 1.27,
704
+ "learning_rate": 9.275509734003309e-05,
705
+ "loss": 0.3581,
706
  "step": 575
707
  },
708
  {
709
+ "epoch": 1.28,
710
+ "learning_rate": 9.051959640653763e-05,
711
+ "loss": 0.4084,
712
  "step": 580
713
  },
714
  {
715
+ "epoch": 1.29,
716
+ "learning_rate": 8.825785828564943e-05,
717
+ "loss": 0.3755,
718
  "step": 585
719
  },
720
  {
721
+ "epoch": 1.3,
722
+ "learning_rate": 8.597259021913913e-05,
723
+ "loss": 0.4321,
724
  "step": 590
725
  },
726
  {
727
+ "epoch": 1.31,
728
+ "learning_rate": 8.366652761350163e-05,
729
+ "loss": 0.4021,
730
  "step": 595
731
  },
732
  {
733
+ "epoch": 1.32,
734
+ "learning_rate": 8.134243076577001e-05,
735
+ "loss": 0.4115,
736
  "step": 600
737
  },
738
  {
739
+ "epoch": 1.33,
740
+ "learning_rate": 7.900308155947565e-05,
741
+ "loss": 0.3358,
742
  "step": 605
743
  },
744
  {
745
+ "epoch": 1.34,
746
+ "learning_rate": 7.665128013484786e-05,
747
+ "loss": 0.3576,
748
  "step": 610
749
  },
750
  {
751
+ "epoch": 1.35,
752
+ "learning_rate": 7.428984153708243e-05,
753
+ "loss": 0.4326,
754
  "step": 615
755
  },
756
  {
757
+ "epoch": 1.37,
758
+ "learning_rate": 7.192159234683063e-05,
759
+ "loss": 0.3627,
760
  "step": 620
761
  },
762
  {
763
+ "epoch": 1.38,
764
+ "learning_rate": 6.954936729683823e-05,
765
+ "loss": 0.3891,
766
  "step": 625
767
  },
768
  {
769
+ "epoch": 1.39,
770
+ "learning_rate": 6.717600587885573e-05,
771
+ "loss": 0.3983,
772
  "step": 630
773
  },
774
  {
775
+ "epoch": 1.4,
776
+ "learning_rate": 6.480434894484433e-05,
777
+ "loss": 0.3504,
778
  "step": 635
779
  },
780
  {
781
+ "epoch": 1.41,
782
+ "learning_rate": 6.243723530652277e-05,
783
+ "loss": 0.3806,
784
  "step": 640
785
  },
786
  {
787
+ "epoch": 1.42,
788
+ "learning_rate": 6.007749833742428e-05,
789
+ "loss": 0.3616,
790
  "step": 645
791
  },
792
  {
793
+ "epoch": 1.43,
794
+ "learning_rate": 5.772796258138513e-05,
795
+ "loss": 0.4417,
796
  "step": 650
797
  },
798
  {
799
+ "epoch": 1.44,
800
+ "learning_rate": 5.53914403716643e-05,
801
+ "loss": 0.4272,
802
  "step": 655
803
  },
804
  {
805
+ "epoch": 1.45,
806
+ "learning_rate": 5.307072846463625e-05,
807
+ "loss": 0.3909,
808
  "step": 660
809
  },
810
  {
811
+ "epoch": 1.46,
812
+ "learning_rate": 5.076860469216153e-05,
813
+ "loss": 0.3833,
814
  "step": 665
815
  },
816
  {
817
+ "epoch": 1.48,
818
+ "learning_rate": 4.848782463657713e-05,
819
+ "loss": 0.3611,
820
  "step": 670
821
  },
822
  {
823
+ "epoch": 1.49,
824
+ "learning_rate": 4.6231118332360436e-05,
825
+ "loss": 0.415,
826
  "step": 675
827
  },
828
  {
829
+ "epoch": 1.5,
830
+ "learning_rate": 4.40011869983135e-05,
831
+ "loss": 0.4037,
832
  "step": 680
833
  },
834
  {
835
+ "epoch": 1.51,
836
+ "learning_rate": 4.1800699804315014e-05,
837
+ "loss": 0.3836,
838
  "step": 685
839
  },
840
  {
841
+ "epoch": 1.52,
842
+ "learning_rate": 3.9632290676358406e-05,
843
+ "loss": 0.3564,
844
  "step": 690
845
  },
846
  {
847
+ "epoch": 1.53,
848
+ "learning_rate": 3.749855514383451e-05,
849
+ "loss": 0.4255,
850
  "step": 695
851
  },
852
  {
853
+ "epoch": 1.54,
854
+ "learning_rate": 3.5402047232734255e-05,
855
+ "loss": 0.3377,
856
  "step": 700
857
  },
858
  {
859
+ "epoch": 1.55,
860
+ "learning_rate": 3.334527640855654e-05,
861
+ "loss": 0.3671,
862
  "step": 705
863
  },
864
  {
865
+ "epoch": 1.56,
866
+ "learning_rate": 3.133070457254623e-05,
867
+ "loss": 0.3735,
868
  "step": 710
869
  },
870
  {
871
+ "epoch": 1.57,
872
+ "learning_rate": 2.9360743114839147e-05,
873
+ "loss": 0.3614,
874
  "step": 715
875
  },
876
  {
877
+ "epoch": 1.59,
878
+ "learning_rate": 2.7437750028125448e-05,
879
+ "loss": 0.373,
880
  "step": 720
881
  },
882
  {
883
+ "epoch": 1.6,
884
+ "learning_rate": 2.5564027085162544e-05,
885
+ "loss": 0.3795,
886
  "step": 725
887
  },
888
  {
889
+ "epoch": 1.61,
890
+ "learning_rate": 2.3741817083633794e-05,
891
+ "loss": 0.3621,
892
  "step": 730
893
  },
894
  {
895
+ "epoch": 1.62,
896
+ "learning_rate": 2.197330116156398e-05,
897
+ "loss": 0.3882,
898
  "step": 735
899
  },
900
  {
901
+ "epoch": 1.63,
902
+ "learning_rate": 2.026059618656573e-05,
903
+ "loss": 0.3836,
904
  "step": 740
905
  },
906
  {
907
+ "epoch": 1.64,
908
+ "learning_rate": 1.860575222199237e-05,
909
+ "loss": 0.3413,
910
  "step": 745
911
  },
912
  {
913
+ "epoch": 1.65,
914
+ "learning_rate": 1.701075007307659e-05,
915
+ "loss": 0.3664,
916
  "step": 750
917
  },
918
  {
919
+ "epoch": 1.66,
920
+ "learning_rate": 1.5477498915945983e-05,
921
+ "loss": 0.3463,
922
  "step": 755
923
  },
924
  {
925
+ "epoch": 1.67,
926
+ "learning_rate": 1.4007834012405429e-05,
927
+ "loss": 0.3331,
928
  "step": 760
929
  },
930
  {
931
+ "epoch": 1.69,
932
+ "learning_rate": 1.2603514513152868e-05,
933
+ "loss": 0.3815,
934
  "step": 765
935
  },
936
  {
937
+ "epoch": 1.7,
938
+ "learning_rate": 1.1266221352148201e-05,
939
+ "loss": 0.3769,
940
  "step": 770
941
  },
942
  {
943
+ "epoch": 1.71,
944
+ "learning_rate": 9.997555234556651e-06,
945
+ "loss": 0.4108,
946
  "step": 775
947
  },
948
  {
949
+ "epoch": 1.72,
950
+ "learning_rate": 8.799034720760045e-06,
951
+ "loss": 0.3722,
952
  "step": 780
953
  },
954
  {
955
+ "epoch": 1.73,
956
+ "learning_rate": 7.672094408674733e-06,
957
+ "loss": 0.4017,
958
  "step": 785
959
  },
960
  {
961
+ "epoch": 1.74,
962
+ "learning_rate": 6.6180832165718085e-06,
963
+ "loss": 0.4178,
964
  "step": 790
965
  },
966
  {
967
+ "epoch": 1.75,
968
+ "learning_rate": 5.638262768466695e-06,
969
+ "loss": 0.3899,
970
  "step": 795
971
  },
972
  {
973
+ "epoch": 1.76,
974
+ "learning_rate": 4.733805883975914e-06,
975
+ "loss": 0.3741,
976
  "step": 800
977
  },
978
  {
979
+ "epoch": 1.77,
980
+ "learning_rate": 3.905795174492382e-06,
981
+ "loss": 0.3831,
982
  "step": 805
983
  },
984
  {
985
+ "epoch": 1.78,
986
+ "learning_rate": 3.15522174732089e-06,
987
+ "loss": 0.3682,
988
  "step": 810
989
  },
990
  {
991
+ "epoch": 1.8,
992
+ "learning_rate": 2.4829840193538357e-06,
993
+ "loss": 0.391,
994
  "step": 815
995
  },
996
  {
997
+ "epoch": 1.81,
998
+ "learning_rate": 1.88988664168465e-06,
999
+ "loss": 0.3597,
1000
  "step": 820
1001
  },
1002
  {
1003
+ "epoch": 1.82,
1004
+ "learning_rate": 1.3766395364657029e-06,
1005
+ "loss": 0.379,
1006
  "step": 825
1007
  },
1008
  {
1009
+ "epoch": 1.83,
1010
+ "learning_rate": 9.43857047145762e-07,
1011
+ "loss": 0.4247,
1012
  "step": 830
1013
  },
1014
  {
1015
+ "epoch": 1.84,
1016
+ "learning_rate": 5.920572031218176e-07,
1017
+ "loss": 0.3705,
1018
  "step": 835
1019
  },
1020
  {
1021
+ "epoch": 1.85,
1022
+ "learning_rate": 3.2166109966507624e-07,
1023
+ "loss": 0.3148,
1024
  "step": 840
1025
  },
1026
  {
1027
+ "epoch": 1.86,
1028
+ "learning_rate": 1.3299239388725518e-07,
1029
+ "loss": 0.3331,
1030
  "step": 845
1031
  },
1032
  {
1033
+ "epoch": 1.87,
1034
+ "learning_rate": 2.6276917326658224e-08,
1035
+ "loss": 0.3802,
1036
  "step": 850
1037
  },
1038
  {
1039
+ "epoch": 1.88,
1040
+ "learning_rate": 1.6424056362936757e-09,
1041
+ "loss": 0.324,
1042
  "step": 855
1043
  },
1044
  {
1045
+ "epoch": 1.89,
1046
+ "learning_rate": 5.911834568650871e-08,
1047
+ "loss": 0.3751,
1048
  "step": 860
1049
  },
1050
  {
1051
+ "epoch": 1.91,
1052
+ "learning_rate": 1.9863594027063992e-07,
1053
+ "loss": 0.3792,
1054
  "step": 865
1055
  },
1056
  {
1057
+ "epoch": 1.92,
1058
+ "learning_rate": 4.200281904521944e-07,
1059
+ "loss": 0.3776,
1060
  "step": 870
1061
  },
1062
  {
1063
+ "epoch": 1.93,
1064
+ "learning_rate": 7.230300954609269e-07,
1065
+ "loss": 0.3469,
1066
  "step": 875
1067
  },
1068
  {
1069
+ "epoch": 1.94,
1070
+ "learning_rate": 1.1072789698879036e-06,
1071
+ "loss": 0.3464,
1072
  "step": 880
1073
  },
1074
  {
1075
+ "epoch": 1.95,
1076
+ "learning_rate": 1.572314877814697e-06,
1077
+ "loss": 0.3726,
1078
  "step": 885
1079
  },
1080
  {
1081
+ "epoch": 1.96,
1082
+ "learning_rate": 2.11758118334118e-06,
1083
+ "loss": 0.3881,
1084
  "step": 890
1085
  },
 
 
 
 
 
 
1086
  {
1087
  "epoch": 1.97,
1088
+ "learning_rate": 2.742425216867453e-06,
1089
+ "loss": 0.3518,
1090
+ "step": 895
1091
  },
1092
  {
1093
  "epoch": 1.98,
1094
+ "learning_rate": 3.4460990563188536e-06,
1095
+ "loss": 0.3839,
1096
+ "step": 900
1097
  },
1098
  {
1099
  "epoch": 1.99,
1100
+ "learning_rate": 4.227760422393333e-06,
1101
+ "loss": 0.347,
1102
+ "step": 905
 
 
 
 
 
 
1103
  },
1104
  {
1105
  "epoch": 2.0,
1106
+ "eval_loss": 0.20774193108081818,
1107
+ "eval_runtime": 15.111,
1108
+ "eval_samples_per_second": 42.816,
1109
+ "eval_steps_per_second": 5.36,
1110
+ "step": 908
1111
  }
1112
  ],
1113
+ "max_steps": 1362,
1114
+ "num_train_epochs": 3,
1115
+ "total_flos": 948490076160000.0,
1116
  "trial_name": null,
1117
  "trial_params": null
1118
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c475812936432ce71effd04e7f79e77c24b0905863f82593b5cd1090e850fcd
3
  size 3311
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59697cb8fe6b03cd91b8ebe02c7f7dde6782a826ae481ef82af6bcc802bdb131
3
  size 3311