besimray commited on
Commit
84e6383
·
verified ·
1 Parent(s): 891470a

Training in progress, step 130, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbeb203f46fb1c1ec8f6e277a8fabff750773722580b461a96e1cbac96a2291f
3
  size 45118424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:777125d8bd12de5f7ed18971ab031a8c25535a9628c33bf91a5fc02cd48f84a0
3
  size 45118424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23e132835a2fd7e3ed3d5cfe045d84119b7967d8c3a6685bd84957e36e914460
3
  size 23159290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a85d722f100b15fa6de8db1b7863c44b71b3fce19bc20b18ae46f8b628ed0a26
3
  size 23159290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7ec2f1de877992cebf6fbce0e472b6b0ae06bf82cd1caccb4ee85e3a5063b21
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab76824ef5f4a03a5fc43923056d7e1a2adea903a5e98b8bb7f651e3d75cd0f7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3295bdd7cfd599ec7f00b8b43fe7ebc7026edcd2d9d9a208dde1fb88ec2e55ef
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e182d30fc85938f253f4b0ba7702798b872e5ce41399e7a5462adca4c40ff6e4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5263157894736842,
5
  "eval_steps": 8,
6
- "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -214,6 +214,845 @@
214
  "learning_rate": 0.00019438833303083678,
215
  "loss": 1.199,
216
  "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  }
218
  ],
219
  "logging_steps": 1,
@@ -233,7 +1072,7 @@
233
  "attributes": {}
234
  }
235
  },
236
- "total_flos": 2528931246243840.0,
237
  "train_batch_size": 10,
238
  "trial_name": null,
239
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.736842105263158,
5
  "eval_steps": 8,
6
+ "global_step": 130,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
214
  "learning_rate": 0.00019438833303083678,
215
  "loss": 1.199,
216
  "step": 25
217
+ },
218
+ {
219
+ "epoch": 0.5473684210526316,
220
+ "grad_norm": 0.4188154339790344,
221
+ "learning_rate": 0.00019362348706397373,
222
+ "loss": 1.2427,
223
+ "step": 26
224
+ },
225
+ {
226
+ "epoch": 0.5684210526315789,
227
+ "grad_norm": 0.40187111496925354,
228
+ "learning_rate": 0.0001928114988519039,
229
+ "loss": 1.2533,
230
+ "step": 27
231
+ },
232
+ {
233
+ "epoch": 0.5894736842105263,
234
+ "grad_norm": 0.3812921643257141,
235
+ "learning_rate": 0.0001919527772551451,
236
+ "loss": 1.2258,
237
+ "step": 28
238
+ },
239
+ {
240
+ "epoch": 0.6105263157894737,
241
+ "grad_norm": 0.36781439185142517,
242
+ "learning_rate": 0.00019104775466588161,
243
+ "loss": 1.3239,
244
+ "step": 29
245
+ },
246
+ {
247
+ "epoch": 0.631578947368421,
248
+ "grad_norm": 0.30295032262802124,
249
+ "learning_rate": 0.0001900968867902419,
250
+ "loss": 1.1369,
251
+ "step": 30
252
+ },
253
+ {
254
+ "epoch": 0.6526315789473685,
255
+ "grad_norm": 0.32395803928375244,
256
+ "learning_rate": 0.0001891006524188368,
257
+ "loss": 1.0771,
258
+ "step": 31
259
+ },
260
+ {
261
+ "epoch": 0.6736842105263158,
262
+ "grad_norm": 0.30691561102867126,
263
+ "learning_rate": 0.0001880595531856738,
264
+ "loss": 1.1445,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 0.6736842105263158,
269
+ "eval_loss": 1.1577736139297485,
270
+ "eval_runtime": 2.099,
271
+ "eval_samples_per_second": 47.642,
272
+ "eval_steps_per_second": 4.764,
273
+ "step": 32
274
+ },
275
+ {
276
+ "epoch": 0.6947368421052632,
277
+ "grad_norm": 0.34633180499076843,
278
+ "learning_rate": 0.00018697411331556956,
279
+ "loss": 1.2347,
280
+ "step": 33
281
+ },
282
+ {
283
+ "epoch": 0.7157894736842105,
284
+ "grad_norm": 0.41544532775878906,
285
+ "learning_rate": 0.00018584487936018661,
286
+ "loss": 1.1794,
287
+ "step": 34
288
+ },
289
+ {
290
+ "epoch": 0.7368421052631579,
291
+ "grad_norm": 0.3669692575931549,
292
+ "learning_rate": 0.00018467241992282843,
293
+ "loss": 1.1205,
294
+ "step": 35
295
+ },
296
+ {
297
+ "epoch": 0.7578947368421053,
298
+ "grad_norm": 0.35185304284095764,
299
+ "learning_rate": 0.00018345732537213027,
300
+ "loss": 1.2297,
301
+ "step": 36
302
+ },
303
+ {
304
+ "epoch": 0.7789473684210526,
305
+ "grad_norm": 0.3203611671924591,
306
+ "learning_rate": 0.00018220020754479102,
307
+ "loss": 1.2702,
308
+ "step": 37
309
+ },
310
+ {
311
+ "epoch": 0.8,
312
+ "grad_norm": 0.35329145193099976,
313
+ "learning_rate": 0.00018090169943749476,
314
+ "loss": 1.2618,
315
+ "step": 38
316
+ },
317
+ {
318
+ "epoch": 0.8210526315789474,
319
+ "grad_norm": 0.3242780268192291,
320
+ "learning_rate": 0.00017956245488817812,
321
+ "loss": 1.1429,
322
+ "step": 39
323
+ },
324
+ {
325
+ "epoch": 0.8421052631578947,
326
+ "grad_norm": 0.36986637115478516,
327
+ "learning_rate": 0.000178183148246803,
328
+ "loss": 1.1966,
329
+ "step": 40
330
+ },
331
+ {
332
+ "epoch": 0.8421052631578947,
333
+ "eval_loss": 1.1517940759658813,
334
+ "eval_runtime": 2.1353,
335
+ "eval_samples_per_second": 46.832,
336
+ "eval_steps_per_second": 4.683,
337
+ "step": 40
338
+ },
339
+ {
340
+ "epoch": 0.8631578947368421,
341
+ "grad_norm": 0.42866817116737366,
342
+ "learning_rate": 0.0001767644740358011,
343
+ "loss": 1.2441,
344
+ "step": 41
345
+ },
346
+ {
347
+ "epoch": 0.8842105263157894,
348
+ "grad_norm": 0.32174167037010193,
349
+ "learning_rate": 0.00017530714660036112,
350
+ "loss": 1.1713,
351
+ "step": 42
352
+ },
353
+ {
354
+ "epoch": 0.9052631578947369,
355
+ "grad_norm": 0.3334487974643707,
356
+ "learning_rate": 0.00017381189974873407,
357
+ "loss": 1.1588,
358
+ "step": 43
359
+ },
360
+ {
361
+ "epoch": 0.9263157894736842,
362
+ "grad_norm": 0.3190995156764984,
363
+ "learning_rate": 0.00017227948638273916,
364
+ "loss": 1.1455,
365
+ "step": 44
366
+ },
367
+ {
368
+ "epoch": 0.9473684210526315,
369
+ "grad_norm": 0.33743467926979065,
370
+ "learning_rate": 0.00017071067811865476,
371
+ "loss": 1.2266,
372
+ "step": 45
373
+ },
374
+ {
375
+ "epoch": 0.968421052631579,
376
+ "grad_norm": 0.3349515199661255,
377
+ "learning_rate": 0.00016910626489868649,
378
+ "loss": 1.1853,
379
+ "step": 46
380
+ },
381
+ {
382
+ "epoch": 0.9894736842105263,
383
+ "grad_norm": 0.35275840759277344,
384
+ "learning_rate": 0.00016746705459320745,
385
+ "loss": 1.0376,
386
+ "step": 47
387
+ },
388
+ {
389
+ "epoch": 1.0105263157894737,
390
+ "grad_norm": 0.3261784315109253,
391
+ "learning_rate": 0.00016579387259397127,
392
+ "loss": 1.2876,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 1.0105263157894737,
397
+ "eval_loss": 1.1508827209472656,
398
+ "eval_runtime": 2.1052,
399
+ "eval_samples_per_second": 47.501,
400
+ "eval_steps_per_second": 4.75,
401
+ "step": 48
402
+ },
403
+ {
404
+ "epoch": 1.0315789473684212,
405
+ "grad_norm": 0.3690132200717926,
406
+ "learning_rate": 0.0001640875613985024,
407
+ "loss": 1.079,
408
+ "step": 49
409
+ },
410
+ {
411
+ "epoch": 1.0526315789473684,
412
+ "grad_norm": 0.42905285954475403,
413
+ "learning_rate": 0.00016234898018587337,
414
+ "loss": 1.153,
415
+ "step": 50
416
+ },
417
+ {
418
+ "epoch": 1.0736842105263158,
419
+ "grad_norm": 0.3510225713253021,
420
+ "learning_rate": 0.000160579004384082,
421
+ "loss": 1.1183,
422
+ "step": 51
423
+ },
424
+ {
425
+ "epoch": 1.0947368421052632,
426
+ "grad_norm": 0.36225467920303345,
427
+ "learning_rate": 0.00015877852522924732,
428
+ "loss": 1.0998,
429
+ "step": 52
430
+ },
431
+ {
432
+ "epoch": 1.1157894736842104,
433
+ "grad_norm": 0.37081998586654663,
434
+ "learning_rate": 0.0001569484493168452,
435
+ "loss": 1.172,
436
+ "step": 53
437
+ },
438
+ {
439
+ "epoch": 1.1368421052631579,
440
+ "grad_norm": 0.37817469239234924,
441
+ "learning_rate": 0.00015508969814521025,
442
+ "loss": 1.1103,
443
+ "step": 54
444
+ },
445
+ {
446
+ "epoch": 1.1578947368421053,
447
+ "grad_norm": 0.36000335216522217,
448
+ "learning_rate": 0.00015320320765153367,
449
+ "loss": 1.0199,
450
+ "step": 55
451
+ },
452
+ {
453
+ "epoch": 1.1789473684210527,
454
+ "grad_norm": 0.34209051728248596,
455
+ "learning_rate": 0.00015128992774059063,
456
+ "loss": 1.0651,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 1.1789473684210527,
461
+ "eval_loss": 1.153894066810608,
462
+ "eval_runtime": 2.0943,
463
+ "eval_samples_per_second": 47.749,
464
+ "eval_steps_per_second": 4.775,
465
+ "step": 56
466
+ },
467
+ {
468
+ "epoch": 1.2,
469
+ "grad_norm": 0.4330388605594635,
470
+ "learning_rate": 0.0001493508218064347,
471
+ "loss": 1.0079,
472
+ "step": 57
473
+ },
474
+ {
475
+ "epoch": 1.2210526315789474,
476
+ "grad_norm": 0.34977588057518005,
477
+ "learning_rate": 0.00014738686624729986,
478
+ "loss": 1.0271,
479
+ "step": 58
480
+ },
481
+ {
482
+ "epoch": 1.2421052631578948,
483
+ "grad_norm": 0.4688788652420044,
484
+ "learning_rate": 0.00014539904997395468,
485
+ "loss": 1.1388,
486
+ "step": 59
487
+ },
488
+ {
489
+ "epoch": 1.263157894736842,
490
+ "grad_norm": 0.3630085289478302,
491
+ "learning_rate": 0.00014338837391175582,
492
+ "loss": 1.0998,
493
+ "step": 60
494
+ },
495
+ {
496
+ "epoch": 1.2842105263157895,
497
+ "grad_norm": 0.4067210853099823,
498
+ "learning_rate": 0.00014135585049665207,
499
+ "loss": 0.9867,
500
+ "step": 61
501
+ },
502
+ {
503
+ "epoch": 1.305263157894737,
504
+ "grad_norm": 0.33548006415367126,
505
+ "learning_rate": 0.00013930250316539238,
506
+ "loss": 0.9863,
507
+ "step": 62
508
+ },
509
+ {
510
+ "epoch": 1.3263157894736843,
511
+ "grad_norm": 0.4114859402179718,
512
+ "learning_rate": 0.00013722936584019453,
513
+ "loss": 1.0526,
514
+ "step": 63
515
+ },
516
+ {
517
+ "epoch": 1.3473684210526315,
518
+ "grad_norm": 0.39736467599868774,
519
+ "learning_rate": 0.0001351374824081343,
520
+ "loss": 1.1337,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 1.3473684210526315,
525
+ "eval_loss": 1.1499197483062744,
526
+ "eval_runtime": 2.1122,
527
+ "eval_samples_per_second": 47.344,
528
+ "eval_steps_per_second": 4.734,
529
+ "step": 64
530
+ },
531
+ {
532
+ "epoch": 1.368421052631579,
533
+ "grad_norm": 0.33866772055625916,
534
+ "learning_rate": 0.00013302790619551674,
535
+ "loss": 1.1114,
536
+ "step": 65
537
+ },
538
+ {
539
+ "epoch": 1.3894736842105262,
540
+ "grad_norm": 0.42472875118255615,
541
+ "learning_rate": 0.00013090169943749476,
542
+ "loss": 1.0533,
543
+ "step": 66
544
+ },
545
+ {
546
+ "epoch": 1.4105263157894736,
547
+ "grad_norm": 0.45051443576812744,
548
+ "learning_rate": 0.00012875993274320173,
549
+ "loss": 1.1449,
550
+ "step": 67
551
+ },
552
+ {
553
+ "epoch": 1.431578947368421,
554
+ "grad_norm": 0.47155171632766724,
555
+ "learning_rate": 0.00012660368455666752,
556
+ "loss": 1.1683,
557
+ "step": 68
558
+ },
559
+ {
560
+ "epoch": 1.4526315789473685,
561
+ "grad_norm": 0.47672173380851746,
562
+ "learning_rate": 0.0001244340406137894,
563
+ "loss": 1.122,
564
+ "step": 69
565
+ },
566
+ {
567
+ "epoch": 1.4736842105263157,
568
+ "grad_norm": 0.3632158935070038,
569
+ "learning_rate": 0.00012225209339563145,
570
+ "loss": 0.9826,
571
+ "step": 70
572
+ },
573
+ {
574
+ "epoch": 1.4947368421052631,
575
+ "grad_norm": 0.44283154606819153,
576
+ "learning_rate": 0.00012005894157832729,
577
+ "loss": 1.1671,
578
+ "step": 71
579
+ },
580
+ {
581
+ "epoch": 1.5157894736842106,
582
+ "grad_norm": 0.45704108476638794,
583
+ "learning_rate": 0.00011785568947986367,
584
+ "loss": 1.0473,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 1.5157894736842106,
589
+ "eval_loss": 1.1517176628112793,
590
+ "eval_runtime": 2.1336,
591
+ "eval_samples_per_second": 46.869,
592
+ "eval_steps_per_second": 4.687,
593
+ "step": 72
594
+ },
595
+ {
596
+ "epoch": 1.5368421052631578,
597
+ "grad_norm": 0.39218422770500183,
598
+ "learning_rate": 0.0001156434465040231,
599
+ "loss": 1.1024,
600
+ "step": 73
601
+ },
602
+ {
603
+ "epoch": 1.5578947368421052,
604
+ "grad_norm": 0.3508377969264984,
605
+ "learning_rate": 0.00011342332658176555,
606
+ "loss": 0.9808,
607
+ "step": 74
608
+ },
609
+ {
610
+ "epoch": 1.5789473684210527,
611
+ "grad_norm": 0.3267882466316223,
612
+ "learning_rate": 0.00011119644761033078,
613
+ "loss": 0.9895,
614
+ "step": 75
615
+ },
616
+ {
617
+ "epoch": 1.6,
618
+ "grad_norm": 0.41372963786125183,
619
+ "learning_rate": 0.00010896393089034336,
620
+ "loss": 0.9947,
621
+ "step": 76
622
+ },
623
+ {
624
+ "epoch": 1.6210526315789475,
625
+ "grad_norm": 0.42969149351119995,
626
+ "learning_rate": 0.00010672690056120399,
627
+ "loss": 0.9632,
628
+ "step": 77
629
+ },
630
+ {
631
+ "epoch": 1.6421052631578947,
632
+ "grad_norm": 0.38285690546035767,
633
+ "learning_rate": 0.00010448648303505151,
634
+ "loss": 1.1273,
635
+ "step": 78
636
+ },
637
+ {
638
+ "epoch": 1.663157894736842,
639
+ "grad_norm": 0.43110236525535583,
640
+ "learning_rate": 0.00010224380642958052,
641
+ "loss": 1.1023,
642
+ "step": 79
643
+ },
644
+ {
645
+ "epoch": 1.6842105263157894,
646
+ "grad_norm": 0.46195274591445923,
647
+ "learning_rate": 0.0001,
648
+ "loss": 1.0664,
649
+ "step": 80
650
+ },
651
+ {
652
+ "epoch": 1.6842105263157894,
653
+ "eval_loss": 1.1489697694778442,
654
+ "eval_runtime": 2.1184,
655
+ "eval_samples_per_second": 47.205,
656
+ "eval_steps_per_second": 4.721,
657
+ "step": 80
658
+ },
659
+ {
660
+ "epoch": 1.7052631578947368,
661
+ "grad_norm": 0.4386035203933716,
662
+ "learning_rate": 9.775619357041952e-05,
663
+ "loss": 1.1012,
664
+ "step": 81
665
+ },
666
+ {
667
+ "epoch": 1.7263157894736842,
668
+ "grad_norm": 0.4999752342700958,
669
+ "learning_rate": 9.551351696494854e-05,
670
+ "loss": 1.1244,
671
+ "step": 82
672
+ },
673
+ {
674
+ "epoch": 1.7473684210526317,
675
+ "grad_norm": 0.4127891957759857,
676
+ "learning_rate": 9.327309943879604e-05,
677
+ "loss": 1.187,
678
+ "step": 83
679
+ },
680
+ {
681
+ "epoch": 1.768421052631579,
682
+ "grad_norm": 0.5349937677383423,
683
+ "learning_rate": 9.103606910965666e-05,
684
+ "loss": 1.1489,
685
+ "step": 84
686
+ },
687
+ {
688
+ "epoch": 1.7894736842105263,
689
+ "grad_norm": 0.42807015776634216,
690
+ "learning_rate": 8.880355238966923e-05,
691
+ "loss": 1.1736,
692
+ "step": 85
693
+ },
694
+ {
695
+ "epoch": 1.8105263157894735,
696
+ "grad_norm": 0.3887334167957306,
697
+ "learning_rate": 8.657667341823448e-05,
698
+ "loss": 1.1251,
699
+ "step": 86
700
+ },
701
+ {
702
+ "epoch": 1.831578947368421,
703
+ "grad_norm": 0.4703119993209839,
704
+ "learning_rate": 8.435655349597689e-05,
705
+ "loss": 1.369,
706
+ "step": 87
707
+ },
708
+ {
709
+ "epoch": 1.8526315789473684,
710
+ "grad_norm": 0.5050467252731323,
711
+ "learning_rate": 8.214431052013634e-05,
712
+ "loss": 0.9705,
713
+ "step": 88
714
+ },
715
+ {
716
+ "epoch": 1.8526315789473684,
717
+ "eval_loss": 1.1517329216003418,
718
+ "eval_runtime": 2.0675,
719
+ "eval_samples_per_second": 48.367,
720
+ "eval_steps_per_second": 4.837,
721
+ "step": 88
722
+ },
723
+ {
724
+ "epoch": 1.8736842105263158,
725
+ "grad_norm": 0.48088398575782776,
726
+ "learning_rate": 7.994105842167273e-05,
727
+ "loss": 1.1485,
728
+ "step": 89
729
+ },
730
+ {
731
+ "epoch": 1.8947368421052633,
732
+ "grad_norm": 0.5244817137718201,
733
+ "learning_rate": 7.774790660436858e-05,
734
+ "loss": 1.1301,
735
+ "step": 90
736
+ },
737
+ {
738
+ "epoch": 1.9157894736842105,
739
+ "grad_norm": 0.5362399220466614,
740
+ "learning_rate": 7.556595938621058e-05,
741
+ "loss": 1.1488,
742
+ "step": 91
743
+ },
744
+ {
745
+ "epoch": 1.936842105263158,
746
+ "grad_norm": 0.45146438479423523,
747
+ "learning_rate": 7.339631544333249e-05,
748
+ "loss": 1.0524,
749
+ "step": 92
750
+ },
751
+ {
752
+ "epoch": 1.9578947368421051,
753
+ "grad_norm": 0.48216360807418823,
754
+ "learning_rate": 7.124006725679828e-05,
755
+ "loss": 1.2223,
756
+ "step": 93
757
+ },
758
+ {
759
+ "epoch": 1.9789473684210526,
760
+ "grad_norm": 0.48500946164131165,
761
+ "learning_rate": 6.909830056250527e-05,
762
+ "loss": 1.0837,
763
+ "step": 94
764
+ },
765
+ {
766
+ "epoch": 2.0,
767
+ "grad_norm": 0.46944934129714966,
768
+ "learning_rate": 6.697209380448333e-05,
769
+ "loss": 1.1183,
770
+ "step": 95
771
+ },
772
+ {
773
+ "epoch": 2.0210526315789474,
774
+ "grad_norm": 0.4117797017097473,
775
+ "learning_rate": 6.486251759186572e-05,
776
+ "loss": 1.0669,
777
+ "step": 96
778
+ },
779
+ {
780
+ "epoch": 2.0210526315789474,
781
+ "eval_loss": 1.1518473625183105,
782
+ "eval_runtime": 2.1108,
783
+ "eval_samples_per_second": 47.376,
784
+ "eval_steps_per_second": 4.738,
785
+ "step": 96
786
+ },
787
+ {
788
+ "epoch": 2.042105263157895,
789
+ "grad_norm": 0.39091888070106506,
790
+ "learning_rate": 6.277063415980549e-05,
791
+ "loss": 0.9891,
792
+ "step": 97
793
+ },
794
+ {
795
+ "epoch": 2.0631578947368423,
796
+ "grad_norm": 0.49795445799827576,
797
+ "learning_rate": 6.069749683460765e-05,
798
+ "loss": 0.8838,
799
+ "step": 98
800
+ },
801
+ {
802
+ "epoch": 2.0842105263157893,
803
+ "grad_norm": 0.4604962170124054,
804
+ "learning_rate": 5.864414950334796e-05,
805
+ "loss": 0.9824,
806
+ "step": 99
807
+ },
808
+ {
809
+ "epoch": 2.1052631578947367,
810
+ "grad_norm": 0.5574219226837158,
811
+ "learning_rate": 5.6611626088244194e-05,
812
+ "loss": 1.0056,
813
+ "step": 100
814
+ },
815
+ {
816
+ "epoch": 2.126315789473684,
817
+ "grad_norm": 0.46602797508239746,
818
+ "learning_rate": 5.4600950026045326e-05,
819
+ "loss": 0.9943,
820
+ "step": 101
821
+ },
822
+ {
823
+ "epoch": 2.1473684210526316,
824
+ "grad_norm": 0.464478999376297,
825
+ "learning_rate": 5.261313375270014e-05,
826
+ "loss": 0.8895,
827
+ "step": 102
828
+ },
829
+ {
830
+ "epoch": 2.168421052631579,
831
+ "grad_norm": 0.47825688123703003,
832
+ "learning_rate": 5.0649178193565314e-05,
833
+ "loss": 1.0034,
834
+ "step": 103
835
+ },
836
+ {
837
+ "epoch": 2.1894736842105265,
838
+ "grad_norm": 0.5426080822944641,
839
+ "learning_rate": 4.87100722594094e-05,
840
+ "loss": 0.9732,
841
+ "step": 104
842
+ },
843
+ {
844
+ "epoch": 2.1894736842105265,
845
+ "eval_loss": 1.1610064506530762,
846
+ "eval_runtime": 2.0779,
847
+ "eval_samples_per_second": 48.125,
848
+ "eval_steps_per_second": 4.813,
849
+ "step": 104
850
+ },
851
+ {
852
+ "epoch": 2.2105263157894735,
853
+ "grad_norm": 0.4391036033630371,
854
+ "learning_rate": 4.6796792348466356e-05,
855
+ "loss": 0.9018,
856
+ "step": 105
857
+ },
858
+ {
859
+ "epoch": 2.231578947368421,
860
+ "grad_norm": 0.495150625705719,
861
+ "learning_rate": 4.491030185478976e-05,
862
+ "loss": 1.0982,
863
+ "step": 106
864
+ },
865
+ {
866
+ "epoch": 2.2526315789473683,
867
+ "grad_norm": 0.4889540374279022,
868
+ "learning_rate": 4.305155068315481e-05,
869
+ "loss": 1.1357,
870
+ "step": 107
871
+ },
872
+ {
873
+ "epoch": 2.2736842105263158,
874
+ "grad_norm": 0.47582054138183594,
875
+ "learning_rate": 4.12214747707527e-05,
876
+ "loss": 0.8421,
877
+ "step": 108
878
+ },
879
+ {
880
+ "epoch": 2.294736842105263,
881
+ "grad_norm": 0.43810227513313293,
882
+ "learning_rate": 3.942099561591802e-05,
883
+ "loss": 1.0096,
884
+ "step": 109
885
+ },
886
+ {
887
+ "epoch": 2.3157894736842106,
888
+ "grad_norm": 0.5217084884643555,
889
+ "learning_rate": 3.7651019814126654e-05,
890
+ "loss": 0.9681,
891
+ "step": 110
892
+ },
893
+ {
894
+ "epoch": 2.336842105263158,
895
+ "grad_norm": 0.5350040793418884,
896
+ "learning_rate": 3.591243860149759e-05,
897
+ "loss": 0.9163,
898
+ "step": 111
899
+ },
900
+ {
901
+ "epoch": 2.3578947368421055,
902
+ "grad_norm": 0.4863702654838562,
903
+ "learning_rate": 3.4206127406028745e-05,
904
+ "loss": 1.1016,
905
+ "step": 112
906
+ },
907
+ {
908
+ "epoch": 2.3578947368421055,
909
+ "eval_loss": 1.163386583328247,
910
+ "eval_runtime": 2.0711,
911
+ "eval_samples_per_second": 48.284,
912
+ "eval_steps_per_second": 4.828,
913
+ "step": 112
914
+ },
915
+ {
916
+ "epoch": 2.3789473684210525,
917
+ "grad_norm": 0.4959012269973755,
918
+ "learning_rate": 3.253294540679257e-05,
919
+ "loss": 1.1242,
920
+ "step": 113
921
+ },
922
+ {
923
+ "epoch": 2.4,
924
+ "grad_norm": 0.4682742953300476,
925
+ "learning_rate": 3.089373510131354e-05,
926
+ "loss": 0.8366,
927
+ "step": 114
928
+ },
929
+ {
930
+ "epoch": 2.4210526315789473,
931
+ "grad_norm": 0.5049096941947937,
932
+ "learning_rate": 2.9289321881345254e-05,
933
+ "loss": 1.0976,
934
+ "step": 115
935
+ },
936
+ {
937
+ "epoch": 2.442105263157895,
938
+ "grad_norm": 0.4340517818927765,
939
+ "learning_rate": 2.7720513617260856e-05,
940
+ "loss": 1.0151,
941
+ "step": 116
942
+ },
943
+ {
944
+ "epoch": 2.463157894736842,
945
+ "grad_norm": 0.5189387202262878,
946
+ "learning_rate": 2.6188100251265945e-05,
947
+ "loss": 0.9766,
948
+ "step": 117
949
+ },
950
+ {
951
+ "epoch": 2.4842105263157896,
952
+ "grad_norm": 0.45461520552635193,
953
+ "learning_rate": 2.4692853399638917e-05,
954
+ "loss": 1.08,
955
+ "step": 118
956
+ },
957
+ {
958
+ "epoch": 2.5052631578947366,
959
+ "grad_norm": 0.5745816826820374,
960
+ "learning_rate": 2.323552596419889e-05,
961
+ "loss": 0.9789,
962
+ "step": 119
963
+ },
964
+ {
965
+ "epoch": 2.526315789473684,
966
+ "grad_norm": 0.4734479784965515,
967
+ "learning_rate": 2.181685175319702e-05,
968
+ "loss": 1.108,
969
+ "step": 120
970
+ },
971
+ {
972
+ "epoch": 2.526315789473684,
973
+ "eval_loss": 1.1624512672424316,
974
+ "eval_runtime": 2.0886,
975
+ "eval_samples_per_second": 47.879,
976
+ "eval_steps_per_second": 4.788,
977
+ "step": 120
978
+ },
979
+ {
980
+ "epoch": 2.5473684210526315,
981
+ "grad_norm": 0.5734113454818726,
982
+ "learning_rate": 2.043754511182191e-05,
983
+ "loss": 0.9151,
984
+ "step": 121
985
+ },
986
+ {
987
+ "epoch": 2.568421052631579,
988
+ "grad_norm": 0.5083211064338684,
989
+ "learning_rate": 1.9098300562505266e-05,
990
+ "loss": 0.9903,
991
+ "step": 122
992
+ },
993
+ {
994
+ "epoch": 2.5894736842105264,
995
+ "grad_norm": 0.5377265214920044,
996
+ "learning_rate": 1.7799792455209018e-05,
997
+ "loss": 1.0774,
998
+ "step": 123
999
+ },
1000
+ {
1001
+ "epoch": 2.610526315789474,
1002
+ "grad_norm": 0.4219975471496582,
1003
+ "learning_rate": 1.6542674627869737e-05,
1004
+ "loss": 0.9234,
1005
+ "step": 124
1006
+ },
1007
+ {
1008
+ "epoch": 2.6315789473684212,
1009
+ "grad_norm": 0.49157968163490295,
1010
+ "learning_rate": 1.5327580077171587e-05,
1011
+ "loss": 0.9577,
1012
+ "step": 125
1013
+ },
1014
+ {
1015
+ "epoch": 2.6526315789473687,
1016
+ "grad_norm": 0.4462091326713562,
1017
+ "learning_rate": 1.415512063981339e-05,
1018
+ "loss": 0.9661,
1019
+ "step": 126
1020
+ },
1021
+ {
1022
+ "epoch": 2.6736842105263157,
1023
+ "grad_norm": 0.5062934756278992,
1024
+ "learning_rate": 1.3025886684430467e-05,
1025
+ "loss": 0.9206,
1026
+ "step": 127
1027
+ },
1028
+ {
1029
+ "epoch": 2.694736842105263,
1030
+ "grad_norm": 0.558468759059906,
1031
+ "learning_rate": 1.19404468143262e-05,
1032
+ "loss": 1.0424,
1033
+ "step": 128
1034
+ },
1035
+ {
1036
+ "epoch": 2.694736842105263,
1037
+ "eval_loss": 1.164870023727417,
1038
+ "eval_runtime": 2.1075,
1039
+ "eval_samples_per_second": 47.449,
1040
+ "eval_steps_per_second": 4.745,
1041
+ "step": 128
1042
+ },
1043
+ {
1044
+ "epoch": 2.7157894736842105,
1045
+ "grad_norm": 0.48067817091941833,
1046
+ "learning_rate": 1.0899347581163221e-05,
1047
+ "loss": 0.9617,
1048
+ "step": 129
1049
+ },
1050
+ {
1051
+ "epoch": 2.736842105263158,
1052
+ "grad_norm": 0.6342288255691528,
1053
+ "learning_rate": 9.903113209758096e-06,
1054
+ "loss": 1.0679,
1055
+ "step": 130
1056
  }
1057
  ],
1058
  "logging_steps": 1,
 
1072
  "attributes": {}
1073
  }
1074
  },
1075
+ "total_flos": 1.315875884630016e+16,
1076
  "train_batch_size": 10,
1077
  "trial_name": null,
1078
  "trial_params": null