CocoRoF commited on
Commit
858a7b4
·
verified ·
1 Parent(s): f98f3c3

Training in progress, step 4965, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1823d8d6fc9bad50e233c3f85df11140836f3d1238a4215dcef47f26ec4a45f
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2002857724b9ba9c06e91b94244022cd822af76f616e23253e73b0d37445df8
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de6dd2f249af6019dccef804f01b9fc641389560f0c188cf62f4d7deb12d34ac
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c88886b857669b8c1bda6f46dfba65789f419ec3c8ba7f50d3594ac44f9c8501
3
  size 2375752250
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:530dd30db5e5df3c66d26bc002c7175a973b1fc31851f04f8833f1cc27686333
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02b7e464b14d20e04557f2705a171f16ff23e7e0780b1d8336dd791f6f104a57
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9062588501840838,
5
  "eval_steps": 500,
6
- "global_step": 4500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6379,6 +6379,657 @@
6379
  "eval_samples_per_second": 1109.992,
6380
  "eval_steps_per_second": 34.688,
6381
  "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6382
  }
6383
  ],
6384
  "logging_steps": 5,
@@ -6393,12 +7044,12 @@
6393
  "should_evaluate": false,
6394
  "should_log": false,
6395
  "should_save": true,
6396
- "should_training_stop": false
6397
  },
6398
  "attributes": {}
6399
  }
6400
  },
6401
- "total_flos": 1.9495610967549542e+19,
6402
  "train_batch_size": 4,
6403
  "trial_name": null,
6404
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9999055980364392,
5
  "eval_steps": 500,
6
+ "global_step": 4965,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6379
  "eval_samples_per_second": 1109.992,
6380
  "eval_steps_per_second": 34.688,
6381
  "step": 4500
6382
+ },
6383
+ {
6384
+ "epoch": 0.9072658044620662,
6385
+ "grad_norm": 104.1875,
6386
+ "learning_rate": 1.0295434198746643e-07,
6387
+ "loss": 96.1257,
6388
+ "step": 4505
6389
+ },
6390
+ {
6391
+ "epoch": 0.9082727587400484,
6392
+ "grad_norm": 105.625,
6393
+ "learning_rate": 1.0183527305282006e-07,
6394
+ "loss": 96.9505,
6395
+ "step": 4510
6396
+ },
6397
+ {
6398
+ "epoch": 0.9092797130180308,
6399
+ "grad_norm": 108.375,
6400
+ "learning_rate": 1.0071620411817368e-07,
6401
+ "loss": 96.6111,
6402
+ "step": 4515
6403
+ },
6404
+ {
6405
+ "epoch": 0.910286667296013,
6406
+ "grad_norm": 106.4375,
6407
+ "learning_rate": 9.95971351835273e-08,
6408
+ "loss": 97.3165,
6409
+ "step": 4520
6410
+ },
6411
+ {
6412
+ "epoch": 0.9112936215739954,
6413
+ "grad_norm": 105.375,
6414
+ "learning_rate": 9.847806624888093e-08,
6415
+ "loss": 97.2006,
6416
+ "step": 4525
6417
+ },
6418
+ {
6419
+ "epoch": 0.9123005758519778,
6420
+ "grad_norm": 109.0,
6421
+ "learning_rate": 9.735899731423455e-08,
6422
+ "loss": 96.5357,
6423
+ "step": 4530
6424
+ },
6425
+ {
6426
+ "epoch": 0.91330753012996,
6427
+ "grad_norm": 104.4375,
6428
+ "learning_rate": 9.623992837958818e-08,
6429
+ "loss": 96.6608,
6430
+ "step": 4535
6431
+ },
6432
+ {
6433
+ "epoch": 0.9143144844079424,
6434
+ "grad_norm": 103.4375,
6435
+ "learning_rate": 9.51208594449418e-08,
6436
+ "loss": 96.2924,
6437
+ "step": 4540
6438
+ },
6439
+ {
6440
+ "epoch": 0.9153214386859246,
6441
+ "grad_norm": 101.5,
6442
+ "learning_rate": 9.400179051029543e-08,
6443
+ "loss": 94.9458,
6444
+ "step": 4545
6445
+ },
6446
+ {
6447
+ "epoch": 0.916328392963907,
6448
+ "grad_norm": 108.875,
6449
+ "learning_rate": 9.288272157564905e-08,
6450
+ "loss": 95.3736,
6451
+ "step": 4550
6452
+ },
6453
+ {
6454
+ "epoch": 0.9173353472418893,
6455
+ "grad_norm": 106.0,
6456
+ "learning_rate": 9.176365264100267e-08,
6457
+ "loss": 94.3943,
6458
+ "step": 4555
6459
+ },
6460
+ {
6461
+ "epoch": 0.9183423015198716,
6462
+ "grad_norm": 105.4375,
6463
+ "learning_rate": 9.06445837063563e-08,
6464
+ "loss": 97.27,
6465
+ "step": 4560
6466
+ },
6467
+ {
6468
+ "epoch": 0.919349255797854,
6469
+ "grad_norm": 106.3125,
6470
+ "learning_rate": 8.952551477170993e-08,
6471
+ "loss": 95.4415,
6472
+ "step": 4565
6473
+ },
6474
+ {
6475
+ "epoch": 0.9203562100758362,
6476
+ "grad_norm": 107.1875,
6477
+ "learning_rate": 8.840644583706356e-08,
6478
+ "loss": 96.8434,
6479
+ "step": 4570
6480
+ },
6481
+ {
6482
+ "epoch": 0.9213631643538186,
6483
+ "grad_norm": 105.1875,
6484
+ "learning_rate": 8.728737690241718e-08,
6485
+ "loss": 96.2896,
6486
+ "step": 4575
6487
+ },
6488
+ {
6489
+ "epoch": 0.9223701186318009,
6490
+ "grad_norm": 104.875,
6491
+ "learning_rate": 8.616830796777082e-08,
6492
+ "loss": 97.0949,
6493
+ "step": 4580
6494
+ },
6495
+ {
6496
+ "epoch": 0.9233770729097832,
6497
+ "grad_norm": 107.375,
6498
+ "learning_rate": 8.504923903312444e-08,
6499
+ "loss": 96.0602,
6500
+ "step": 4585
6501
+ },
6502
+ {
6503
+ "epoch": 0.9243840271877655,
6504
+ "grad_norm": 105.0,
6505
+ "learning_rate": 8.393017009847807e-08,
6506
+ "loss": 96.6697,
6507
+ "step": 4590
6508
+ },
6509
+ {
6510
+ "epoch": 0.9253909814657478,
6511
+ "grad_norm": 103.5,
6512
+ "learning_rate": 8.281110116383169e-08,
6513
+ "loss": 95.5824,
6514
+ "step": 4595
6515
+ },
6516
+ {
6517
+ "epoch": 0.9263979357437301,
6518
+ "grad_norm": 107.5625,
6519
+ "learning_rate": 8.169203222918532e-08,
6520
+ "loss": 96.6081,
6521
+ "step": 4600
6522
+ },
6523
+ {
6524
+ "epoch": 0.9274048900217124,
6525
+ "grad_norm": 108.4375,
6526
+ "learning_rate": 8.057296329453894e-08,
6527
+ "loss": 96.3714,
6528
+ "step": 4605
6529
+ },
6530
+ {
6531
+ "epoch": 0.9284118442996948,
6532
+ "grad_norm": 105.3125,
6533
+ "learning_rate": 7.945389435989256e-08,
6534
+ "loss": 95.8521,
6535
+ "step": 4610
6536
+ },
6537
+ {
6538
+ "epoch": 0.9294187985776771,
6539
+ "grad_norm": 108.125,
6540
+ "learning_rate": 7.833482542524619e-08,
6541
+ "loss": 96.356,
6542
+ "step": 4615
6543
+ },
6544
+ {
6545
+ "epoch": 0.9304257528556594,
6546
+ "grad_norm": 105.9375,
6547
+ "learning_rate": 7.721575649059981e-08,
6548
+ "loss": 96.4865,
6549
+ "step": 4620
6550
+ },
6551
+ {
6552
+ "epoch": 0.9314327071336417,
6553
+ "grad_norm": 105.6875,
6554
+ "learning_rate": 7.609668755595345e-08,
6555
+ "loss": 95.1476,
6556
+ "step": 4625
6557
+ },
6558
+ {
6559
+ "epoch": 0.932439661411624,
6560
+ "grad_norm": 106.9375,
6561
+ "learning_rate": 7.497761862130707e-08,
6562
+ "loss": 95.1061,
6563
+ "step": 4630
6564
+ },
6565
+ {
6566
+ "epoch": 0.9334466156896063,
6567
+ "grad_norm": 105.5625,
6568
+ "learning_rate": 7.38585496866607e-08,
6569
+ "loss": 95.2852,
6570
+ "step": 4635
6571
+ },
6572
+ {
6573
+ "epoch": 0.9344535699675887,
6574
+ "grad_norm": 107.5625,
6575
+ "learning_rate": 7.273948075201432e-08,
6576
+ "loss": 95.0002,
6577
+ "step": 4640
6578
+ },
6579
+ {
6580
+ "epoch": 0.935460524245571,
6581
+ "grad_norm": 106.5625,
6582
+ "learning_rate": 7.162041181736795e-08,
6583
+ "loss": 97.3515,
6584
+ "step": 4645
6585
+ },
6586
+ {
6587
+ "epoch": 0.9364674785235533,
6588
+ "grad_norm": 106.875,
6589
+ "learning_rate": 7.050134288272157e-08,
6590
+ "loss": 96.8893,
6591
+ "step": 4650
6592
+ },
6593
+ {
6594
+ "epoch": 0.9374744328015356,
6595
+ "grad_norm": 106.125,
6596
+ "learning_rate": 6.938227394807519e-08,
6597
+ "loss": 96.1281,
6598
+ "step": 4655
6599
+ },
6600
+ {
6601
+ "epoch": 0.9384813870795179,
6602
+ "grad_norm": 105.4375,
6603
+ "learning_rate": 6.826320501342882e-08,
6604
+ "loss": 95.932,
6605
+ "step": 4660
6606
+ },
6607
+ {
6608
+ "epoch": 0.9394883413575003,
6609
+ "grad_norm": 106.0625,
6610
+ "learning_rate": 6.714413607878245e-08,
6611
+ "loss": 96.195,
6612
+ "step": 4665
6613
+ },
6614
+ {
6615
+ "epoch": 0.9404952956354825,
6616
+ "grad_norm": 106.1875,
6617
+ "learning_rate": 6.602506714413608e-08,
6618
+ "loss": 94.7684,
6619
+ "step": 4670
6620
+ },
6621
+ {
6622
+ "epoch": 0.9415022499134649,
6623
+ "grad_norm": 109.0,
6624
+ "learning_rate": 6.49059982094897e-08,
6625
+ "loss": 96.4495,
6626
+ "step": 4675
6627
+ },
6628
+ {
6629
+ "epoch": 0.9425092041914472,
6630
+ "grad_norm": 109.0,
6631
+ "learning_rate": 6.378692927484333e-08,
6632
+ "loss": 96.9962,
6633
+ "step": 4680
6634
+ },
6635
+ {
6636
+ "epoch": 0.9435161584694295,
6637
+ "grad_norm": 104.3125,
6638
+ "learning_rate": 6.266786034019696e-08,
6639
+ "loss": 94.3069,
6640
+ "step": 4685
6641
+ },
6642
+ {
6643
+ "epoch": 0.9445231127474119,
6644
+ "grad_norm": 107.625,
6645
+ "learning_rate": 6.154879140555059e-08,
6646
+ "loss": 96.7521,
6647
+ "step": 4690
6648
+ },
6649
+ {
6650
+ "epoch": 0.9455300670253941,
6651
+ "grad_norm": 104.3125,
6652
+ "learning_rate": 6.042972247090421e-08,
6653
+ "loss": 96.0066,
6654
+ "step": 4695
6655
+ },
6656
+ {
6657
+ "epoch": 0.9465370213033765,
6658
+ "grad_norm": 104.875,
6659
+ "learning_rate": 5.931065353625783e-08,
6660
+ "loss": 94.7801,
6661
+ "step": 4700
6662
+ },
6663
+ {
6664
+ "epoch": 0.9475439755813587,
6665
+ "grad_norm": 106.375,
6666
+ "learning_rate": 5.819158460161146e-08,
6667
+ "loss": 95.1509,
6668
+ "step": 4705
6669
+ },
6670
+ {
6671
+ "epoch": 0.9485509298593411,
6672
+ "grad_norm": 104.9375,
6673
+ "learning_rate": 5.7072515666965083e-08,
6674
+ "loss": 95.5377,
6675
+ "step": 4710
6676
+ },
6677
+ {
6678
+ "epoch": 0.9495578841373233,
6679
+ "grad_norm": 105.0,
6680
+ "learning_rate": 5.595344673231871e-08,
6681
+ "loss": 96.0342,
6682
+ "step": 4715
6683
+ },
6684
+ {
6685
+ "epoch": 0.9505648384153057,
6686
+ "grad_norm": 106.8125,
6687
+ "learning_rate": 5.483437779767233e-08,
6688
+ "loss": 95.7919,
6689
+ "step": 4720
6690
+ },
6691
+ {
6692
+ "epoch": 0.9515717926932881,
6693
+ "grad_norm": 104.0625,
6694
+ "learning_rate": 5.3715308863025955e-08,
6695
+ "loss": 95.4794,
6696
+ "step": 4725
6697
+ },
6698
+ {
6699
+ "epoch": 0.9525787469712703,
6700
+ "grad_norm": 105.1875,
6701
+ "learning_rate": 5.2596239928379586e-08,
6702
+ "loss": 96.2796,
6703
+ "step": 4730
6704
+ },
6705
+ {
6706
+ "epoch": 0.9535857012492527,
6707
+ "grad_norm": 107.4375,
6708
+ "learning_rate": 5.147717099373322e-08,
6709
+ "loss": 96.9097,
6710
+ "step": 4735
6711
+ },
6712
+ {
6713
+ "epoch": 0.9545926555272349,
6714
+ "grad_norm": 104.0,
6715
+ "learning_rate": 5.035810205908684e-08,
6716
+ "loss": 95.2215,
6717
+ "step": 4740
6718
+ },
6719
+ {
6720
+ "epoch": 0.9555996098052173,
6721
+ "grad_norm": 103.875,
6722
+ "learning_rate": 4.9239033124440465e-08,
6723
+ "loss": 95.531,
6724
+ "step": 4745
6725
+ },
6726
+ {
6727
+ "epoch": 0.9566065640831996,
6728
+ "grad_norm": 104.5625,
6729
+ "learning_rate": 4.811996418979409e-08,
6730
+ "loss": 94.8213,
6731
+ "step": 4750
6732
+ },
6733
+ {
6734
+ "epoch": 0.9576135183611819,
6735
+ "grad_norm": 103.125,
6736
+ "learning_rate": 4.700089525514771e-08,
6737
+ "loss": 94.9124,
6738
+ "step": 4755
6739
+ },
6740
+ {
6741
+ "epoch": 0.9586204726391643,
6742
+ "grad_norm": 108.75,
6743
+ "learning_rate": 4.588182632050134e-08,
6744
+ "loss": 94.8872,
6745
+ "step": 4760
6746
+ },
6747
+ {
6748
+ "epoch": 0.9596274269171465,
6749
+ "grad_norm": 106.0625,
6750
+ "learning_rate": 4.476275738585497e-08,
6751
+ "loss": 94.9003,
6752
+ "step": 4765
6753
+ },
6754
+ {
6755
+ "epoch": 0.9606343811951289,
6756
+ "grad_norm": 109.0,
6757
+ "learning_rate": 4.364368845120859e-08,
6758
+ "loss": 97.4909,
6759
+ "step": 4770
6760
+ },
6761
+ {
6762
+ "epoch": 0.9616413354731111,
6763
+ "grad_norm": 105.1875,
6764
+ "learning_rate": 4.252461951656222e-08,
6765
+ "loss": 95.4977,
6766
+ "step": 4775
6767
+ },
6768
+ {
6769
+ "epoch": 0.9626482897510935,
6770
+ "grad_norm": 103.4375,
6771
+ "learning_rate": 4.1405550581915846e-08,
6772
+ "loss": 95.1702,
6773
+ "step": 4780
6774
+ },
6775
+ {
6776
+ "epoch": 0.9636552440290758,
6777
+ "grad_norm": 105.4375,
6778
+ "learning_rate": 4.028648164726947e-08,
6779
+ "loss": 95.1124,
6780
+ "step": 4785
6781
+ },
6782
+ {
6783
+ "epoch": 0.9646621983070581,
6784
+ "grad_norm": 107.125,
6785
+ "learning_rate": 3.9167412712623094e-08,
6786
+ "loss": 95.5008,
6787
+ "step": 4790
6788
+ },
6789
+ {
6790
+ "epoch": 0.9656691525850404,
6791
+ "grad_norm": 103.8125,
6792
+ "learning_rate": 3.8048343777976725e-08,
6793
+ "loss": 96.745,
6794
+ "step": 4795
6795
+ },
6796
+ {
6797
+ "epoch": 0.9666761068630227,
6798
+ "grad_norm": 103.875,
6799
+ "learning_rate": 3.692927484333035e-08,
6800
+ "loss": 96.3884,
6801
+ "step": 4800
6802
+ },
6803
+ {
6804
+ "epoch": 0.9676830611410051,
6805
+ "grad_norm": 103.375,
6806
+ "learning_rate": 3.581020590868397e-08,
6807
+ "loss": 94.6912,
6808
+ "step": 4805
6809
+ },
6810
+ {
6811
+ "epoch": 0.9686900154189874,
6812
+ "grad_norm": 106.125,
6813
+ "learning_rate": 3.4691136974037597e-08,
6814
+ "loss": 95.0865,
6815
+ "step": 4810
6816
+ },
6817
+ {
6818
+ "epoch": 0.9696969696969697,
6819
+ "grad_norm": 103.625,
6820
+ "learning_rate": 3.357206803939123e-08,
6821
+ "loss": 93.7961,
6822
+ "step": 4815
6823
+ },
6824
+ {
6825
+ "epoch": 0.970703923974952,
6826
+ "grad_norm": 104.3125,
6827
+ "learning_rate": 3.245299910474485e-08,
6828
+ "loss": 95.4935,
6829
+ "step": 4820
6830
+ },
6831
+ {
6832
+ "epoch": 0.9717108782529343,
6833
+ "grad_norm": 102.0625,
6834
+ "learning_rate": 3.133393017009848e-08,
6835
+ "loss": 95.072,
6836
+ "step": 4825
6837
+ },
6838
+ {
6839
+ "epoch": 0.9727178325309166,
6840
+ "grad_norm": 107.1875,
6841
+ "learning_rate": 3.0214861235452106e-08,
6842
+ "loss": 95.2949,
6843
+ "step": 4830
6844
+ },
6845
+ {
6846
+ "epoch": 0.973724786808899,
6847
+ "grad_norm": 105.375,
6848
+ "learning_rate": 2.909579230080573e-08,
6849
+ "loss": 94.6876,
6850
+ "step": 4835
6851
+ },
6852
+ {
6853
+ "epoch": 0.9747317410868813,
6854
+ "grad_norm": 104.5625,
6855
+ "learning_rate": 2.7976723366159354e-08,
6856
+ "loss": 96.9518,
6857
+ "step": 4840
6858
+ },
6859
+ {
6860
+ "epoch": 0.9757386953648636,
6861
+ "grad_norm": 106.1875,
6862
+ "learning_rate": 2.6857654431512978e-08,
6863
+ "loss": 95.4756,
6864
+ "step": 4845
6865
+ },
6866
+ {
6867
+ "epoch": 0.9767456496428459,
6868
+ "grad_norm": 107.5625,
6869
+ "learning_rate": 2.573858549686661e-08,
6870
+ "loss": 94.8716,
6871
+ "step": 4850
6872
+ },
6873
+ {
6874
+ "epoch": 0.9777526039208282,
6875
+ "grad_norm": 103.4375,
6876
+ "learning_rate": 2.4619516562220232e-08,
6877
+ "loss": 95.0801,
6878
+ "step": 4855
6879
+ },
6880
+ {
6881
+ "epoch": 0.9787595581988106,
6882
+ "grad_norm": 102.75,
6883
+ "learning_rate": 2.3500447627573856e-08,
6884
+ "loss": 95.3318,
6885
+ "step": 4860
6886
+ },
6887
+ {
6888
+ "epoch": 0.9797665124767928,
6889
+ "grad_norm": 107.75,
6890
+ "learning_rate": 2.2381378692927484e-08,
6891
+ "loss": 95.9054,
6892
+ "step": 4865
6893
+ },
6894
+ {
6895
+ "epoch": 0.9807734667547752,
6896
+ "grad_norm": 103.1875,
6897
+ "learning_rate": 2.126230975828111e-08,
6898
+ "loss": 95.6948,
6899
+ "step": 4870
6900
+ },
6901
+ {
6902
+ "epoch": 0.9817804210327575,
6903
+ "grad_norm": 107.5625,
6904
+ "learning_rate": 2.0143240823634735e-08,
6905
+ "loss": 95.1651,
6906
+ "step": 4875
6907
+ },
6908
+ {
6909
+ "epoch": 0.9827873753107398,
6910
+ "grad_norm": 102.3125,
6911
+ "learning_rate": 1.9024171888988362e-08,
6912
+ "loss": 95.8977,
6913
+ "step": 4880
6914
+ },
6915
+ {
6916
+ "epoch": 0.9837943295887221,
6917
+ "grad_norm": 107.3125,
6918
+ "learning_rate": 1.7905102954341986e-08,
6919
+ "loss": 94.0943,
6920
+ "step": 4885
6921
+ },
6922
+ {
6923
+ "epoch": 0.9848012838667044,
6924
+ "grad_norm": 105.8125,
6925
+ "learning_rate": 1.6786034019695614e-08,
6926
+ "loss": 96.5686,
6927
+ "step": 4890
6928
+ },
6929
+ {
6930
+ "epoch": 0.9858082381446868,
6931
+ "grad_norm": 104.75,
6932
+ "learning_rate": 1.566696508504924e-08,
6933
+ "loss": 96.2139,
6934
+ "step": 4895
6935
+ },
6936
+ {
6937
+ "epoch": 0.986815192422669,
6938
+ "grad_norm": 106.5625,
6939
+ "learning_rate": 1.4547896150402865e-08,
6940
+ "loss": 96.4123,
6941
+ "step": 4900
6942
+ },
6943
+ {
6944
+ "epoch": 0.9878221467006514,
6945
+ "grad_norm": 107.5,
6946
+ "learning_rate": 1.3428827215756489e-08,
6947
+ "loss": 95.4067,
6948
+ "step": 4905
6949
+ },
6950
+ {
6951
+ "epoch": 0.9888291009786336,
6952
+ "grad_norm": 106.125,
6953
+ "learning_rate": 1.2309758281110116e-08,
6954
+ "loss": 96.4161,
6955
+ "step": 4910
6956
+ },
6957
+ {
6958
+ "epoch": 0.989836055256616,
6959
+ "grad_norm": 104.6875,
6960
+ "learning_rate": 1.1190689346463742e-08,
6961
+ "loss": 94.9028,
6962
+ "step": 4915
6963
+ },
6964
+ {
6965
+ "epoch": 0.9908430095345984,
6966
+ "grad_norm": 106.0625,
6967
+ "learning_rate": 1.0071620411817367e-08,
6968
+ "loss": 96.9095,
6969
+ "step": 4920
6970
+ },
6971
+ {
6972
+ "epoch": 0.9918499638125806,
6973
+ "grad_norm": 106.8125,
6974
+ "learning_rate": 8.952551477170993e-09,
6975
+ "loss": 94.9621,
6976
+ "step": 4925
6977
+ },
6978
+ {
6979
+ "epoch": 0.992856918090563,
6980
+ "grad_norm": 105.75,
6981
+ "learning_rate": 7.83348254252462e-09,
6982
+ "loss": 95.0764,
6983
+ "step": 4930
6984
+ },
6985
+ {
6986
+ "epoch": 0.9938638723685452,
6987
+ "grad_norm": 107.5,
6988
+ "learning_rate": 6.7144136078782444e-09,
6989
+ "loss": 96.6513,
6990
+ "step": 4935
6991
+ },
6992
+ {
6993
+ "epoch": 0.9948708266465276,
6994
+ "grad_norm": 104.1875,
6995
+ "learning_rate": 5.595344673231871e-09,
6996
+ "loss": 94.489,
6997
+ "step": 4940
6998
+ },
6999
+ {
7000
+ "epoch": 0.9958777809245098,
7001
+ "grad_norm": 105.75,
7002
+ "learning_rate": 4.4762757385854966e-09,
7003
+ "loss": 95.3881,
7004
+ "step": 4945
7005
+ },
7006
+ {
7007
+ "epoch": 0.9968847352024922,
7008
+ "grad_norm": 108.0625,
7009
+ "learning_rate": 3.3572068039391222e-09,
7010
+ "loss": 95.4261,
7011
+ "step": 4950
7012
+ },
7013
+ {
7014
+ "epoch": 0.9978916894804746,
7015
+ "grad_norm": 105.9375,
7016
+ "learning_rate": 2.2381378692927483e-09,
7017
+ "loss": 95.8491,
7018
+ "step": 4955
7019
+ },
7020
+ {
7021
+ "epoch": 0.9988986437584568,
7022
+ "grad_norm": 104.6875,
7023
+ "learning_rate": 1.1190689346463741e-09,
7024
+ "loss": 94.8424,
7025
+ "step": 4960
7026
+ },
7027
+ {
7028
+ "epoch": 0.9999055980364392,
7029
+ "grad_norm": 101.75,
7030
+ "learning_rate": 0.0,
7031
+ "loss": 94.7523,
7032
+ "step": 4965
7033
  }
7034
  ],
7035
  "logging_steps": 5,
 
7044
  "should_evaluate": false,
7045
  "should_log": false,
7046
  "should_save": true,
7047
+ "should_training_stop": true
7048
  },
7049
  "attributes": {}
7050
  }
7051
  },
7052
+ "total_flos": 2.151015743419633e+19,
7053
  "train_batch_size": 4,
7054
  "trial_name": null,
7055
  "trial_params": null