rovdetection commited on
Commit
e28cc22
·
verified ·
1 Parent(s): 39671e8

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e72f74f4e0e6b9d2ea75ad255fa3eaeba01a7e0823bcb5a98cb4e80510a589e
3
  size 9446744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:835db88c2c568a2a5b9eecd0ca20228d562ccd37375f6d5e37ee4f667bd5c028
3
  size 9446744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:200b11c2ef8037da35c5780017b6651f6b6076fc914de77cf6066270f2e1b523
3
  size 4879947
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bde2b53b9a0c26662086027ef84b0578651b731c913f116872da22f0740efeab
3
  size 4879947
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f531f0a7c41eca7b2e7efd3f8d7ba2d9d38c29a18b748522bac1a5c64a9df8e8
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa8f41c51c4c045061b2c14ad0e244d1f18ea14e355c0937c51abc1c22235765
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ef97fc93f1e58ebc9e20759b520e3bc8c347e36f8f62fa0c9adef310bbf48e6
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcdef9cce1358b15f98ec011b2742b883d23020479104f9b5467277f0c257b88
3
  size 14917
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac1b9358ab1cb9acff90f4d1d692ac08bbdb1986a66544aed9f6fe9b801b17b3
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0033c7745b46bdca3ecab5787678834ca68f7f7e1288869dceeb38812abc253
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2198d94a2d44839615c5c39451a8a9843983d5979dc85a343072f765e711e24
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5ee800e7df74b641553b418c04566b716dade6c517cb6fd519bb2168d1739f3
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 6.013754566946056,
6
  "eval_steps": 500,
7
- "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3508,6 +3508,506 @@
3508
  "mean_token_accuracy": 0.6643109286760355,
3509
  "num_tokens": 20795175.0,
3510
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3511
  }
3512
  ],
3513
  "logging_steps": 10,
@@ -3527,7 +4027,7 @@
3527
  "attributes": {}
3528
  }
3529
  },
3530
- "total_flos": 1.7073275440778445e+17,
3531
  "train_batch_size": 2,
3532
  "trial_name": null,
3533
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 6.873415001074576,
6
  "eval_steps": 500,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3508
  "mean_token_accuracy": 0.6643109286760355,
3509
  "num_tokens": 20795175.0,
3510
  "step": 3500
3511
+ },
3512
+ {
3513
+ "entropy": 1.687648557126522,
3514
+ "epoch": 6.0309477756286265,
3515
+ "grad_norm": 0.8348304629325867,
3516
+ "learning_rate": 5.9640000000000005e-05,
3517
+ "loss": 1.7558349609375,
3518
+ "mean_token_accuracy": 0.6784385897219181,
3519
+ "num_tokens": 20852486.0,
3520
+ "step": 3510
3521
+ },
3522
+ {
3523
+ "entropy": 1.6863658234477044,
3524
+ "epoch": 6.048140984311197,
3525
+ "grad_norm": 0.7642632126808167,
3526
+ "learning_rate": 5.924000000000001e-05,
3527
+ "loss": 1.6536775588989259,
3528
+ "mean_token_accuracy": 0.680523382127285,
3529
+ "num_tokens": 20908597.0,
3530
+ "step": 3520
3531
+ },
3532
+ {
3533
+ "entropy": 1.6652932062745094,
3534
+ "epoch": 6.065334192993768,
3535
+ "grad_norm": 0.8676924109458923,
3536
+ "learning_rate": 5.8840000000000006e-05,
3537
+ "loss": 1.7443069458007812,
3538
+ "mean_token_accuracy": 0.6719188451766968,
3539
+ "num_tokens": 20966567.0,
3540
+ "step": 3530
3541
+ },
3542
+ {
3543
+ "entropy": 1.7391631960868836,
3544
+ "epoch": 6.082527401676338,
3545
+ "grad_norm": 0.8444374799728394,
3546
+ "learning_rate": 5.844e-05,
3547
+ "loss": 1.7849775314331056,
3548
+ "mean_token_accuracy": 0.672398941218853,
3549
+ "num_tokens": 21023832.0,
3550
+ "step": 3540
3551
+ },
3552
+ {
3553
+ "entropy": 1.7432220742106437,
3554
+ "epoch": 6.099720610358908,
3555
+ "grad_norm": 0.7972187995910645,
3556
+ "learning_rate": 5.804000000000001e-05,
3557
+ "loss": 1.8264921188354493,
3558
+ "mean_token_accuracy": 0.6713483344763518,
3559
+ "num_tokens": 21080325.0,
3560
+ "step": 3550
3561
+ },
3562
+ {
3563
+ "entropy": 1.7394985787570476,
3564
+ "epoch": 6.1169138190414785,
3565
+ "grad_norm": 0.8266369700431824,
3566
+ "learning_rate": 5.7640000000000004e-05,
3567
+ "loss": 1.819821548461914,
3568
+ "mean_token_accuracy": 0.6708907049149275,
3569
+ "num_tokens": 21143316.0,
3570
+ "step": 3560
3571
+ },
3572
+ {
3573
+ "entropy": 1.7923602670431138,
3574
+ "epoch": 6.134107027724049,
3575
+ "grad_norm": 0.8315872550010681,
3576
+ "learning_rate": 5.724000000000001e-05,
3577
+ "loss": 1.8086809158325194,
3578
+ "mean_token_accuracy": 0.665992408245802,
3579
+ "num_tokens": 21203848.0,
3580
+ "step": 3570
3581
+ },
3582
+ {
3583
+ "entropy": 1.711188006401062,
3584
+ "epoch": 6.15130023640662,
3585
+ "grad_norm": 0.8174048066139221,
3586
+ "learning_rate": 5.6840000000000005e-05,
3587
+ "loss": 1.7656991958618165,
3588
+ "mean_token_accuracy": 0.6711975857615471,
3589
+ "num_tokens": 21266260.0,
3590
+ "step": 3580
3591
+ },
3592
+ {
3593
+ "entropy": 1.8437035098671912,
3594
+ "epoch": 6.16849344508919,
3595
+ "grad_norm": 0.8155949711799622,
3596
+ "learning_rate": 5.644e-05,
3597
+ "loss": 1.877999496459961,
3598
+ "mean_token_accuracy": 0.6532085236161947,
3599
+ "num_tokens": 21326008.0,
3600
+ "step": 3590
3601
+ },
3602
+ {
3603
+ "entropy": 1.7264528393745422,
3604
+ "epoch": 6.18568665377176,
3605
+ "grad_norm": 0.7951272130012512,
3606
+ "learning_rate": 5.6040000000000006e-05,
3607
+ "loss": 1.747119140625,
3608
+ "mean_token_accuracy": 0.6696909107267857,
3609
+ "num_tokens": 21385356.0,
3610
+ "step": 3600
3611
+ },
3612
+ {
3613
+ "entropy": 1.68227918446064,
3614
+ "epoch": 6.20287986245433,
3615
+ "grad_norm": 0.779587984085083,
3616
+ "learning_rate": 5.564e-05,
3617
+ "loss": 1.7062965393066407,
3618
+ "mean_token_accuracy": 0.6786911800503731,
3619
+ "num_tokens": 21443231.0,
3620
+ "step": 3610
3621
+ },
3622
+ {
3623
+ "entropy": 1.7644565671682357,
3624
+ "epoch": 6.220073071136901,
3625
+ "grad_norm": 0.9153981804847717,
3626
+ "learning_rate": 5.524e-05,
3627
+ "loss": 1.8082721710205079,
3628
+ "mean_token_accuracy": 0.6671201888471842,
3629
+ "num_tokens": 21499309.0,
3630
+ "step": 3620
3631
+ },
3632
+ {
3633
+ "entropy": 1.7211210913956165,
3634
+ "epoch": 6.237266279819472,
3635
+ "grad_norm": 0.8166586756706238,
3636
+ "learning_rate": 5.4840000000000003e-05,
3637
+ "loss": 1.769371795654297,
3638
+ "mean_token_accuracy": 0.6694241009652615,
3639
+ "num_tokens": 21558565.0,
3640
+ "step": 3630
3641
+ },
3642
+ {
3643
+ "entropy": 1.7693689942359925,
3644
+ "epoch": 6.254459488502041,
3645
+ "grad_norm": 0.7773623466491699,
3646
+ "learning_rate": 5.444e-05,
3647
+ "loss": 1.848412322998047,
3648
+ "mean_token_accuracy": 0.66685731112957,
3649
+ "num_tokens": 21618504.0,
3650
+ "step": 3640
3651
+ },
3652
+ {
3653
+ "entropy": 1.8090675905346871,
3654
+ "epoch": 6.271652697184612,
3655
+ "grad_norm": 0.9420453310012817,
3656
+ "learning_rate": 5.4040000000000004e-05,
3657
+ "loss": 1.8266836166381837,
3658
+ "mean_token_accuracy": 0.6643423162400722,
3659
+ "num_tokens": 21676861.0,
3660
+ "step": 3650
3661
+ },
3662
+ {
3663
+ "entropy": 1.7340097561478616,
3664
+ "epoch": 6.288845905867182,
3665
+ "grad_norm": 0.805880069732666,
3666
+ "learning_rate": 5.364e-05,
3667
+ "loss": 1.7760274887084961,
3668
+ "mean_token_accuracy": 0.6729184173047542,
3669
+ "num_tokens": 21734874.0,
3670
+ "step": 3660
3671
+ },
3672
+ {
3673
+ "entropy": 1.733542764186859,
3674
+ "epoch": 6.306039114549753,
3675
+ "grad_norm": 0.7459798455238342,
3676
+ "learning_rate": 5.324e-05,
3677
+ "loss": 1.7874065399169923,
3678
+ "mean_token_accuracy": 0.6733234331011773,
3679
+ "num_tokens": 21797467.0,
3680
+ "step": 3670
3681
+ },
3682
+ {
3683
+ "entropy": 1.6855479300022125,
3684
+ "epoch": 6.3232323232323235,
3685
+ "grad_norm": 0.7362611889839172,
3686
+ "learning_rate": 5.284e-05,
3687
+ "loss": 1.7557338714599608,
3688
+ "mean_token_accuracy": 0.6742986045777798,
3689
+ "num_tokens": 21856704.0,
3690
+ "step": 3680
3691
+ },
3692
+ {
3693
+ "entropy": 1.762756396830082,
3694
+ "epoch": 6.340425531914893,
3695
+ "grad_norm": 0.8349901437759399,
3696
+ "learning_rate": 5.244e-05,
3697
+ "loss": 1.784174346923828,
3698
+ "mean_token_accuracy": 0.6732991166412831,
3699
+ "num_tokens": 21915781.0,
3700
+ "step": 3690
3701
+ },
3702
+ {
3703
+ "entropy": 1.7664957396686076,
3704
+ "epoch": 6.357618740597464,
3705
+ "grad_norm": 0.8295337557792664,
3706
+ "learning_rate": 5.204e-05,
3707
+ "loss": 1.8338695526123048,
3708
+ "mean_token_accuracy": 0.6659718155860901,
3709
+ "num_tokens": 21973568.0,
3710
+ "step": 3700
3711
+ },
3712
+ {
3713
+ "entropy": 1.7744196206331253,
3714
+ "epoch": 6.374811949280034,
3715
+ "grad_norm": 0.739115297794342,
3716
+ "learning_rate": 5.164e-05,
3717
+ "loss": 1.8148929595947265,
3718
+ "mean_token_accuracy": 0.6660460762679576,
3719
+ "num_tokens": 22032979.0,
3720
+ "step": 3710
3721
+ },
3722
+ {
3723
+ "entropy": 1.7459667712450027,
3724
+ "epoch": 6.392005157962605,
3725
+ "grad_norm": 0.7716593146324158,
3726
+ "learning_rate": 5.124e-05,
3727
+ "loss": 1.8079204559326172,
3728
+ "mean_token_accuracy": 0.66551748290658,
3729
+ "num_tokens": 22092283.0,
3730
+ "step": 3720
3731
+ },
3732
+ {
3733
+ "entropy": 1.7491293936967849,
3734
+ "epoch": 6.4091983666451755,
3735
+ "grad_norm": 0.8270374536514282,
3736
+ "learning_rate": 5.084e-05,
3737
+ "loss": 1.8020380020141602,
3738
+ "mean_token_accuracy": 0.6673273537307978,
3739
+ "num_tokens": 22150667.0,
3740
+ "step": 3730
3741
+ },
3742
+ {
3743
+ "entropy": 1.6887403331696986,
3744
+ "epoch": 6.426391575327745,
3745
+ "grad_norm": 0.8306758403778076,
3746
+ "learning_rate": 5.044e-05,
3747
+ "loss": 1.7328964233398438,
3748
+ "mean_token_accuracy": 0.676455694437027,
3749
+ "num_tokens": 22211170.0,
3750
+ "step": 3740
3751
+ },
3752
+ {
3753
+ "entropy": 1.8332835257053375,
3754
+ "epoch": 6.443584784010316,
3755
+ "grad_norm": 0.8369497656822205,
3756
+ "learning_rate": 5.0039999999999995e-05,
3757
+ "loss": 1.913273239135742,
3758
+ "mean_token_accuracy": 0.656198850646615,
3759
+ "num_tokens": 22269928.0,
3760
+ "step": 3750
3761
+ },
3762
+ {
3763
+ "entropy": 1.6914366707205772,
3764
+ "epoch": 6.460777992692886,
3765
+ "grad_norm": 0.7562059164047241,
3766
+ "learning_rate": 4.9640000000000006e-05,
3767
+ "loss": 1.7506240844726562,
3768
+ "mean_token_accuracy": 0.67936124317348,
3769
+ "num_tokens": 22328611.0,
3770
+ "step": 3760
3771
+ },
3772
+ {
3773
+ "entropy": 1.7604179099202155,
3774
+ "epoch": 6.477971201375457,
3775
+ "grad_norm": 0.7541300058364868,
3776
+ "learning_rate": 4.924e-05,
3777
+ "loss": 1.8065948486328125,
3778
+ "mean_token_accuracy": 0.6697364591062069,
3779
+ "num_tokens": 22389219.0,
3780
+ "step": 3770
3781
+ },
3782
+ {
3783
+ "entropy": 1.731757602095604,
3784
+ "epoch": 6.4951644100580275,
3785
+ "grad_norm": 0.8319364190101624,
3786
+ "learning_rate": 4.884e-05,
3787
+ "loss": 1.7902181625366211,
3788
+ "mean_token_accuracy": 0.6673447206616402,
3789
+ "num_tokens": 22449858.0,
3790
+ "step": 3780
3791
+ },
3792
+ {
3793
+ "entropy": 1.7152166068553925,
3794
+ "epoch": 6.512357618740597,
3795
+ "grad_norm": 0.8575091361999512,
3796
+ "learning_rate": 4.8440000000000004e-05,
3797
+ "loss": 1.7424659729003906,
3798
+ "mean_token_accuracy": 0.6707747709006071,
3799
+ "num_tokens": 22509375.0,
3800
+ "step": 3790
3801
+ },
3802
+ {
3803
+ "entropy": 1.6641680032014847,
3804
+ "epoch": 6.529550827423168,
3805
+ "grad_norm": 0.7516652345657349,
3806
+ "learning_rate": 4.804e-05,
3807
+ "loss": 1.6937873840332032,
3808
+ "mean_token_accuracy": 0.6811798132956028,
3809
+ "num_tokens": 22566440.0,
3810
+ "step": 3800
3811
+ },
3812
+ {
3813
+ "entropy": 1.7551555022597314,
3814
+ "epoch": 6.546744036105738,
3815
+ "grad_norm": 0.817863941192627,
3816
+ "learning_rate": 4.7640000000000005e-05,
3817
+ "loss": 1.8282489776611328,
3818
+ "mean_token_accuracy": 0.6655839093029499,
3819
+ "num_tokens": 22627900.0,
3820
+ "step": 3810
3821
+ },
3822
+ {
3823
+ "entropy": 1.7025569766759872,
3824
+ "epoch": 6.563937244788309,
3825
+ "grad_norm": 0.757764458656311,
3826
+ "learning_rate": 4.724e-05,
3827
+ "loss": 1.7325496673583984,
3828
+ "mean_token_accuracy": 0.6785391330718994,
3829
+ "num_tokens": 22685738.0,
3830
+ "step": 3820
3831
+ },
3832
+ {
3833
+ "entropy": 1.699775031208992,
3834
+ "epoch": 6.5811304534708785,
3835
+ "grad_norm": 0.7960421442985535,
3836
+ "learning_rate": 4.684e-05,
3837
+ "loss": 1.7602745056152345,
3838
+ "mean_token_accuracy": 0.6698532458394766,
3839
+ "num_tokens": 22745696.0,
3840
+ "step": 3830
3841
+ },
3842
+ {
3843
+ "entropy": 1.8100605458021164,
3844
+ "epoch": 6.598323662153449,
3845
+ "grad_norm": 0.8477244973182678,
3846
+ "learning_rate": 4.644e-05,
3847
+ "loss": 1.8226333618164063,
3848
+ "mean_token_accuracy": 0.6646727129817009,
3849
+ "num_tokens": 22805783.0,
3850
+ "step": 3840
3851
+ },
3852
+ {
3853
+ "entropy": 1.7685839846730231,
3854
+ "epoch": 6.61551687083602,
3855
+ "grad_norm": 0.7853493690490723,
3856
+ "learning_rate": 4.604e-05,
3857
+ "loss": 1.8230281829833985,
3858
+ "mean_token_accuracy": 0.664577030390501,
3859
+ "num_tokens": 22866822.0,
3860
+ "step": 3850
3861
+ },
3862
+ {
3863
+ "entropy": 1.7810854628682136,
3864
+ "epoch": 6.63271007951859,
3865
+ "grad_norm": 0.7139444351196289,
3866
+ "learning_rate": 4.564e-05,
3867
+ "loss": 1.855198287963867,
3868
+ "mean_token_accuracy": 0.6652711797505617,
3869
+ "num_tokens": 22928790.0,
3870
+ "step": 3860
3871
+ },
3872
+ {
3873
+ "entropy": 1.7815292954444886,
3874
+ "epoch": 6.649903288201161,
3875
+ "grad_norm": 0.7039018869400024,
3876
+ "learning_rate": 4.524000000000001e-05,
3877
+ "loss": 1.845859909057617,
3878
+ "mean_token_accuracy": 0.6595252249389887,
3879
+ "num_tokens": 22990170.0,
3880
+ "step": 3870
3881
+ },
3882
+ {
3883
+ "entropy": 1.7107908308506012,
3884
+ "epoch": 6.667096496883731,
3885
+ "grad_norm": 0.7651708126068115,
3886
+ "learning_rate": 4.4840000000000004e-05,
3887
+ "loss": 1.7340824127197265,
3888
+ "mean_token_accuracy": 0.6750431463122368,
3889
+ "num_tokens": 23047902.0,
3890
+ "step": 3880
3891
+ },
3892
+ {
3893
+ "entropy": 1.7069460928440094,
3894
+ "epoch": 6.684289705566301,
3895
+ "grad_norm": 0.7385950088500977,
3896
+ "learning_rate": 4.444e-05,
3897
+ "loss": 1.758881187438965,
3898
+ "mean_token_accuracy": 0.6745327576994896,
3899
+ "num_tokens": 23112106.0,
3900
+ "step": 3890
3901
+ },
3902
+ {
3903
+ "entropy": 1.821124967932701,
3904
+ "epoch": 6.701482914248872,
3905
+ "grad_norm": 0.7827627658843994,
3906
+ "learning_rate": 4.4040000000000005e-05,
3907
+ "loss": 1.913480567932129,
3908
+ "mean_token_accuracy": 0.6593531377613544,
3909
+ "num_tokens": 23170056.0,
3910
+ "step": 3900
3911
+ },
3912
+ {
3913
+ "entropy": 1.7924881175160408,
3914
+ "epoch": 6.718676122931442,
3915
+ "grad_norm": 0.8166612386703491,
3916
+ "learning_rate": 4.364e-05,
3917
+ "loss": 1.855017852783203,
3918
+ "mean_token_accuracy": 0.6593458168208599,
3919
+ "num_tokens": 23228582.0,
3920
+ "step": 3910
3921
+ },
3922
+ {
3923
+ "entropy": 1.736910080909729,
3924
+ "epoch": 6.735869331614013,
3925
+ "grad_norm": 0.779629647731781,
3926
+ "learning_rate": 4.324e-05,
3927
+ "loss": 1.7581821441650392,
3928
+ "mean_token_accuracy": 0.6779871381819248,
3929
+ "num_tokens": 23288702.0,
3930
+ "step": 3920
3931
+ },
3932
+ {
3933
+ "entropy": 1.6776573412120341,
3934
+ "epoch": 6.7530625402965825,
3935
+ "grad_norm": 0.7625913619995117,
3936
+ "learning_rate": 4.284e-05,
3937
+ "loss": 1.7102031707763672,
3938
+ "mean_token_accuracy": 0.6794889360666275,
3939
+ "num_tokens": 23349004.0,
3940
+ "step": 3930
3941
+ },
3942
+ {
3943
+ "entropy": 1.8100020587444305,
3944
+ "epoch": 6.770255748979153,
3945
+ "grad_norm": 0.7499405145645142,
3946
+ "learning_rate": 4.244e-05,
3947
+ "loss": 1.8514158248901367,
3948
+ "mean_token_accuracy": 0.6620845705270767,
3949
+ "num_tokens": 23410874.0,
3950
+ "step": 3940
3951
+ },
3952
+ {
3953
+ "entropy": 1.697011759877205,
3954
+ "epoch": 6.787448957661724,
3955
+ "grad_norm": 0.736323893070221,
3956
+ "learning_rate": 4.2040000000000004e-05,
3957
+ "loss": 1.7609180450439452,
3958
+ "mean_token_accuracy": 0.6772994473576546,
3959
+ "num_tokens": 23472518.0,
3960
+ "step": 3950
3961
+ },
3962
+ {
3963
+ "entropy": 1.764576494693756,
3964
+ "epoch": 6.804642166344294,
3965
+ "grad_norm": 0.8523833751678467,
3966
+ "learning_rate": 4.164e-05,
3967
+ "loss": 1.81484375,
3968
+ "mean_token_accuracy": 0.6644324712455273,
3969
+ "num_tokens": 23531203.0,
3970
+ "step": 3960
3971
+ },
3972
+ {
3973
+ "entropy": 1.7241224959492683,
3974
+ "epoch": 6.821835375026865,
3975
+ "grad_norm": 0.8820350766181946,
3976
+ "learning_rate": 4.124e-05,
3977
+ "loss": 1.739130401611328,
3978
+ "mean_token_accuracy": 0.6771424360573292,
3979
+ "num_tokens": 23590289.0,
3980
+ "step": 3970
3981
+ },
3982
+ {
3983
+ "entropy": 1.6967746496200562,
3984
+ "epoch": 6.8390285837094345,
3985
+ "grad_norm": 0.8161067962646484,
3986
+ "learning_rate": 4.084e-05,
3987
+ "loss": 1.7659534454345702,
3988
+ "mean_token_accuracy": 0.6744477659463882,
3989
+ "num_tokens": 23647985.0,
3990
+ "step": 3980
3991
+ },
3992
+ {
3993
+ "entropy": 1.8578275874257089,
3994
+ "epoch": 6.856221792392005,
3995
+ "grad_norm": 0.778160810470581,
3996
+ "learning_rate": 4.044e-05,
3997
+ "loss": 1.9046249389648438,
3998
+ "mean_token_accuracy": 0.6525318272411823,
3999
+ "num_tokens": 23707387.0,
4000
+ "step": 3990
4001
+ },
4002
+ {
4003
+ "entropy": 1.781902502477169,
4004
+ "epoch": 6.873415001074576,
4005
+ "grad_norm": 0.9398592710494995,
4006
+ "learning_rate": 4.004e-05,
4007
+ "loss": 1.8081722259521484,
4008
+ "mean_token_accuracy": 0.6625144556164742,
4009
+ "num_tokens": 23764831.0,
4010
+ "step": 4000
4011
  }
4012
  ],
4013
  "logging_steps": 10,
 
4027
  "attributes": {}
4028
  }
4029
  },
4030
+ "total_flos": 1.951545327353856e+17,
4031
  "train_batch_size": 2,
4032
  "trial_name": null,
4033
  "trial_params": null