mtzig commited on
Commit
0c5bd42
·
verified ·
1 Parent(s): 6bc3624

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2b93ad52642a85b05dfeb12298d5fc96031e28deca183e4c2c08dee55390519
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bdd41feecab147e3702fc9a09ae4f3759d0fd2eaf5075a4794809ea853b57dd
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2efb917810cc2f95e9f9ce9dbedd055ab564155034f3abefbe6e2c2aba3ca26d
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f2060b9ac678bc735c6aa21c657746c6a3eb4a708a65a556806c9e6a69997c6
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1074fb3e5da840427a8c2bea51f33775455c187a3952a36a214f784d7d2276f
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebff2e905977f2cd428ca1b24f49468350b74d9cb1dcc0461fc80243ceab581f
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:125e0693505ee7832ee7ccc24457433cbae25876a3094529f034b3e75b927697
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49631406cb95eb8f8daf261c6428aa3afdceb3aad5aba318de0a43ed481d41ef
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a160bd7b9e08e738f42a7a457989e3c22a3b5edc1387393e647818565d70087c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a376864c68cd8140d4a8c8d1313d2085fad00b426c9d7c50f67377d9024c806b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecc75e35548f117f565dab4461f84d0eafe694b50abee164ab9bef5eab3b4ee4
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9094a7c70774e5fb0f079bf19e64d3aeedab732ea28f3fdbd261beb1471e797d
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ae5b867cd013595180bd2ccabeca150c9cfa9878165ccfecdfd9293bc574562
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91267c676fe033603ed303ba35beb8e62450b6e31dad2048d004a26b33530e19
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a70ec7443f662da848c4b8210fe462d74b2d7a68b35699dc79e5bd86cb1eae2
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3945204a1afc847a21741b668c19c312b7ccf92d58c6fff63a70c9d4b3319dfb
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c76de5da5afcc5abc533d34acaa4a5f75726ae0dddef8d03dce20b4b0bb16b26
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:076df3ee78299c9288fa283b2488d4715dfacbcac9821b1d2a66d28531d8f15a
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe6768c650c318d7513b359980365700ed7262b73426a2846102802665f07202
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b361c84c477f7c6235201218f1bc29b4f72aab9464107d1a89dee126263148f
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:984877f9215c7cb40eee3a9c80335cc677560aa6c451b66bba02e1f3bd6a640f
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9e69d08de85b535829406b9c04296fe936a0eae9bdb9304ab3a2b07dbf3dab6
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61f55f74596ac7956ac8ae93c9417eb4717b8e1452599a0c994329e13b642c3c
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de91bcd34728c2f7b26ffa826160ccbb9c4c2a7079f6d8d402251e4d4d317506
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:976dc23b255118d6686ca07b5635c96d50eb321a91532eac8dd124ba2740fad5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60135211e354b13f48a59683d9bc3cfc0bb1db62799fa50234aa03cb59f58850
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.423728813559322,
5
  "eval_steps": 20,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3819,6 +3819,766 @@
3819
  "eval_samples_per_second": 5.6,
3820
  "eval_steps_per_second": 0.192,
3821
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3822
  }
3823
  ],
3824
  "logging_steps": 1,
@@ -3838,7 +4598,7 @@
3838
  "attributes": {}
3839
  }
3840
  },
3841
- "total_flos": 1.5414191977188557e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5084745762711864,
5
  "eval_steps": 20,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3819
  "eval_samples_per_second": 5.6,
3820
  "eval_steps_per_second": 0.192,
3821
  "step": 500
3822
+ },
3823
+ {
3824
+ "epoch": 0.4245762711864407,
3825
+ "grad_norm": 3.1827445030212402,
3826
+ "learning_rate": 1.4239583120967125e-05,
3827
+ "loss": 0.0485,
3828
+ "step": 501
3829
+ },
3830
+ {
3831
+ "epoch": 0.42542372881355933,
3832
+ "grad_norm": 1.7482115030288696,
3833
+ "learning_rate": 1.4212772868165957e-05,
3834
+ "loss": 0.0151,
3835
+ "step": 502
3836
+ },
3837
+ {
3838
+ "epoch": 0.42627118644067796,
3839
+ "grad_norm": 1.0564489364624023,
3840
+ "learning_rate": 1.418592575000813e-05,
3841
+ "loss": 0.007,
3842
+ "step": 503
3843
+ },
3844
+ {
3845
+ "epoch": 0.4271186440677966,
3846
+ "grad_norm": 1.7804484367370605,
3847
+ "learning_rate": 1.4159042001428827e-05,
3848
+ "loss": 0.0103,
3849
+ "step": 504
3850
+ },
3851
+ {
3852
+ "epoch": 0.4279661016949153,
3853
+ "grad_norm": 0.9321497082710266,
3854
+ "learning_rate": 1.4132121857683782e-05,
3855
+ "loss": 0.0152,
3856
+ "step": 505
3857
+ },
3858
+ {
3859
+ "epoch": 0.4288135593220339,
3860
+ "grad_norm": 0.712197482585907,
3861
+ "learning_rate": 1.4105165554347227e-05,
3862
+ "loss": 0.0035,
3863
+ "step": 506
3864
+ },
3865
+ {
3866
+ "epoch": 0.42966101694915254,
3867
+ "grad_norm": 0.5874913930892944,
3868
+ "learning_rate": 1.4078173327309807e-05,
3869
+ "loss": 0.0045,
3870
+ "step": 507
3871
+ },
3872
+ {
3873
+ "epoch": 0.43050847457627117,
3874
+ "grad_norm": 1.6754646301269531,
3875
+ "learning_rate": 1.4051145412776536e-05,
3876
+ "loss": 0.0134,
3877
+ "step": 508
3878
+ },
3879
+ {
3880
+ "epoch": 0.43135593220338986,
3881
+ "grad_norm": 0.6901799440383911,
3882
+ "learning_rate": 1.4024082047264729e-05,
3883
+ "loss": 0.0049,
3884
+ "step": 509
3885
+ },
3886
+ {
3887
+ "epoch": 0.4322033898305085,
3888
+ "grad_norm": 1.6095439195632935,
3889
+ "learning_rate": 1.3996983467601921e-05,
3890
+ "loss": 0.0113,
3891
+ "step": 510
3892
+ },
3893
+ {
3894
+ "epoch": 0.4330508474576271,
3895
+ "grad_norm": 2.4264211654663086,
3896
+ "learning_rate": 1.3969849910923803e-05,
3897
+ "loss": 0.0281,
3898
+ "step": 511
3899
+ },
3900
+ {
3901
+ "epoch": 0.43389830508474575,
3902
+ "grad_norm": 0.8501647114753723,
3903
+ "learning_rate": 1.3942681614672144e-05,
3904
+ "loss": 0.0042,
3905
+ "step": 512
3906
+ },
3907
+ {
3908
+ "epoch": 0.4347457627118644,
3909
+ "grad_norm": 1.2504135370254517,
3910
+ "learning_rate": 1.3915478816592715e-05,
3911
+ "loss": 0.0069,
3912
+ "step": 513
3913
+ },
3914
+ {
3915
+ "epoch": 0.43559322033898307,
3916
+ "grad_norm": 0.24446320533752441,
3917
+ "learning_rate": 1.388824175473321e-05,
3918
+ "loss": 0.0013,
3919
+ "step": 514
3920
+ },
3921
+ {
3922
+ "epoch": 0.4364406779661017,
3923
+ "grad_norm": 2.65307354927063,
3924
+ "learning_rate": 1.3860970667441147e-05,
3925
+ "loss": 0.0194,
3926
+ "step": 515
3927
+ },
3928
+ {
3929
+ "epoch": 0.43728813559322033,
3930
+ "grad_norm": 1.6779959201812744,
3931
+ "learning_rate": 1.3833665793361816e-05,
3932
+ "loss": 0.0159,
3933
+ "step": 516
3934
+ },
3935
+ {
3936
+ "epoch": 0.43813559322033896,
3937
+ "grad_norm": 1.3321099281311035,
3938
+ "learning_rate": 1.3806327371436159e-05,
3939
+ "loss": 0.0118,
3940
+ "step": 517
3941
+ },
3942
+ {
3943
+ "epoch": 0.43898305084745765,
3944
+ "grad_norm": 0.5509894490242004,
3945
+ "learning_rate": 1.3778955640898686e-05,
3946
+ "loss": 0.0026,
3947
+ "step": 518
3948
+ },
3949
+ {
3950
+ "epoch": 0.4398305084745763,
3951
+ "grad_norm": 1.3254811763763428,
3952
+ "learning_rate": 1.3751550841275394e-05,
3953
+ "loss": 0.0135,
3954
+ "step": 519
3955
+ },
3956
+ {
3957
+ "epoch": 0.4406779661016949,
3958
+ "grad_norm": 1.8219112157821655,
3959
+ "learning_rate": 1.372411321238166e-05,
3960
+ "loss": 0.0139,
3961
+ "step": 520
3962
+ },
3963
+ {
3964
+ "epoch": 0.4406779661016949,
3965
+ "eval_accuracy": 1.0,
3966
+ "eval_f1": 1.0,
3967
+ "eval_loss": 0.0005393382161855698,
3968
+ "eval_precision": 1.0,
3969
+ "eval_recall": 1.0,
3970
+ "eval_runtime": 50.6981,
3971
+ "eval_samples_per_second": 5.76,
3972
+ "eval_steps_per_second": 0.197,
3973
+ "step": 520
3974
+ },
3975
+ {
3976
+ "epoch": 0.44152542372881354,
3977
+ "grad_norm": 1.3557311296463013,
3978
+ "learning_rate": 1.3696642994320146e-05,
3979
+ "loss": 0.0138,
3980
+ "step": 521
3981
+ },
3982
+ {
3983
+ "epoch": 0.4423728813559322,
3984
+ "grad_norm": 0.22216911613941193,
3985
+ "learning_rate": 1.3669140427478693e-05,
3986
+ "loss": 0.0015,
3987
+ "step": 522
3988
+ },
3989
+ {
3990
+ "epoch": 0.44322033898305085,
3991
+ "grad_norm": 0.794339120388031,
3992
+ "learning_rate": 1.3641605752528225e-05,
3993
+ "loss": 0.0082,
3994
+ "step": 523
3995
+ },
3996
+ {
3997
+ "epoch": 0.4440677966101695,
3998
+ "grad_norm": 1.4078831672668457,
3999
+ "learning_rate": 1.3614039210420638e-05,
4000
+ "loss": 0.0057,
4001
+ "step": 524
4002
+ },
4003
+ {
4004
+ "epoch": 0.4449152542372881,
4005
+ "grad_norm": 2.3533213138580322,
4006
+ "learning_rate": 1.3586441042386694e-05,
4007
+ "loss": 0.0198,
4008
+ "step": 525
4009
+ },
4010
+ {
4011
+ "epoch": 0.4457627118644068,
4012
+ "grad_norm": 3.0721018314361572,
4013
+ "learning_rate": 1.3558811489933909e-05,
4014
+ "loss": 0.0254,
4015
+ "step": 526
4016
+ },
4017
+ {
4018
+ "epoch": 0.44661016949152543,
4019
+ "grad_norm": 1.831836223602295,
4020
+ "learning_rate": 1.353115079484444e-05,
4021
+ "loss": 0.0169,
4022
+ "step": 527
4023
+ },
4024
+ {
4025
+ "epoch": 0.44745762711864406,
4026
+ "grad_norm": 1.5127922296524048,
4027
+ "learning_rate": 1.3503459199172969e-05,
4028
+ "loss": 0.0075,
4029
+ "step": 528
4030
+ },
4031
+ {
4032
+ "epoch": 0.4483050847457627,
4033
+ "grad_norm": 2.0667357444763184,
4034
+ "learning_rate": 1.3475736945244575e-05,
4035
+ "loss": 0.0122,
4036
+ "step": 529
4037
+ },
4038
+ {
4039
+ "epoch": 0.4491525423728814,
4040
+ "grad_norm": 0.386496365070343,
4041
+ "learning_rate": 1.3447984275652638e-05,
4042
+ "loss": 0.0023,
4043
+ "step": 530
4044
+ },
4045
+ {
4046
+ "epoch": 0.45,
4047
+ "grad_norm": 0.4718012511730194,
4048
+ "learning_rate": 1.342020143325669e-05,
4049
+ "loss": 0.0034,
4050
+ "step": 531
4051
+ },
4052
+ {
4053
+ "epoch": 0.45084745762711864,
4054
+ "grad_norm": 2.439950466156006,
4055
+ "learning_rate": 1.3392388661180303e-05,
4056
+ "loss": 0.0186,
4057
+ "step": 532
4058
+ },
4059
+ {
4060
+ "epoch": 0.4516949152542373,
4061
+ "grad_norm": 0.8340148329734802,
4062
+ "learning_rate": 1.3364546202808966e-05,
4063
+ "loss": 0.0067,
4064
+ "step": 533
4065
+ },
4066
+ {
4067
+ "epoch": 0.45254237288135596,
4068
+ "grad_norm": 2.2126450538635254,
4069
+ "learning_rate": 1.3336674301787942e-05,
4070
+ "loss": 0.0187,
4071
+ "step": 534
4072
+ },
4073
+ {
4074
+ "epoch": 0.4533898305084746,
4075
+ "grad_norm": 0.4377054274082184,
4076
+ "learning_rate": 1.330877320202014e-05,
4077
+ "loss": 0.0028,
4078
+ "step": 535
4079
+ },
4080
+ {
4081
+ "epoch": 0.4542372881355932,
4082
+ "grad_norm": 1.3909966945648193,
4083
+ "learning_rate": 1.3280843147663988e-05,
4084
+ "loss": 0.0131,
4085
+ "step": 536
4086
+ },
4087
+ {
4088
+ "epoch": 0.45508474576271185,
4089
+ "grad_norm": 0.3523927330970764,
4090
+ "learning_rate": 1.325288438313129e-05,
4091
+ "loss": 0.0016,
4092
+ "step": 537
4093
+ },
4094
+ {
4095
+ "epoch": 0.4559322033898305,
4096
+ "grad_norm": 3.0238094329833984,
4097
+ "learning_rate": 1.322489715308509e-05,
4098
+ "loss": 0.0175,
4099
+ "step": 538
4100
+ },
4101
+ {
4102
+ "epoch": 0.45677966101694917,
4103
+ "grad_norm": 1.9963982105255127,
4104
+ "learning_rate": 1.3196881702437525e-05,
4105
+ "loss": 0.0181,
4106
+ "step": 539
4107
+ },
4108
+ {
4109
+ "epoch": 0.4576271186440678,
4110
+ "grad_norm": 1.146183729171753,
4111
+ "learning_rate": 1.3168838276347691e-05,
4112
+ "loss": 0.0161,
4113
+ "step": 540
4114
+ },
4115
+ {
4116
+ "epoch": 0.4576271186440678,
4117
+ "eval_accuracy": 1.0,
4118
+ "eval_f1": 1.0,
4119
+ "eval_loss": 0.0002900932158809155,
4120
+ "eval_precision": 1.0,
4121
+ "eval_recall": 1.0,
4122
+ "eval_runtime": 51.127,
4123
+ "eval_samples_per_second": 5.711,
4124
+ "eval_steps_per_second": 0.196,
4125
+ "step": 540
4126
+ },
4127
+ {
4128
+ "epoch": 0.45847457627118643,
4129
+ "grad_norm": 1.0082005262374878,
4130
+ "learning_rate": 1.314076712021949e-05,
4131
+ "loss": 0.0081,
4132
+ "step": 541
4133
+ },
4134
+ {
4135
+ "epoch": 0.45932203389830506,
4136
+ "grad_norm": 0.8905977606773376,
4137
+ "learning_rate": 1.3112668479699486e-05,
4138
+ "loss": 0.0037,
4139
+ "step": 542
4140
+ },
4141
+ {
4142
+ "epoch": 0.46016949152542375,
4143
+ "grad_norm": 1.1065611839294434,
4144
+ "learning_rate": 1.3084542600674756e-05,
4145
+ "loss": 0.0048,
4146
+ "step": 543
4147
+ },
4148
+ {
4149
+ "epoch": 0.4610169491525424,
4150
+ "grad_norm": 1.939026951789856,
4151
+ "learning_rate": 1.305638972927074e-05,
4152
+ "loss": 0.009,
4153
+ "step": 544
4154
+ },
4155
+ {
4156
+ "epoch": 0.461864406779661,
4157
+ "grad_norm": 2.5424704551696777,
4158
+ "learning_rate": 1.3028210111849079e-05,
4159
+ "loss": 0.0121,
4160
+ "step": 545
4161
+ },
4162
+ {
4163
+ "epoch": 0.46271186440677964,
4164
+ "grad_norm": 0.8510853052139282,
4165
+ "learning_rate": 1.3000003995005462e-05,
4166
+ "loss": 0.0053,
4167
+ "step": 546
4168
+ },
4169
+ {
4170
+ "epoch": 0.4635593220338983,
4171
+ "grad_norm": 4.978145599365234,
4172
+ "learning_rate": 1.297177162556748e-05,
4173
+ "loss": 0.0365,
4174
+ "step": 547
4175
+ },
4176
+ {
4177
+ "epoch": 0.46440677966101696,
4178
+ "grad_norm": 1.0519256591796875,
4179
+ "learning_rate": 1.294351325059245e-05,
4180
+ "loss": 0.0046,
4181
+ "step": 548
4182
+ },
4183
+ {
4184
+ "epoch": 0.4652542372881356,
4185
+ "grad_norm": 2.549281120300293,
4186
+ "learning_rate": 1.291522911736526e-05,
4187
+ "loss": 0.0192,
4188
+ "step": 549
4189
+ },
4190
+ {
4191
+ "epoch": 0.4661016949152542,
4192
+ "grad_norm": 1.8708372116088867,
4193
+ "learning_rate": 1.2886919473396212e-05,
4194
+ "loss": 0.0121,
4195
+ "step": 550
4196
+ },
4197
+ {
4198
+ "epoch": 0.4669491525423729,
4199
+ "grad_norm": 3.6299774646759033,
4200
+ "learning_rate": 1.2858584566418837e-05,
4201
+ "loss": 0.0233,
4202
+ "step": 551
4203
+ },
4204
+ {
4205
+ "epoch": 0.46779661016949153,
4206
+ "grad_norm": 1.1099425554275513,
4207
+ "learning_rate": 1.2830224644387742e-05,
4208
+ "loss": 0.0084,
4209
+ "step": 552
4210
+ },
4211
+ {
4212
+ "epoch": 0.46864406779661016,
4213
+ "grad_norm": 3.3964803218841553,
4214
+ "learning_rate": 1.2801839955476444e-05,
4215
+ "loss": 0.0413,
4216
+ "step": 553
4217
+ },
4218
+ {
4219
+ "epoch": 0.4694915254237288,
4220
+ "grad_norm": 2.0021309852600098,
4221
+ "learning_rate": 1.277343074807519e-05,
4222
+ "loss": 0.0144,
4223
+ "step": 554
4224
+ },
4225
+ {
4226
+ "epoch": 0.4703389830508475,
4227
+ "grad_norm": 2.881606340408325,
4228
+ "learning_rate": 1.2744997270788777e-05,
4229
+ "loss": 0.0354,
4230
+ "step": 555
4231
+ },
4232
+ {
4233
+ "epoch": 0.4711864406779661,
4234
+ "grad_norm": 0.47970932722091675,
4235
+ "learning_rate": 1.2716539772434389e-05,
4236
+ "loss": 0.0022,
4237
+ "step": 556
4238
+ },
4239
+ {
4240
+ "epoch": 0.47203389830508474,
4241
+ "grad_norm": 0.41695287823677063,
4242
+ "learning_rate": 1.2688058502039416e-05,
4243
+ "loss": 0.0023,
4244
+ "step": 557
4245
+ },
4246
+ {
4247
+ "epoch": 0.4728813559322034,
4248
+ "grad_norm": 2.176729440689087,
4249
+ "learning_rate": 1.2659553708839273e-05,
4250
+ "loss": 0.0271,
4251
+ "step": 558
4252
+ },
4253
+ {
4254
+ "epoch": 0.47372881355932206,
4255
+ "grad_norm": 0.9656859636306763,
4256
+ "learning_rate": 1.2631025642275212e-05,
4257
+ "loss": 0.0075,
4258
+ "step": 559
4259
+ },
4260
+ {
4261
+ "epoch": 0.4745762711864407,
4262
+ "grad_norm": 0.5131775140762329,
4263
+ "learning_rate": 1.2602474551992165e-05,
4264
+ "loss": 0.0027,
4265
+ "step": 560
4266
+ },
4267
+ {
4268
+ "epoch": 0.4745762711864407,
4269
+ "eval_accuracy": 1.0,
4270
+ "eval_f1": 1.0,
4271
+ "eval_loss": 0.0001880442287074402,
4272
+ "eval_precision": 1.0,
4273
+ "eval_recall": 1.0,
4274
+ "eval_runtime": 50.0747,
4275
+ "eval_samples_per_second": 5.831,
4276
+ "eval_steps_per_second": 0.2,
4277
+ "step": 560
4278
+ },
4279
+ {
4280
+ "epoch": 0.4754237288135593,
4281
+ "grad_norm": 1.1515014171600342,
4282
+ "learning_rate": 1.2573900687836525e-05,
4283
+ "loss": 0.0114,
4284
+ "step": 561
4285
+ },
4286
+ {
4287
+ "epoch": 0.47627118644067795,
4288
+ "grad_norm": 0.5253351926803589,
4289
+ "learning_rate": 1.2545304299853977e-05,
4290
+ "loss": 0.0038,
4291
+ "step": 562
4292
+ },
4293
+ {
4294
+ "epoch": 0.47711864406779664,
4295
+ "grad_norm": 0.9620082378387451,
4296
+ "learning_rate": 1.2516685638287318e-05,
4297
+ "loss": 0.0053,
4298
+ "step": 563
4299
+ },
4300
+ {
4301
+ "epoch": 0.47796610169491527,
4302
+ "grad_norm": 0.7079916596412659,
4303
+ "learning_rate": 1.248804495357425e-05,
4304
+ "loss": 0.0038,
4305
+ "step": 564
4306
+ },
4307
+ {
4308
+ "epoch": 0.4788135593220339,
4309
+ "grad_norm": 1.2482668161392212,
4310
+ "learning_rate": 1.2459382496345199e-05,
4311
+ "loss": 0.0042,
4312
+ "step": 565
4313
+ },
4314
+ {
4315
+ "epoch": 0.47966101694915253,
4316
+ "grad_norm": 0.6608754992485046,
4317
+ "learning_rate": 1.2430698517421117e-05,
4318
+ "loss": 0.006,
4319
+ "step": 566
4320
+ },
4321
+ {
4322
+ "epoch": 0.48050847457627116,
4323
+ "grad_norm": 0.9836096167564392,
4324
+ "learning_rate": 1.2401993267811293e-05,
4325
+ "loss": 0.0087,
4326
+ "step": 567
4327
+ },
4328
+ {
4329
+ "epoch": 0.48135593220338985,
4330
+ "grad_norm": 2.097402334213257,
4331
+ "learning_rate": 1.2373266998711152e-05,
4332
+ "loss": 0.0165,
4333
+ "step": 568
4334
+ },
4335
+ {
4336
+ "epoch": 0.4822033898305085,
4337
+ "grad_norm": 0.6762765645980835,
4338
+ "learning_rate": 1.2344519961500048e-05,
4339
+ "loss": 0.0038,
4340
+ "step": 569
4341
+ },
4342
+ {
4343
+ "epoch": 0.4830508474576271,
4344
+ "grad_norm": 0.8983291983604431,
4345
+ "learning_rate": 1.2315752407739093e-05,
4346
+ "loss": 0.0044,
4347
+ "step": 570
4348
+ },
4349
+ {
4350
+ "epoch": 0.48389830508474574,
4351
+ "grad_norm": 1.4723248481750488,
4352
+ "learning_rate": 1.2286964589168917e-05,
4353
+ "loss": 0.0062,
4354
+ "step": 571
4355
+ },
4356
+ {
4357
+ "epoch": 0.4847457627118644,
4358
+ "grad_norm": 1.0354115962982178,
4359
+ "learning_rate": 1.2258156757707496e-05,
4360
+ "loss": 0.0098,
4361
+ "step": 572
4362
+ },
4363
+ {
4364
+ "epoch": 0.48559322033898306,
4365
+ "grad_norm": 0.8155727982521057,
4366
+ "learning_rate": 1.2229329165447931e-05,
4367
+ "loss": 0.0124,
4368
+ "step": 573
4369
+ },
4370
+ {
4371
+ "epoch": 0.4864406779661017,
4372
+ "grad_norm": 2.297771692276001,
4373
+ "learning_rate": 1.220048206465625e-05,
4374
+ "loss": 0.0094,
4375
+ "step": 574
4376
+ },
4377
+ {
4378
+ "epoch": 0.4872881355932203,
4379
+ "grad_norm": 1.3691686391830444,
4380
+ "learning_rate": 1.217161570776919e-05,
4381
+ "loss": 0.0039,
4382
+ "step": 575
4383
+ },
4384
+ {
4385
+ "epoch": 0.488135593220339,
4386
+ "grad_norm": 2.56219744682312,
4387
+ "learning_rate": 1.2142730347392007e-05,
4388
+ "loss": 0.0088,
4389
+ "step": 576
4390
+ },
4391
+ {
4392
+ "epoch": 0.48898305084745763,
4393
+ "grad_norm": 1.2534739971160889,
4394
+ "learning_rate": 1.2113826236296245e-05,
4395
+ "loss": 0.0046,
4396
+ "step": 577
4397
+ },
4398
+ {
4399
+ "epoch": 0.48983050847457626,
4400
+ "grad_norm": 0.3140455186367035,
4401
+ "learning_rate": 1.2084903627417535e-05,
4402
+ "loss": 0.001,
4403
+ "step": 578
4404
+ },
4405
+ {
4406
+ "epoch": 0.4906779661016949,
4407
+ "grad_norm": 2.9673843383789062,
4408
+ "learning_rate": 1.2055962773853379e-05,
4409
+ "loss": 0.0261,
4410
+ "step": 579
4411
+ },
4412
+ {
4413
+ "epoch": 0.4915254237288136,
4414
+ "grad_norm": 1.1575368642807007,
4415
+ "learning_rate": 1.2027003928860936e-05,
4416
+ "loss": 0.0039,
4417
+ "step": 580
4418
+ },
4419
+ {
4420
+ "epoch": 0.4915254237288136,
4421
+ "eval_accuracy": 1.0,
4422
+ "eval_f1": 1.0,
4423
+ "eval_loss": 0.00029336303123272955,
4424
+ "eval_precision": 1.0,
4425
+ "eval_recall": 1.0,
4426
+ "eval_runtime": 51.1721,
4427
+ "eval_samples_per_second": 5.706,
4428
+ "eval_steps_per_second": 0.195,
4429
+ "step": 580
4430
+ },
4431
+ {
4432
+ "epoch": 0.4923728813559322,
4433
+ "grad_norm": 0.5004251003265381,
4434
+ "learning_rate": 1.1998027345854811e-05,
4435
+ "loss": 0.002,
4436
+ "step": 581
4437
+ },
4438
+ {
4439
+ "epoch": 0.49322033898305084,
4440
+ "grad_norm": 1.6102720499038696,
4441
+ "learning_rate": 1.1969033278404816e-05,
4442
+ "loss": 0.0071,
4443
+ "step": 582
4444
+ },
4445
+ {
4446
+ "epoch": 0.4940677966101695,
4447
+ "grad_norm": 0.36238208413124084,
4448
+ "learning_rate": 1.1940021980233784e-05,
4449
+ "loss": 0.0021,
4450
+ "step": 583
4451
+ },
4452
+ {
4453
+ "epoch": 0.49491525423728816,
4454
+ "grad_norm": 1.1167278289794922,
4455
+ "learning_rate": 1.1910993705215323e-05,
4456
+ "loss": 0.0026,
4457
+ "step": 584
4458
+ },
4459
+ {
4460
+ "epoch": 0.4957627118644068,
4461
+ "grad_norm": 1.2431293725967407,
4462
+ "learning_rate": 1.1881948707371609e-05,
4463
+ "loss": 0.0186,
4464
+ "step": 585
4465
+ },
4466
+ {
4467
+ "epoch": 0.4966101694915254,
4468
+ "grad_norm": 2.63661789894104,
4469
+ "learning_rate": 1.1852887240871145e-05,
4470
+ "loss": 0.024,
4471
+ "step": 586
4472
+ },
4473
+ {
4474
+ "epoch": 0.49745762711864405,
4475
+ "grad_norm": 0.9267066717147827,
4476
+ "learning_rate": 1.1823809560026558e-05,
4477
+ "loss": 0.0055,
4478
+ "step": 587
4479
+ },
4480
+ {
4481
+ "epoch": 0.49830508474576274,
4482
+ "grad_norm": 2.153548240661621,
4483
+ "learning_rate": 1.1794715919292368e-05,
4484
+ "loss": 0.0189,
4485
+ "step": 588
4486
+ },
4487
+ {
4488
+ "epoch": 0.49915254237288137,
4489
+ "grad_norm": 1.1179004907608032,
4490
+ "learning_rate": 1.1765606573262745e-05,
4491
+ "loss": 0.0091,
4492
+ "step": 589
4493
+ },
4494
+ {
4495
+ "epoch": 0.5,
4496
+ "grad_norm": 0.7604763507843018,
4497
+ "learning_rate": 1.1736481776669307e-05,
4498
+ "loss": 0.0038,
4499
+ "step": 590
4500
+ },
4501
+ {
4502
+ "epoch": 0.5008474576271187,
4503
+ "grad_norm": 1.1166861057281494,
4504
+ "learning_rate": 1.1707341784378865e-05,
4505
+ "loss": 0.004,
4506
+ "step": 591
4507
+ },
4508
+ {
4509
+ "epoch": 0.5016949152542373,
4510
+ "grad_norm": 2.1006617546081543,
4511
+ "learning_rate": 1.1678186851391218e-05,
4512
+ "loss": 0.0119,
4513
+ "step": 592
4514
+ },
4515
+ {
4516
+ "epoch": 0.502542372881356,
4517
+ "grad_norm": 0.9687687754631042,
4518
+ "learning_rate": 1.1649017232836899e-05,
4519
+ "loss": 0.0031,
4520
+ "step": 593
4521
+ },
4522
+ {
4523
+ "epoch": 0.5033898305084745,
4524
+ "grad_norm": 1.6780524253845215,
4525
+ "learning_rate": 1.1619833183974959e-05,
4526
+ "loss": 0.0095,
4527
+ "step": 594
4528
+ },
4529
+ {
4530
+ "epoch": 0.5042372881355932,
4531
+ "grad_norm": 1.265393614768982,
4532
+ "learning_rate": 1.1590634960190722e-05,
4533
+ "loss": 0.0113,
4534
+ "step": 595
4535
+ },
4536
+ {
4537
+ "epoch": 0.5050847457627119,
4538
+ "grad_norm": 2.153305768966675,
4539
+ "learning_rate": 1.1561422816993555e-05,
4540
+ "loss": 0.021,
4541
+ "step": 596
4542
+ },
4543
+ {
4544
+ "epoch": 0.5059322033898305,
4545
+ "grad_norm": 1.4506266117095947,
4546
+ "learning_rate": 1.1532197010014636e-05,
4547
+ "loss": 0.0091,
4548
+ "step": 597
4549
+ },
4550
+ {
4551
+ "epoch": 0.5067796610169492,
4552
+ "grad_norm": 2.4317758083343506,
4553
+ "learning_rate": 1.1502957795004706e-05,
4554
+ "loss": 0.0171,
4555
+ "step": 598
4556
+ },
4557
+ {
4558
+ "epoch": 0.5076271186440678,
4559
+ "grad_norm": 1.4479554891586304,
4560
+ "learning_rate": 1.1473705427831843e-05,
4561
+ "loss": 0.0055,
4562
+ "step": 599
4563
+ },
4564
+ {
4565
+ "epoch": 0.5084745762711864,
4566
+ "grad_norm": 1.9250199794769287,
4567
+ "learning_rate": 1.1444440164479215e-05,
4568
+ "loss": 0.0067,
4569
+ "step": 600
4570
+ },
4571
+ {
4572
+ "epoch": 0.5084745762711864,
4573
+ "eval_accuracy": 1.0,
4574
+ "eval_f1": 1.0,
4575
+ "eval_loss": 0.00014447586727328598,
4576
+ "eval_precision": 1.0,
4577
+ "eval_recall": 1.0,
4578
+ "eval_runtime": 49.9263,
4579
+ "eval_samples_per_second": 5.849,
4580
+ "eval_steps_per_second": 0.2,
4581
+ "step": 600
4582
  }
4583
  ],
4584
  "logging_steps": 1,
 
4598
  "attributes": {}
4599
  }
4600
  },
4601
+ "total_flos": 1.849773217934213e+17,
4602
  "train_batch_size": 8,
4603
  "trial_name": null,
4604
  "trial_params": null