mtzig commited on
Commit
1aff730
·
verified ·
1 Parent(s): fe4f44e

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f675b8e523776045fff07cc6069a11f316b731b88040c14db9f32fd2ee4cb3fc
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c7ebbcad844382cb0827a7b1f134dbcd4d34137f17f88cff4f0916eb189ebf5
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f144ad0e0078ea0e907fff1577dca201e23acfe6b187950b4207828616263df7
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7112a496d5fcae5f50cfea78f1f333eab9ed1e0c59313fd80a7a7bc132bf69da
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ea5ca8f53b6619679b02cad3a488679df060e795c7845edb2fadcfae284f3c6
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8017e33ea602d361b2a15d6086fc895939152bd60dd6cf18b54db9895e8fa4ab
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71aead7baaa7488a46b28b54b870b312346cd6aeaefc2ee1bca37b7b3c220410
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b44a3e9c1fc45b6852fc5241f9ff5cc13d8e2ff6074f9df68b4d097ed706134
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d47c1fa7dbfc22211ae4d9ef6a93950aecd82aa0597b76b6088d58ab8cebfce
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecc2212000c5fe210e90809b2e44ab52cdbe049455449ffbb43a7aee4a0f7198
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b2ebcfed4a6ec05a623903e4f7a5369ae2dc76493e6791d714d19397c077eb8
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e14dfa71bc73ec24ac571cc78cf5e1386575d6b3eadffbdf1fdb4f7ef2266da
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5346e91abc6e958cbd6a5b276aa1375688bf66b327a1cf44856c6f64608dfbed
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60546e3c3ddce8d7a69cf9166934de67859396cc08b7574b96d474577977c98
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15d4b2668fb0f0c7aceb12510a0e81b0ac44e7c44b8fc9b9bc407ca80b2baea3
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bae454cfbbeef253a90b9291e2fe802fd4eab12a25818000192a303e71359d4
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7af657b557210fae0ffbaec36878ba97d5823ecb77a0293ffe43fc6e4f3d427b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13cdfb0ba0b39344ffb7c51f008a9525a0099ba731e0b27df1e92d8952acacbf
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f09a374443c7d9aedad0fa8df337669a31b68e237c3859c2041ae01b1833bccc
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b00b812fe39f4d94693255c049b0abd4bfba391cf4f816905b3d288faf6ee1ba
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e720f588174a5fb5674a6cf224f79a363e3703e591c74fb40c50cb44c49746c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8934e21fbebe2892e28c801cf4e90276a2f5fc0d5159b41c7d9ebdd5b89cbbbd
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b312f3be5f5d1b5587804f12a8f6902d1233a4020e9562b05c089db1693ddd6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c08fecc49906d862bfc6e72f4697782e40b17c8a512b907777ac3c47be4f8aa5
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d6685944070ce9823634f48aac861b22b93e0dde51040bec01d339ab13515a2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:007a009d54f5cc96725c131dafb54cc883f16e2aa322b400a09312dd7b22f98b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6188118811881188,
5
  "eval_steps": 20,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3819,6 +3819,766 @@
3819
  "eval_samples_per_second": 5.701,
3820
  "eval_steps_per_second": 0.186,
3821
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3822
  }
3823
  ],
3824
  "logging_steps": 1,
@@ -3838,7 +4598,7 @@
3838
  "attributes": {}
3839
  }
3840
  },
3841
- "total_flos": 1.529363226867794e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7425742574257426,
5
  "eval_steps": 20,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3819
  "eval_samples_per_second": 5.701,
3820
  "eval_steps_per_second": 0.186,
3821
  "step": 500
3822
+ },
3823
+ {
3824
+ "epoch": 0.620049504950495,
3825
+ "grad_norm": 3.656219244003296,
3826
+ "learning_rate": 7.5826446215003695e-06,
3827
+ "loss": 0.1701,
3828
+ "step": 501
3829
+ },
3830
+ {
3831
+ "epoch": 0.6212871287128713,
3832
+ "grad_norm": 3.9823882579803467,
3833
+ "learning_rate": 7.5407358254056995e-06,
3834
+ "loss": 0.1759,
3835
+ "step": 502
3836
+ },
3837
+ {
3838
+ "epoch": 0.6225247524752475,
3839
+ "grad_norm": 5.238447666168213,
3840
+ "learning_rate": 7.4988729528587445e-06,
3841
+ "loss": 0.2084,
3842
+ "step": 503
3843
+ },
3844
+ {
3845
+ "epoch": 0.6237623762376238,
3846
+ "grad_norm": 5.870246887207031,
3847
+ "learning_rate": 7.45705678559399e-06,
3848
+ "loss": 0.2592,
3849
+ "step": 504
3850
+ },
3851
+ {
3852
+ "epoch": 0.625,
3853
+ "grad_norm": 6.469812393188477,
3854
+ "learning_rate": 7.415288104473774e-06,
3855
+ "loss": 0.2664,
3856
+ "step": 505
3857
+ },
3858
+ {
3859
+ "epoch": 0.6262376237623762,
3860
+ "grad_norm": 4.086902141571045,
3861
+ "learning_rate": 7.373567689473683e-06,
3862
+ "loss": 0.123,
3863
+ "step": 506
3864
+ },
3865
+ {
3866
+ "epoch": 0.6274752475247525,
3867
+ "grad_norm": 4.265713691711426,
3868
+ "learning_rate": 7.3318963196679904e-06,
3869
+ "loss": 0.1567,
3870
+ "step": 507
3871
+ },
3872
+ {
3873
+ "epoch": 0.6287128712871287,
3874
+ "grad_norm": 5.820674419403076,
3875
+ "learning_rate": 7.290274773215131e-06,
3876
+ "loss": 0.2661,
3877
+ "step": 508
3878
+ },
3879
+ {
3880
+ "epoch": 0.629950495049505,
3881
+ "grad_norm": 3.3316519260406494,
3882
+ "learning_rate": 7.248703827343142e-06,
3883
+ "loss": 0.1827,
3884
+ "step": 509
3885
+ },
3886
+ {
3887
+ "epoch": 0.6311881188118812,
3888
+ "grad_norm": 6.632443428039551,
3889
+ "learning_rate": 7.207184258335163e-06,
3890
+ "loss": 0.2486,
3891
+ "step": 510
3892
+ },
3893
+ {
3894
+ "epoch": 0.6324257425742574,
3895
+ "grad_norm": 5.278284072875977,
3896
+ "learning_rate": 7.1657168415149396e-06,
3897
+ "loss": 0.205,
3898
+ "step": 511
3899
+ },
3900
+ {
3901
+ "epoch": 0.6336633663366337,
3902
+ "grad_norm": 7.767988204956055,
3903
+ "learning_rate": 7.124302351232337e-06,
3904
+ "loss": 0.1912,
3905
+ "step": 512
3906
+ },
3907
+ {
3908
+ "epoch": 0.6349009900990099,
3909
+ "grad_norm": 4.862364292144775,
3910
+ "learning_rate": 7.0829415608489e-06,
3911
+ "loss": 0.2028,
3912
+ "step": 513
3913
+ },
3914
+ {
3915
+ "epoch": 0.6361386138613861,
3916
+ "grad_norm": 4.547641754150391,
3917
+ "learning_rate": 7.041635242723386e-06,
3918
+ "loss": 0.182,
3919
+ "step": 514
3920
+ },
3921
+ {
3922
+ "epoch": 0.6373762376237624,
3923
+ "grad_norm": 4.835113048553467,
3924
+ "learning_rate": 7.000384168197354e-06,
3925
+ "loss": 0.2437,
3926
+ "step": 515
3927
+ },
3928
+ {
3929
+ "epoch": 0.6386138613861386,
3930
+ "grad_norm": 4.105000019073486,
3931
+ "learning_rate": 6.9591891075807705e-06,
3932
+ "loss": 0.1998,
3933
+ "step": 516
3934
+ },
3935
+ {
3936
+ "epoch": 0.6398514851485149,
3937
+ "grad_norm": 4.884759902954102,
3938
+ "learning_rate": 6.918050830137608e-06,
3939
+ "loss": 0.2597,
3940
+ "step": 517
3941
+ },
3942
+ {
3943
+ "epoch": 0.6410891089108911,
3944
+ "grad_norm": 5.189384460449219,
3945
+ "learning_rate": 6.876970104071483e-06,
3946
+ "loss": 0.2485,
3947
+ "step": 518
3948
+ },
3949
+ {
3950
+ "epoch": 0.6423267326732673,
3951
+ "grad_norm": 4.154090881347656,
3952
+ "learning_rate": 6.8359476965113295e-06,
3953
+ "loss": 0.179,
3954
+ "step": 519
3955
+ },
3956
+ {
3957
+ "epoch": 0.6435643564356436,
3958
+ "grad_norm": 5.2503437995910645,
3959
+ "learning_rate": 6.7949843734970475e-06,
3960
+ "loss": 0.2046,
3961
+ "step": 520
3962
+ },
3963
+ {
3964
+ "epoch": 0.6435643564356436,
3965
+ "eval_accuracy": 0.8647450110864745,
3966
+ "eval_f1": 0.7252252252252253,
3967
+ "eval_loss": 0.29431188106536865,
3968
+ "eval_precision": 0.8429319371727748,
3969
+ "eval_recall": 0.6363636363636364,
3970
+ "eval_runtime": 49.3715,
3971
+ "eval_samples_per_second": 5.59,
3972
+ "eval_steps_per_second": 0.182,
3973
+ "step": 520
3974
+ },
3975
+ {
3976
+ "epoch": 0.6448019801980198,
3977
+ "grad_norm": 3.8733835220336914,
3978
+ "learning_rate": 6.754080899965208e-06,
3979
+ "loss": 0.2034,
3980
+ "step": 521
3981
+ },
3982
+ {
3983
+ "epoch": 0.6460396039603961,
3984
+ "grad_norm": 3.805725574493408,
3985
+ "learning_rate": 6.713238039734788e-06,
3986
+ "loss": 0.151,
3987
+ "step": 522
3988
+ },
3989
+ {
3990
+ "epoch": 0.6472772277227723,
3991
+ "grad_norm": 3.7677907943725586,
3992
+ "learning_rate": 6.67245655549287e-06,
3993
+ "loss": 0.1466,
3994
+ "step": 523
3995
+ },
3996
+ {
3997
+ "epoch": 0.6485148514851485,
3998
+ "grad_norm": 3.6880476474761963,
3999
+ "learning_rate": 6.631737208780433e-06,
4000
+ "loss": 0.1986,
4001
+ "step": 524
4002
+ },
4003
+ {
4004
+ "epoch": 0.6497524752475248,
4005
+ "grad_norm": 4.416601657867432,
4006
+ "learning_rate": 6.5910807599781135e-06,
4007
+ "loss": 0.2105,
4008
+ "step": 525
4009
+ },
4010
+ {
4011
+ "epoch": 0.650990099009901,
4012
+ "grad_norm": 3.7478973865509033,
4013
+ "learning_rate": 6.550487968292013e-06,
4014
+ "loss": 0.1534,
4015
+ "step": 526
4016
+ },
4017
+ {
4018
+ "epoch": 0.6522277227722773,
4019
+ "grad_norm": 5.128391742706299,
4020
+ "learning_rate": 6.509959591739522e-06,
4021
+ "loss": 0.2103,
4022
+ "step": 527
4023
+ },
4024
+ {
4025
+ "epoch": 0.6534653465346535,
4026
+ "grad_norm": 5.070952415466309,
4027
+ "learning_rate": 6.469496387135158e-06,
4028
+ "loss": 0.1674,
4029
+ "step": 528
4030
+ },
4031
+ {
4032
+ "epoch": 0.6547029702970297,
4033
+ "grad_norm": 4.714488983154297,
4034
+ "learning_rate": 6.429099110076436e-06,
4035
+ "loss": 0.221,
4036
+ "step": 529
4037
+ },
4038
+ {
4039
+ "epoch": 0.655940594059406,
4040
+ "grad_norm": 5.199388027191162,
4041
+ "learning_rate": 6.388768514929768e-06,
4042
+ "loss": 0.2027,
4043
+ "step": 530
4044
+ },
4045
+ {
4046
+ "epoch": 0.6571782178217822,
4047
+ "grad_norm": 5.243039608001709,
4048
+ "learning_rate": 6.3485053548163644e-06,
4049
+ "loss": 0.2347,
4050
+ "step": 531
4051
+ },
4052
+ {
4053
+ "epoch": 0.6584158415841584,
4054
+ "grad_norm": 6.204155445098877,
4055
+ "learning_rate": 6.308310381598168e-06,
4056
+ "loss": 0.1924,
4057
+ "step": 532
4058
+ },
4059
+ {
4060
+ "epoch": 0.6596534653465347,
4061
+ "grad_norm": 4.144034385681152,
4062
+ "learning_rate": 6.2681843458638345e-06,
4063
+ "loss": 0.2338,
4064
+ "step": 533
4065
+ },
4066
+ {
4067
+ "epoch": 0.6608910891089109,
4068
+ "grad_norm": 3.8708503246307373,
4069
+ "learning_rate": 6.2281279969146855e-06,
4070
+ "loss": 0.1827,
4071
+ "step": 534
4072
+ },
4073
+ {
4074
+ "epoch": 0.6621287128712872,
4075
+ "grad_norm": 4.874747276306152,
4076
+ "learning_rate": 6.18814208275075e-06,
4077
+ "loss": 0.2345,
4078
+ "step": 535
4079
+ },
4080
+ {
4081
+ "epoch": 0.6633663366336634,
4082
+ "grad_norm": 3.9638774394989014,
4083
+ "learning_rate": 6.148227350056763e-06,
4084
+ "loss": 0.173,
4085
+ "step": 536
4086
+ },
4087
+ {
4088
+ "epoch": 0.6646039603960396,
4089
+ "grad_norm": 3.154224395751953,
4090
+ "learning_rate": 6.10838454418825e-06,
4091
+ "loss": 0.1253,
4092
+ "step": 537
4093
+ },
4094
+ {
4095
+ "epoch": 0.6658415841584159,
4096
+ "grad_norm": 3.55877947807312,
4097
+ "learning_rate": 6.068614409157591e-06,
4098
+ "loss": 0.1708,
4099
+ "step": 538
4100
+ },
4101
+ {
4102
+ "epoch": 0.6670792079207921,
4103
+ "grad_norm": 3.995196580886841,
4104
+ "learning_rate": 6.0289176876201385e-06,
4105
+ "loss": 0.1969,
4106
+ "step": 539
4107
+ },
4108
+ {
4109
+ "epoch": 0.6683168316831684,
4110
+ "grad_norm": 3.824521064758301,
4111
+ "learning_rate": 5.989295120860334e-06,
4112
+ "loss": 0.1548,
4113
+ "step": 540
4114
+ },
4115
+ {
4116
+ "epoch": 0.6683168316831684,
4117
+ "eval_accuracy": 0.8636363636363636,
4118
+ "eval_f1": 0.7223476297968398,
4119
+ "eval_loss": 0.3003367483615875,
4120
+ "eval_precision": 0.8421052631578947,
4121
+ "eval_recall": 0.6324110671936759,
4122
+ "eval_runtime": 49.5296,
4123
+ "eval_samples_per_second": 5.572,
4124
+ "eval_steps_per_second": 0.182,
4125
+ "step": 540
4126
+ },
4127
+ {
4128
+ "epoch": 0.6695544554455446,
4129
+ "grad_norm": 5.372792720794678,
4130
+ "learning_rate": 5.94974744877789e-06,
4131
+ "loss": 0.179,
4132
+ "step": 541
4133
+ },
4134
+ {
4135
+ "epoch": 0.6707920792079208,
4136
+ "grad_norm": 4.743022441864014,
4137
+ "learning_rate": 5.910275409873942e-06,
4138
+ "loss": 0.1794,
4139
+ "step": 542
4140
+ },
4141
+ {
4142
+ "epoch": 0.6720297029702971,
4143
+ "grad_norm": 3.996967315673828,
4144
+ "learning_rate": 5.870879741237285e-06,
4145
+ "loss": 0.1525,
4146
+ "step": 543
4147
+ },
4148
+ {
4149
+ "epoch": 0.6732673267326733,
4150
+ "grad_norm": 7.0855207443237305,
4151
+ "learning_rate": 5.831561178530602e-06,
4152
+ "loss": 0.2256,
4153
+ "step": 544
4154
+ },
4155
+ {
4156
+ "epoch": 0.6745049504950495,
4157
+ "grad_norm": 7.199305057525635,
4158
+ "learning_rate": 5.792320455976714e-06,
4159
+ "loss": 0.2125,
4160
+ "step": 545
4161
+ },
4162
+ {
4163
+ "epoch": 0.6757425742574258,
4164
+ "grad_norm": 5.01775598526001,
4165
+ "learning_rate": 5.753158306344882e-06,
4166
+ "loss": 0.1781,
4167
+ "step": 546
4168
+ },
4169
+ {
4170
+ "epoch": 0.676980198019802,
4171
+ "grad_norm": 3.7600646018981934,
4172
+ "learning_rate": 5.7140754609371255e-06,
4173
+ "loss": 0.2278,
4174
+ "step": 547
4175
+ },
4176
+ {
4177
+ "epoch": 0.6782178217821783,
4178
+ "grad_norm": 5.047920227050781,
4179
+ "learning_rate": 5.675072649574551e-06,
4180
+ "loss": 0.2191,
4181
+ "step": 548
4182
+ },
4183
+ {
4184
+ "epoch": 0.6794554455445545,
4185
+ "grad_norm": 5.662668228149414,
4186
+ "learning_rate": 5.636150600583747e-06,
4187
+ "loss": 0.1901,
4188
+ "step": 549
4189
+ },
4190
+ {
4191
+ "epoch": 0.6806930693069307,
4192
+ "grad_norm": 4.518259525299072,
4193
+ "learning_rate": 5.597310040783161e-06,
4194
+ "loss": 0.2264,
4195
+ "step": 550
4196
+ },
4197
+ {
4198
+ "epoch": 0.681930693069307,
4199
+ "grad_norm": 4.768115043640137,
4200
+ "learning_rate": 5.558551695469532e-06,
4201
+ "loss": 0.2532,
4202
+ "step": 551
4203
+ },
4204
+ {
4205
+ "epoch": 0.6831683168316832,
4206
+ "grad_norm": 4.239420413970947,
4207
+ "learning_rate": 5.519876288404367e-06,
4208
+ "loss": 0.2236,
4209
+ "step": 552
4210
+ },
4211
+ {
4212
+ "epoch": 0.6844059405940595,
4213
+ "grad_norm": 4.318198204040527,
4214
+ "learning_rate": 5.481284541800391e-06,
4215
+ "loss": 0.2504,
4216
+ "step": 553
4217
+ },
4218
+ {
4219
+ "epoch": 0.6856435643564357,
4220
+ "grad_norm": 4.004321575164795,
4221
+ "learning_rate": 5.44277717630809e-06,
4222
+ "loss": 0.1704,
4223
+ "step": 554
4224
+ },
4225
+ {
4226
+ "epoch": 0.6868811881188119,
4227
+ "grad_norm": 6.967254161834717,
4228
+ "learning_rate": 5.404354911002243e-06,
4229
+ "loss": 0.2201,
4230
+ "step": 555
4231
+ },
4232
+ {
4233
+ "epoch": 0.6881188118811881,
4234
+ "grad_norm": 3.623018980026245,
4235
+ "learning_rate": 5.3660184633684895e-06,
4236
+ "loss": 0.1477,
4237
+ "step": 556
4238
+ },
4239
+ {
4240
+ "epoch": 0.6893564356435643,
4241
+ "grad_norm": 4.345696926116943,
4242
+ "learning_rate": 5.3277685492899345e-06,
4243
+ "loss": 0.2465,
4244
+ "step": 557
4245
+ },
4246
+ {
4247
+ "epoch": 0.6905940594059405,
4248
+ "grad_norm": 4.71245813369751,
4249
+ "learning_rate": 5.289605883033793e-06,
4250
+ "loss": 0.1864,
4251
+ "step": 558
4252
+ },
4253
+ {
4254
+ "epoch": 0.6918316831683168,
4255
+ "grad_norm": 5.07157039642334,
4256
+ "learning_rate": 5.251531177238029e-06,
4257
+ "loss": 0.1596,
4258
+ "step": 559
4259
+ },
4260
+ {
4261
+ "epoch": 0.693069306930693,
4262
+ "grad_norm": 3.7650375366210938,
4263
+ "learning_rate": 5.213545142898061e-06,
4264
+ "loss": 0.1626,
4265
+ "step": 560
4266
+ },
4267
+ {
4268
+ "epoch": 0.693069306930693,
4269
+ "eval_accuracy": 0.8625277161862528,
4270
+ "eval_f1": 0.7129629629629629,
4271
+ "eval_loss": 0.2982478439807892,
4272
+ "eval_precision": 0.8603351955307262,
4273
+ "eval_recall": 0.6086956521739131,
4274
+ "eval_runtime": 48.9768,
4275
+ "eval_samples_per_second": 5.635,
4276
+ "eval_steps_per_second": 0.184,
4277
+ "step": 560
4278
+ },
4279
+ {
4280
+ "epoch": 0.6943069306930693,
4281
+ "grad_norm": 5.199242115020752,
4282
+ "learning_rate": 5.175648489353493e-06,
4283
+ "loss": 0.1277,
4284
+ "step": 561
4285
+ },
4286
+ {
4287
+ "epoch": 0.6955445544554455,
4288
+ "grad_norm": 4.108044624328613,
4289
+ "learning_rate": 5.137841924274851e-06,
4290
+ "loss": 0.2117,
4291
+ "step": 562
4292
+ },
4293
+ {
4294
+ "epoch": 0.6967821782178217,
4295
+ "grad_norm": 5.149396896362305,
4296
+ "learning_rate": 5.100126153650379e-06,
4297
+ "loss": 0.1769,
4298
+ "step": 563
4299
+ },
4300
+ {
4301
+ "epoch": 0.698019801980198,
4302
+ "grad_norm": 3.721707820892334,
4303
+ "learning_rate": 5.0625018817728496e-06,
4304
+ "loss": 0.1764,
4305
+ "step": 564
4306
+ },
4307
+ {
4308
+ "epoch": 0.6992574257425742,
4309
+ "grad_norm": 5.771122932434082,
4310
+ "learning_rate": 5.024969811226419e-06,
4311
+ "loss": 0.2841,
4312
+ "step": 565
4313
+ },
4314
+ {
4315
+ "epoch": 0.7004950495049505,
4316
+ "grad_norm": 6.165885925292969,
4317
+ "learning_rate": 4.98753064287351e-06,
4318
+ "loss": 0.2048,
4319
+ "step": 566
4320
+ },
4321
+ {
4322
+ "epoch": 0.7017326732673267,
4323
+ "grad_norm": 3.664384126663208,
4324
+ "learning_rate": 4.950185075841706e-06,
4325
+ "loss": 0.14,
4326
+ "step": 567
4327
+ },
4328
+ {
4329
+ "epoch": 0.7029702970297029,
4330
+ "grad_norm": 6.110241889953613,
4331
+ "learning_rate": 4.912933807510714e-06,
4332
+ "loss": 0.2553,
4333
+ "step": 568
4334
+ },
4335
+ {
4336
+ "epoch": 0.7042079207920792,
4337
+ "grad_norm": 4.46115255355835,
4338
+ "learning_rate": 4.875777533499339e-06,
4339
+ "loss": 0.1871,
4340
+ "step": 569
4341
+ },
4342
+ {
4343
+ "epoch": 0.7054455445544554,
4344
+ "grad_norm": 5.189129829406738,
4345
+ "learning_rate": 4.838716947652485e-06,
4346
+ "loss": 0.1922,
4347
+ "step": 570
4348
+ },
4349
+ {
4350
+ "epoch": 0.7066831683168316,
4351
+ "grad_norm": 3.854255437850952,
4352
+ "learning_rate": 4.801752742028214e-06,
4353
+ "loss": 0.1823,
4354
+ "step": 571
4355
+ },
4356
+ {
4357
+ "epoch": 0.7079207920792079,
4358
+ "grad_norm": 4.2072319984436035,
4359
+ "learning_rate": 4.7648856068848e-06,
4360
+ "loss": 0.1776,
4361
+ "step": 572
4362
+ },
4363
+ {
4364
+ "epoch": 0.7091584158415841,
4365
+ "grad_norm": 3.298652172088623,
4366
+ "learning_rate": 4.728116230667859e-06,
4367
+ "loss": 0.2089,
4368
+ "step": 573
4369
+ },
4370
+ {
4371
+ "epoch": 0.7103960396039604,
4372
+ "grad_norm": 4.39929723739624,
4373
+ "learning_rate": 4.691445299997491e-06,
4374
+ "loss": 0.19,
4375
+ "step": 574
4376
+ },
4377
+ {
4378
+ "epoch": 0.7116336633663366,
4379
+ "grad_norm": 4.1644110679626465,
4380
+ "learning_rate": 4.654873499655449e-06,
4381
+ "loss": 0.1932,
4382
+ "step": 575
4383
+ },
4384
+ {
4385
+ "epoch": 0.7128712871287128,
4386
+ "grad_norm": 6.846812725067139,
4387
+ "learning_rate": 4.618401512572351e-06,
4388
+ "loss": 0.2762,
4389
+ "step": 576
4390
+ },
4391
+ {
4392
+ "epoch": 0.7141089108910891,
4393
+ "grad_norm": 6.345206260681152,
4394
+ "learning_rate": 4.582030019814948e-06,
4395
+ "loss": 0.249,
4396
+ "step": 577
4397
+ },
4398
+ {
4399
+ "epoch": 0.7153465346534653,
4400
+ "grad_norm": 4.541729927062988,
4401
+ "learning_rate": 4.5457597005733774e-06,
4402
+ "loss": 0.199,
4403
+ "step": 578
4404
+ },
4405
+ {
4406
+ "epoch": 0.7165841584158416,
4407
+ "grad_norm": 5.228466510772705,
4408
+ "learning_rate": 4.5095912321484946e-06,
4409
+ "loss": 0.1622,
4410
+ "step": 579
4411
+ },
4412
+ {
4413
+ "epoch": 0.7178217821782178,
4414
+ "grad_norm": 4.646934509277344,
4415
+ "learning_rate": 4.4735252899392335e-06,
4416
+ "loss": 0.2065,
4417
+ "step": 580
4418
+ },
4419
+ {
4420
+ "epoch": 0.7178217821782178,
4421
+ "eval_accuracy": 0.8636363636363636,
4422
+ "eval_f1": 0.7308533916849015,
4423
+ "eval_loss": 0.28770458698272705,
4424
+ "eval_precision": 0.8186274509803921,
4425
+ "eval_recall": 0.6600790513833992,
4426
+ "eval_runtime": 48.9708,
4427
+ "eval_samples_per_second": 5.636,
4428
+ "eval_steps_per_second": 0.184,
4429
+ "step": 580
4430
+ },
4431
+ {
4432
+ "epoch": 0.719059405940594,
4433
+ "grad_norm": 5.637617588043213,
4434
+ "learning_rate": 4.437562547429971e-06,
4435
+ "loss": 0.2249,
4436
+ "step": 581
4437
+ },
4438
+ {
4439
+ "epoch": 0.7202970297029703,
4440
+ "grad_norm": 3.387362480163574,
4441
+ "learning_rate": 4.4017036761779785e-06,
4442
+ "loss": 0.1965,
4443
+ "step": 582
4444
+ },
4445
+ {
4446
+ "epoch": 0.7215346534653465,
4447
+ "grad_norm": 4.963772773742676,
4448
+ "learning_rate": 4.365949345800856e-06,
4449
+ "loss": 0.1589,
4450
+ "step": 583
4451
+ },
4452
+ {
4453
+ "epoch": 0.7227722772277227,
4454
+ "grad_norm": 8.057557106018066,
4455
+ "learning_rate": 4.3303002239640424e-06,
4456
+ "loss": 0.2567,
4457
+ "step": 584
4458
+ },
4459
+ {
4460
+ "epoch": 0.724009900990099,
4461
+ "grad_norm": 4.3113789558410645,
4462
+ "learning_rate": 4.294756976368351e-06,
4463
+ "loss": 0.2492,
4464
+ "step": 585
4465
+ },
4466
+ {
4467
+ "epoch": 0.7252475247524752,
4468
+ "grad_norm": 6.69741153717041,
4469
+ "learning_rate": 4.259320266737522e-06,
4470
+ "loss": 0.2378,
4471
+ "step": 586
4472
+ },
4473
+ {
4474
+ "epoch": 0.7264851485148515,
4475
+ "grad_norm": 5.994034290313721,
4476
+ "learning_rate": 4.223990756805841e-06,
4477
+ "loss": 0.2038,
4478
+ "step": 587
4479
+ },
4480
+ {
4481
+ "epoch": 0.7277227722772277,
4482
+ "grad_norm": 6.540597915649414,
4483
+ "learning_rate": 4.1887691063057865e-06,
4484
+ "loss": 0.1929,
4485
+ "step": 588
4486
+ },
4487
+ {
4488
+ "epoch": 0.7289603960396039,
4489
+ "grad_norm": 4.549102783203125,
4490
+ "learning_rate": 4.153655972955695e-06,
4491
+ "loss": 0.2153,
4492
+ "step": 589
4493
+ },
4494
+ {
4495
+ "epoch": 0.7301980198019802,
4496
+ "grad_norm": 5.070977210998535,
4497
+ "learning_rate": 4.118652012447486e-06,
4498
+ "loss": 0.1908,
4499
+ "step": 590
4500
+ },
4501
+ {
4502
+ "epoch": 0.7314356435643564,
4503
+ "grad_norm": 3.0591437816619873,
4504
+ "learning_rate": 4.0837578784344225e-06,
4505
+ "loss": 0.1806,
4506
+ "step": 591
4507
+ },
4508
+ {
4509
+ "epoch": 0.7326732673267327,
4510
+ "grad_norm": 3.303514003753662,
4511
+ "learning_rate": 4.048974222518905e-06,
4512
+ "loss": 0.1859,
4513
+ "step": 592
4514
+ },
4515
+ {
4516
+ "epoch": 0.7339108910891089,
4517
+ "grad_norm": 3.909907817840576,
4518
+ "learning_rate": 4.01430169424029e-06,
4519
+ "loss": 0.2238,
4520
+ "step": 593
4521
+ },
4522
+ {
4523
+ "epoch": 0.7351485148514851,
4524
+ "grad_norm": 5.40861701965332,
4525
+ "learning_rate": 3.97974094106278e-06,
4526
+ "loss": 0.1768,
4527
+ "step": 594
4528
+ },
4529
+ {
4530
+ "epoch": 0.7363861386138614,
4531
+ "grad_norm": 4.427615165710449,
4532
+ "learning_rate": 3.945292608363312e-06,
4533
+ "loss": 0.2324,
4534
+ "step": 595
4535
+ },
4536
+ {
4537
+ "epoch": 0.7376237623762376,
4538
+ "grad_norm": 3.793356418609619,
4539
+ "learning_rate": 3.9109573394195336e-06,
4540
+ "loss": 0.1758,
4541
+ "step": 596
4542
+ },
4543
+ {
4544
+ "epoch": 0.7388613861386139,
4545
+ "grad_norm": 3.278257369995117,
4546
+ "learning_rate": 3.876735775397759e-06,
4547
+ "loss": 0.1133,
4548
+ "step": 597
4549
+ },
4550
+ {
4551
+ "epoch": 0.7400990099009901,
4552
+ "grad_norm": 3.4571950435638428,
4553
+ "learning_rate": 3.842628555341018e-06,
4554
+ "loss": 0.1381,
4555
+ "step": 598
4556
+ },
4557
+ {
4558
+ "epoch": 0.7413366336633663,
4559
+ "grad_norm": 7.060393810272217,
4560
+ "learning_rate": 3.8086363161571194e-06,
4561
+ "loss": 0.2736,
4562
+ "step": 599
4563
+ },
4564
+ {
4565
+ "epoch": 0.7425742574257426,
4566
+ "grad_norm": 3.590026617050171,
4567
+ "learning_rate": 3.7747596926067485e-06,
4568
+ "loss": 0.1423,
4569
+ "step": 600
4570
+ },
4571
+ {
4572
+ "epoch": 0.7425742574257426,
4573
+ "eval_accuracy": 0.8603104212860311,
4574
+ "eval_f1": 0.7014218009478673,
4575
+ "eval_loss": 0.30313166975975037,
4576
+ "eval_precision": 0.8757396449704142,
4577
+ "eval_recall": 0.5849802371541502,
4578
+ "eval_runtime": 50.2279,
4579
+ "eval_samples_per_second": 5.495,
4580
+ "eval_steps_per_second": 0.179,
4581
+ "step": 600
4582
  }
4583
  ],
4584
  "logging_steps": 1,
 
4598
  "attributes": {}
4599
  }
4600
  },
4601
+ "total_flos": 1.8390793736473805e+17,
4602
  "train_batch_size": 8,
4603
  "trial_name": null,
4604
  "trial_params": null