mtzig commited on
Commit
7cee2c9
·
verified ·
1 Parent(s): d6e7cfb

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c379ef2a6d06f1b6d62f1076fd4c921adfce1635cf0e33794870aa4e4c50cd5
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99da1138da71c7a590a2707e483883e957340e06dae08b8827f18835c34a64b4
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6c6149211a936e9509b71caa3038d2974cbc24f16d2f139c4bce1f7f929ca96
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79c4ba664980c19c5ef3783c79e3d4e043fca5bb4c60f89d82faac7a943cf243
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca8239e8b89a1f8c74d7a7c79a3cdb93732f2743fa3971d79b6663a4fc675a9e
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f424fd03327ecb5ff630a9df1d98261931fcdfc83c1716e339b348d51df7d478
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0ec3076d00cd1127df3b4ac55a666860fa921fb02d4b5b8f30867f96acb82ce
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a13d0c056d2946548a4902bc1af045b538f2ae540c6e2ef689c1bb7482253a0
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:931a5ebaf87bed2e7bed3aa5b3ecd7d4da726d7c7d64726db93912347ac7a58e
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b31287f991033846081b020484b6de0532ffe6b03b8cf50698e3f201f2e56e9
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a4b8b1241a63c17dd12c1ce295952c7a1c8e1f86004b01703d7542255f0387f
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e80cce203211b2c2441e9d6c1bb6224c0d09baf614394b407b0f9ce1ded28a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7ea0758e0b5550a0bad05276c2c8f8f6bed79e6d2e051cca07c9b0179ca83f1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e7beb1931987579881f896dde9404f4b409e252ef531ece93cbf739876254fe
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:364a002390ab0186583237ae427b0edf353723a14dec95eb28db15d33d2b2de4
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb5450bd834dd38bde6c77563d509a72456d15718acf61980ebcccc75aef710
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c0996004e3280ba2b8c5308142e245e93b9a3d5870de383914360145085a647
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae91bbe4bb3448f9fe588b5f12c6d570f98cbeb7f79c6b4c021fdd413e35a673
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b99dae60d08ae089466b878474ef297a0b281547cd1097ea214ecee77244b16
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e2bdfc538b632be76938bdc369215d9c9e9696454b505d6d5c099a19d59619
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c3acfb47638e30fe1106672a6fd0db74c9187c94c19467e9d22bd366fbb5472
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:654f94a53cbd3a4c0aa96462f7eefb36cea6a40f65967f82f41333fe8d59b3e6
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9723827a668573edbd596a65e0f225b208491adf853284b8da3f11b792077fdc
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24ee86115e5e887f663c435c280ac37373efa53275c443e874691073017d1363
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a124d1e9d8a7b4a76d7294be394802bfec19da05b0209e12c8dc6b8ab250293
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85ee37d9a532de8cdb09f3a64e5b2fe9e638521f567e2b493ae4f1f2c3b0617
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6443298969072165,
5
  "eval_steps": 20,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3819,6 +3819,766 @@
3819
  "eval_samples_per_second": 5.335,
3820
  "eval_steps_per_second": 0.176,
3821
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3822
  }
3823
  ],
3824
  "logging_steps": 1,
@@ -3838,7 +4598,7 @@
3838
  "attributes": {}
3839
  }
3840
  },
3841
- "total_flos": 1.6687462625574912e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7731958762886598,
5
  "eval_steps": 20,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3819
  "eval_samples_per_second": 5.335,
3820
  "eval_steps_per_second": 0.176,
3821
  "step": 500
3822
+ },
3823
+ {
3824
+ "epoch": 0.645618556701031,
3825
+ "grad_norm": 0.2069253921508789,
3826
+ "learning_rate": 6.730609277866644e-06,
3827
+ "loss": 0.0008,
3828
+ "step": 501
3829
+ },
3830
+ {
3831
+ "epoch": 0.6469072164948454,
3832
+ "grad_norm": 0.7460423111915588,
3833
+ "learning_rate": 6.688107468527297e-06,
3834
+ "loss": 0.0043,
3835
+ "step": 502
3836
+ },
3837
+ {
3838
+ "epoch": 0.6481958762886598,
3839
+ "grad_norm": 0.1024048775434494,
3840
+ "learning_rate": 6.645672750209216e-06,
3841
+ "loss": 0.0008,
3842
+ "step": 503
3843
+ },
3844
+ {
3845
+ "epoch": 0.6494845360824743,
3846
+ "grad_norm": 3.0588488578796387,
3847
+ "learning_rate": 6.603305982538295e-06,
3848
+ "loss": 0.0153,
3849
+ "step": 504
3850
+ },
3851
+ {
3852
+ "epoch": 0.6507731958762887,
3853
+ "grad_norm": 0.7051161527633667,
3854
+ "learning_rate": 6.561008023763915e-06,
3855
+ "loss": 0.0014,
3856
+ "step": 505
3857
+ },
3858
+ {
3859
+ "epoch": 0.6520618556701031,
3860
+ "grad_norm": 1.1912554502487183,
3861
+ "learning_rate": 6.518779730741555e-06,
3862
+ "loss": 0.0085,
3863
+ "step": 506
3864
+ },
3865
+ {
3866
+ "epoch": 0.6533505154639175,
3867
+ "grad_norm": 0.12031827121973038,
3868
+ "learning_rate": 6.476621958915426e-06,
3869
+ "loss": 0.0008,
3870
+ "step": 507
3871
+ },
3872
+ {
3873
+ "epoch": 0.654639175257732,
3874
+ "grad_norm": 0.19412291049957275,
3875
+ "learning_rate": 6.434535562301153e-06,
3876
+ "loss": 0.0005,
3877
+ "step": 508
3878
+ },
3879
+ {
3880
+ "epoch": 0.6559278350515464,
3881
+ "grad_norm": 1.5401062965393066,
3882
+ "learning_rate": 6.392521393468471e-06,
3883
+ "loss": 0.003,
3884
+ "step": 509
3885
+ },
3886
+ {
3887
+ "epoch": 0.6572164948453608,
3888
+ "grad_norm": 1.881408929824829,
3889
+ "learning_rate": 6.350580303523947e-06,
3890
+ "loss": 0.0053,
3891
+ "step": 510
3892
+ },
3893
+ {
3894
+ "epoch": 0.6585051546391752,
3895
+ "grad_norm": 1.8034359216690063,
3896
+ "learning_rate": 6.308713142093749e-06,
3897
+ "loss": 0.0255,
3898
+ "step": 511
3899
+ },
3900
+ {
3901
+ "epoch": 0.6597938144329897,
3902
+ "grad_norm": 2.4420573711395264,
3903
+ "learning_rate": 6.266920757306429e-06,
3904
+ "loss": 0.0248,
3905
+ "step": 512
3906
+ },
3907
+ {
3908
+ "epoch": 0.6610824742268041,
3909
+ "grad_norm": 2.5794460773468018,
3910
+ "learning_rate": 6.225203995775746e-06,
3911
+ "loss": 0.0085,
3912
+ "step": 513
3913
+ },
3914
+ {
3915
+ "epoch": 0.6623711340206185,
3916
+ "grad_norm": 0.4044947922229767,
3917
+ "learning_rate": 6.183563702583506e-06,
3918
+ "loss": 0.0016,
3919
+ "step": 514
3920
+ },
3921
+ {
3922
+ "epoch": 0.663659793814433,
3923
+ "grad_norm": 0.47102847695350647,
3924
+ "learning_rate": 6.1420007212624584e-06,
3925
+ "loss": 0.0011,
3926
+ "step": 515
3927
+ },
3928
+ {
3929
+ "epoch": 0.6649484536082474,
3930
+ "grad_norm": 0.08942103385925293,
3931
+ "learning_rate": 6.100515893779188e-06,
3932
+ "loss": 0.0006,
3933
+ "step": 516
3934
+ },
3935
+ {
3936
+ "epoch": 0.6662371134020618,
3937
+ "grad_norm": 1.772922158241272,
3938
+ "learning_rate": 6.05911006051708e-06,
3939
+ "loss": 0.0078,
3940
+ "step": 517
3941
+ },
3942
+ {
3943
+ "epoch": 0.6675257731958762,
3944
+ "grad_norm": 1.0783969163894653,
3945
+ "learning_rate": 6.01778406025928e-06,
3946
+ "loss": 0.0041,
3947
+ "step": 518
3948
+ },
3949
+ {
3950
+ "epoch": 0.6688144329896907,
3951
+ "grad_norm": 1.0612661838531494,
3952
+ "learning_rate": 5.976538730171708e-06,
3953
+ "loss": 0.0064,
3954
+ "step": 519
3955
+ },
3956
+ {
3957
+ "epoch": 0.6701030927835051,
3958
+ "grad_norm": 3.4996323585510254,
3959
+ "learning_rate": 5.935374905786102e-06,
3960
+ "loss": 0.0078,
3961
+ "step": 520
3962
+ },
3963
+ {
3964
+ "epoch": 0.6701030927835051,
3965
+ "eval_accuracy": 0.997020854021847,
3966
+ "eval_f1": 0.9473684210526315,
3967
+ "eval_loss": 0.014883686788380146,
3968
+ "eval_precision": 0.9473684210526315,
3969
+ "eval_recall": 0.9473684210526315,
3970
+ "eval_runtime": 83.6623,
3971
+ "eval_samples_per_second": 5.439,
3972
+ "eval_steps_per_second": 0.179,
3973
+ "step": 520
3974
+ },
3975
+ {
3976
+ "epoch": 0.6713917525773195,
3977
+ "grad_norm": 4.155304431915283,
3978
+ "learning_rate": 5.89429342098309e-06,
3979
+ "loss": 0.0209,
3980
+ "step": 521
3981
+ },
3982
+ {
3983
+ "epoch": 0.6726804123711341,
3984
+ "grad_norm": 1.2153571844100952,
3985
+ "learning_rate": 5.8532951079752895e-06,
3986
+ "loss": 0.0045,
3987
+ "step": 522
3988
+ },
3989
+ {
3990
+ "epoch": 0.6739690721649485,
3991
+ "grad_norm": 0.23559901118278503,
3992
+ "learning_rate": 5.812380797290465e-06,
3993
+ "loss": 0.0011,
3994
+ "step": 523
3995
+ },
3996
+ {
3997
+ "epoch": 0.6752577319587629,
3998
+ "grad_norm": 0.7906387448310852,
3999
+ "learning_rate": 5.771551317754691e-06,
4000
+ "loss": 0.0016,
4001
+ "step": 524
4002
+ },
4003
+ {
4004
+ "epoch": 0.6765463917525774,
4005
+ "grad_norm": 0.6385716199874878,
4006
+ "learning_rate": 5.730807496475568e-06,
4007
+ "loss": 0.0028,
4008
+ "step": 525
4009
+ },
4010
+ {
4011
+ "epoch": 0.6778350515463918,
4012
+ "grad_norm": 0.6003400087356567,
4013
+ "learning_rate": 5.690150158825462e-06,
4014
+ "loss": 0.0017,
4015
+ "step": 526
4016
+ },
4017
+ {
4018
+ "epoch": 0.6791237113402062,
4019
+ "grad_norm": 0.0828804075717926,
4020
+ "learning_rate": 5.649580128424792e-06,
4021
+ "loss": 0.0013,
4022
+ "step": 527
4023
+ },
4024
+ {
4025
+ "epoch": 0.6804123711340206,
4026
+ "grad_norm": 0.4008159637451172,
4027
+ "learning_rate": 5.609098227125334e-06,
4028
+ "loss": 0.0012,
4029
+ "step": 528
4030
+ },
4031
+ {
4032
+ "epoch": 0.6817010309278351,
4033
+ "grad_norm": 1.0619193315505981,
4034
+ "learning_rate": 5.568705274993584e-06,
4035
+ "loss": 0.0026,
4036
+ "step": 529
4037
+ },
4038
+ {
4039
+ "epoch": 0.6829896907216495,
4040
+ "grad_norm": 0.24862630665302277,
4041
+ "learning_rate": 5.528402090294142e-06,
4042
+ "loss": 0.0013,
4043
+ "step": 530
4044
+ },
4045
+ {
4046
+ "epoch": 0.6842783505154639,
4047
+ "grad_norm": 2.5749034881591797,
4048
+ "learning_rate": 5.488189489473133e-06,
4049
+ "loss": 0.0279,
4050
+ "step": 531
4051
+ },
4052
+ {
4053
+ "epoch": 0.6855670103092784,
4054
+ "grad_norm": 0.7257778644561768,
4055
+ "learning_rate": 5.448068287141663e-06,
4056
+ "loss": 0.0029,
4057
+ "step": 532
4058
+ },
4059
+ {
4060
+ "epoch": 0.6868556701030928,
4061
+ "grad_norm": 0.047684453427791595,
4062
+ "learning_rate": 5.4080392960593355e-06,
4063
+ "loss": 0.0003,
4064
+ "step": 533
4065
+ },
4066
+ {
4067
+ "epoch": 0.6881443298969072,
4068
+ "grad_norm": 2.056273937225342,
4069
+ "learning_rate": 5.368103327117768e-06,
4070
+ "loss": 0.017,
4071
+ "step": 534
4072
+ },
4073
+ {
4074
+ "epoch": 0.6894329896907216,
4075
+ "grad_norm": 2.3359081745147705,
4076
+ "learning_rate": 5.328261189324166e-06,
4077
+ "loss": 0.0234,
4078
+ "step": 535
4079
+ },
4080
+ {
4081
+ "epoch": 0.6907216494845361,
4082
+ "grad_norm": 0.4394116699695587,
4083
+ "learning_rate": 5.288513689784951e-06,
4084
+ "loss": 0.0013,
4085
+ "step": 536
4086
+ },
4087
+ {
4088
+ "epoch": 0.6920103092783505,
4089
+ "grad_norm": 0.6693306565284729,
4090
+ "learning_rate": 5.2488616336893915e-06,
4091
+ "loss": 0.0016,
4092
+ "step": 537
4093
+ },
4094
+ {
4095
+ "epoch": 0.6932989690721649,
4096
+ "grad_norm": 1.8861000537872314,
4097
+ "learning_rate": 5.209305824293307e-06,
4098
+ "loss": 0.0068,
4099
+ "step": 538
4100
+ },
4101
+ {
4102
+ "epoch": 0.6945876288659794,
4103
+ "grad_norm": 1.8001142740249634,
4104
+ "learning_rate": 5.1698470629027845e-06,
4105
+ "loss": 0.0163,
4106
+ "step": 539
4107
+ },
4108
+ {
4109
+ "epoch": 0.6958762886597938,
4110
+ "grad_norm": 1.7382943630218506,
4111
+ "learning_rate": 5.130486148857952e-06,
4112
+ "loss": 0.0291,
4113
+ "step": 540
4114
+ },
4115
+ {
4116
+ "epoch": 0.6958762886597938,
4117
+ "eval_accuracy": 0.9975173783515392,
4118
+ "eval_f1": 0.9557522123893806,
4119
+ "eval_loss": 0.014678677543997765,
4120
+ "eval_precision": 0.9642857142857143,
4121
+ "eval_recall": 0.9473684210526315,
4122
+ "eval_runtime": 83.1931,
4123
+ "eval_samples_per_second": 5.469,
4124
+ "eval_steps_per_second": 0.18,
4125
+ "step": 540
4126
+ },
4127
+ {
4128
+ "epoch": 0.6971649484536082,
4129
+ "grad_norm": 1.584878921508789,
4130
+ "learning_rate": 5.0912238795167845e-06,
4131
+ "loss": 0.0218,
4132
+ "step": 541
4133
+ },
4134
+ {
4135
+ "epoch": 0.6984536082474226,
4136
+ "grad_norm": 0.42845746874809265,
4137
+ "learning_rate": 5.05206105023895e-06,
4138
+ "loss": 0.0022,
4139
+ "step": 542
4140
+ },
4141
+ {
4142
+ "epoch": 0.6997422680412371,
4143
+ "grad_norm": 3.7735812664031982,
4144
+ "learning_rate": 5.012998454369701e-06,
4145
+ "loss": 0.0381,
4146
+ "step": 543
4147
+ },
4148
+ {
4149
+ "epoch": 0.7010309278350515,
4150
+ "grad_norm": 1.2307202816009521,
4151
+ "learning_rate": 4.974036883223798e-06,
4152
+ "loss": 0.0065,
4153
+ "step": 544
4154
+ },
4155
+ {
4156
+ "epoch": 0.7023195876288659,
4157
+ "grad_norm": 0.17536193132400513,
4158
+ "learning_rate": 4.935177126069485e-06,
4159
+ "loss": 0.0006,
4160
+ "step": 545
4161
+ },
4162
+ {
4163
+ "epoch": 0.7036082474226805,
4164
+ "grad_norm": 1.0268815755844116,
4165
+ "learning_rate": 4.896419970112499e-06,
4166
+ "loss": 0.0072,
4167
+ "step": 546
4168
+ },
4169
+ {
4170
+ "epoch": 0.7048969072164949,
4171
+ "grad_norm": 0.6802368760108948,
4172
+ "learning_rate": 4.857766200480115e-06,
4173
+ "loss": 0.0018,
4174
+ "step": 547
4175
+ },
4176
+ {
4177
+ "epoch": 0.7061855670103093,
4178
+ "grad_norm": 0.3945528268814087,
4179
+ "learning_rate": 4.819216600205254e-06,
4180
+ "loss": 0.0019,
4181
+ "step": 548
4182
+ },
4183
+ {
4184
+ "epoch": 0.7074742268041238,
4185
+ "grad_norm": 1.625831127166748,
4186
+ "learning_rate": 4.780771950210616e-06,
4187
+ "loss": 0.0087,
4188
+ "step": 549
4189
+ },
4190
+ {
4191
+ "epoch": 0.7087628865979382,
4192
+ "grad_norm": 1.2541440725326538,
4193
+ "learning_rate": 4.742433029292856e-06,
4194
+ "loss": 0.0031,
4195
+ "step": 550
4196
+ },
4197
+ {
4198
+ "epoch": 0.7100515463917526,
4199
+ "grad_norm": 0.1559123545885086,
4200
+ "learning_rate": 4.704200614106813e-06,
4201
+ "loss": 0.0011,
4202
+ "step": 551
4203
+ },
4204
+ {
4205
+ "epoch": 0.711340206185567,
4206
+ "grad_norm": 1.7901103496551514,
4207
+ "learning_rate": 4.6660754791497755e-06,
4208
+ "loss": 0.008,
4209
+ "step": 552
4210
+ },
4211
+ {
4212
+ "epoch": 0.7126288659793815,
4213
+ "grad_norm": 2.5184080600738525,
4214
+ "learning_rate": 4.628058396745787e-06,
4215
+ "loss": 0.0062,
4216
+ "step": 553
4217
+ },
4218
+ {
4219
+ "epoch": 0.7139175257731959,
4220
+ "grad_norm": 1.7694346904754639,
4221
+ "learning_rate": 4.590150137030009e-06,
4222
+ "loss": 0.0067,
4223
+ "step": 554
4224
+ },
4225
+ {
4226
+ "epoch": 0.7152061855670103,
4227
+ "grad_norm": 0.8861400485038757,
4228
+ "learning_rate": 4.552351467933115e-06,
4229
+ "loss": 0.0027,
4230
+ "step": 555
4231
+ },
4232
+ {
4233
+ "epoch": 0.7164948453608248,
4234
+ "grad_norm": 1.867985725402832,
4235
+ "learning_rate": 4.514663155165731e-06,
4236
+ "loss": 0.0076,
4237
+ "step": 556
4238
+ },
4239
+ {
4240
+ "epoch": 0.7177835051546392,
4241
+ "grad_norm": 0.6749151945114136,
4242
+ "learning_rate": 4.477085962202931e-06,
4243
+ "loss": 0.0026,
4244
+ "step": 557
4245
+ },
4246
+ {
4247
+ "epoch": 0.7190721649484536,
4248
+ "grad_norm": 0.47943294048309326,
4249
+ "learning_rate": 4.439620650268771e-06,
4250
+ "loss": 0.0014,
4251
+ "step": 558
4252
+ },
4253
+ {
4254
+ "epoch": 0.720360824742268,
4255
+ "grad_norm": 0.7041411399841309,
4256
+ "learning_rate": 4.402267978320854e-06,
4257
+ "loss": 0.0024,
4258
+ "step": 559
4259
+ },
4260
+ {
4261
+ "epoch": 0.7216494845360825,
4262
+ "grad_norm": 2.199207067489624,
4263
+ "learning_rate": 4.365028703034976e-06,
4264
+ "loss": 0.0119,
4265
+ "step": 560
4266
+ },
4267
+ {
4268
+ "epoch": 0.7216494845360825,
4269
+ "eval_accuracy": 0.997020854021847,
4270
+ "eval_f1": 0.9473684210526315,
4271
+ "eval_loss": 0.013599889352917671,
4272
+ "eval_precision": 0.9473684210526315,
4273
+ "eval_recall": 0.9473684210526315,
4274
+ "eval_runtime": 84.0133,
4275
+ "eval_samples_per_second": 5.416,
4276
+ "eval_steps_per_second": 0.179,
4277
+ "step": 560
4278
+ },
4279
+ {
4280
+ "epoch": 0.7229381443298969,
4281
+ "grad_norm": 2.070563554763794,
4282
+ "learning_rate": 4.327903578789785e-06,
4283
+ "loss": 0.0297,
4284
+ "step": 561
4285
+ },
4286
+ {
4287
+ "epoch": 0.7242268041237113,
4288
+ "grad_norm": 0.7736282348632812,
4289
+ "learning_rate": 4.290893357651502e-06,
4290
+ "loss": 0.0014,
4291
+ "step": 562
4292
+ },
4293
+ {
4294
+ "epoch": 0.7255154639175257,
4295
+ "grad_norm": 1.5043028593063354,
4296
+ "learning_rate": 4.253998789358683e-06,
4297
+ "loss": 0.0121,
4298
+ "step": 563
4299
+ },
4300
+ {
4301
+ "epoch": 0.7268041237113402,
4302
+ "grad_norm": 2.060772180557251,
4303
+ "learning_rate": 4.217220621307043e-06,
4304
+ "loss": 0.0076,
4305
+ "step": 564
4306
+ },
4307
+ {
4308
+ "epoch": 0.7280927835051546,
4309
+ "grad_norm": 0.3604072034358978,
4310
+ "learning_rate": 4.180559598534297e-06,
4311
+ "loss": 0.0012,
4312
+ "step": 565
4313
+ },
4314
+ {
4315
+ "epoch": 0.729381443298969,
4316
+ "grad_norm": 1.343294382095337,
4317
+ "learning_rate": 4.144016463705081e-06,
4318
+ "loss": 0.0043,
4319
+ "step": 566
4320
+ },
4321
+ {
4322
+ "epoch": 0.7306701030927835,
4323
+ "grad_norm": 1.6220418214797974,
4324
+ "learning_rate": 4.107591957095903e-06,
4325
+ "loss": 0.0232,
4326
+ "step": 567
4327
+ },
4328
+ {
4329
+ "epoch": 0.7319587628865979,
4330
+ "grad_norm": 0.1144634336233139,
4331
+ "learning_rate": 4.071286816580142e-06,
4332
+ "loss": 0.001,
4333
+ "step": 568
4334
+ },
4335
+ {
4336
+ "epoch": 0.7332474226804123,
4337
+ "grad_norm": 0.7282357811927795,
4338
+ "learning_rate": 4.035101777613113e-06,
4339
+ "loss": 0.0018,
4340
+ "step": 569
4341
+ },
4342
+ {
4343
+ "epoch": 0.7345360824742269,
4344
+ "grad_norm": 3.7139105796813965,
4345
+ "learning_rate": 3.999037573217157e-06,
4346
+ "loss": 0.0249,
4347
+ "step": 570
4348
+ },
4349
+ {
4350
+ "epoch": 0.7358247422680413,
4351
+ "grad_norm": 2.923678159713745,
4352
+ "learning_rate": 3.963094933966797e-06,
4353
+ "loss": 0.0207,
4354
+ "step": 571
4355
+ },
4356
+ {
4357
+ "epoch": 0.7371134020618557,
4358
+ "grad_norm": 0.20384177565574646,
4359
+ "learning_rate": 3.927274587973935e-06,
4360
+ "loss": 0.0006,
4361
+ "step": 572
4362
+ },
4363
+ {
4364
+ "epoch": 0.7384020618556701,
4365
+ "grad_norm": 0.675167977809906,
4366
+ "learning_rate": 3.8915772608731066e-06,
4367
+ "loss": 0.0014,
4368
+ "step": 573
4369
+ },
4370
+ {
4371
+ "epoch": 0.7396907216494846,
4372
+ "grad_norm": 3.719918727874756,
4373
+ "learning_rate": 3.856003675806777e-06,
4374
+ "loss": 0.0142,
4375
+ "step": 574
4376
+ },
4377
+ {
4378
+ "epoch": 0.740979381443299,
4379
+ "grad_norm": 0.10527591407299042,
4380
+ "learning_rate": 3.820554553410693e-06,
4381
+ "loss": 0.0006,
4382
+ "step": 575
4383
+ },
4384
+ {
4385
+ "epoch": 0.7422680412371134,
4386
+ "grad_norm": 2.821239948272705,
4387
+ "learning_rate": 3.78523061179929e-06,
4388
+ "loss": 0.0142,
4389
+ "step": 576
4390
+ },
4391
+ {
4392
+ "epoch": 0.7435567010309279,
4393
+ "grad_norm": 0.972940981388092,
4394
+ "learning_rate": 3.7500325665511337e-06,
4395
+ "loss": 0.0039,
4396
+ "step": 577
4397
+ },
4398
+ {
4399
+ "epoch": 0.7448453608247423,
4400
+ "grad_norm": 0.08761493116617203,
4401
+ "learning_rate": 3.7149611306944356e-06,
4402
+ "loss": 0.0008,
4403
+ "step": 578
4404
+ },
4405
+ {
4406
+ "epoch": 0.7461340206185567,
4407
+ "grad_norm": 1.4573255777359009,
4408
+ "learning_rate": 3.680017014692604e-06,
4409
+ "loss": 0.0063,
4410
+ "step": 579
4411
+ },
4412
+ {
4413
+ "epoch": 0.7474226804123711,
4414
+ "grad_norm": 0.7035017609596252,
4415
+ "learning_rate": 3.645200926429844e-06,
4416
+ "loss": 0.002,
4417
+ "step": 580
4418
+ },
4419
+ {
4420
+ "epoch": 0.7474226804123711,
4421
+ "eval_accuracy": 0.9980139026812314,
4422
+ "eval_f1": 0.9642857142857143,
4423
+ "eval_loss": 0.0138359684497118,
4424
+ "eval_precision": 0.9818181818181818,
4425
+ "eval_recall": 0.9473684210526315,
4426
+ "eval_runtime": 83.1039,
4427
+ "eval_samples_per_second": 5.475,
4428
+ "eval_steps_per_second": 0.18,
4429
+ "step": 580
4430
+ },
4431
+ {
4432
+ "epoch": 0.7487113402061856,
4433
+ "grad_norm": 0.15962012112140656,
4434
+ "learning_rate": 3.610513571196832e-06,
4435
+ "loss": 0.0007,
4436
+ "step": 581
4437
+ },
4438
+ {
4439
+ "epoch": 0.75,
4440
+ "grad_norm": 0.1321505457162857,
4441
+ "learning_rate": 3.5759556516764205e-06,
4442
+ "loss": 0.0008,
4443
+ "step": 582
4444
+ },
4445
+ {
4446
+ "epoch": 0.7512886597938144,
4447
+ "grad_norm": 0.07236671447753906,
4448
+ "learning_rate": 3.541527867929403e-06,
4449
+ "loss": 0.0004,
4450
+ "step": 583
4451
+ },
4452
+ {
4453
+ "epoch": 0.7525773195876289,
4454
+ "grad_norm": 0.5433388352394104,
4455
+ "learning_rate": 3.507230917380332e-06,
4456
+ "loss": 0.0017,
4457
+ "step": 584
4458
+ },
4459
+ {
4460
+ "epoch": 0.7538659793814433,
4461
+ "grad_norm": 3.733372449874878,
4462
+ "learning_rate": 3.4730654948033957e-06,
4463
+ "loss": 0.0024,
4464
+ "step": 585
4465
+ },
4466
+ {
4467
+ "epoch": 0.7551546391752577,
4468
+ "grad_norm": 4.992002010345459,
4469
+ "learning_rate": 3.4390322923083385e-06,
4470
+ "loss": 0.0182,
4471
+ "step": 586
4472
+ },
4473
+ {
4474
+ "epoch": 0.7564432989690721,
4475
+ "grad_norm": 0.3195149302482605,
4476
+ "learning_rate": 3.4051319993264397e-06,
4477
+ "loss": 0.0009,
4478
+ "step": 587
4479
+ },
4480
+ {
4481
+ "epoch": 0.7577319587628866,
4482
+ "grad_norm": 0.13421054184436798,
4483
+ "learning_rate": 3.3713653025965544e-06,
4484
+ "loss": 0.0008,
4485
+ "step": 588
4486
+ },
4487
+ {
4488
+ "epoch": 0.759020618556701,
4489
+ "grad_norm": 0.12820366024971008,
4490
+ "learning_rate": 3.3377328861511927e-06,
4491
+ "loss": 0.0005,
4492
+ "step": 589
4493
+ },
4494
+ {
4495
+ "epoch": 0.7603092783505154,
4496
+ "grad_norm": 0.8797692060470581,
4497
+ "learning_rate": 3.3042354313026702e-06,
4498
+ "loss": 0.004,
4499
+ "step": 590
4500
+ },
4501
+ {
4502
+ "epoch": 0.7615979381443299,
4503
+ "grad_norm": 0.11013241112232208,
4504
+ "learning_rate": 3.2708736166293064e-06,
4505
+ "loss": 0.0004,
4506
+ "step": 591
4507
+ },
4508
+ {
4509
+ "epoch": 0.7628865979381443,
4510
+ "grad_norm": 0.743179202079773,
4511
+ "learning_rate": 3.237648117961665e-06,
4512
+ "loss": 0.0013,
4513
+ "step": 592
4514
+ },
4515
+ {
4516
+ "epoch": 0.7641752577319587,
4517
+ "grad_norm": 4.201583385467529,
4518
+ "learning_rate": 3.2045596083688814e-06,
4519
+ "loss": 0.0186,
4520
+ "step": 593
4521
+ },
4522
+ {
4523
+ "epoch": 0.7654639175257731,
4524
+ "grad_norm": 1.279377818107605,
4525
+ "learning_rate": 3.1716087581450193e-06,
4526
+ "loss": 0.0031,
4527
+ "step": 594
4528
+ },
4529
+ {
4530
+ "epoch": 0.7667525773195877,
4531
+ "grad_norm": 2.0569307804107666,
4532
+ "learning_rate": 3.1387962347954936e-06,
4533
+ "loss": 0.009,
4534
+ "step": 595
4535
+ },
4536
+ {
4537
+ "epoch": 0.7680412371134021,
4538
+ "grad_norm": 0.5968018174171448,
4539
+ "learning_rate": 3.1061227030235442e-06,
4540
+ "loss": 0.0011,
4541
+ "step": 596
4542
+ },
4543
+ {
4544
+ "epoch": 0.7693298969072165,
4545
+ "grad_norm": 2.1051347255706787,
4546
+ "learning_rate": 3.073588824716777e-06,
4547
+ "loss": 0.005,
4548
+ "step": 597
4549
+ },
4550
+ {
4551
+ "epoch": 0.770618556701031,
4552
+ "grad_norm": 0.3999452292919159,
4553
+ "learning_rate": 3.041195258933749e-06,
4554
+ "loss": 0.0024,
4555
+ "step": 598
4556
+ },
4557
+ {
4558
+ "epoch": 0.7719072164948454,
4559
+ "grad_norm": 1.3566821813583374,
4560
+ "learning_rate": 3.008942661890627e-06,
4561
+ "loss": 0.0027,
4562
+ "step": 599
4563
+ },
4564
+ {
4565
+ "epoch": 0.7731958762886598,
4566
+ "grad_norm": 0.30999279022216797,
4567
+ "learning_rate": 2.976831686947884e-06,
4568
+ "loss": 0.0009,
4569
+ "step": 600
4570
+ },
4571
+ {
4572
+ "epoch": 0.7731958762886598,
4573
+ "eval_accuracy": 0.9980139026812314,
4574
+ "eval_f1": 0.9642857142857143,
4575
+ "eval_loss": 0.013978274539113045,
4576
+ "eval_precision": 0.9818181818181818,
4577
+ "eval_recall": 0.9473684210526315,
4578
+ "eval_runtime": 83.5538,
4579
+ "eval_samples_per_second": 5.446,
4580
+ "eval_steps_per_second": 0.18,
4581
+ "step": 600
4582
  }
4583
  ],
4584
  "logging_steps": 1,
 
4598
  "attributes": {}
4599
  }
4600
  },
4601
+ "total_flos": 2.004089890144256e+17,
4602
  "train_batch_size": 8,
4603
  "trial_name": null,
4604
  "trial_params": null