mtzig commited on
Commit
4882ebd
·
verified ·
1 Parent(s): 2ff964d

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee6fb7682ac2960073f7f7f514a487e812e315d44419de7236d03c8ab15aadb0
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d23824b3f642ae1c179f034608cd4b9f408d94d0f7fcb6cf1a4e20079b004e8c
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88e008363e8993235e6eb2af6c9a5ff56f447d8bdc2cf16eca2f0422b1ede8c6
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be21c5803484f0c4c0b0b4a16dbac528d0b5af1bd54d4586f3265080760294dc
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44b477d5172b476bbf9b578ef9878ce39658de519ce36fd3743830da1d68fefd
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10500e9d5fe4432017d4802187cbc53c3d8e66cefb65b9df41e5b265ac7d3904
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:145633610b9b152b0d2cfc8f1f6615f8471936dfa77ce591ae7e2e811ac751bb
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd93fdfd38d485f3348d1401133ef3b5e8b9f318176a54d6a3de17e87a0cc244
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:265e64cfbd04131bffb2a5617a8c2c4c2ea31a7fad0d26e752c26ca24a5c050c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d2d7f1f9c14930a0fe7b92e49a1a38aedb2f9b0b32cf2680ed384f5ea215db8
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a960df6918c81f11623085f85ea94df1837af4b8f4267a1629fb6bab5c469523
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15f402b6d71f018c886c5b84826d5e72c63f77c6a19da8dfa4d1d2ae32c239cb
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5862dd9e056331bb039021a6505bb7a3f1fa98b8a9f5c3fda8a0888d65d5b053
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be1fa2d26d651df333ec36a0c11ff0a77d307383f8747a1e9b552d6a8c33f7d8
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc9bd789414240ee2b44965c46a333328df7f9fce29ffffe29646f97e82894b8
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1880f07905fc31b9ac52835ef69014867a49aee88680a57b62a19264452b89c6
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c60c3492b6f46efa4d46f08722f1ca59fb0c21715cf617886bb5ba859c7d9d5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a59db3724fd4bbce6ad60141d8e9b678e2c788fb1955c6c880b0f571d42b391
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8b3388ef358d7f5f11e1879a04d7711953bb9f05b7b304558dfcff8e1df0bd0
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf84d01b70a9160a4096ae20f0ffd66ca31c1e463be8b3c7a205beb67b59d79
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7c4655d165a45ceaebd2333d540d7ca911eb47056d42042ac402dbf7a3b1875
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fe1aa7e3e5012afa45a34124a8768ba80f30f03a36c3082e890c6fec80f5130
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:076dbf8750dbf683323b1e43171b411204fb0c6b72da57c223dae871c8d1c08e
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7971cf1ed3ade754f5376f0be2af02ca9956ac66a79f39ece9d08be9961f719f
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5b83b9e4abe0e439076bc90dc25d49fcd5d4c81c9c012e6730d7f022accf132
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e43838c246602506a97d56d7f86c5c26c0b2b1f2b549429b3935fd043d6ac6c6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.46382189239332094,
5
  "eval_steps": 20,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3819,6 +3819,766 @@
3819
  "eval_samples_per_second": 5.71,
3820
  "eval_steps_per_second": 0.186,
3821
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3822
  }
3823
  ],
3824
  "logging_steps": 1,
@@ -3838,7 +4598,7 @@
3838
  "attributes": {}
3839
  }
3840
  },
3841
- "total_flos": 1.5957678843389542e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5565862708719852,
5
  "eval_steps": 20,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3819
  "eval_samples_per_second": 5.71,
3820
  "eval_steps_per_second": 0.186,
3821
  "step": 500
3822
+ },
3823
+ {
3824
+ "epoch": 0.4647495361781076,
3825
+ "grad_norm": 4.0140509605407715,
3826
+ "learning_rate": 1.2935759535959528e-05,
3827
+ "loss": 0.2093,
3828
+ "step": 501
3829
+ },
3830
+ {
3831
+ "epoch": 0.46567717996289426,
3832
+ "grad_norm": 2.7800872325897217,
3833
+ "learning_rate": 1.2904783772807534e-05,
3834
+ "loss": 0.1333,
3835
+ "step": 502
3836
+ },
3837
+ {
3838
+ "epoch": 0.46660482374768086,
3839
+ "grad_norm": 4.866140842437744,
3840
+ "learning_rate": 1.2873777539848284e-05,
3841
+ "loss": 0.2078,
3842
+ "step": 503
3843
+ },
3844
+ {
3845
+ "epoch": 0.4675324675324675,
3846
+ "grad_norm": 4.730177879333496,
3847
+ "learning_rate": 1.2842741162322487e-05,
3848
+ "loss": 0.1834,
3849
+ "step": 504
3850
+ },
3851
+ {
3852
+ "epoch": 0.4684601113172542,
3853
+ "grad_norm": 3.826939582824707,
3854
+ "learning_rate": 1.2811674965787058e-05,
3855
+ "loss": 0.1712,
3856
+ "step": 505
3857
+ },
3858
+ {
3859
+ "epoch": 0.46938775510204084,
3860
+ "grad_norm": 16.19306755065918,
3861
+ "learning_rate": 1.2780579276111702e-05,
3862
+ "loss": 0.2195,
3863
+ "step": 506
3864
+ },
3865
+ {
3866
+ "epoch": 0.47031539888682744,
3867
+ "grad_norm": 4.020465850830078,
3868
+ "learning_rate": 1.2749454419475486e-05,
3869
+ "loss": 0.1387,
3870
+ "step": 507
3871
+ },
3872
+ {
3873
+ "epoch": 0.4712430426716141,
3874
+ "grad_norm": 3.083519697189331,
3875
+ "learning_rate": 1.2718300722363431e-05,
3876
+ "loss": 0.1595,
3877
+ "step": 508
3878
+ },
3879
+ {
3880
+ "epoch": 0.47217068645640076,
3881
+ "grad_norm": 3.982968807220459,
3882
+ "learning_rate": 1.2687118511563075e-05,
3883
+ "loss": 0.1304,
3884
+ "step": 509
3885
+ },
3886
+ {
3887
+ "epoch": 0.47309833024118736,
3888
+ "grad_norm": 4.213962554931641,
3889
+ "learning_rate": 1.2655908114161053e-05,
3890
+ "loss": 0.1269,
3891
+ "step": 510
3892
+ },
3893
+ {
3894
+ "epoch": 0.474025974025974,
3895
+ "grad_norm": 3.9727156162261963,
3896
+ "learning_rate": 1.2624669857539669e-05,
3897
+ "loss": 0.1327,
3898
+ "step": 511
3899
+ },
3900
+ {
3901
+ "epoch": 0.4749536178107607,
3902
+ "grad_norm": 4.792732238769531,
3903
+ "learning_rate": 1.2593404069373452e-05,
3904
+ "loss": 0.1782,
3905
+ "step": 512
3906
+ },
3907
+ {
3908
+ "epoch": 0.47588126159554733,
3909
+ "grad_norm": 3.0668811798095703,
3910
+ "learning_rate": 1.2562111077625723e-05,
3911
+ "loss": 0.1568,
3912
+ "step": 513
3913
+ },
3914
+ {
3915
+ "epoch": 0.47680890538033394,
3916
+ "grad_norm": 4.204139709472656,
3917
+ "learning_rate": 1.2530791210545163e-05,
3918
+ "loss": 0.149,
3919
+ "step": 514
3920
+ },
3921
+ {
3922
+ "epoch": 0.4777365491651206,
3923
+ "grad_norm": 4.396578788757324,
3924
+ "learning_rate": 1.2499444796662354e-05,
3925
+ "loss": 0.1943,
3926
+ "step": 515
3927
+ },
3928
+ {
3929
+ "epoch": 0.47866419294990725,
3930
+ "grad_norm": 8.477376937866211,
3931
+ "learning_rate": 1.2468072164786342e-05,
3932
+ "loss": 0.3153,
3933
+ "step": 516
3934
+ },
3935
+ {
3936
+ "epoch": 0.47959183673469385,
3937
+ "grad_norm": 4.493762016296387,
3938
+ "learning_rate": 1.2436673644001196e-05,
3939
+ "loss": 0.2028,
3940
+ "step": 517
3941
+ },
3942
+ {
3943
+ "epoch": 0.4805194805194805,
3944
+ "grad_norm": 3.7687320709228516,
3945
+ "learning_rate": 1.2405249563662539e-05,
3946
+ "loss": 0.1834,
3947
+ "step": 518
3948
+ },
3949
+ {
3950
+ "epoch": 0.48144712430426717,
3951
+ "grad_norm": 5.828054428100586,
3952
+ "learning_rate": 1.23738002533941e-05,
3953
+ "loss": 0.1587,
3954
+ "step": 519
3955
+ },
3956
+ {
3957
+ "epoch": 0.48237476808905383,
3958
+ "grad_norm": 4.38277006149292,
3959
+ "learning_rate": 1.2342326043084268e-05,
3960
+ "loss": 0.1809,
3961
+ "step": 520
3962
+ },
3963
+ {
3964
+ "epoch": 0.48237476808905383,
3965
+ "eval_accuracy": 0.8458980044345898,
3966
+ "eval_f1": 0.6584766584766585,
3967
+ "eval_loss": 0.3318649232387543,
3968
+ "eval_precision": 0.8701298701298701,
3969
+ "eval_recall": 0.5296442687747036,
3970
+ "eval_runtime": 46.7049,
3971
+ "eval_samples_per_second": 5.909,
3972
+ "eval_steps_per_second": 0.193,
3973
+ "step": 520
3974
+ },
3975
+ {
3976
+ "epoch": 0.48330241187384043,
3977
+ "grad_norm": 4.514082908630371,
3978
+ "learning_rate": 1.2310827262882614e-05,
3979
+ "loss": 0.1759,
3980
+ "step": 521
3981
+ },
3982
+ {
3983
+ "epoch": 0.4842300556586271,
3984
+ "grad_norm": 3.4471170902252197,
3985
+ "learning_rate": 1.2279304243196438e-05,
3986
+ "loss": 0.1364,
3987
+ "step": 522
3988
+ },
3989
+ {
3990
+ "epoch": 0.48515769944341375,
3991
+ "grad_norm": 2.6771576404571533,
3992
+ "learning_rate": 1.2247757314687296e-05,
3993
+ "loss": 0.146,
3994
+ "step": 523
3995
+ },
3996
+ {
3997
+ "epoch": 0.48608534322820035,
3998
+ "grad_norm": 3.6538710594177246,
3999
+ "learning_rate": 1.2216186808267544e-05,
4000
+ "loss": 0.2009,
4001
+ "step": 524
4002
+ },
4003
+ {
4004
+ "epoch": 0.487012987012987,
4005
+ "grad_norm": 4.4636149406433105,
4006
+ "learning_rate": 1.2184593055096853e-05,
4007
+ "loss": 0.1474,
4008
+ "step": 525
4009
+ },
4010
+ {
4011
+ "epoch": 0.48794063079777367,
4012
+ "grad_norm": 5.249742031097412,
4013
+ "learning_rate": 1.215297638657875e-05,
4014
+ "loss": 0.1564,
4015
+ "step": 526
4016
+ },
4017
+ {
4018
+ "epoch": 0.48886827458256027,
4019
+ "grad_norm": 6.001680850982666,
4020
+ "learning_rate": 1.2121337134357121e-05,
4021
+ "loss": 0.1718,
4022
+ "step": 527
4023
+ },
4024
+ {
4025
+ "epoch": 0.4897959183673469,
4026
+ "grad_norm": 6.546911716461182,
4027
+ "learning_rate": 1.2089675630312755e-05,
4028
+ "loss": 0.2193,
4029
+ "step": 528
4030
+ },
4031
+ {
4032
+ "epoch": 0.4907235621521336,
4033
+ "grad_norm": 2.6112513542175293,
4034
+ "learning_rate": 1.2057992206559837e-05,
4035
+ "loss": 0.1295,
4036
+ "step": 529
4037
+ },
4038
+ {
4039
+ "epoch": 0.49165120593692024,
4040
+ "grad_norm": 3.9656717777252197,
4041
+ "learning_rate": 1.2026287195442503e-05,
4042
+ "loss": 0.1707,
4043
+ "step": 530
4044
+ },
4045
+ {
4046
+ "epoch": 0.49257884972170685,
4047
+ "grad_norm": 3.8426477909088135,
4048
+ "learning_rate": 1.199456092953131e-05,
4049
+ "loss": 0.1768,
4050
+ "step": 531
4051
+ },
4052
+ {
4053
+ "epoch": 0.4935064935064935,
4054
+ "grad_norm": 3.001831531524658,
4055
+ "learning_rate": 1.1962813741619777e-05,
4056
+ "loss": 0.1839,
4057
+ "step": 532
4058
+ },
4059
+ {
4060
+ "epoch": 0.49443413729128016,
4061
+ "grad_norm": 5.149347305297852,
4062
+ "learning_rate": 1.1931045964720882e-05,
4063
+ "loss": 0.2559,
4064
+ "step": 533
4065
+ },
4066
+ {
4067
+ "epoch": 0.49536178107606677,
4068
+ "grad_norm": 3.0468552112579346,
4069
+ "learning_rate": 1.189925793206357e-05,
4070
+ "loss": 0.1408,
4071
+ "step": 534
4072
+ },
4073
+ {
4074
+ "epoch": 0.4962894248608534,
4075
+ "grad_norm": 4.5860490798950195,
4076
+ "learning_rate": 1.1867449977089264e-05,
4077
+ "loss": 0.1945,
4078
+ "step": 535
4079
+ },
4080
+ {
4081
+ "epoch": 0.4972170686456401,
4082
+ "grad_norm": 3.8901429176330566,
4083
+ "learning_rate": 1.1835622433448361e-05,
4084
+ "loss": 0.2126,
4085
+ "step": 536
4086
+ },
4087
+ {
4088
+ "epoch": 0.49814471243042674,
4089
+ "grad_norm": 4.597271919250488,
4090
+ "learning_rate": 1.1803775634996735e-05,
4091
+ "loss": 0.1977,
4092
+ "step": 537
4093
+ },
4094
+ {
4095
+ "epoch": 0.49907235621521334,
4096
+ "grad_norm": 3.079770803451538,
4097
+ "learning_rate": 1.177190991579223e-05,
4098
+ "loss": 0.1758,
4099
+ "step": 538
4100
+ },
4101
+ {
4102
+ "epoch": 0.5,
4103
+ "grad_norm": 4.214216709136963,
4104
+ "learning_rate": 1.174002561009116e-05,
4105
+ "loss": 0.141,
4106
+ "step": 539
4107
+ },
4108
+ {
4109
+ "epoch": 0.5009276437847866,
4110
+ "grad_norm": 5.213557243347168,
4111
+ "learning_rate": 1.1708123052344803e-05,
4112
+ "loss": 0.1605,
4113
+ "step": 540
4114
+ },
4115
+ {
4116
+ "epoch": 0.5009276437847866,
4117
+ "eval_accuracy": 0.8614190687361419,
4118
+ "eval_f1": 0.7072599531615925,
4119
+ "eval_loss": 0.30164089798927307,
4120
+ "eval_precision": 0.867816091954023,
4121
+ "eval_recall": 0.5968379446640316,
4122
+ "eval_runtime": 47.208,
4123
+ "eval_samples_per_second": 5.846,
4124
+ "eval_steps_per_second": 0.191,
4125
+ "step": 540
4126
+ },
4127
+ {
4128
+ "epoch": 0.5018552875695733,
4129
+ "grad_norm": 2.1639671325683594,
4130
+ "learning_rate": 1.1676202577195901e-05,
4131
+ "loss": 0.0809,
4132
+ "step": 541
4133
+ },
4134
+ {
4135
+ "epoch": 0.5027829313543599,
4136
+ "grad_norm": 4.442990303039551,
4137
+ "learning_rate": 1.164426451947513e-05,
4138
+ "loss": 0.1677,
4139
+ "step": 542
4140
+ },
4141
+ {
4142
+ "epoch": 0.5037105751391465,
4143
+ "grad_norm": 3.968435049057007,
4144
+ "learning_rate": 1.1612309214197599e-05,
4145
+ "loss": 0.2114,
4146
+ "step": 543
4147
+ },
4148
+ {
4149
+ "epoch": 0.5046382189239332,
4150
+ "grad_norm": 9.67383098602295,
4151
+ "learning_rate": 1.1580336996559343e-05,
4152
+ "loss": 0.2159,
4153
+ "step": 544
4154
+ },
4155
+ {
4156
+ "epoch": 0.5055658627087198,
4157
+ "grad_norm": 3.427710771560669,
4158
+ "learning_rate": 1.1548348201933799e-05,
4159
+ "loss": 0.0846,
4160
+ "step": 545
4161
+ },
4162
+ {
4163
+ "epoch": 0.5064935064935064,
4164
+ "grad_norm": 3.03241229057312,
4165
+ "learning_rate": 1.151634316586828e-05,
4166
+ "loss": 0.1195,
4167
+ "step": 546
4168
+ },
4169
+ {
4170
+ "epoch": 0.5074211502782932,
4171
+ "grad_norm": 5.138626575469971,
4172
+ "learning_rate": 1.1484322224080474e-05,
4173
+ "loss": 0.1542,
4174
+ "step": 547
4175
+ },
4176
+ {
4177
+ "epoch": 0.5083487940630798,
4178
+ "grad_norm": 4.37513542175293,
4179
+ "learning_rate": 1.1452285712454905e-05,
4180
+ "loss": 0.2118,
4181
+ "step": 548
4182
+ },
4183
+ {
4184
+ "epoch": 0.5092764378478665,
4185
+ "grad_norm": 4.746356010437012,
4186
+ "learning_rate": 1.1420233967039423e-05,
4187
+ "loss": 0.1456,
4188
+ "step": 549
4189
+ },
4190
+ {
4191
+ "epoch": 0.5102040816326531,
4192
+ "grad_norm": 5.190648555755615,
4193
+ "learning_rate": 1.138816732404167e-05,
4194
+ "loss": 0.1921,
4195
+ "step": 550
4196
+ },
4197
+ {
4198
+ "epoch": 0.5111317254174397,
4199
+ "grad_norm": 3.955061197280884,
4200
+ "learning_rate": 1.1356086119825553e-05,
4201
+ "loss": 0.1964,
4202
+ "step": 551
4203
+ },
4204
+ {
4205
+ "epoch": 0.5120593692022264,
4206
+ "grad_norm": 7.773352146148682,
4207
+ "learning_rate": 1.1323990690907734e-05,
4208
+ "loss": 0.2178,
4209
+ "step": 552
4210
+ },
4211
+ {
4212
+ "epoch": 0.512987012987013,
4213
+ "grad_norm": 3.211651563644409,
4214
+ "learning_rate": 1.1291881373954066e-05,
4215
+ "loss": 0.1859,
4216
+ "step": 553
4217
+ },
4218
+ {
4219
+ "epoch": 0.5139146567717996,
4220
+ "grad_norm": 3.536742687225342,
4221
+ "learning_rate": 1.1259758505776092e-05,
4222
+ "loss": 0.1949,
4223
+ "step": 554
4224
+ },
4225
+ {
4226
+ "epoch": 0.5148423005565863,
4227
+ "grad_norm": 4.817080974578857,
4228
+ "learning_rate": 1.1227622423327501e-05,
4229
+ "loss": 0.2482,
4230
+ "step": 555
4231
+ },
4232
+ {
4233
+ "epoch": 0.5157699443413729,
4234
+ "grad_norm": 4.828971862792969,
4235
+ "learning_rate": 1.119547346370059e-05,
4236
+ "loss": 0.216,
4237
+ "step": 556
4238
+ },
4239
+ {
4240
+ "epoch": 0.5166975881261595,
4241
+ "grad_norm": 4.580413818359375,
4242
+ "learning_rate": 1.1163311964122733e-05,
4243
+ "loss": 0.2267,
4244
+ "step": 557
4245
+ },
4246
+ {
4247
+ "epoch": 0.5176252319109462,
4248
+ "grad_norm": 2.9997427463531494,
4249
+ "learning_rate": 1.1131138261952845e-05,
4250
+ "loss": 0.1556,
4251
+ "step": 558
4252
+ },
4253
+ {
4254
+ "epoch": 0.5185528756957328,
4255
+ "grad_norm": 2.876847982406616,
4256
+ "learning_rate": 1.109895269467783e-05,
4257
+ "loss": 0.1551,
4258
+ "step": 559
4259
+ },
4260
+ {
4261
+ "epoch": 0.5194805194805194,
4262
+ "grad_norm": 6.003294467926025,
4263
+ "learning_rate": 1.1066755599909065e-05,
4264
+ "loss": 0.2123,
4265
+ "step": 560
4266
+ },
4267
+ {
4268
+ "epoch": 0.5194805194805194,
4269
+ "eval_accuracy": 0.8603104212860311,
4270
+ "eval_f1": 0.7136363636363636,
4271
+ "eval_loss": 0.2982672452926636,
4272
+ "eval_precision": 0.839572192513369,
4273
+ "eval_recall": 0.6205533596837944,
4274
+ "eval_runtime": 48.6549,
4275
+ "eval_samples_per_second": 5.673,
4276
+ "eval_steps_per_second": 0.185,
4277
+ "step": 560
4278
+ },
4279
+ {
4280
+ "epoch": 0.5204081632653061,
4281
+ "grad_norm": 3.7513365745544434,
4282
+ "learning_rate": 1.1034547315378838e-05,
4283
+ "loss": 0.1808,
4284
+ "step": 561
4285
+ },
4286
+ {
4287
+ "epoch": 0.5213358070500927,
4288
+ "grad_norm": 2.755664110183716,
4289
+ "learning_rate": 1.1002328178936813e-05,
4290
+ "loss": 0.1272,
4291
+ "step": 562
4292
+ },
4293
+ {
4294
+ "epoch": 0.5222634508348795,
4295
+ "grad_norm": 6.458003520965576,
4296
+ "learning_rate": 1.0970098528546482e-05,
4297
+ "loss": 0.161,
4298
+ "step": 563
4299
+ },
4300
+ {
4301
+ "epoch": 0.5231910946196661,
4302
+ "grad_norm": 3.156869649887085,
4303
+ "learning_rate": 1.0937858702281631e-05,
4304
+ "loss": 0.1423,
4305
+ "step": 564
4306
+ },
4307
+ {
4308
+ "epoch": 0.5241187384044527,
4309
+ "grad_norm": 4.826041221618652,
4310
+ "learning_rate": 1.090560903832278e-05,
4311
+ "loss": 0.1717,
4312
+ "step": 565
4313
+ },
4314
+ {
4315
+ "epoch": 0.5250463821892394,
4316
+ "grad_norm": 2.8493692874908447,
4317
+ "learning_rate": 1.087334987495364e-05,
4318
+ "loss": 0.147,
4319
+ "step": 566
4320
+ },
4321
+ {
4322
+ "epoch": 0.525974025974026,
4323
+ "grad_norm": 3.3068854808807373,
4324
+ "learning_rate": 1.0841081550557577e-05,
4325
+ "loss": 0.1084,
4326
+ "step": 567
4327
+ },
4328
+ {
4329
+ "epoch": 0.5269016697588126,
4330
+ "grad_norm": 3.908871650695801,
4331
+ "learning_rate": 1.0808804403614044e-05,
4332
+ "loss": 0.1484,
4333
+ "step": 568
4334
+ },
4335
+ {
4336
+ "epoch": 0.5278293135435993,
4337
+ "grad_norm": 3.8984129428863525,
4338
+ "learning_rate": 1.0776518772695035e-05,
4339
+ "loss": 0.1883,
4340
+ "step": 569
4341
+ },
4342
+ {
4343
+ "epoch": 0.5287569573283859,
4344
+ "grad_norm": 3.58467960357666,
4345
+ "learning_rate": 1.0744224996461541e-05,
4346
+ "loss": 0.1238,
4347
+ "step": 570
4348
+ },
4349
+ {
4350
+ "epoch": 0.5296846011131725,
4351
+ "grad_norm": 5.206255912780762,
4352
+ "learning_rate": 1.0711923413659995e-05,
4353
+ "loss": 0.2008,
4354
+ "step": 571
4355
+ },
4356
+ {
4357
+ "epoch": 0.5306122448979592,
4358
+ "grad_norm": 2.4864425659179688,
4359
+ "learning_rate": 1.0679614363118718e-05,
4360
+ "loss": 0.1181,
4361
+ "step": 572
4362
+ },
4363
+ {
4364
+ "epoch": 0.5315398886827458,
4365
+ "grad_norm": 3.949312686920166,
4366
+ "learning_rate": 1.0647298183744359e-05,
4367
+ "loss": 0.1927,
4368
+ "step": 573
4369
+ },
4370
+ {
4371
+ "epoch": 0.5324675324675324,
4372
+ "grad_norm": 6.005074501037598,
4373
+ "learning_rate": 1.061497521451835e-05,
4374
+ "loss": 0.1936,
4375
+ "step": 574
4376
+ },
4377
+ {
4378
+ "epoch": 0.5333951762523191,
4379
+ "grad_norm": 4.429588317871094,
4380
+ "learning_rate": 1.0582645794493337e-05,
4381
+ "loss": 0.2031,
4382
+ "step": 575
4383
+ },
4384
+ {
4385
+ "epoch": 0.5343228200371057,
4386
+ "grad_norm": 4.36995792388916,
4387
+ "learning_rate": 1.055031026278965e-05,
4388
+ "loss": 0.1846,
4389
+ "step": 576
4390
+ },
4391
+ {
4392
+ "epoch": 0.5352504638218923,
4393
+ "grad_norm": 4.059164524078369,
4394
+ "learning_rate": 1.0517968958591705e-05,
4395
+ "loss": 0.1681,
4396
+ "step": 577
4397
+ },
4398
+ {
4399
+ "epoch": 0.536178107606679,
4400
+ "grad_norm": 4.211386203765869,
4401
+ "learning_rate": 1.0485622221144485e-05,
4402
+ "loss": 0.1296,
4403
+ "step": 578
4404
+ },
4405
+ {
4406
+ "epoch": 0.5371057513914657,
4407
+ "grad_norm": 4.114365100860596,
4408
+ "learning_rate": 1.0453270389749956e-05,
4409
+ "loss": 0.164,
4410
+ "step": 579
4411
+ },
4412
+ {
4413
+ "epoch": 0.5380333951762524,
4414
+ "grad_norm": 7.943957805633545,
4415
+ "learning_rate": 1.0420913803763522e-05,
4416
+ "loss": 0.2279,
4417
+ "step": 580
4418
+ },
4419
+ {
4420
+ "epoch": 0.5380333951762524,
4421
+ "eval_accuracy": 0.8558758314855875,
4422
+ "eval_f1": 0.7018348623853211,
4423
+ "eval_loss": 0.30458346009254456,
4424
+ "eval_precision": 0.8360655737704918,
4425
+ "eval_recall": 0.6047430830039525,
4426
+ "eval_runtime": 46.8877,
4427
+ "eval_samples_per_second": 5.886,
4428
+ "eval_steps_per_second": 0.192,
4429
+ "step": 580
4430
+ },
4431
+ {
4432
+ "epoch": 0.538961038961039,
4433
+ "grad_norm": 7.277109146118164,
4434
+ "learning_rate": 1.0388552802590461e-05,
4435
+ "loss": 0.0867,
4436
+ "step": 581
4437
+ },
4438
+ {
4439
+ "epoch": 0.5398886827458256,
4440
+ "grad_norm": 3.722276449203491,
4441
+ "learning_rate": 1.0356187725682359e-05,
4442
+ "loss": 0.1233,
4443
+ "step": 582
4444
+ },
4445
+ {
4446
+ "epoch": 0.5408163265306123,
4447
+ "grad_norm": 4.75911283493042,
4448
+ "learning_rate": 1.0323818912533561e-05,
4449
+ "loss": 0.2018,
4450
+ "step": 583
4451
+ },
4452
+ {
4453
+ "epoch": 0.5417439703153989,
4454
+ "grad_norm": 5.309391498565674,
4455
+ "learning_rate": 1.0291446702677598e-05,
4456
+ "loss": 0.1853,
4457
+ "step": 584
4458
+ },
4459
+ {
4460
+ "epoch": 0.5426716141001855,
4461
+ "grad_norm": 3.6361756324768066,
4462
+ "learning_rate": 1.0259071435683636e-05,
4463
+ "loss": 0.1594,
4464
+ "step": 585
4465
+ },
4466
+ {
4467
+ "epoch": 0.5435992578849722,
4468
+ "grad_norm": 8.379057884216309,
4469
+ "learning_rate": 1.02266934511529e-05,
4470
+ "loss": 0.1496,
4471
+ "step": 586
4472
+ },
4473
+ {
4474
+ "epoch": 0.5445269016697588,
4475
+ "grad_norm": 6.6115593910217285,
4476
+ "learning_rate": 1.0194313088715135e-05,
4477
+ "loss": 0.1887,
4478
+ "step": 587
4479
+ },
4480
+ {
4481
+ "epoch": 0.5454545454545454,
4482
+ "grad_norm": 3.5273900032043457,
4483
+ "learning_rate": 1.0161930688025018e-05,
4484
+ "loss": 0.1549,
4485
+ "step": 588
4486
+ },
4487
+ {
4488
+ "epoch": 0.5463821892393321,
4489
+ "grad_norm": 4.194179058074951,
4490
+ "learning_rate": 1.0129546588758605e-05,
4491
+ "loss": 0.1748,
4492
+ "step": 589
4493
+ },
4494
+ {
4495
+ "epoch": 0.5473098330241187,
4496
+ "grad_norm": 3.131457805633545,
4497
+ "learning_rate": 1.0097161130609774e-05,
4498
+ "loss": 0.1319,
4499
+ "step": 590
4500
+ },
4501
+ {
4502
+ "epoch": 0.5482374768089053,
4503
+ "grad_norm": 4.083921909332275,
4504
+ "learning_rate": 1.0064774653286662e-05,
4505
+ "loss": 0.1759,
4506
+ "step": 591
4507
+ },
4508
+ {
4509
+ "epoch": 0.549165120593692,
4510
+ "grad_norm": 3.384917736053467,
4511
+ "learning_rate": 1.003238749650809e-05,
4512
+ "loss": 0.1434,
4513
+ "step": 592
4514
+ },
4515
+ {
4516
+ "epoch": 0.5500927643784786,
4517
+ "grad_norm": 2.5680859088897705,
4518
+ "learning_rate": 1e-05,
4519
+ "loss": 0.1626,
4520
+ "step": 593
4521
+ },
4522
+ {
4523
+ "epoch": 0.5510204081632653,
4524
+ "grad_norm": 3.510484457015991,
4525
+ "learning_rate": 9.967612503491915e-06,
4526
+ "loss": 0.1701,
4527
+ "step": 594
4528
+ },
4529
+ {
4530
+ "epoch": 0.551948051948052,
4531
+ "grad_norm": 5.625200271606445,
4532
+ "learning_rate": 9.935225346713341e-06,
4533
+ "loss": 0.1486,
4534
+ "step": 595
4535
+ },
4536
+ {
4537
+ "epoch": 0.5528756957328386,
4538
+ "grad_norm": 5.352198123931885,
4539
+ "learning_rate": 9.90283886939023e-06,
4540
+ "loss": 0.1158,
4541
+ "step": 596
4542
+ },
4543
+ {
4544
+ "epoch": 0.5538033395176253,
4545
+ "grad_norm": 3.0487513542175293,
4546
+ "learning_rate": 9.870453411241399e-06,
4547
+ "loss": 0.1339,
4548
+ "step": 597
4549
+ },
4550
+ {
4551
+ "epoch": 0.5547309833024119,
4552
+ "grad_norm": 3.2191503047943115,
4553
+ "learning_rate": 9.838069311974986e-06,
4554
+ "loss": 0.1147,
4555
+ "step": 598
4556
+ },
4557
+ {
4558
+ "epoch": 0.5556586270871985,
4559
+ "grad_norm": 4.020503520965576,
4560
+ "learning_rate": 9.805686911284867e-06,
4561
+ "loss": 0.1163,
4562
+ "step": 599
4563
+ },
4564
+ {
4565
+ "epoch": 0.5565862708719852,
4566
+ "grad_norm": 6.643052101135254,
4567
+ "learning_rate": 9.773306548847102e-06,
4568
+ "loss": 0.2224,
4569
+ "step": 600
4570
+ },
4571
+ {
4572
+ "epoch": 0.5565862708719852,
4573
+ "eval_accuracy": 0.8381374722838137,
4574
+ "eval_f1": 0.6313131313131313,
4575
+ "eval_loss": 0.3394555449485779,
4576
+ "eval_precision": 0.8741258741258742,
4577
+ "eval_recall": 0.49407114624505927,
4578
+ "eval_runtime": 48.1127,
4579
+ "eval_samples_per_second": 5.737,
4580
+ "eval_steps_per_second": 0.187,
4581
+ "step": 600
4582
  }
4583
  ],
4584
  "logging_steps": 1,
 
4598
  "attributes": {}
4599
  }
4600
  },
4601
+ "total_flos": 1.915741517465518e+17,
4602
  "train_batch_size": 8,
4603
  "trial_name": null,
4604
  "trial_params": null