mtzig commited on
Commit
0b06254
·
verified ·
1 Parent(s): 51a8e90

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:046ad15b3a172be6c8a55556a3c20f15ef4ee714b05b61a7d6c92d4c6c9e3474
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7454f5845a5f23b4913750fbc16c23c17c15de424259fe838f50365af1a6fc31
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c99483ad02ea22340771991cc6669d1256f76d10e032f69951c7480de7534bf0
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85f7e2225ecc77b15d23f17a6bea4d6eff8126f32e4809d5fb9012178465eb09
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15aeeac899877619eba9935ba0590b8f1fa55e2d75c220d96a220798bf78d453
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a00dd5beda4d499c230a9f8ca29e291ebb388ce836dab2a836a479101ade1b29
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c991dfb7fae6e7e8e823a6ff78f7059aa6c3e2ee08cfa323cee3a4c276002a52
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f93bf48e5d0d2bd9aeb51d2e8ded2cd18f7df64624a0dc6007c452f77b97c0b
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2efd166763c9a22763a8e34b47f368dce987da9bde3aa0da236e9078b6b587f0
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:321edf67e47d0b4c67f8b27b7638c352cf813e649e5ee9996cca152fb9c75062
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e892fe343502bd3fc0cfd63b8565786111da6ae6996697589256c318e3c3076d
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed803da648fcbfb47d40579c66789c1b5813d1ca024980a285cc0b048653350
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e97d6a38ca3edf5744f51d03cec6812554f609ad1d7c762e2e3dcca3bc8af260
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c97e2f6d521cea150acde4f0539f7f18dddad1bc75aab9306523cb6a5047e1ae
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13a2e456b9f475387054566e8f129204eb628e3726aee77c4412ff11fc720706
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33f2493dd6209c0c885d7b0bc168e9eae2ef749d5250330e5bbff5e28b5a6d9
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99b43935a8d0c3ec7f6a15b5d02d38b25daf586d495f97529bae66a69e46d216
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79f19fb56fbaf65a779ecd9cb3b30247bfd45158eafdcd105585ee87f1735d98
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31626c13f5c4cf0cf88e6b691ef4408c4d52105b3855f7889d25ca5f4f0a0734
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afab5b50c52c688a707864652d6c5558ef69395b7a2375fa583b1b7139f6c609
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04807176bb2c22eeb6b0258c9226b2dfd4f8b8398c96841c95458ef393e0f56a
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1755b69d438567bb52e96e1dc58c619fdbd9a694f1794f20010dff70a9f6c151
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40e15fa019a19c6cd7ce2d72f5afc609c5c0b834df5220f887d5ba71dae814ca
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2c36f5a9cc648d59e856fb43df0193c9d5e0acbe931f548fb0ceebfba5c6a1
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:689f27834221968cbb24970b2a0ef37515a668dd8bd2e8a00c81e11a90d7d4a4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9988adf605fcbeeb14de5bdc4b2db6b176f9774ef7818d04698d021a01fbf6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6276150627615062,
5
  "eval_steps": 20,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3819,6 +3819,766 @@
3819
  "eval_samples_per_second": 5.236,
3820
  "eval_steps_per_second": 0.17,
3821
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3822
  }
3823
  ],
3824
  "logging_steps": 1,
@@ -3838,7 +4598,7 @@
3838
  "attributes": {}
3839
  }
3840
  },
3841
- "total_flos": 1.8275482733012582e+17,
3842
  "train_batch_size": 6,
3843
  "trial_name": null,
3844
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7531380753138075,
5
  "eval_steps": 20,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3819
  "eval_samples_per_second": 5.236,
3820
  "eval_steps_per_second": 0.17,
3821
  "step": 500
3822
+ },
3823
+ {
3824
+ "epoch": 0.6288702928870293,
3825
+ "grad_norm": 4.248108863830566,
3826
+ "learning_rate": 7.270818333554665e-06,
3827
+ "loss": 0.2752,
3828
+ "step": 501
3829
+ },
3830
+ {
3831
+ "epoch": 0.6301255230125523,
3832
+ "grad_norm": 3.575007677078247,
3833
+ "learning_rate": 7.228633438760138e-06,
3834
+ "loss": 0.238,
3835
+ "step": 502
3836
+ },
3837
+ {
3838
+ "epoch": 0.6313807531380753,
3839
+ "grad_norm": 5.464937210083008,
3840
+ "learning_rate": 7.186501897964644e-06,
3841
+ "loss": 0.2215,
3842
+ "step": 503
3843
+ },
3844
+ {
3845
+ "epoch": 0.6326359832635984,
3846
+ "grad_norm": 4.194279670715332,
3847
+ "learning_rate": 7.144424522279283e-06,
3848
+ "loss": 0.2387,
3849
+ "step": 504
3850
+ },
3851
+ {
3852
+ "epoch": 0.6338912133891214,
3853
+ "grad_norm": 4.254185199737549,
3854
+ "learning_rate": 7.102402121772378e-06,
3855
+ "loss": 0.2539,
3856
+ "step": 505
3857
+ },
3858
+ {
3859
+ "epoch": 0.6351464435146443,
3860
+ "grad_norm": 8.297247886657715,
3861
+ "learning_rate": 7.060435505453884e-06,
3862
+ "loss": 0.2829,
3863
+ "step": 506
3864
+ },
3865
+ {
3866
+ "epoch": 0.6364016736401673,
3867
+ "grad_norm": 5.86175012588501,
3868
+ "learning_rate": 7.018525481259787e-06,
3869
+ "loss": 0.2731,
3870
+ "step": 507
3871
+ },
3872
+ {
3873
+ "epoch": 0.6376569037656904,
3874
+ "grad_norm": 4.584890365600586,
3875
+ "learning_rate": 6.976672856036586e-06,
3876
+ "loss": 0.1941,
3877
+ "step": 508
3878
+ },
3879
+ {
3880
+ "epoch": 0.6389121338912134,
3881
+ "grad_norm": 4.639788627624512,
3882
+ "learning_rate": 6.934878435525736e-06,
3883
+ "loss": 0.2143,
3884
+ "step": 509
3885
+ },
3886
+ {
3887
+ "epoch": 0.6401673640167364,
3888
+ "grad_norm": 4.085133075714111,
3889
+ "learning_rate": 6.893143024348137e-06,
3890
+ "loss": 0.2428,
3891
+ "step": 510
3892
+ },
3893
+ {
3894
+ "epoch": 0.6414225941422594,
3895
+ "grad_norm": 5.64658784866333,
3896
+ "learning_rate": 6.851467425988663e-06,
3897
+ "loss": 0.2014,
3898
+ "step": 511
3899
+ },
3900
+ {
3901
+ "epoch": 0.6426778242677824,
3902
+ "grad_norm": 4.088956832885742,
3903
+ "learning_rate": 6.809852442780664e-06,
3904
+ "loss": 0.185,
3905
+ "step": 512
3906
+ },
3907
+ {
3908
+ "epoch": 0.6439330543933054,
3909
+ "grad_norm": 5.6827921867370605,
3910
+ "learning_rate": 6.768298875890541e-06,
3911
+ "loss": 0.3019,
3912
+ "step": 513
3913
+ },
3914
+ {
3915
+ "epoch": 0.6451882845188285,
3916
+ "grad_norm": 6.169975757598877,
3917
+ "learning_rate": 6.726807525302319e-06,
3918
+ "loss": 0.2872,
3919
+ "step": 514
3920
+ },
3921
+ {
3922
+ "epoch": 0.6464435146443515,
3923
+ "grad_norm": 4.995835304260254,
3924
+ "learning_rate": 6.685379189802241e-06,
3925
+ "loss": 0.2611,
3926
+ "step": 515
3927
+ },
3928
+ {
3929
+ "epoch": 0.6476987447698744,
3930
+ "grad_norm": 3.8299150466918945,
3931
+ "learning_rate": 6.6440146669633855e-06,
3932
+ "loss": 0.1959,
3933
+ "step": 516
3934
+ },
3935
+ {
3936
+ "epoch": 0.6489539748953975,
3937
+ "grad_norm": 4.961380958557129,
3938
+ "learning_rate": 6.602714753130322e-06,
3939
+ "loss": 0.2274,
3940
+ "step": 517
3941
+ },
3942
+ {
3943
+ "epoch": 0.6502092050209205,
3944
+ "grad_norm": 4.710041522979736,
3945
+ "learning_rate": 6.561480243403776e-06,
3946
+ "loss": 0.3025,
3947
+ "step": 518
3948
+ },
3949
+ {
3950
+ "epoch": 0.6514644351464435,
3951
+ "grad_norm": 3.762503147125244,
3952
+ "learning_rate": 6.520311931625325e-06,
3953
+ "loss": 0.25,
3954
+ "step": 519
3955
+ },
3956
+ {
3957
+ "epoch": 0.6527196652719666,
3958
+ "grad_norm": 3.377311944961548,
3959
+ "learning_rate": 6.479210610362103e-06,
3960
+ "loss": 0.1953,
3961
+ "step": 520
3962
+ },
3963
+ {
3964
+ "epoch": 0.6527196652719666,
3965
+ "eval_accuracy": 0.8576158940397351,
3966
+ "eval_f1": 0.7020785219399538,
3967
+ "eval_loss": 0.29889100790023804,
3968
+ "eval_precision": 0.8491620111731844,
3969
+ "eval_recall": 0.5984251968503937,
3970
+ "eval_runtime": 50.9239,
3971
+ "eval_samples_per_second": 5.439,
3972
+ "eval_steps_per_second": 0.177,
3973
+ "step": 520
3974
+ },
3975
+ {
3976
+ "epoch": 0.6539748953974895,
3977
+ "grad_norm": 4.013047218322754,
3978
+ "learning_rate": 6.4381770708915594e-06,
3979
+ "loss": 0.2467,
3980
+ "step": 521
3981
+ },
3982
+ {
3983
+ "epoch": 0.6552301255230125,
3984
+ "grad_norm": 3.758030652999878,
3985
+ "learning_rate": 6.397212103186214e-06,
3986
+ "loss": 0.2211,
3987
+ "step": 522
3988
+ },
3989
+ {
3990
+ "epoch": 0.6564853556485356,
3991
+ "grad_norm": 3.948408603668213,
3992
+ "learning_rate": 6.35631649589845e-06,
3993
+ "loss": 0.188,
3994
+ "step": 523
3995
+ },
3996
+ {
3997
+ "epoch": 0.6577405857740586,
3998
+ "grad_norm": 2.879676103591919,
3999
+ "learning_rate": 6.315491036345338e-06,
4000
+ "loss": 0.2009,
4001
+ "step": 524
4002
+ },
4003
+ {
4004
+ "epoch": 0.6589958158995816,
4005
+ "grad_norm": 4.440194606781006,
4006
+ "learning_rate": 6.274736510493462e-06,
4007
+ "loss": 0.2608,
4008
+ "step": 525
4009
+ },
4010
+ {
4011
+ "epoch": 0.6602510460251046,
4012
+ "grad_norm": 7.7365403175354,
4013
+ "learning_rate": 6.23405370294381e-06,
4014
+ "loss": 0.2733,
4015
+ "step": 526
4016
+ },
4017
+ {
4018
+ "epoch": 0.6615062761506276,
4019
+ "grad_norm": 4.252779960632324,
4020
+ "learning_rate": 6.1934433969166575e-06,
4021
+ "loss": 0.1904,
4022
+ "step": 527
4023
+ },
4024
+ {
4025
+ "epoch": 0.6627615062761506,
4026
+ "grad_norm": 3.153885841369629,
4027
+ "learning_rate": 6.1529063742364844e-06,
4028
+ "loss": 0.1833,
4029
+ "step": 528
4030
+ },
4031
+ {
4032
+ "epoch": 0.6640167364016737,
4033
+ "grad_norm": 7.1857476234436035,
4034
+ "learning_rate": 6.112443415316934e-06,
4035
+ "loss": 0.2215,
4036
+ "step": 529
4037
+ },
4038
+ {
4039
+ "epoch": 0.6652719665271967,
4040
+ "grad_norm": 2.9891092777252197,
4041
+ "learning_rate": 6.072055299145778e-06,
4042
+ "loss": 0.2156,
4043
+ "step": 530
4044
+ },
4045
+ {
4046
+ "epoch": 0.6665271966527196,
4047
+ "grad_norm": 3.8794708251953125,
4048
+ "learning_rate": 6.031742803269931e-06,
4049
+ "loss": 0.251,
4050
+ "step": 531
4051
+ },
4052
+ {
4053
+ "epoch": 0.6677824267782427,
4054
+ "grad_norm": 3.384833335876465,
4055
+ "learning_rate": 5.991506703780475e-06,
4056
+ "loss": 0.218,
4057
+ "step": 532
4058
+ },
4059
+ {
4060
+ "epoch": 0.6690376569037657,
4061
+ "grad_norm": 3.629615306854248,
4062
+ "learning_rate": 5.95134777529771e-06,
4063
+ "loss": 0.2329,
4064
+ "step": 533
4065
+ },
4066
+ {
4067
+ "epoch": 0.6702928870292887,
4068
+ "grad_norm": 3.8525490760803223,
4069
+ "learning_rate": 5.911266790956258e-06,
4070
+ "loss": 0.229,
4071
+ "step": 534
4072
+ },
4073
+ {
4074
+ "epoch": 0.6715481171548117,
4075
+ "grad_norm": 4.439032077789307,
4076
+ "learning_rate": 5.871264522390165e-06,
4077
+ "loss": 0.2752,
4078
+ "step": 535
4079
+ },
4080
+ {
4081
+ "epoch": 0.6728033472803348,
4082
+ "grad_norm": 3.8713276386260986,
4083
+ "learning_rate": 5.831341739718055e-06,
4084
+ "loss": 0.2427,
4085
+ "step": 536
4086
+ },
4087
+ {
4088
+ "epoch": 0.6740585774058577,
4089
+ "grad_norm": 3.2495763301849365,
4090
+ "learning_rate": 5.791499211528302e-06,
4091
+ "loss": 0.2424,
4092
+ "step": 537
4093
+ },
4094
+ {
4095
+ "epoch": 0.6753138075313807,
4096
+ "grad_norm": 4.471564292907715,
4097
+ "learning_rate": 5.751737704864224e-06,
4098
+ "loss": 0.1954,
4099
+ "step": 538
4100
+ },
4101
+ {
4102
+ "epoch": 0.6765690376569038,
4103
+ "grad_norm": 4.963108539581299,
4104
+ "learning_rate": 5.712057985209325e-06,
4105
+ "loss": 0.2393,
4106
+ "step": 539
4107
+ },
4108
+ {
4109
+ "epoch": 0.6778242677824268,
4110
+ "grad_norm": 5.474493503570557,
4111
+ "learning_rate": 5.672460816472556e-06,
4112
+ "loss": 0.3153,
4113
+ "step": 540
4114
+ },
4115
+ {
4116
+ "epoch": 0.6778242677824268,
4117
+ "eval_accuracy": 0.8642384105960265,
4118
+ "eval_f1": 0.7260579064587973,
4119
+ "eval_loss": 0.2864134609699249,
4120
+ "eval_precision": 0.8358974358974359,
4121
+ "eval_recall": 0.6417322834645669,
4122
+ "eval_runtime": 51.7722,
4123
+ "eval_samples_per_second": 5.35,
4124
+ "eval_steps_per_second": 0.174,
4125
+ "step": 540
4126
+ },
4127
+ {
4128
+ "epoch": 0.6790794979079497,
4129
+ "grad_norm": 5.037370681762695,
4130
+ "learning_rate": 5.632946960973611e-06,
4131
+ "loss": 0.2517,
4132
+ "step": 541
4133
+ },
4134
+ {
4135
+ "epoch": 0.6803347280334728,
4136
+ "grad_norm": 5.2865142822265625,
4137
+ "learning_rate": 5.5935171794282426e-06,
4138
+ "loss": 0.2962,
4139
+ "step": 542
4140
+ },
4141
+ {
4142
+ "epoch": 0.6815899581589958,
4143
+ "grad_norm": 3.1313962936401367,
4144
+ "learning_rate": 5.554172230933628e-06,
4145
+ "loss": 0.1967,
4146
+ "step": 543
4147
+ },
4148
+ {
4149
+ "epoch": 0.6828451882845188,
4150
+ "grad_norm": 4.365119934082031,
4151
+ "learning_rate": 5.514912872953746e-06,
4152
+ "loss": 0.2568,
4153
+ "step": 544
4154
+ },
4155
+ {
4156
+ "epoch": 0.6841004184100419,
4157
+ "grad_norm": 2.9407169818878174,
4158
+ "learning_rate": 5.4757398613047985e-06,
4159
+ "loss": 0.2133,
4160
+ "step": 545
4161
+ },
4162
+ {
4163
+ "epoch": 0.6853556485355649,
4164
+ "grad_norm": 4.410444259643555,
4165
+ "learning_rate": 5.436653950140657e-06,
4166
+ "loss": 0.2656,
4167
+ "step": 546
4168
+ },
4169
+ {
4170
+ "epoch": 0.6866108786610878,
4171
+ "grad_norm": 5.113467216491699,
4172
+ "learning_rate": 5.397655891938348e-06,
4173
+ "loss": 0.2425,
4174
+ "step": 547
4175
+ },
4176
+ {
4177
+ "epoch": 0.6878661087866109,
4178
+ "grad_norm": 3.6607089042663574,
4179
+ "learning_rate": 5.35874643748356e-06,
4180
+ "loss": 0.1942,
4181
+ "step": 548
4182
+ },
4183
+ {
4184
+ "epoch": 0.6891213389121339,
4185
+ "grad_norm": 4.0803914070129395,
4186
+ "learning_rate": 5.3199263358562e-06,
4187
+ "loss": 0.1702,
4188
+ "step": 549
4189
+ },
4190
+ {
4191
+ "epoch": 0.6903765690376569,
4192
+ "grad_norm": 7.306187629699707,
4193
+ "learning_rate": 5.281196334415968e-06,
4194
+ "loss": 0.1774,
4195
+ "step": 550
4196
+ },
4197
+ {
4198
+ "epoch": 0.69163179916318,
4199
+ "grad_norm": 6.191274166107178,
4200
+ "learning_rate": 5.2425571787879455e-06,
4201
+ "loss": 0.2996,
4202
+ "step": 551
4203
+ },
4204
+ {
4205
+ "epoch": 0.6928870292887029,
4206
+ "grad_norm": 4.841433048248291,
4207
+ "learning_rate": 5.204009612848288e-06,
4208
+ "loss": 0.3033,
4209
+ "step": 552
4210
+ },
4211
+ {
4212
+ "epoch": 0.6941422594142259,
4213
+ "grad_norm": 4.5002899169921875,
4214
+ "learning_rate": 5.165554378709857e-06,
4215
+ "loss": 0.2149,
4216
+ "step": 553
4217
+ },
4218
+ {
4219
+ "epoch": 0.695397489539749,
4220
+ "grad_norm": 3.8781685829162598,
4221
+ "learning_rate": 5.127192216707974e-06,
4222
+ "loss": 0.2828,
4223
+ "step": 554
4224
+ },
4225
+ {
4226
+ "epoch": 0.696652719665272,
4227
+ "grad_norm": 4.068243980407715,
4228
+ "learning_rate": 5.088923865386133e-06,
4229
+ "loss": 0.1836,
4230
+ "step": 555
4231
+ },
4232
+ {
4233
+ "epoch": 0.697907949790795,
4234
+ "grad_norm": 4.608306407928467,
4235
+ "learning_rate": 5.050750061481799e-06,
4236
+ "loss": 0.2514,
4237
+ "step": 556
4238
+ },
4239
+ {
4240
+ "epoch": 0.699163179916318,
4241
+ "grad_norm": 5.066010475158691,
4242
+ "learning_rate": 5.012671539912226e-06,
4243
+ "loss": 0.2174,
4244
+ "step": 557
4245
+ },
4246
+ {
4247
+ "epoch": 0.700418410041841,
4248
+ "grad_norm": 4.110201358795166,
4249
+ "learning_rate": 4.9746890337603005e-06,
4250
+ "loss": 0.1672,
4251
+ "step": 558
4252
+ },
4253
+ {
4254
+ "epoch": 0.701673640167364,
4255
+ "grad_norm": 4.265486717224121,
4256
+ "learning_rate": 4.936803274260434e-06,
4257
+ "loss": 0.2198,
4258
+ "step": 559
4259
+ },
4260
+ {
4261
+ "epoch": 0.702928870292887,
4262
+ "grad_norm": 4.03239107131958,
4263
+ "learning_rate": 4.899014990784485e-06,
4264
+ "loss": 0.2172,
4265
+ "step": 560
4266
+ },
4267
+ {
4268
+ "epoch": 0.702928870292887,
4269
+ "eval_accuracy": 0.8443708609271523,
4270
+ "eval_f1": 0.6483790523690773,
4271
+ "eval_loss": 0.31904953718185425,
4272
+ "eval_precision": 0.8843537414965986,
4273
+ "eval_recall": 0.5118110236220472,
4274
+ "eval_runtime": 51.3743,
4275
+ "eval_samples_per_second": 5.392,
4276
+ "eval_steps_per_second": 0.175,
4277
+ "step": 560
4278
+ },
4279
+ {
4280
+ "epoch": 0.7041841004184101,
4281
+ "grad_norm": 4.523290157318115,
4282
+ "learning_rate": 4.861324910827714e-06,
4283
+ "loss": 0.2345,
4284
+ "step": 561
4285
+ },
4286
+ {
4287
+ "epoch": 0.705439330543933,
4288
+ "grad_norm": 4.160706520080566,
4289
+ "learning_rate": 4.8237337599947795e-06,
4290
+ "loss": 0.2406,
4291
+ "step": 562
4292
+ },
4293
+ {
4294
+ "epoch": 0.706694560669456,
4295
+ "grad_norm": 6.3733811378479,
4296
+ "learning_rate": 4.786242261985772e-06,
4297
+ "loss": 0.2486,
4298
+ "step": 563
4299
+ },
4300
+ {
4301
+ "epoch": 0.7079497907949791,
4302
+ "grad_norm": 6.861822128295898,
4303
+ "learning_rate": 4.748851138582269e-06,
4304
+ "loss": 0.2124,
4305
+ "step": 564
4306
+ },
4307
+ {
4308
+ "epoch": 0.7092050209205021,
4309
+ "grad_norm": 4.6429829597473145,
4310
+ "learning_rate": 4.711561109633466e-06,
4311
+ "loss": 0.2569,
4312
+ "step": 565
4313
+ },
4314
+ {
4315
+ "epoch": 0.7104602510460251,
4316
+ "grad_norm": 3.4625086784362793,
4317
+ "learning_rate": 4.674372893042287e-06,
4318
+ "loss": 0.2174,
4319
+ "step": 566
4320
+ },
4321
+ {
4322
+ "epoch": 0.7117154811715481,
4323
+ "grad_norm": 7.073486804962158,
4324
+ "learning_rate": 4.63728720475158e-06,
4325
+ "loss": 0.2145,
4326
+ "step": 567
4327
+ },
4328
+ {
4329
+ "epoch": 0.7129707112970711,
4330
+ "grad_norm": 4.345584869384766,
4331
+ "learning_rate": 4.6003047587303376e-06,
4332
+ "loss": 0.1827,
4333
+ "step": 568
4334
+ },
4335
+ {
4336
+ "epoch": 0.7142259414225941,
4337
+ "grad_norm": 4.599338054656982,
4338
+ "learning_rate": 4.563426266959932e-06,
4339
+ "loss": 0.2167,
4340
+ "step": 569
4341
+ },
4342
+ {
4343
+ "epoch": 0.7154811715481172,
4344
+ "grad_norm": 3.284950017929077,
4345
+ "learning_rate": 4.526652439420427e-06,
4346
+ "loss": 0.1409,
4347
+ "step": 570
4348
+ },
4349
+ {
4350
+ "epoch": 0.7167364016736402,
4351
+ "grad_norm": 3.3235983848571777,
4352
+ "learning_rate": 4.489983984076918e-06,
4353
+ "loss": 0.2377,
4354
+ "step": 571
4355
+ },
4356
+ {
4357
+ "epoch": 0.7179916317991631,
4358
+ "grad_norm": 5.1698079109191895,
4359
+ "learning_rate": 4.453421606865869e-06,
4360
+ "loss": 0.2022,
4361
+ "step": 572
4362
+ },
4363
+ {
4364
+ "epoch": 0.7192468619246862,
4365
+ "grad_norm": 5.351980209350586,
4366
+ "learning_rate": 4.416966011681548e-06,
4367
+ "loss": 0.1903,
4368
+ "step": 573
4369
+ },
4370
+ {
4371
+ "epoch": 0.7205020920502092,
4372
+ "grad_norm": 2.702564239501953,
4373
+ "learning_rate": 4.380617900362473e-06,
4374
+ "loss": 0.1935,
4375
+ "step": 574
4376
+ },
4377
+ {
4378
+ "epoch": 0.7217573221757322,
4379
+ "grad_norm": 3.770988702774048,
4380
+ "learning_rate": 4.34437797267789e-06,
4381
+ "loss": 0.175,
4382
+ "step": 575
4383
+ },
4384
+ {
4385
+ "epoch": 0.7230125523012553,
4386
+ "grad_norm": 3.6061084270477295,
4387
+ "learning_rate": 4.308246926314307e-06,
4388
+ "loss": 0.2515,
4389
+ "step": 576
4390
+ },
4391
+ {
4392
+ "epoch": 0.7242677824267783,
4393
+ "grad_norm": 5.896265983581543,
4394
+ "learning_rate": 4.272225456862076e-06,
4395
+ "loss": 0.2694,
4396
+ "step": 577
4397
+ },
4398
+ {
4399
+ "epoch": 0.7255230125523012,
4400
+ "grad_norm": 7.105819225311279,
4401
+ "learning_rate": 4.236314257801968e-06,
4402
+ "loss": 0.3122,
4403
+ "step": 578
4404
+ },
4405
+ {
4406
+ "epoch": 0.7267782426778243,
4407
+ "grad_norm": 5.691869735717773,
4408
+ "learning_rate": 4.200514020491854e-06,
4409
+ "loss": 0.2672,
4410
+ "step": 579
4411
+ },
4412
+ {
4413
+ "epoch": 0.7280334728033473,
4414
+ "grad_norm": 3.698089838027954,
4415
+ "learning_rate": 4.164825434153381e-06,
4416
+ "loss": 0.2604,
4417
+ "step": 580
4418
+ },
4419
+ {
4420
+ "epoch": 0.7280334728033473,
4421
+ "eval_accuracy": 0.8686534216335541,
4422
+ "eval_f1": 0.7384615384615385,
4423
+ "eval_loss": 0.28295037150382996,
4424
+ "eval_precision": 0.835820895522388,
4425
+ "eval_recall": 0.6614173228346457,
4426
+ "eval_runtime": 53.3249,
4427
+ "eval_samples_per_second": 5.195,
4428
+ "eval_steps_per_second": 0.169,
4429
+ "step": 580
4430
+ },
4431
+ {
4432
+ "epoch": 0.7292887029288703,
4433
+ "grad_norm": 5.295552730560303,
4434
+ "learning_rate": 4.129249185858704e-06,
4435
+ "loss": 0.2536,
4436
+ "step": 581
4437
+ },
4438
+ {
4439
+ "epoch": 0.7305439330543934,
4440
+ "grad_norm": 3.8178629875183105,
4441
+ "learning_rate": 4.093785960517269e-06,
4442
+ "loss": 0.2233,
4443
+ "step": 582
4444
+ },
4445
+ {
4446
+ "epoch": 0.7317991631799163,
4447
+ "grad_norm": 5.176862716674805,
4448
+ "learning_rate": 4.0584364408626065e-06,
4449
+ "loss": 0.3026,
4450
+ "step": 583
4451
+ },
4452
+ {
4453
+ "epoch": 0.7330543933054393,
4454
+ "grad_norm": 6.326966762542725,
4455
+ "learning_rate": 4.0232013074392065e-06,
4456
+ "loss": 0.2652,
4457
+ "step": 584
4458
+ },
4459
+ {
4460
+ "epoch": 0.7343096234309623,
4461
+ "grad_norm": 8.710590362548828,
4462
+ "learning_rate": 3.988081238589406e-06,
4463
+ "loss": 0.2439,
4464
+ "step": 585
4465
+ },
4466
+ {
4467
+ "epoch": 0.7355648535564854,
4468
+ "grad_norm": 5.5283026695251465,
4469
+ "learning_rate": 3.953076910440337e-06,
4470
+ "loss": 0.2445,
4471
+ "step": 586
4472
+ },
4473
+ {
4474
+ "epoch": 0.7368200836820084,
4475
+ "grad_norm": 4.787403583526611,
4476
+ "learning_rate": 3.918188996890903e-06,
4477
+ "loss": 0.2705,
4478
+ "step": 587
4479
+ },
4480
+ {
4481
+ "epoch": 0.7380753138075313,
4482
+ "grad_norm": 6.294352054595947,
4483
+ "learning_rate": 3.883418169598808e-06,
4484
+ "loss": 0.2813,
4485
+ "step": 588
4486
+ },
4487
+ {
4488
+ "epoch": 0.7393305439330544,
4489
+ "grad_norm": 4.5643415451049805,
4490
+ "learning_rate": 3.84876509796763e-06,
4491
+ "loss": 0.2417,
4492
+ "step": 589
4493
+ },
4494
+ {
4495
+ "epoch": 0.7405857740585774,
4496
+ "grad_norm": 6.011057376861572,
4497
+ "learning_rate": 3.814230449133928e-06,
4498
+ "loss": 0.3062,
4499
+ "step": 590
4500
+ },
4501
+ {
4502
+ "epoch": 0.7418410041841004,
4503
+ "grad_norm": 5.100391387939453,
4504
+ "learning_rate": 3.7798148879543983e-06,
4505
+ "loss": 0.2424,
4506
+ "step": 591
4507
+ },
4508
+ {
4509
+ "epoch": 0.7430962343096235,
4510
+ "grad_norm": 3.619565963745117,
4511
+ "learning_rate": 3.745519076993078e-06,
4512
+ "loss": 0.262,
4513
+ "step": 592
4514
+ },
4515
+ {
4516
+ "epoch": 0.7443514644351464,
4517
+ "grad_norm": 5.100575923919678,
4518
+ "learning_rate": 3.7113436765085865e-06,
4519
+ "loss": 0.2577,
4520
+ "step": 593
4521
+ },
4522
+ {
4523
+ "epoch": 0.7456066945606694,
4524
+ "grad_norm": 6.600237846374512,
4525
+ "learning_rate": 3.6772893444414226e-06,
4526
+ "loss": 0.2571,
4527
+ "step": 594
4528
+ },
4529
+ {
4530
+ "epoch": 0.7468619246861925,
4531
+ "grad_norm": 4.155444145202637,
4532
+ "learning_rate": 3.643356736401289e-06,
4533
+ "loss": 0.2558,
4534
+ "step": 595
4535
+ },
4536
+ {
4537
+ "epoch": 0.7481171548117155,
4538
+ "grad_norm": 3.4668867588043213,
4539
+ "learning_rate": 3.609546505654462e-06,
4540
+ "loss": 0.1694,
4541
+ "step": 596
4542
+ },
4543
+ {
4544
+ "epoch": 0.7493723849372385,
4545
+ "grad_norm": 4.315099239349365,
4546
+ "learning_rate": 3.5758593031112364e-06,
4547
+ "loss": 0.2029,
4548
+ "step": 597
4549
+ },
4550
+ {
4551
+ "epoch": 0.7506276150627615,
4552
+ "grad_norm": 4.79595422744751,
4553
+ "learning_rate": 3.5422957773133804e-06,
4554
+ "loss": 0.2165,
4555
+ "step": 598
4556
+ },
4557
+ {
4558
+ "epoch": 0.7518828451882845,
4559
+ "grad_norm": 3.9190430641174316,
4560
+ "learning_rate": 3.5088565744216574e-06,
4561
+ "loss": 0.2107,
4562
+ "step": 599
4563
+ },
4564
+ {
4565
+ "epoch": 0.7531380753138075,
4566
+ "grad_norm": 4.860176086425781,
4567
+ "learning_rate": 3.475542338203377e-06,
4568
+ "loss": 0.2671,
4569
+ "step": 600
4570
+ },
4571
+ {
4572
+ "epoch": 0.7531380753138075,
4573
+ "eval_accuracy": 0.8565121412803532,
4574
+ "eval_f1": 0.6976744186046512,
4575
+ "eval_loss": 0.29695039987564087,
4576
+ "eval_precision": 0.8522727272727273,
4577
+ "eval_recall": 0.5905511811023622,
4578
+ "eval_runtime": 51.86,
4579
+ "eval_samples_per_second": 5.341,
4580
+ "eval_steps_per_second": 0.174,
4581
+ "step": 600
4582
  }
4583
  ],
4584
  "logging_steps": 1,
 
4598
  "attributes": {}
4599
  }
4600
  },
4601
+ "total_flos": 2.19779349803434e+17,
4602
  "train_batch_size": 6,
4603
  "trial_name": null,
4604
  "trial_params": null