mtzig commited on
Commit
82a89ac
·
verified ·
1 Parent(s): e88290c

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb8e62aebef304af7f29719c0fad923798eb330385aeb4124ebe5905d2f7893
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a352f0d84009b1817ea378a4704c01130220431cda057a719176edb53b9ce38
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:925451c7a47e11cfdf5d3e79ff8e1d1616fa31bfbe9fd4dd921a8d07495100a5
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:192c82f34e86d685c6f351fd58c1000ddea9a13d640195ac79c49fbf42423aa5
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80aebfaa38d8fd15efc2f80ca22db9271add0dfa1df7cd1b9ec3a6cafcc1e980
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b90ca6c9a0d45f633e326ad429b79dcb8a229254c394c0026c58947de8b6ccb1
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e93ab70baf5fb3f49fc902766d3981884a3c59e7111a93ea08d704b5eaca5524
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1ddcd3b678ecc28638f1325c2c32db98cad1876b80914907eec102e20d65888
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1f8093292209f6718d35e0ccd016f16652167381bf80627ae426fc0a96d439c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:441eb9b06b4fc0f3fa0a9291de25b8426d0d9f412df64f69773da2db1b4860b2
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f14ea077a90622e6b4dc501a0231b02a369eb0516f972a2f1408a934b610f29
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01659b87d6d23358ab75fa4077af9feedf08b369b1c157aa83e98851b9c0d1ee
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:716d166395b1d3204d4c7983923c858c7814b0e98a579025f7d5396f16ff6dbc
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71450373e32f8a9a1b7bd7c09bbf7665cd2aab9935d9141b9e0d70c0fce7c3de
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f369ee493042be075f560b3402758308791790a6b19967c254c943ef54144890
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:173eff09d590e65fe2dd1179e23f7fb059beaf649179bf2d537bde02e80545b0
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d70614fd7a6e0bc09ddffd7fa93961bcf9eea5616a951374e008e680df41fe5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6afe62f64f980792c5f93908f1252e0efd7d9d6dd9a401096016c0cf0f6e9df7
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53ddb89813eac5e34cc35dc1ec465872bcd28d173f301c7ce65f1667e4d5f404
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c96a88229c7cf8988c09092a9afef0bd222230400623a17d132e957aa024720
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbbf9a445428ea3735d412cb42b094a0445cbab134f49cd9d71dd69330ce45b5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d2280d0785bc9b8dd3a1397de7a4d5f6e608d8e08010244249962de0f0c423
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe5328f1346602824c30aa9cc75535e926a4d8ce4ca9da88e40f8ce89791ebde
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e89f8a1132e0f0def133732be826c04d18fb1ddc8e499809e4f481802df182
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed607b5799c4b3e2659af93e02b0fa11a91a2ab37a2feec0e0666f1663f216c4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ead26a1aba46fa0b3384e323e0349ee0e9c3d6b20dad4ce8e9c9bf15675155cc
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8348794063079777,
5
  "eval_steps": 20,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6859,6 +6859,766 @@
6859
  "eval_samples_per_second": 5.818,
6860
  "eval_steps_per_second": 0.19,
6861
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6862
  }
6863
  ],
6864
  "logging_steps": 1,
@@ -6878,7 +7638,7 @@
6878
  "attributes": {}
6879
  }
6880
  },
6881
- "total_flos": 2.8777892777715302e+17,
6882
  "train_batch_size": 8,
6883
  "trial_name": null,
6884
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9276437847866419,
5
  "eval_steps": 20,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6859
  "eval_samples_per_second": 5.818,
6860
  "eval_steps_per_second": 0.19,
6861
  "step": 900
6862
+ },
6863
+ {
6864
+ "epoch": 0.8358070500927643,
6865
+ "grad_norm": 3.852740526199341,
6866
+ "learning_rate": 1.5986252796969482e-06,
6867
+ "loss": 0.1165,
6868
+ "step": 901
6869
+ },
6870
+ {
6871
+ "epoch": 0.8367346938775511,
6872
+ "grad_norm": 5.131833076477051,
6873
+ "learning_rate": 1.5811032226467304e-06,
6874
+ "loss": 0.198,
6875
+ "step": 902
6876
+ },
6877
+ {
6878
+ "epoch": 0.8376623376623377,
6879
+ "grad_norm": 4.975651741027832,
6880
+ "learning_rate": 1.5636694758399563e-06,
6881
+ "loss": 0.1891,
6882
+ "step": 903
6883
+ },
6884
+ {
6885
+ "epoch": 0.8385899814471243,
6886
+ "grad_norm": 3.24419903755188,
6887
+ "learning_rate": 1.5463242221483742e-06,
6888
+ "loss": 0.0935,
6889
+ "step": 904
6890
+ },
6891
+ {
6892
+ "epoch": 0.839517625231911,
6893
+ "grad_norm": 3.5641651153564453,
6894
+ "learning_rate": 1.5290676435154949e-06,
6895
+ "loss": 0.1533,
6896
+ "step": 905
6897
+ },
6898
+ {
6899
+ "epoch": 0.8404452690166976,
6900
+ "grad_norm": 3.872134208679199,
6901
+ "learning_rate": 1.511899920954656e-06,
6902
+ "loss": 0.1545,
6903
+ "step": 906
6904
+ },
6905
+ {
6906
+ "epoch": 0.8413729128014842,
6907
+ "grad_norm": 6.075543403625488,
6908
+ "learning_rate": 1.4948212345471492e-06,
6909
+ "loss": 0.2032,
6910
+ "step": 907
6911
+ },
6912
+ {
6913
+ "epoch": 0.8423005565862709,
6914
+ "grad_norm": 2.9056954383850098,
6915
+ "learning_rate": 1.4778317634403082e-06,
6916
+ "loss": 0.0986,
6917
+ "step": 908
6918
+ },
6919
+ {
6920
+ "epoch": 0.8432282003710575,
6921
+ "grad_norm": 5.516162872314453,
6922
+ "learning_rate": 1.460931685845649e-06,
6923
+ "loss": 0.1868,
6924
+ "step": 909
6925
+ },
6926
+ {
6927
+ "epoch": 0.8441558441558441,
6928
+ "grad_norm": 2.8610849380493164,
6929
+ "learning_rate": 1.4441211790369892e-06,
6930
+ "loss": 0.0923,
6931
+ "step": 910
6932
+ },
6933
+ {
6934
+ "epoch": 0.8450834879406308,
6935
+ "grad_norm": 3.700863838195801,
6936
+ "learning_rate": 1.427400419348588e-06,
6937
+ "loss": 0.1291,
6938
+ "step": 911
6939
+ },
6940
+ {
6941
+ "epoch": 0.8460111317254174,
6942
+ "grad_norm": 4.772455215454102,
6943
+ "learning_rate": 1.4107695821733026e-06,
6944
+ "loss": 0.1352,
6945
+ "step": 912
6946
+ },
6947
+ {
6948
+ "epoch": 0.8469387755102041,
6949
+ "grad_norm": 3.5742745399475098,
6950
+ "learning_rate": 1.3942288419607476e-06,
6951
+ "loss": 0.1824,
6952
+ "step": 913
6953
+ },
6954
+ {
6955
+ "epoch": 0.8478664192949907,
6956
+ "grad_norm": 8.259415626525879,
6957
+ "learning_rate": 1.3777783722154603e-06,
6958
+ "loss": 0.2448,
6959
+ "step": 914
6960
+ },
6961
+ {
6962
+ "epoch": 0.8487940630797773,
6963
+ "grad_norm": 3.900238513946533,
6964
+ "learning_rate": 1.3614183454950824e-06,
6965
+ "loss": 0.1273,
6966
+ "step": 915
6967
+ },
6968
+ {
6969
+ "epoch": 0.849721706864564,
6970
+ "grad_norm": 2.9773433208465576,
6971
+ "learning_rate": 1.3451489334085555e-06,
6972
+ "loss": 0.1522,
6973
+ "step": 916
6974
+ },
6975
+ {
6976
+ "epoch": 0.8506493506493507,
6977
+ "grad_norm": 3.071232318878174,
6978
+ "learning_rate": 1.3289703066143112e-06,
6979
+ "loss": 0.1256,
6980
+ "step": 917
6981
+ },
6982
+ {
6983
+ "epoch": 0.8515769944341373,
6984
+ "grad_norm": 3.8165667057037354,
6985
+ "learning_rate": 1.3128826348184886e-06,
6986
+ "loss": 0.1111,
6987
+ "step": 918
6988
+ },
6989
+ {
6990
+ "epoch": 0.852504638218924,
6991
+ "grad_norm": 3.7821688652038574,
6992
+ "learning_rate": 1.296886086773157e-06,
6993
+ "loss": 0.2091,
6994
+ "step": 919
6995
+ },
6996
+ {
6997
+ "epoch": 0.8534322820037106,
6998
+ "grad_norm": 4.833895206451416,
6999
+ "learning_rate": 1.2809808302745298e-06,
7000
+ "loss": 0.1762,
7001
+ "step": 920
7002
+ },
7003
+ {
7004
+ "epoch": 0.8534322820037106,
7005
+ "eval_accuracy": 0.8603104212860311,
7006
+ "eval_f1": 0.704225352112676,
7007
+ "eval_loss": 0.30113720893859863,
7008
+ "eval_precision": 0.8670520231213873,
7009
+ "eval_recall": 0.5928853754940712,
7010
+ "eval_runtime": 47.313,
7011
+ "eval_samples_per_second": 5.833,
7012
+ "eval_steps_per_second": 0.19,
7013
+ "step": 920
7014
+ },
7015
+ {
7016
+ "epoch": 0.8543599257884972,
7017
+ "grad_norm": 3.3207972049713135,
7018
+ "learning_rate": 1.2651670321612264e-06,
7019
+ "loss": 0.1367,
7020
+ "step": 921
7021
+ },
7022
+ {
7023
+ "epoch": 0.8552875695732839,
7024
+ "grad_norm": 3.202796697616577,
7025
+ "learning_rate": 1.249444858312502e-06,
7026
+ "loss": 0.1379,
7027
+ "step": 922
7028
+ },
7029
+ {
7030
+ "epoch": 0.8562152133580705,
7031
+ "grad_norm": 6.188356876373291,
7032
+ "learning_rate": 1.233814473646524e-06,
7033
+ "loss": 0.2627,
7034
+ "step": 923
7035
+ },
7036
+ {
7037
+ "epoch": 0.8571428571428571,
7038
+ "grad_norm": 3.4624321460723877,
7039
+ "learning_rate": 1.218276042118629e-06,
7040
+ "loss": 0.1318,
7041
+ "step": 924
7042
+ },
7043
+ {
7044
+ "epoch": 0.8580705009276438,
7045
+ "grad_norm": 3.288809061050415,
7046
+ "learning_rate": 1.202829726719611e-06,
7047
+ "loss": 0.1188,
7048
+ "step": 925
7049
+ },
7050
+ {
7051
+ "epoch": 0.8589981447124304,
7052
+ "grad_norm": 2.691675901412964,
7053
+ "learning_rate": 1.1874756894740137e-06,
7054
+ "loss": 0.1252,
7055
+ "step": 926
7056
+ },
7057
+ {
7058
+ "epoch": 0.859925788497217,
7059
+ "grad_norm": 3.750600576400757,
7060
+ "learning_rate": 1.1722140914384162e-06,
7061
+ "loss": 0.1644,
7062
+ "step": 927
7063
+ },
7064
+ {
7065
+ "epoch": 0.8608534322820037,
7066
+ "grad_norm": 3.1353397369384766,
7067
+ "learning_rate": 1.1570450926997657e-06,
7068
+ "loss": 0.1461,
7069
+ "step": 928
7070
+ },
7071
+ {
7072
+ "epoch": 0.8617810760667903,
7073
+ "grad_norm": 5.295469760894775,
7074
+ "learning_rate": 1.1419688523736761e-06,
7075
+ "loss": 0.1967,
7076
+ "step": 929
7077
+ },
7078
+ {
7079
+ "epoch": 0.862708719851577,
7080
+ "grad_norm": 3.461599349975586,
7081
+ "learning_rate": 1.1269855286027798e-06,
7082
+ "loss": 0.1426,
7083
+ "step": 930
7084
+ },
7085
+ {
7086
+ "epoch": 0.8636363636363636,
7087
+ "grad_norm": 6.9660420417785645,
7088
+ "learning_rate": 1.1120952785550477e-06,
7089
+ "loss": 0.2015,
7090
+ "step": 931
7091
+ },
7092
+ {
7093
+ "epoch": 0.8645640074211502,
7094
+ "grad_norm": 2.989213705062866,
7095
+ "learning_rate": 1.0972982584221592e-06,
7096
+ "loss": 0.1204,
7097
+ "step": 932
7098
+ },
7099
+ {
7100
+ "epoch": 0.865491651205937,
7101
+ "grad_norm": 4.492414474487305,
7102
+ "learning_rate": 1.0825946234178575e-06,
7103
+ "loss": 0.1579,
7104
+ "step": 933
7105
+ },
7106
+ {
7107
+ "epoch": 0.8664192949907236,
7108
+ "grad_norm": 4.693439960479736,
7109
+ "learning_rate": 1.067984527776309e-06,
7110
+ "loss": 0.1959,
7111
+ "step": 934
7112
+ },
7113
+ {
7114
+ "epoch": 0.8673469387755102,
7115
+ "grad_norm": 5.462426662445068,
7116
+ "learning_rate": 1.0534681247505107e-06,
7117
+ "loss": 0.1435,
7118
+ "step": 935
7119
+ },
7120
+ {
7121
+ "epoch": 0.8682745825602969,
7122
+ "grad_norm": 2.594604730606079,
7123
+ "learning_rate": 1.0390455666106547e-06,
7124
+ "loss": 0.115,
7125
+ "step": 936
7126
+ },
7127
+ {
7128
+ "epoch": 0.8692022263450835,
7129
+ "grad_norm": 5.900606155395508,
7130
+ "learning_rate": 1.024717004642557e-06,
7131
+ "loss": 0.1749,
7132
+ "step": 937
7133
+ },
7134
+ {
7135
+ "epoch": 0.8701298701298701,
7136
+ "grad_norm": 5.774359226226807,
7137
+ "learning_rate": 1.010482589146048e-06,
7138
+ "loss": 0.1802,
7139
+ "step": 938
7140
+ },
7141
+ {
7142
+ "epoch": 0.8710575139146568,
7143
+ "grad_norm": 4.002913951873779,
7144
+ "learning_rate": 9.963424694334122e-07,
7145
+ "loss": 0.1277,
7146
+ "step": 939
7147
+ },
7148
+ {
7149
+ "epoch": 0.8719851576994434,
7150
+ "grad_norm": 3.6173672676086426,
7151
+ "learning_rate": 9.822967938278172e-07,
7152
+ "loss": 0.1561,
7153
+ "step": 940
7154
+ },
7155
+ {
7156
+ "epoch": 0.8719851576994434,
7157
+ "eval_accuracy": 0.8603104212860311,
7158
+ "eval_f1": 0.704225352112676,
7159
+ "eval_loss": 0.29984721541404724,
7160
+ "eval_precision": 0.8670520231213873,
7161
+ "eval_recall": 0.5928853754940712,
7162
+ "eval_runtime": 48.0345,
7163
+ "eval_samples_per_second": 5.746,
7164
+ "eval_steps_per_second": 0.187,
7165
+ "step": 940
7166
+ },
7167
+ {
7168
+ "epoch": 0.87291280148423,
7169
+ "grad_norm": 5.298496723175049,
7170
+ "learning_rate": 9.683457096617487e-07,
7171
+ "loss": 0.1343,
7172
+ "step": 941
7173
+ },
7174
+ {
7175
+ "epoch": 0.8738404452690167,
7176
+ "grad_norm": 4.087591648101807,
7177
+ "learning_rate": 9.544893632754816e-07,
7178
+ "loss": 0.1342,
7179
+ "step": 942
7180
+ },
7181
+ {
7182
+ "epoch": 0.8747680890538033,
7183
+ "grad_norm": 3.6953861713409424,
7184
+ "learning_rate": 9.407279000155311e-07,
7185
+ "loss": 0.1125,
7186
+ "step": 943
7187
+ },
7188
+ {
7189
+ "epoch": 0.87569573283859,
7190
+ "grad_norm": 5.693349838256836,
7191
+ "learning_rate": 9.270614642331377e-07,
7192
+ "loss": 0.2285,
7193
+ "step": 944
7194
+ },
7195
+ {
7196
+ "epoch": 0.8766233766233766,
7197
+ "grad_norm": 4.321276664733887,
7198
+ "learning_rate": 9.134901992827427e-07,
7199
+ "loss": 0.2169,
7200
+ "step": 945
7201
+ },
7202
+ {
7203
+ "epoch": 0.8775510204081632,
7204
+ "grad_norm": 5.951560020446777,
7205
+ "learning_rate": 9.000142475204965e-07,
7206
+ "loss": 0.2039,
7207
+ "step": 946
7208
+ },
7209
+ {
7210
+ "epoch": 0.87847866419295,
7211
+ "grad_norm": 5.382765293121338,
7212
+ "learning_rate": 8.866337503027523e-07,
7213
+ "loss": 0.1347,
7214
+ "step": 947
7215
+ },
7216
+ {
7217
+ "epoch": 0.8794063079777366,
7218
+ "grad_norm": 4.566171646118164,
7219
+ "learning_rate": 8.733488479845997e-07,
7220
+ "loss": 0.1929,
7221
+ "step": 948
7222
+ },
7223
+ {
7224
+ "epoch": 0.8803339517625232,
7225
+ "grad_norm": 4.413459300994873,
7226
+ "learning_rate": 8.60159679918372e-07,
7227
+ "loss": 0.1463,
7228
+ "step": 949
7229
+ },
7230
+ {
7231
+ "epoch": 0.8812615955473099,
7232
+ "grad_norm": 3.8674092292785645,
7233
+ "learning_rate": 8.470663844522053e-07,
7234
+ "loss": 0.1523,
7235
+ "step": 950
7236
+ },
7237
+ {
7238
+ "epoch": 0.8821892393320965,
7239
+ "grad_norm": 3.844576597213745,
7240
+ "learning_rate": 8.340690989285727e-07,
7241
+ "loss": 0.1248,
7242
+ "step": 951
7243
+ },
7244
+ {
7245
+ "epoch": 0.8831168831168831,
7246
+ "grad_norm": 4.541808605194092,
7247
+ "learning_rate": 8.211679596828481e-07,
7248
+ "loss": 0.1571,
7249
+ "step": 952
7250
+ },
7251
+ {
7252
+ "epoch": 0.8840445269016698,
7253
+ "grad_norm": 3.0702145099639893,
7254
+ "learning_rate": 8.083631020418792e-07,
7255
+ "loss": 0.157,
7256
+ "step": 953
7257
+ },
7258
+ {
7259
+ "epoch": 0.8849721706864564,
7260
+ "grad_norm": 3.5125439167022705,
7261
+ "learning_rate": 7.956546603225601e-07,
7262
+ "loss": 0.1011,
7263
+ "step": 954
7264
+ },
7265
+ {
7266
+ "epoch": 0.885899814471243,
7267
+ "grad_norm": 4.256104469299316,
7268
+ "learning_rate": 7.830427678304353e-07,
7269
+ "loss": 0.1411,
7270
+ "step": 955
7271
+ },
7272
+ {
7273
+ "epoch": 0.8868274582560297,
7274
+ "grad_norm": 4.931686878204346,
7275
+ "learning_rate": 7.705275568582848e-07,
7276
+ "loss": 0.1953,
7277
+ "step": 956
7278
+ },
7279
+ {
7280
+ "epoch": 0.8877551020408163,
7281
+ "grad_norm": 5.233354091644287,
7282
+ "learning_rate": 7.581091586847522e-07,
7283
+ "loss": 0.2095,
7284
+ "step": 957
7285
+ },
7286
+ {
7287
+ "epoch": 0.8886827458256029,
7288
+ "grad_norm": 6.383068084716797,
7289
+ "learning_rate": 7.457877035729588e-07,
7290
+ "loss": 0.2274,
7291
+ "step": 958
7292
+ },
7293
+ {
7294
+ "epoch": 0.8896103896103896,
7295
+ "grad_norm": 2.8475682735443115,
7296
+ "learning_rate": 7.335633207691362e-07,
7297
+ "loss": 0.1336,
7298
+ "step": 959
7299
+ },
7300
+ {
7301
+ "epoch": 0.8905380333951762,
7302
+ "grad_norm": 3.393915891647339,
7303
+ "learning_rate": 7.21436138501278e-07,
7304
+ "loss": 0.1633,
7305
+ "step": 960
7306
+ },
7307
+ {
7308
+ "epoch": 0.8905380333951762,
7309
+ "eval_accuracy": 0.8569844789356984,
7310
+ "eval_f1": 0.6935866983372921,
7311
+ "eval_loss": 0.3064272701740265,
7312
+ "eval_precision": 0.8690476190476191,
7313
+ "eval_recall": 0.5770750988142292,
7314
+ "eval_runtime": 48.2701,
7315
+ "eval_samples_per_second": 5.718,
7316
+ "eval_steps_per_second": 0.186,
7317
+ "step": 960
7318
+ },
7319
+ {
7320
+ "epoch": 0.891465677179963,
7321
+ "grad_norm": 4.68550968170166,
7322
+ "learning_rate": 7.094062839777838e-07,
7323
+ "loss": 0.1854,
7324
+ "step": 961
7325
+ },
7326
+ {
7327
+ "epoch": 0.8923933209647495,
7328
+ "grad_norm": 5.072958946228027,
7329
+ "learning_rate": 6.974738833861383e-07,
7330
+ "loss": 0.1762,
7331
+ "step": 962
7332
+ },
7333
+ {
7334
+ "epoch": 0.8933209647495362,
7335
+ "grad_norm": 4.519327640533447,
7336
+ "learning_rate": 6.856390618915775e-07,
7337
+ "loss": 0.182,
7338
+ "step": 963
7339
+ },
7340
+ {
7341
+ "epoch": 0.8942486085343229,
7342
+ "grad_norm": 5.558988094329834,
7343
+ "learning_rate": 6.739019436357774e-07,
7344
+ "loss": 0.1665,
7345
+ "step": 964
7346
+ },
7347
+ {
7348
+ "epoch": 0.8951762523191095,
7349
+ "grad_norm": 2.263278007507324,
7350
+ "learning_rate": 6.622626517355557e-07,
7351
+ "loss": 0.1112,
7352
+ "step": 965
7353
+ },
7354
+ {
7355
+ "epoch": 0.8961038961038961,
7356
+ "grad_norm": 5.888603687286377,
7357
+ "learning_rate": 6.507213082815745e-07,
7358
+ "loss": 0.1455,
7359
+ "step": 966
7360
+ },
7361
+ {
7362
+ "epoch": 0.8970315398886828,
7363
+ "grad_norm": 5.091086387634277,
7364
+ "learning_rate": 6.392780343370686e-07,
7365
+ "loss": 0.1812,
7366
+ "step": 967
7367
+ },
7368
+ {
7369
+ "epoch": 0.8979591836734694,
7370
+ "grad_norm": 6.290548324584961,
7371
+ "learning_rate": 6.279329499365649e-07,
7372
+ "loss": 0.1527,
7373
+ "step": 968
7374
+ },
7375
+ {
7376
+ "epoch": 0.898886827458256,
7377
+ "grad_norm": 6.533473014831543,
7378
+ "learning_rate": 6.166861740846297e-07,
7379
+ "loss": 0.2105,
7380
+ "step": 969
7381
+ },
7382
+ {
7383
+ "epoch": 0.8998144712430427,
7384
+ "grad_norm": 3.4495279788970947,
7385
+ "learning_rate": 6.055378247546217e-07,
7386
+ "loss": 0.1222,
7387
+ "step": 970
7388
+ },
7389
+ {
7390
+ "epoch": 0.9007421150278293,
7391
+ "grad_norm": 5.290384769439697,
7392
+ "learning_rate": 5.94488018887448e-07,
7393
+ "loss": 0.2046,
7394
+ "step": 971
7395
+ },
7396
+ {
7397
+ "epoch": 0.9016697588126159,
7398
+ "grad_norm": 6.091614723205566,
7399
+ "learning_rate": 5.835368723903456e-07,
7400
+ "loss": 0.2643,
7401
+ "step": 972
7402
+ },
7403
+ {
7404
+ "epoch": 0.9025974025974026,
7405
+ "grad_norm": 4.488548278808594,
7406
+ "learning_rate": 5.726845001356573e-07,
7407
+ "loss": 0.1263,
7408
+ "step": 973
7409
+ },
7410
+ {
7411
+ "epoch": 0.9035250463821892,
7412
+ "grad_norm": 2.7875099182128906,
7413
+ "learning_rate": 5.619310159596358e-07,
7414
+ "loss": 0.0922,
7415
+ "step": 974
7416
+ },
7417
+ {
7418
+ "epoch": 0.9044526901669759,
7419
+ "grad_norm": 5.558516025543213,
7420
+ "learning_rate": 5.51276532661238e-07,
7421
+ "loss": 0.2045,
7422
+ "step": 975
7423
+ },
7424
+ {
7425
+ "epoch": 0.9053803339517625,
7426
+ "grad_norm": 5.901011943817139,
7427
+ "learning_rate": 5.407211620009545e-07,
7428
+ "loss": 0.1743,
7429
+ "step": 976
7430
+ },
7431
+ {
7432
+ "epoch": 0.9063079777365491,
7433
+ "grad_norm": 3.838674783706665,
7434
+ "learning_rate": 5.30265014699628e-07,
7435
+ "loss": 0.1728,
7436
+ "step": 977
7437
+ },
7438
+ {
7439
+ "epoch": 0.9072356215213359,
7440
+ "grad_norm": 3.811453104019165,
7441
+ "learning_rate": 5.199082004372958e-07,
7442
+ "loss": 0.153,
7443
+ "step": 978
7444
+ },
7445
+ {
7446
+ "epoch": 0.9081632653061225,
7447
+ "grad_norm": 5.14892578125,
7448
+ "learning_rate": 5.096508278520385e-07,
7449
+ "loss": 0.1991,
7450
+ "step": 979
7451
+ },
7452
+ {
7453
+ "epoch": 0.9090909090909091,
7454
+ "grad_norm": 3.6292712688446045,
7455
+ "learning_rate": 4.994930045388414e-07,
7456
+ "loss": 0.1452,
7457
+ "step": 980
7458
+ },
7459
+ {
7460
+ "epoch": 0.9090909090909091,
7461
+ "eval_accuracy": 0.8603104212860311,
7462
+ "eval_f1": 0.7028301886792453,
7463
+ "eval_loss": 0.3034472167491913,
7464
+ "eval_precision": 0.8713450292397661,
7465
+ "eval_recall": 0.5889328063241107,
7466
+ "eval_runtime": 48.7572,
7467
+ "eval_samples_per_second": 5.661,
7468
+ "eval_steps_per_second": 0.185,
7469
+ "step": 980
7470
+ },
7471
+ {
7472
+ "epoch": 0.9100185528756958,
7473
+ "grad_norm": 3.438109874725342,
7474
+ "learning_rate": 4.894348370484648e-07,
7475
+ "loss": 0.1054,
7476
+ "step": 981
7477
+ },
7478
+ {
7479
+ "epoch": 0.9109461966604824,
7480
+ "grad_norm": 5.481462478637695,
7481
+ "learning_rate": 4.794764308863242e-07,
7482
+ "loss": 0.1463,
7483
+ "step": 982
7484
+ },
7485
+ {
7486
+ "epoch": 0.911873840445269,
7487
+ "grad_norm": 6.784456253051758,
7488
+ "learning_rate": 4.696178905113913e-07,
7489
+ "loss": 0.1634,
7490
+ "step": 983
7491
+ },
7492
+ {
7493
+ "epoch": 0.9128014842300557,
7494
+ "grad_norm": 3.902355194091797,
7495
+ "learning_rate": 4.5985931933508757e-07,
7496
+ "loss": 0.1689,
7497
+ "step": 984
7498
+ },
7499
+ {
7500
+ "epoch": 0.9137291280148423,
7501
+ "grad_norm": 4.524623394012451,
7502
+ "learning_rate": 4.502008197202068e-07,
7503
+ "loss": 0.1428,
7504
+ "step": 985
7505
+ },
7506
+ {
7507
+ "epoch": 0.9146567717996289,
7508
+ "grad_norm": 3.976349353790283,
7509
+ "learning_rate": 4.406424929798403e-07,
7510
+ "loss": 0.1864,
7511
+ "step": 986
7512
+ },
7513
+ {
7514
+ "epoch": 0.9155844155844156,
7515
+ "grad_norm": 3.3905527591705322,
7516
+ "learning_rate": 4.3118443937631094e-07,
7517
+ "loss": 0.1719,
7518
+ "step": 987
7519
+ },
7520
+ {
7521
+ "epoch": 0.9165120593692022,
7522
+ "grad_norm": 4.1316938400268555,
7523
+ "learning_rate": 4.218267581201296e-07,
7524
+ "loss": 0.1124,
7525
+ "step": 988
7526
+ },
7527
+ {
7528
+ "epoch": 0.9174397031539888,
7529
+ "grad_norm": 5.6381754875183105,
7530
+ "learning_rate": 4.125695473689406e-07,
7531
+ "loss": 0.1994,
7532
+ "step": 989
7533
+ },
7534
+ {
7535
+ "epoch": 0.9183673469387755,
7536
+ "grad_norm": 7.625948905944824,
7537
+ "learning_rate": 4.034129042265067e-07,
7538
+ "loss": 0.2211,
7539
+ "step": 990
7540
+ },
7541
+ {
7542
+ "epoch": 0.9192949907235621,
7543
+ "grad_norm": 3.567246437072754,
7544
+ "learning_rate": 3.943569247416801e-07,
7545
+ "loss": 0.1359,
7546
+ "step": 991
7547
+ },
7548
+ {
7549
+ "epoch": 0.9202226345083488,
7550
+ "grad_norm": 4.336119174957275,
7551
+ "learning_rate": 3.8540170390740097e-07,
7552
+ "loss": 0.1519,
7553
+ "step": 992
7554
+ },
7555
+ {
7556
+ "epoch": 0.9211502782931354,
7557
+ "grad_norm": 4.9389848709106445,
7558
+ "learning_rate": 3.7654733565969826e-07,
7559
+ "loss": 0.1874,
7560
+ "step": 993
7561
+ },
7562
+ {
7563
+ "epoch": 0.922077922077922,
7564
+ "grad_norm": 3.25769305229187,
7565
+ "learning_rate": 3.67793912876705e-07,
7566
+ "loss": 0.1191,
7567
+ "step": 994
7568
+ },
7569
+ {
7570
+ "epoch": 0.9230055658627088,
7571
+ "grad_norm": 3.4334826469421387,
7572
+ "learning_rate": 3.591415273776855e-07,
7573
+ "loss": 0.1012,
7574
+ "step": 995
7575
+ },
7576
+ {
7577
+ "epoch": 0.9239332096474954,
7578
+ "grad_norm": 3.1981468200683594,
7579
+ "learning_rate": 3.5059026992206645e-07,
7580
+ "loss": 0.0812,
7581
+ "step": 996
7582
+ },
7583
+ {
7584
+ "epoch": 0.924860853432282,
7585
+ "grad_norm": 5.118222236633301,
7586
+ "learning_rate": 3.421402302084953e-07,
7587
+ "loss": 0.1293,
7588
+ "step": 997
7589
+ },
7590
+ {
7591
+ "epoch": 0.9257884972170687,
7592
+ "grad_norm": 4.047184944152832,
7593
+ "learning_rate": 3.3379149687388866e-07,
7594
+ "loss": 0.1723,
7595
+ "step": 998
7596
+ },
7597
+ {
7598
+ "epoch": 0.9267161410018553,
7599
+ "grad_norm": 7.083133220672607,
7600
+ "learning_rate": 3.255441574925089e-07,
7601
+ "loss": 0.2061,
7602
+ "step": 999
7603
+ },
7604
+ {
7605
+ "epoch": 0.9276437847866419,
7606
+ "grad_norm": 2.8097355365753174,
7607
+ "learning_rate": 3.1739829857504235e-07,
7608
+ "loss": 0.086,
7609
+ "step": 1000
7610
+ },
7611
+ {
7612
+ "epoch": 0.9276437847866419,
7613
+ "eval_accuracy": 0.8580931263858093,
7614
+ "eval_f1": 0.6966824644549763,
7615
+ "eval_loss": 0.30505669116973877,
7616
+ "eval_precision": 0.8698224852071006,
7617
+ "eval_recall": 0.5810276679841897,
7618
+ "eval_runtime": 47.8654,
7619
+ "eval_samples_per_second": 5.766,
7620
+ "eval_steps_per_second": 0.188,
7621
+ "step": 1000
7622
  }
7623
  ],
7624
  "logging_steps": 1,
 
7638
  "attributes": {}
7639
  }
7640
  },
7641
+ "total_flos": 3.198993040534405e+17,
7642
  "train_batch_size": 8,
7643
  "trial_name": null,
7644
  "trial_params": null