mtzig commited on
Commit
e88290c
·
verified ·
1 Parent(s): 634757b

Training in progress, step 900, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68eb5ed2e06aaf8ccf51155c64c6d00bc22fb8856e437e0118b64ceffe8a67cb
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb8e62aebef304af7f29719c0fad923798eb330385aeb4124ebe5905d2f7893
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed69f0ff9655bfaab7272f6ad7c0ae7dd2a7e69609ff879b264ac67fb89fa9f1
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:925451c7a47e11cfdf5d3e79ff8e1d1616fa31bfbe9fd4dd921a8d07495100a5
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bbd790913cc32d61c035f94711e5213bc995991b0da5ce1e10cae6c207d6aff
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80aebfaa38d8fd15efc2f80ca22db9271add0dfa1df7cd1b9ec3a6cafcc1e980
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47ee402b977d2bcdbecf23c2cac2e41b1eece5636309269bebeeddbf9c83f8d2
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e93ab70baf5fb3f49fc902766d3981884a3c59e7111a93ea08d704b5eaca5524
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3d4686b220e210f4828731ecf8026fe744b87154e9a5102165f52041740b6ac
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1f8093292209f6718d35e0ccd016f16652167381bf80627ae426fc0a96d439c
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff71901f9fc3dd0a0169fd09f191b1474c7af397ce776f7c375bb3b28eb7bd0a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f14ea077a90622e6b4dc501a0231b02a369eb0516f972a2f1408a934b610f29
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7bd67655dcdb83e7ff22694d01a9823713b17e32ab4a1e5c80da2d423d2c37fc
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:716d166395b1d3204d4c7983923c858c7814b0e98a579025f7d5396f16ff6dbc
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bebb458ab33cd9a17efd02365cbba8cf0457b1be88b705a8de29ec92988fadf4
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f369ee493042be075f560b3402758308791790a6b19967c254c943ef54144890
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:930942e93c4300034c378c60f4a0dab9ede67a9d39a6de67496fcfc48b953599
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d70614fd7a6e0bc09ddffd7fa93961bcf9eea5616a951374e008e680df41fe5
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07bfe832d1e9096039f344c8603706dbf51d8a2520da53b6fb91c2a831e3c65c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53ddb89813eac5e34cc35dc1ec465872bcd28d173f301c7ce65f1667e4d5f404
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f806da0a270ca93d00536c431b16e74277199f672ba1a254a8c26faa0eba661b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbbf9a445428ea3735d412cb42b094a0445cbab134f49cd9d71dd69330ce45b5
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a4acd0e09a01d2645011aa7cce0fbe01327e6aa40a58caa03b7a2ab1d8edd4a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5328f1346602824c30aa9cc75535e926a4d8ce4ca9da88e40f8ce89791ebde
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:044d30e23b739bee6ba5f240459efac0778a387063ea9930d3e3133b74846187
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed607b5799c4b3e2659af93e02b0fa11a91a2ab37a2feec0e0666f1663f216c4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7421150278293135,
5
  "eval_steps": 20,
6
- "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6099,6 +6099,766 @@
6099
  "eval_samples_per_second": 5.805,
6100
  "eval_steps_per_second": 0.189,
6101
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6102
  }
6103
  ],
6104
  "logging_steps": 1,
@@ -6118,7 +6878,7 @@
6118
  "attributes": {}
6119
  }
6120
  },
6121
- "total_flos": 2.5585969473467187e+17,
6122
  "train_batch_size": 8,
6123
  "trial_name": null,
6124
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8348794063079777,
5
  "eval_steps": 20,
6
+ "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6099
  "eval_samples_per_second": 5.805,
6100
  "eval_steps_per_second": 0.189,
6101
  "step": 800
6102
+ },
6103
+ {
6104
+ "epoch": 0.7430426716141002,
6105
+ "grad_norm": 3.8346760272979736,
6106
+ "learning_rate": 3.7614852762326303e-06,
6107
+ "loss": 0.1511,
6108
+ "step": 801
6109
+ },
6110
+ {
6111
+ "epoch": 0.7439703153988868,
6112
+ "grad_norm": 3.3582327365875244,
6113
+ "learning_rate": 3.736205777078381e-06,
6114
+ "loss": 0.099,
6115
+ "step": 802
6116
+ },
6117
+ {
6118
+ "epoch": 0.7448979591836735,
6119
+ "grad_norm": 4.043883323669434,
6120
+ "learning_rate": 3.7109919821615546e-06,
6121
+ "loss": 0.1684,
6122
+ "step": 803
6123
+ },
6124
+ {
6125
+ "epoch": 0.7458256029684601,
6126
+ "grad_norm": 4.061108112335205,
6127
+ "learning_rate": 3.685844155962931e-06,
6128
+ "loss": 0.1874,
6129
+ "step": 804
6130
+ },
6131
+ {
6132
+ "epoch": 0.7467532467532467,
6133
+ "grad_norm": 2.596107244491577,
6134
+ "learning_rate": 3.6607625622713005e-06,
6135
+ "loss": 0.1146,
6136
+ "step": 805
6137
+ },
6138
+ {
6139
+ "epoch": 0.7476808905380334,
6140
+ "grad_norm": 3.7484891414642334,
6141
+ "learning_rate": 3.63574746418072e-06,
6142
+ "loss": 0.1027,
6143
+ "step": 806
6144
+ },
6145
+ {
6146
+ "epoch": 0.74860853432282,
6147
+ "grad_norm": 4.787120342254639,
6148
+ "learning_rate": 3.610799124087725e-06,
6149
+ "loss": 0.2284,
6150
+ "step": 807
6151
+ },
6152
+ {
6153
+ "epoch": 0.7495361781076066,
6154
+ "grad_norm": 3.2740108966827393,
6155
+ "learning_rate": 3.585917803688603e-06,
6156
+ "loss": 0.126,
6157
+ "step": 808
6158
+ },
6159
+ {
6160
+ "epoch": 0.7504638218923934,
6161
+ "grad_norm": 3.000568389892578,
6162
+ "learning_rate": 3.5611037639766267e-06,
6163
+ "loss": 0.1258,
6164
+ "step": 809
6165
+ },
6166
+ {
6167
+ "epoch": 0.75139146567718,
6168
+ "grad_norm": 3.994319200515747,
6169
+ "learning_rate": 3.536357265239333e-06,
6170
+ "loss": 0.2042,
6171
+ "step": 810
6172
+ },
6173
+ {
6174
+ "epoch": 0.7523191094619666,
6175
+ "grad_norm": 3.8152012825012207,
6176
+ "learning_rate": 3.511678567055786e-06,
6177
+ "loss": 0.147,
6178
+ "step": 811
6179
+ },
6180
+ {
6181
+ "epoch": 0.7532467532467533,
6182
+ "grad_norm": 3.8970158100128174,
6183
+ "learning_rate": 3.487067928293848e-06,
6184
+ "loss": 0.142,
6185
+ "step": 812
6186
+ },
6187
+ {
6188
+ "epoch": 0.7541743970315399,
6189
+ "grad_norm": 4.383775234222412,
6190
+ "learning_rate": 3.4625256071074776e-06,
6191
+ "loss": 0.1752,
6192
+ "step": 813
6193
+ },
6194
+ {
6195
+ "epoch": 0.7551020408163265,
6196
+ "grad_norm": 3.43391489982605,
6197
+ "learning_rate": 3.4380518609340076e-06,
6198
+ "loss": 0.1207,
6199
+ "step": 814
6200
+ },
6201
+ {
6202
+ "epoch": 0.7560296846011132,
6203
+ "grad_norm": 3.767289161682129,
6204
+ "learning_rate": 3.413646946491458e-06,
6205
+ "loss": 0.1301,
6206
+ "step": 815
6207
+ },
6208
+ {
6209
+ "epoch": 0.7569573283858998,
6210
+ "grad_norm": 3.6454761028289795,
6211
+ "learning_rate": 3.3893111197758276e-06,
6212
+ "loss": 0.1729,
6213
+ "step": 816
6214
+ },
6215
+ {
6216
+ "epoch": 0.7578849721706865,
6217
+ "grad_norm": 4.36724328994751,
6218
+ "learning_rate": 3.3650446360584276e-06,
6219
+ "loss": 0.1707,
6220
+ "step": 817
6221
+ },
6222
+ {
6223
+ "epoch": 0.7588126159554731,
6224
+ "grad_norm": 2.9927897453308105,
6225
+ "learning_rate": 3.3408477498831917e-06,
6226
+ "loss": 0.112,
6227
+ "step": 818
6228
+ },
6229
+ {
6230
+ "epoch": 0.7597402597402597,
6231
+ "grad_norm": 4.448176860809326,
6232
+ "learning_rate": 3.3167207150640003e-06,
6233
+ "loss": 0.1064,
6234
+ "step": 819
6235
+ },
6236
+ {
6237
+ "epoch": 0.7606679035250464,
6238
+ "grad_norm": 6.467900276184082,
6239
+ "learning_rate": 3.2926637846820366e-06,
6240
+ "loss": 0.249,
6241
+ "step": 820
6242
+ },
6243
+ {
6244
+ "epoch": 0.7606679035250464,
6245
+ "eval_accuracy": 0.8592017738359202,
6246
+ "eval_f1": 0.6997635933806147,
6247
+ "eval_loss": 0.3030702769756317,
6248
+ "eval_precision": 0.8705882352941177,
6249
+ "eval_recall": 0.5849802371541502,
6250
+ "eval_runtime": 47.6515,
6251
+ "eval_samples_per_second": 5.792,
6252
+ "eval_steps_per_second": 0.189,
6253
+ "step": 820
6254
+ },
6255
+ {
6256
+ "epoch": 0.761595547309833,
6257
+ "grad_norm": 3.21474027633667,
6258
+ "learning_rate": 3.268677211083109e-06,
6259
+ "loss": 0.1103,
6260
+ "step": 821
6261
+ },
6262
+ {
6263
+ "epoch": 0.7625231910946196,
6264
+ "grad_norm": 3.8230926990509033,
6265
+ "learning_rate": 3.2447612458750365e-06,
6266
+ "loss": 0.1608,
6267
+ "step": 822
6268
+ },
6269
+ {
6270
+ "epoch": 0.7634508348794063,
6271
+ "grad_norm": 4.736266136169434,
6272
+ "learning_rate": 3.2209161399249677e-06,
6273
+ "loss": 0.1549,
6274
+ "step": 823
6275
+ },
6276
+ {
6277
+ "epoch": 0.764378478664193,
6278
+ "grad_norm": 5.4161481857299805,
6279
+ "learning_rate": 3.197142143356787e-06,
6280
+ "loss": 0.1905,
6281
+ "step": 824
6282
+ },
6283
+ {
6284
+ "epoch": 0.7653061224489796,
6285
+ "grad_norm": 3.857360601425171,
6286
+ "learning_rate": 3.1734395055484623e-06,
6287
+ "loss": 0.1287,
6288
+ "step": 825
6289
+ },
6290
+ {
6291
+ "epoch": 0.7662337662337663,
6292
+ "grad_norm": 4.246245861053467,
6293
+ "learning_rate": 3.1498084751294523e-06,
6294
+ "loss": 0.1751,
6295
+ "step": 826
6296
+ },
6297
+ {
6298
+ "epoch": 0.7671614100185529,
6299
+ "grad_norm": 4.21674108505249,
6300
+ "learning_rate": 3.126249299978086e-06,
6301
+ "loss": 0.1593,
6302
+ "step": 827
6303
+ },
6304
+ {
6305
+ "epoch": 0.7680890538033395,
6306
+ "grad_norm": 3.7095324993133545,
6307
+ "learning_rate": 3.1027622272189572e-06,
6308
+ "loss": 0.1384,
6309
+ "step": 828
6310
+ },
6311
+ {
6312
+ "epoch": 0.7690166975881262,
6313
+ "grad_norm": 6.3794965744018555,
6314
+ "learning_rate": 3.0793475032203513e-06,
6315
+ "loss": 0.1583,
6316
+ "step": 829
6317
+ },
6318
+ {
6319
+ "epoch": 0.7699443413729128,
6320
+ "grad_norm": 3.0277578830718994,
6321
+ "learning_rate": 3.0560053735916372e-06,
6322
+ "loss": 0.1043,
6323
+ "step": 830
6324
+ },
6325
+ {
6326
+ "epoch": 0.7708719851576994,
6327
+ "grad_norm": 5.117831707000732,
6328
+ "learning_rate": 3.032736083180716e-06,
6329
+ "loss": 0.15,
6330
+ "step": 831
6331
+ },
6332
+ {
6333
+ "epoch": 0.7717996289424861,
6334
+ "grad_norm": 2.76505184173584,
6335
+ "learning_rate": 3.009539876071427e-06,
6336
+ "loss": 0.0558,
6337
+ "step": 832
6338
+ },
6339
+ {
6340
+ "epoch": 0.7727272727272727,
6341
+ "grad_norm": 4.057276725769043,
6342
+ "learning_rate": 2.9864169955810085e-06,
6343
+ "loss": 0.1391,
6344
+ "step": 833
6345
+ },
6346
+ {
6347
+ "epoch": 0.7736549165120594,
6348
+ "grad_norm": 3.700852870941162,
6349
+ "learning_rate": 2.9633676842575386e-06,
6350
+ "loss": 0.1721,
6351
+ "step": 834
6352
+ },
6353
+ {
6354
+ "epoch": 0.774582560296846,
6355
+ "grad_norm": 4.1468939781188965,
6356
+ "learning_rate": 2.940392183877382e-06,
6357
+ "loss": 0.1866,
6358
+ "step": 835
6359
+ },
6360
+ {
6361
+ "epoch": 0.7755102040816326,
6362
+ "grad_norm": 3.1190547943115234,
6363
+ "learning_rate": 2.9174907354426696e-06,
6364
+ "loss": 0.1292,
6365
+ "step": 836
6366
+ },
6367
+ {
6368
+ "epoch": 0.7764378478664193,
6369
+ "grad_norm": 3.0235095024108887,
6370
+ "learning_rate": 2.8946635791787546e-06,
6371
+ "loss": 0.0629,
6372
+ "step": 837
6373
+ },
6374
+ {
6375
+ "epoch": 0.7773654916512059,
6376
+ "grad_norm": 3.435035467147827,
6377
+ "learning_rate": 2.8719109545317102e-06,
6378
+ "loss": 0.1064,
6379
+ "step": 838
6380
+ },
6381
+ {
6382
+ "epoch": 0.7782931354359925,
6383
+ "grad_norm": 5.368072986602783,
6384
+ "learning_rate": 2.849233100165795e-06,
6385
+ "loss": 0.1662,
6386
+ "step": 839
6387
+ },
6388
+ {
6389
+ "epoch": 0.7792207792207793,
6390
+ "grad_norm": 4.488304615020752,
6391
+ "learning_rate": 2.8266302539609747e-06,
6392
+ "loss": 0.2033,
6393
+ "step": 840
6394
+ },
6395
+ {
6396
+ "epoch": 0.7792207792207793,
6397
+ "eval_accuracy": 0.8592017738359202,
6398
+ "eval_f1": 0.6983372921615202,
6399
+ "eval_loss": 0.30760514736175537,
6400
+ "eval_precision": 0.875,
6401
+ "eval_recall": 0.5810276679841897,
6402
+ "eval_runtime": 47.6262,
6403
+ "eval_samples_per_second": 5.795,
6404
+ "eval_steps_per_second": 0.189,
6405
+ "step": 840
6406
+ },
6407
+ {
6408
+ "epoch": 0.7801484230055659,
6409
+ "grad_norm": 4.42378044128418,
6410
+ "learning_rate": 2.8041026530104144e-06,
6411
+ "loss": 0.1223,
6412
+ "step": 841
6413
+ },
6414
+ {
6415
+ "epoch": 0.7810760667903525,
6416
+ "grad_norm": 3.143146514892578,
6417
+ "learning_rate": 2.78165053361798e-06,
6418
+ "loss": 0.1408,
6419
+ "step": 842
6420
+ },
6421
+ {
6422
+ "epoch": 0.7820037105751392,
6423
+ "grad_norm": 3.8427817821502686,
6424
+ "learning_rate": 2.759274131295787e-06,
6425
+ "loss": 0.0995,
6426
+ "step": 843
6427
+ },
6428
+ {
6429
+ "epoch": 0.7829313543599258,
6430
+ "grad_norm": 4.896306991577148,
6431
+ "learning_rate": 2.736973680761702e-06,
6432
+ "loss": 0.1621,
6433
+ "step": 844
6434
+ },
6435
+ {
6436
+ "epoch": 0.7838589981447124,
6437
+ "grad_norm": 5.344152450561523,
6438
+ "learning_rate": 2.714749415936904e-06,
6439
+ "loss": 0.1749,
6440
+ "step": 845
6441
+ },
6442
+ {
6443
+ "epoch": 0.7847866419294991,
6444
+ "grad_norm": 4.093927383422852,
6445
+ "learning_rate": 2.692601569943407e-06,
6446
+ "loss": 0.1716,
6447
+ "step": 846
6448
+ },
6449
+ {
6450
+ "epoch": 0.7857142857142857,
6451
+ "grad_norm": 3.3568267822265625,
6452
+ "learning_rate": 2.670530375101641e-06,
6453
+ "loss": 0.1658,
6454
+ "step": 847
6455
+ },
6456
+ {
6457
+ "epoch": 0.7866419294990723,
6458
+ "grad_norm": 3.3225290775299072,
6459
+ "learning_rate": 2.648536062927999e-06,
6460
+ "loss": 0.1358,
6461
+ "step": 848
6462
+ },
6463
+ {
6464
+ "epoch": 0.787569573283859,
6465
+ "grad_norm": 4.490353584289551,
6466
+ "learning_rate": 2.6266188641324e-06,
6467
+ "loss": 0.1178,
6468
+ "step": 849
6469
+ },
6470
+ {
6471
+ "epoch": 0.7884972170686456,
6472
+ "grad_norm": 3.4706945419311523,
6473
+ "learning_rate": 2.604779008615895e-06,
6474
+ "loss": 0.0946,
6475
+ "step": 850
6476
+ },
6477
+ {
6478
+ "epoch": 0.7894248608534323,
6479
+ "grad_norm": 5.027894020080566,
6480
+ "learning_rate": 2.583016725468226e-06,
6481
+ "loss": 0.1434,
6482
+ "step": 851
6483
+ },
6484
+ {
6485
+ "epoch": 0.7903525046382189,
6486
+ "grad_norm": 4.010559558868408,
6487
+ "learning_rate": 2.5613322429654573e-06,
6488
+ "loss": 0.158,
6489
+ "step": 852
6490
+ },
6491
+ {
6492
+ "epoch": 0.7912801484230055,
6493
+ "grad_norm": 6.057810306549072,
6494
+ "learning_rate": 2.5397257885675396e-06,
6495
+ "loss": 0.1912,
6496
+ "step": 853
6497
+ },
6498
+ {
6499
+ "epoch": 0.7922077922077922,
6500
+ "grad_norm": 5.5663371086120605,
6501
+ "learning_rate": 2.5181975889159615e-06,
6502
+ "loss": 0.2398,
6503
+ "step": 854
6504
+ },
6505
+ {
6506
+ "epoch": 0.7931354359925789,
6507
+ "grad_norm": 2.8672025203704834,
6508
+ "learning_rate": 2.496747869831345e-06,
6509
+ "loss": 0.0944,
6510
+ "step": 855
6511
+ },
6512
+ {
6513
+ "epoch": 0.7940630797773655,
6514
+ "grad_norm": 4.048581123352051,
6515
+ "learning_rate": 2.475376856311097e-06,
6516
+ "loss": 0.1603,
6517
+ "step": 856
6518
+ },
6519
+ {
6520
+ "epoch": 0.7949907235621522,
6521
+ "grad_norm": 5.737659931182861,
6522
+ "learning_rate": 2.4540847725270376e-06,
6523
+ "loss": 0.1366,
6524
+ "step": 857
6525
+ },
6526
+ {
6527
+ "epoch": 0.7959183673469388,
6528
+ "grad_norm": 4.4261155128479,
6529
+ "learning_rate": 2.432871841823047e-06,
6530
+ "loss": 0.1939,
6531
+ "step": 858
6532
+ },
6533
+ {
6534
+ "epoch": 0.7968460111317254,
6535
+ "grad_norm": 5.629834175109863,
6536
+ "learning_rate": 2.411738286712735e-06,
6537
+ "loss": 0.2281,
6538
+ "step": 859
6539
+ },
6540
+ {
6541
+ "epoch": 0.7977736549165121,
6542
+ "grad_norm": 3.919034004211426,
6543
+ "learning_rate": 2.390684328877089e-06,
6544
+ "loss": 0.1418,
6545
+ "step": 860
6546
+ },
6547
+ {
6548
+ "epoch": 0.7977736549165121,
6549
+ "eval_accuracy": 0.8614190687361419,
6550
+ "eval_f1": 0.7072599531615925,
6551
+ "eval_loss": 0.29977986216545105,
6552
+ "eval_precision": 0.867816091954023,
6553
+ "eval_recall": 0.5968379446640316,
6554
+ "eval_runtime": 47.4113,
6555
+ "eval_samples_per_second": 5.821,
6556
+ "eval_steps_per_second": 0.19,
6557
+ "step": 860
6558
+ },
6559
+ {
6560
+ "epoch": 0.7987012987012987,
6561
+ "grad_norm": 7.607851982116699,
6562
+ "learning_rate": 2.36971018916217e-06,
6563
+ "loss": 0.187,
6564
+ "step": 861
6565
+ },
6566
+ {
6567
+ "epoch": 0.7996289424860853,
6568
+ "grad_norm": 3.1179118156433105,
6569
+ "learning_rate": 2.3488160875767717e-06,
6570
+ "loss": 0.1326,
6571
+ "step": 862
6572
+ },
6573
+ {
6574
+ "epoch": 0.800556586270872,
6575
+ "grad_norm": 3.8754749298095703,
6576
+ "learning_rate": 2.328002243290138e-06,
6577
+ "loss": 0.1497,
6578
+ "step": 863
6579
+ },
6580
+ {
6581
+ "epoch": 0.8014842300556586,
6582
+ "grad_norm": 4.216552734375,
6583
+ "learning_rate": 2.307268874629649e-06,
6584
+ "loss": 0.1259,
6585
+ "step": 864
6586
+ },
6587
+ {
6588
+ "epoch": 0.8024118738404453,
6589
+ "grad_norm": 5.980984210968018,
6590
+ "learning_rate": 2.2866161990785228e-06,
6591
+ "loss": 0.1778,
6592
+ "step": 865
6593
+ },
6594
+ {
6595
+ "epoch": 0.8033395176252319,
6596
+ "grad_norm": 4.638891220092773,
6597
+ "learning_rate": 2.266044433273562e-06,
6598
+ "loss": 0.2258,
6599
+ "step": 866
6600
+ },
6601
+ {
6602
+ "epoch": 0.8042671614100185,
6603
+ "grad_norm": 3.3244616985321045,
6604
+ "learning_rate": 2.245553793002849e-06,
6605
+ "loss": 0.1215,
6606
+ "step": 867
6607
+ },
6608
+ {
6609
+ "epoch": 0.8051948051948052,
6610
+ "grad_norm": 6.757506370544434,
6611
+ "learning_rate": 2.2251444932035094e-06,
6612
+ "loss": 0.159,
6613
+ "step": 868
6614
+ },
6615
+ {
6616
+ "epoch": 0.8061224489795918,
6617
+ "grad_norm": 5.527317523956299,
6618
+ "learning_rate": 2.204816747959434e-06,
6619
+ "loss": 0.1025,
6620
+ "step": 869
6621
+ },
6622
+ {
6623
+ "epoch": 0.8070500927643784,
6624
+ "grad_norm": 3.5259664058685303,
6625
+ "learning_rate": 2.184570770499056e-06,
6626
+ "loss": 0.1522,
6627
+ "step": 870
6628
+ },
6629
+ {
6630
+ "epoch": 0.8079777365491652,
6631
+ "grad_norm": 4.936224937438965,
6632
+ "learning_rate": 2.1644067731931005e-06,
6633
+ "loss": 0.1679,
6634
+ "step": 871
6635
+ },
6636
+ {
6637
+ "epoch": 0.8089053803339518,
6638
+ "grad_norm": 4.338299751281738,
6639
+ "learning_rate": 2.1443249675523536e-06,
6640
+ "loss": 0.1705,
6641
+ "step": 872
6642
+ },
6643
+ {
6644
+ "epoch": 0.8098330241187384,
6645
+ "grad_norm": 3.631812334060669,
6646
+ "learning_rate": 2.124325564225458e-06,
6647
+ "loss": 0.1034,
6648
+ "step": 873
6649
+ },
6650
+ {
6651
+ "epoch": 0.8107606679035251,
6652
+ "grad_norm": 3.276409864425659,
6653
+ "learning_rate": 2.1044087729966856e-06,
6654
+ "loss": 0.1486,
6655
+ "step": 874
6656
+ },
6657
+ {
6658
+ "epoch": 0.8116883116883117,
6659
+ "grad_norm": 5.677032947540283,
6660
+ "learning_rate": 2.0845748027837585e-06,
6661
+ "loss": 0.2155,
6662
+ "step": 875
6663
+ },
6664
+ {
6665
+ "epoch": 0.8126159554730983,
6666
+ "grad_norm": 6.064774036407471,
6667
+ "learning_rate": 2.064823861635633e-06,
6668
+ "loss": 0.1773,
6669
+ "step": 876
6670
+ },
6671
+ {
6672
+ "epoch": 0.813543599257885,
6673
+ "grad_norm": 3.783052444458008,
6674
+ "learning_rate": 2.0451561567303378e-06,
6675
+ "loss": 0.1696,
6676
+ "step": 877
6677
+ },
6678
+ {
6679
+ "epoch": 0.8144712430426716,
6680
+ "grad_norm": 3.9778928756713867,
6681
+ "learning_rate": 2.025571894372794e-06,
6682
+ "loss": 0.0916,
6683
+ "step": 878
6684
+ },
6685
+ {
6686
+ "epoch": 0.8153988868274582,
6687
+ "grad_norm": 3.2632224559783936,
6688
+ "learning_rate": 2.0060712799926407e-06,
6689
+ "loss": 0.1004,
6690
+ "step": 879
6691
+ },
6692
+ {
6693
+ "epoch": 0.8163265306122449,
6694
+ "grad_norm": 5.624824047088623,
6695
+ "learning_rate": 1.9866545181421016e-06,
6696
+ "loss": 0.1826,
6697
+ "step": 880
6698
+ },
6699
+ {
6700
+ "epoch": 0.8163265306122449,
6701
+ "eval_accuracy": 0.8625277161862528,
6702
+ "eval_f1": 0.7089201877934272,
6703
+ "eval_loss": 0.30142825841903687,
6704
+ "eval_precision": 0.8728323699421965,
6705
+ "eval_recall": 0.5968379446640316,
6706
+ "eval_runtime": 47.5017,
6707
+ "eval_samples_per_second": 5.81,
6708
+ "eval_steps_per_second": 0.189,
6709
+ "step": 880
6710
+ },
6711
+ {
6712
+ "epoch": 0.8172541743970315,
6713
+ "grad_norm": 5.620955944061279,
6714
+ "learning_rate": 1.967321812493813e-06,
6715
+ "loss": 0.159,
6716
+ "step": 881
6717
+ },
6718
+ {
6719
+ "epoch": 0.8181818181818182,
6720
+ "grad_norm": 3.5917294025421143,
6721
+ "learning_rate": 1.9480733658387175e-06,
6722
+ "loss": 0.1544,
6723
+ "step": 882
6724
+ },
6725
+ {
6726
+ "epoch": 0.8191094619666048,
6727
+ "grad_norm": 4.478516101837158,
6728
+ "learning_rate": 1.9289093800839067e-06,
6729
+ "loss": 0.1835,
6730
+ "step": 883
6731
+ },
6732
+ {
6733
+ "epoch": 0.8200371057513914,
6734
+ "grad_norm": 5.72329044342041,
6735
+ "learning_rate": 1.9098300562505266e-06,
6736
+ "loss": 0.181,
6737
+ "step": 884
6738
+ },
6739
+ {
6740
+ "epoch": 0.8209647495361782,
6741
+ "grad_norm": 4.014090538024902,
6742
+ "learning_rate": 1.8908355944716516e-06,
6743
+ "loss": 0.123,
6744
+ "step": 885
6745
+ },
6746
+ {
6747
+ "epoch": 0.8218923933209648,
6748
+ "grad_norm": 3.9502549171447754,
6749
+ "learning_rate": 1.8719261939902023e-06,
6750
+ "loss": 0.1675,
6751
+ "step": 886
6752
+ },
6753
+ {
6754
+ "epoch": 0.8228200371057514,
6755
+ "grad_norm": 3.701931953430176,
6756
+ "learning_rate": 1.8531020531568377e-06,
6757
+ "loss": 0.1297,
6758
+ "step": 887
6759
+ },
6760
+ {
6761
+ "epoch": 0.8237476808905381,
6762
+ "grad_norm": 4.026066303253174,
6763
+ "learning_rate": 1.8343633694278895e-06,
6764
+ "loss": 0.1366,
6765
+ "step": 888
6766
+ },
6767
+ {
6768
+ "epoch": 0.8246753246753247,
6769
+ "grad_norm": 4.122823715209961,
6770
+ "learning_rate": 1.8157103393632869e-06,
6771
+ "loss": 0.1819,
6772
+ "step": 889
6773
+ },
6774
+ {
6775
+ "epoch": 0.8256029684601113,
6776
+ "grad_norm": 4.512097358703613,
6777
+ "learning_rate": 1.7971431586244814e-06,
6778
+ "loss": 0.2281,
6779
+ "step": 890
6780
+ },
6781
+ {
6782
+ "epoch": 0.826530612244898,
6783
+ "grad_norm": 3.6927201747894287,
6784
+ "learning_rate": 1.7786620219724205e-06,
6785
+ "loss": 0.0749,
6786
+ "step": 891
6787
+ },
6788
+ {
6789
+ "epoch": 0.8274582560296846,
6790
+ "grad_norm": 5.3355207443237305,
6791
+ "learning_rate": 1.7602671232654755e-06,
6792
+ "loss": 0.1781,
6793
+ "step": 892
6794
+ },
6795
+ {
6796
+ "epoch": 0.8283858998144712,
6797
+ "grad_norm": 3.310504674911499,
6798
+ "learning_rate": 1.7419586554574364e-06,
6799
+ "loss": 0.0927,
6800
+ "step": 893
6801
+ },
6802
+ {
6803
+ "epoch": 0.8293135435992579,
6804
+ "grad_norm": 3.6718716621398926,
6805
+ "learning_rate": 1.723736810595461e-06,
6806
+ "loss": 0.159,
6807
+ "step": 894
6808
+ },
6809
+ {
6810
+ "epoch": 0.8302411873840445,
6811
+ "grad_norm": 4.083915710449219,
6812
+ "learning_rate": 1.7056017798180824e-06,
6813
+ "loss": 0.1516,
6814
+ "step": 895
6815
+ },
6816
+ {
6817
+ "epoch": 0.8311688311688312,
6818
+ "grad_norm": 8.39147663116455,
6819
+ "learning_rate": 1.687553753353195e-06,
6820
+ "loss": 0.1548,
6821
+ "step": 896
6822
+ },
6823
+ {
6824
+ "epoch": 0.8320964749536178,
6825
+ "grad_norm": 3.83030366897583,
6826
+ "learning_rate": 1.669592920516049e-06,
6827
+ "loss": 0.1613,
6828
+ "step": 897
6829
+ },
6830
+ {
6831
+ "epoch": 0.8330241187384044,
6832
+ "grad_norm": 3.559238910675049,
6833
+ "learning_rate": 1.6517194697072903e-06,
6834
+ "loss": 0.1193,
6835
+ "step": 898
6836
+ },
6837
+ {
6838
+ "epoch": 0.8339517625231911,
6839
+ "grad_norm": 4.015861511230469,
6840
+ "learning_rate": 1.633933588410952e-06,
6841
+ "loss": 0.1383,
6842
+ "step": 899
6843
+ },
6844
+ {
6845
+ "epoch": 0.8348794063079777,
6846
+ "grad_norm": 4.48312520980835,
6847
+ "learning_rate": 1.6162354631925203e-06,
6848
+ "loss": 0.1538,
6849
+ "step": 900
6850
+ },
6851
+ {
6852
+ "epoch": 0.8348794063079777,
6853
+ "eval_accuracy": 0.8614190687361419,
6854
+ "eval_f1": 0.7016706443914081,
6855
+ "eval_loss": 0.3091951012611389,
6856
+ "eval_precision": 0.8855421686746988,
6857
+ "eval_recall": 0.5810276679841897,
6858
+ "eval_runtime": 47.4361,
6859
+ "eval_samples_per_second": 5.818,
6860
+ "eval_steps_per_second": 0.19,
6861
+ "step": 900
6862
  }
6863
  ],
6864
  "logging_steps": 1,
 
6878
  "attributes": {}
6879
  }
6880
  },
6881
+ "total_flos": 2.8777892777715302e+17,
6882
  "train_batch_size": 8,
6883
  "trial_name": null,
6884
  "trial_params": null