mtzig commited on
Commit
af5c80b
·
verified ·
1 Parent(s): 506becf

Training in progress, step 6300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9532cf3853865f83aa4b1512fed11a043caac16c7c7a479336cb00c08c47445f
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28a8c69423684ee4c64da8962a7bfc59ba0c98b1b135f97d468efb2d682b7f3
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac0306f8cd79071439fb2e032b6a794dfe130b78d3f6139dacf123dfc6184db8
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:256356e5f5f129661266fd2ec5986d64e8a618f50386558442d8fd5e211f9d75
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8952265544fc4294d6fa38bb32c8013d07436ac0fa10a7ef59f2d03aaf69a899
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4400418753cdae533886a325d8574dc0fd9e84c371d8423f3b0575671aff9b5
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74cac162bd00e61ab073a6b6fa81138d15f540573e2730c348646239c0af2746
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d0003531b8a67ee8629b1863a22b3c8772704ff5ae56a9428b25b3f9af27ca
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79f9fd3ffa5c298f49aa683a89f30a3b293edf8a4bf04e3e2e1304208647e606
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d59b6204db24eaafdf19a89c40f08932737a129af907b8fa01e86a38e864b7b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b62917f83638a2302f8bdb8e4696e57f59c8864664078b94923b1e2952d78862
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a488e42a6c1233774282544efdbb895b44374f17a7953d74ea138b797268fdd1
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dddadf1f078604529c0f4d51b0dfabc290ef123390e4b641aa10c7584948cc1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a157099efd1a2813560e813b422c6d600f68c33a2bb205d7f3a61370a041b79
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21fb7db76e3758690c774743f26cd5ccb3de7c9e9ec9421fb6347ba964f73792
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd7f94bcc3a523e515db8e62f1b61f8f766e6f97044ede3fb1d022d6fec18097
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ebcdd9cb3a00187b7caf8ccddabd7425b6b74eafab1a8a7e286f4cf2c1e0dc5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f80073c48585f31ea8d8b021958a20a34c2dfc7e8e8ec02b7ace68d8369bd89d
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3802beb66fc35db4df22557b4497b6a8fdfdf3e582059b4fe079309c7d84ad1a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae3b6324078ae2ab8d58a5fe3558de31400b69d699a72fa9072c4fd896d7f841
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61d911caf90f35f3e5e63bf349703d8ac88e88dcfb0f587f0a27fb4ec2d5b04b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeb00a30bd3348fef7fa7a0dc88bf9a7a5a32f4484761a26220beef20b2e2ee5
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4faf604ceb02aaa7b878afc6f9935dd3d58f0bba74657b78471494e5a2ee20b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95a2a891e4d47fc182ed74e57aef0f749cc61efcda057957b66e209db024a9f5
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:529be97fb31f3c3cb5a6124f64514f96e9dc11d13d1ad58796326c25a10ede28
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f98eaae48265d25e6b8b613f21a112d74712c3c7822c1f5228bd295d2e702437
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9147915898192549,
5
  "eval_steps": 20,
6
- "global_step": 6200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -47139,6 +47139,766 @@
47139
  "eval_samples_per_second": 5.842,
47140
  "eval_steps_per_second": 0.201,
47141
  "step": 6200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47142
  }
47143
  ],
47144
  "logging_steps": 1,
@@ -47158,7 +47918,7 @@
47158
  "attributes": {}
47159
  }
47160
  },
47161
- "total_flos": 1.9099781569372488e+18,
47162
  "train_batch_size": 8,
47163
  "trial_name": null,
47164
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9295462928808558,
5
  "eval_steps": 20,
6
+ "global_step": 6300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
47139
  "eval_samples_per_second": 5.842,
47140
  "eval_steps_per_second": 0.201,
47141
  "step": 6200
47142
+ },
47143
+ {
47144
+ "epoch": 0.9149391368498709,
47145
+ "grad_norm": 1.5670089721679688,
47146
+ "learning_rate": 4.369273406291519e-07,
47147
+ "loss": 0.0568,
47148
+ "step": 6201
47149
+ },
47150
+ {
47151
+ "epoch": 0.9150866838804869,
47152
+ "grad_norm": 1.30291748046875,
47153
+ "learning_rate": 4.354226460843414e-07,
47154
+ "loss": 0.0265,
47155
+ "step": 6202
47156
+ },
47157
+ {
47158
+ "epoch": 0.9152342309111029,
47159
+ "grad_norm": 1.8413887023925781,
47160
+ "learning_rate": 4.339204892861215e-07,
47161
+ "loss": 0.0398,
47162
+ "step": 6203
47163
+ },
47164
+ {
47165
+ "epoch": 0.9153817779417189,
47166
+ "grad_norm": 2.272320032119751,
47167
+ "learning_rate": 4.3242087063305684e-07,
47168
+ "loss": 0.1022,
47169
+ "step": 6204
47170
+ },
47171
+ {
47172
+ "epoch": 0.915529324972335,
47173
+ "grad_norm": 2.3131356239318848,
47174
+ "learning_rate": 4.3092379052303457e-07,
47175
+ "loss": 0.0353,
47176
+ "step": 6205
47177
+ },
47178
+ {
47179
+ "epoch": 0.9156768720029509,
47180
+ "grad_norm": 4.904222011566162,
47181
+ "learning_rate": 4.294292493532737e-07,
47182
+ "loss": 0.0836,
47183
+ "step": 6206
47184
+ },
47185
+ {
47186
+ "epoch": 0.9158244190335669,
47187
+ "grad_norm": 1.8545290231704712,
47188
+ "learning_rate": 4.2793724752031807e-07,
47189
+ "loss": 0.0445,
47190
+ "step": 6207
47191
+ },
47192
+ {
47193
+ "epoch": 0.915971966064183,
47194
+ "grad_norm": 4.446885585784912,
47195
+ "learning_rate": 4.264477854200366e-07,
47196
+ "loss": 0.065,
47197
+ "step": 6208
47198
+ },
47199
+ {
47200
+ "epoch": 0.916119513094799,
47201
+ "grad_norm": 0.9898476004600525,
47202
+ "learning_rate": 4.24960863447621e-07,
47203
+ "loss": 0.0215,
47204
+ "step": 6209
47205
+ },
47206
+ {
47207
+ "epoch": 0.9162670601254149,
47208
+ "grad_norm": 2.3899457454681396,
47209
+ "learning_rate": 4.2347648199759784e-07,
47210
+ "loss": 0.0282,
47211
+ "step": 6210
47212
+ },
47213
+ {
47214
+ "epoch": 0.916414607156031,
47215
+ "grad_norm": 1.5149897336959839,
47216
+ "learning_rate": 4.219946414638132e-07,
47217
+ "loss": 0.0236,
47218
+ "step": 6211
47219
+ },
47220
+ {
47221
+ "epoch": 0.916562154186647,
47222
+ "grad_norm": 2.0021674633026123,
47223
+ "learning_rate": 4.205153422394381e-07,
47224
+ "loss": 0.0429,
47225
+ "step": 6212
47226
+ },
47227
+ {
47228
+ "epoch": 0.916709701217263,
47229
+ "grad_norm": 3.0172886848449707,
47230
+ "learning_rate": 4.1903858471697424e-07,
47231
+ "loss": 0.0445,
47232
+ "step": 6213
47233
+ },
47234
+ {
47235
+ "epoch": 0.9168572482478791,
47236
+ "grad_norm": 4.3301310539245605,
47237
+ "learning_rate": 4.175643692882436e-07,
47238
+ "loss": 0.1072,
47239
+ "step": 6214
47240
+ },
47241
+ {
47242
+ "epoch": 0.917004795278495,
47243
+ "grad_norm": 3.8663575649261475,
47244
+ "learning_rate": 4.160926963443979e-07,
47245
+ "loss": 0.0675,
47246
+ "step": 6215
47247
+ },
47248
+ {
47249
+ "epoch": 0.917152342309111,
47250
+ "grad_norm": 1.8558573722839355,
47251
+ "learning_rate": 4.1462356627591236e-07,
47252
+ "loss": 0.0268,
47253
+ "step": 6216
47254
+ },
47255
+ {
47256
+ "epoch": 0.9172998893397271,
47257
+ "grad_norm": 1.4661530256271362,
47258
+ "learning_rate": 4.131569794725876e-07,
47259
+ "loss": 0.0256,
47260
+ "step": 6217
47261
+ },
47262
+ {
47263
+ "epoch": 0.9174474363703431,
47264
+ "grad_norm": 1.4858981370925903,
47265
+ "learning_rate": 4.1169293632355114e-07,
47266
+ "loss": 0.0414,
47267
+ "step": 6218
47268
+ },
47269
+ {
47270
+ "epoch": 0.917594983400959,
47271
+ "grad_norm": 4.485099792480469,
47272
+ "learning_rate": 4.1023143721725e-07,
47273
+ "loss": 0.0801,
47274
+ "step": 6219
47275
+ },
47276
+ {
47277
+ "epoch": 0.917742530431575,
47278
+ "grad_norm": 3.052419662475586,
47279
+ "learning_rate": 4.0877248254146404e-07,
47280
+ "loss": 0.0737,
47281
+ "step": 6220
47282
+ },
47283
+ {
47284
+ "epoch": 0.917742530431575,
47285
+ "eval_accuracy": 0.9797395079594791,
47286
+ "eval_f1": 0.9653465346534653,
47287
+ "eval_loss": 0.05517810955643654,
47288
+ "eval_precision": 0.9848484848484849,
47289
+ "eval_recall": 0.9466019417475728,
47290
+ "eval_runtime": 50.2014,
47291
+ "eval_samples_per_second": 5.797,
47292
+ "eval_steps_per_second": 0.199,
47293
+ "step": 6220
47294
+ },
47295
+ {
47296
+ "epoch": 0.9178900774621911,
47297
+ "grad_norm": 2.319796562194824,
47298
+ "learning_rate": 4.0731607268329477e-07,
47299
+ "loss": 0.0639,
47300
+ "step": 6221
47301
+ },
47302
+ {
47303
+ "epoch": 0.9180376244928071,
47304
+ "grad_norm": 2.6878068447113037,
47305
+ "learning_rate": 4.058622080291652e-07,
47306
+ "loss": 0.0571,
47307
+ "step": 6222
47308
+ },
47309
+ {
47310
+ "epoch": 0.918185171523423,
47311
+ "grad_norm": 2.0226361751556396,
47312
+ "learning_rate": 4.0441088896482574e-07,
47313
+ "loss": 0.0681,
47314
+ "step": 6223
47315
+ },
47316
+ {
47317
+ "epoch": 0.9183327185540391,
47318
+ "grad_norm": 1.9138096570968628,
47319
+ "learning_rate": 4.029621158753538e-07,
47320
+ "loss": 0.0566,
47321
+ "step": 6224
47322
+ },
47323
+ {
47324
+ "epoch": 0.9184802655846551,
47325
+ "grad_norm": 2.777580976486206,
47326
+ "learning_rate": 4.015158891451476e-07,
47327
+ "loss": 0.0417,
47328
+ "step": 6225
47329
+ },
47330
+ {
47331
+ "epoch": 0.9186278126152712,
47332
+ "grad_norm": 1.0402213335037231,
47333
+ "learning_rate": 4.000722091579301e-07,
47334
+ "loss": 0.0174,
47335
+ "step": 6226
47336
+ },
47337
+ {
47338
+ "epoch": 0.9187753596458871,
47339
+ "grad_norm": 3.61226749420166,
47340
+ "learning_rate": 3.986310762967527e-07,
47341
+ "loss": 0.0224,
47342
+ "step": 6227
47343
+ },
47344
+ {
47345
+ "epoch": 0.9189229066765031,
47346
+ "grad_norm": 1.522729516029358,
47347
+ "learning_rate": 3.971924909439828e-07,
47348
+ "loss": 0.052,
47349
+ "step": 6228
47350
+ },
47351
+ {
47352
+ "epoch": 0.9190704537071192,
47353
+ "grad_norm": 3.099111557006836,
47354
+ "learning_rate": 3.9575645348132074e-07,
47355
+ "loss": 0.1204,
47356
+ "step": 6229
47357
+ },
47358
+ {
47359
+ "epoch": 0.9192180007377352,
47360
+ "grad_norm": 1.7007861137390137,
47361
+ "learning_rate": 3.943229642897861e-07,
47362
+ "loss": 0.0405,
47363
+ "step": 6230
47364
+ },
47365
+ {
47366
+ "epoch": 0.9193655477683511,
47367
+ "grad_norm": 5.425076484680176,
47368
+ "learning_rate": 3.9289202374972247e-07,
47369
+ "loss": 0.1331,
47370
+ "step": 6231
47371
+ },
47372
+ {
47373
+ "epoch": 0.9195130947989671,
47374
+ "grad_norm": 3.1894729137420654,
47375
+ "learning_rate": 3.9146363224079943e-07,
47376
+ "loss": 0.1074,
47377
+ "step": 6232
47378
+ },
47379
+ {
47380
+ "epoch": 0.9196606418295832,
47381
+ "grad_norm": 4.616338729858398,
47382
+ "learning_rate": 3.90037790142006e-07,
47383
+ "loss": 0.0738,
47384
+ "step": 6233
47385
+ },
47386
+ {
47387
+ "epoch": 0.9198081888601992,
47388
+ "grad_norm": 2.9994616508483887,
47389
+ "learning_rate": 3.886144978316586e-07,
47390
+ "loss": 0.1025,
47391
+ "step": 6234
47392
+ },
47393
+ {
47394
+ "epoch": 0.9199557358908153,
47395
+ "grad_norm": 3.527212142944336,
47396
+ "learning_rate": 3.8719375568739834e-07,
47397
+ "loss": 0.0572,
47398
+ "step": 6235
47399
+ },
47400
+ {
47401
+ "epoch": 0.9201032829214312,
47402
+ "grad_norm": 1.8691866397857666,
47403
+ "learning_rate": 3.8577556408618487e-07,
47404
+ "loss": 0.0501,
47405
+ "step": 6236
47406
+ },
47407
+ {
47408
+ "epoch": 0.9202508299520472,
47409
+ "grad_norm": 3.163731813430786,
47410
+ "learning_rate": 3.8435992340430383e-07,
47411
+ "loss": 0.0944,
47412
+ "step": 6237
47413
+ },
47414
+ {
47415
+ "epoch": 0.9203983769826632,
47416
+ "grad_norm": 2.161836862564087,
47417
+ "learning_rate": 3.829468340173637e-07,
47418
+ "loss": 0.0457,
47419
+ "step": 6238
47420
+ },
47421
+ {
47422
+ "epoch": 0.9205459240132793,
47423
+ "grad_norm": 3.0532407760620117,
47424
+ "learning_rate": 3.8153629630029666e-07,
47425
+ "loss": 0.0415,
47426
+ "step": 6239
47427
+ },
47428
+ {
47429
+ "epoch": 0.9206934710438952,
47430
+ "grad_norm": 2.5379703044891357,
47431
+ "learning_rate": 3.80128310627359e-07,
47432
+ "loss": 0.042,
47433
+ "step": 6240
47434
+ },
47435
+ {
47436
+ "epoch": 0.9206934710438952,
47437
+ "eval_accuracy": 0.9782923299565847,
47438
+ "eval_f1": 0.9629629629629629,
47439
+ "eval_loss": 0.0559084378182888,
47440
+ "eval_precision": 0.9798994974874372,
47441
+ "eval_recall": 0.9466019417475728,
47442
+ "eval_runtime": 49.6899,
47443
+ "eval_samples_per_second": 5.856,
47444
+ "eval_steps_per_second": 0.201,
47445
+ "step": 6240
47446
+ },
47447
+ {
47448
+ "epoch": 0.9208410180745112,
47449
+ "grad_norm": 2.550798177719116,
47450
+ "learning_rate": 3.787228773721252e-07,
47451
+ "loss": 0.1024,
47452
+ "step": 6241
47453
+ },
47454
+ {
47455
+ "epoch": 0.9209885651051273,
47456
+ "grad_norm": 5.740802764892578,
47457
+ "learning_rate": 3.773199969074959e-07,
47458
+ "loss": 0.0571,
47459
+ "step": 6242
47460
+ },
47461
+ {
47462
+ "epoch": 0.9211361121357433,
47463
+ "grad_norm": 3.259659767150879,
47464
+ "learning_rate": 3.759196696056955e-07,
47465
+ "loss": 0.0458,
47466
+ "step": 6243
47467
+ },
47468
+ {
47469
+ "epoch": 0.9212836591663592,
47470
+ "grad_norm": 1.5455894470214844,
47471
+ "learning_rate": 3.7452189583827017e-07,
47472
+ "loss": 0.0435,
47473
+ "step": 6244
47474
+ },
47475
+ {
47476
+ "epoch": 0.9214312061969753,
47477
+ "grad_norm": 3.3945140838623047,
47478
+ "learning_rate": 3.731266759760854e-07,
47479
+ "loss": 0.1067,
47480
+ "step": 6245
47481
+ },
47482
+ {
47483
+ "epoch": 0.9215787532275913,
47484
+ "grad_norm": 2.3547747135162354,
47485
+ "learning_rate": 3.717340103893341e-07,
47486
+ "loss": 0.0584,
47487
+ "step": 6246
47488
+ },
47489
+ {
47490
+ "epoch": 0.9217263002582073,
47491
+ "grad_norm": 1.9721163511276245,
47492
+ "learning_rate": 3.7034389944752613e-07,
47493
+ "loss": 0.0601,
47494
+ "step": 6247
47495
+ },
47496
+ {
47497
+ "epoch": 0.9218738472888233,
47498
+ "grad_norm": 4.440569877624512,
47499
+ "learning_rate": 3.689563435194976e-07,
47500
+ "loss": 0.1317,
47501
+ "step": 6248
47502
+ },
47503
+ {
47504
+ "epoch": 0.9220213943194393,
47505
+ "grad_norm": 1.7474677562713623,
47506
+ "learning_rate": 3.6757134297340735e-07,
47507
+ "loss": 0.044,
47508
+ "step": 6249
47509
+ },
47510
+ {
47511
+ "epoch": 0.9221689413500553,
47512
+ "grad_norm": 1.8465862274169922,
47513
+ "learning_rate": 3.661888981767314e-07,
47514
+ "loss": 0.0436,
47515
+ "step": 6250
47516
+ },
47517
+ {
47518
+ "epoch": 0.9223164883806714,
47519
+ "grad_norm": 1.5237339735031128,
47520
+ "learning_rate": 3.6480900949627306e-07,
47521
+ "loss": 0.0412,
47522
+ "step": 6251
47523
+ },
47524
+ {
47525
+ "epoch": 0.9224640354112873,
47526
+ "grad_norm": 4.066259384155273,
47527
+ "learning_rate": 3.6343167729815164e-07,
47528
+ "loss": 0.0675,
47529
+ "step": 6252
47530
+ },
47531
+ {
47532
+ "epoch": 0.9226115824419033,
47533
+ "grad_norm": 1.155721664428711,
47534
+ "learning_rate": 3.6205690194781487e-07,
47535
+ "loss": 0.0208,
47536
+ "step": 6253
47537
+ },
47538
+ {
47539
+ "epoch": 0.9227591294725194,
47540
+ "grad_norm": 2.956277370452881,
47541
+ "learning_rate": 3.606846838100264e-07,
47542
+ "loss": 0.0557,
47543
+ "step": 6254
47544
+ },
47545
+ {
47546
+ "epoch": 0.9229066765031354,
47547
+ "grad_norm": 2.8474464416503906,
47548
+ "learning_rate": 3.5931502324887624e-07,
47549
+ "loss": 0.1175,
47550
+ "step": 6255
47551
+ },
47552
+ {
47553
+ "epoch": 0.9230542235337513,
47554
+ "grad_norm": 0.9943166971206665,
47555
+ "learning_rate": 3.579479206277692e-07,
47556
+ "loss": 0.0091,
47557
+ "step": 6256
47558
+ },
47559
+ {
47560
+ "epoch": 0.9232017705643674,
47561
+ "grad_norm": 2.0411195755004883,
47562
+ "learning_rate": 3.565833763094373e-07,
47563
+ "loss": 0.0226,
47564
+ "step": 6257
47565
+ },
47566
+ {
47567
+ "epoch": 0.9233493175949834,
47568
+ "grad_norm": 3.972092390060425,
47569
+ "learning_rate": 3.552213906559343e-07,
47570
+ "loss": 0.0739,
47571
+ "step": 6258
47572
+ },
47573
+ {
47574
+ "epoch": 0.9234968646255994,
47575
+ "grad_norm": 2.7468929290771484,
47576
+ "learning_rate": 3.538619640286278e-07,
47577
+ "loss": 0.1084,
47578
+ "step": 6259
47579
+ },
47580
+ {
47581
+ "epoch": 0.9236444116562155,
47582
+ "grad_norm": 3.2310478687286377,
47583
+ "learning_rate": 3.52505096788216e-07,
47584
+ "loss": 0.0505,
47585
+ "step": 6260
47586
+ },
47587
+ {
47588
+ "epoch": 0.9236444116562155,
47589
+ "eval_accuracy": 0.9782923299565847,
47590
+ "eval_f1": 0.9629629629629629,
47591
+ "eval_loss": 0.05572787672281265,
47592
+ "eval_precision": 0.9798994974874372,
47593
+ "eval_recall": 0.9466019417475728,
47594
+ "eval_runtime": 51.4965,
47595
+ "eval_samples_per_second": 5.651,
47596
+ "eval_steps_per_second": 0.194,
47597
+ "step": 6260
47598
+ },
47599
+ {
47600
+ "epoch": 0.9237919586868314,
47601
+ "grad_norm": 2.4628522396087646,
47602
+ "learning_rate": 3.5115078929470856e-07,
47603
+ "loss": 0.1245,
47604
+ "step": 6261
47605
+ },
47606
+ {
47607
+ "epoch": 0.9239395057174474,
47608
+ "grad_norm": 2.1519012451171875,
47609
+ "learning_rate": 3.4979904190744486e-07,
47610
+ "loss": 0.0936,
47611
+ "step": 6262
47612
+ },
47613
+ {
47614
+ "epoch": 0.9240870527480635,
47615
+ "grad_norm": 1.2964609861373901,
47616
+ "learning_rate": 3.48449854985079e-07,
47617
+ "loss": 0.0132,
47618
+ "step": 6263
47619
+ },
47620
+ {
47621
+ "epoch": 0.9242345997786795,
47622
+ "grad_norm": 1.963150143623352,
47623
+ "learning_rate": 3.471032288855869e-07,
47624
+ "loss": 0.0431,
47625
+ "step": 6264
47626
+ },
47627
+ {
47628
+ "epoch": 0.9243821468092954,
47629
+ "grad_norm": 2.247939109802246,
47630
+ "learning_rate": 3.457591639662672e-07,
47631
+ "loss": 0.0297,
47632
+ "step": 6265
47633
+ },
47634
+ {
47635
+ "epoch": 0.9245296938399115,
47636
+ "grad_norm": 1.91328763961792,
47637
+ "learning_rate": 3.444176605837368e-07,
47638
+ "loss": 0.0626,
47639
+ "step": 6266
47640
+ },
47641
+ {
47642
+ "epoch": 0.9246772408705275,
47643
+ "grad_norm": 2.5009827613830566,
47644
+ "learning_rate": 3.430787190939322e-07,
47645
+ "loss": 0.0545,
47646
+ "step": 6267
47647
+ },
47648
+ {
47649
+ "epoch": 0.9248247879011435,
47650
+ "grad_norm": 0.9863361120223999,
47651
+ "learning_rate": 3.4174233985211467e-07,
47652
+ "loss": 0.0193,
47653
+ "step": 6268
47654
+ },
47655
+ {
47656
+ "epoch": 0.9249723349317595,
47657
+ "grad_norm": 1.2787401676177979,
47658
+ "learning_rate": 3.4040852321285954e-07,
47659
+ "loss": 0.0104,
47660
+ "step": 6269
47661
+ },
47662
+ {
47663
+ "epoch": 0.9251198819623755,
47664
+ "grad_norm": 2.8463070392608643,
47665
+ "learning_rate": 3.39077269530067e-07,
47666
+ "loss": 0.0565,
47667
+ "step": 6270
47668
+ },
47669
+ {
47670
+ "epoch": 0.9252674289929915,
47671
+ "grad_norm": 2.008657217025757,
47672
+ "learning_rate": 3.3774857915695346e-07,
47673
+ "loss": 0.0718,
47674
+ "step": 6271
47675
+ },
47676
+ {
47677
+ "epoch": 0.9254149760236076,
47678
+ "grad_norm": 0.9314476251602173,
47679
+ "learning_rate": 3.364224524460602e-07,
47680
+ "loss": 0.0236,
47681
+ "step": 6272
47682
+ },
47683
+ {
47684
+ "epoch": 0.9255625230542235,
47685
+ "grad_norm": 2.7881734371185303,
47686
+ "learning_rate": 3.3509888974924243e-07,
47687
+ "loss": 0.0565,
47688
+ "step": 6273
47689
+ },
47690
+ {
47691
+ "epoch": 0.9257100700848395,
47692
+ "grad_norm": 3.7052223682403564,
47693
+ "learning_rate": 3.3377789141768035e-07,
47694
+ "loss": 0.0528,
47695
+ "step": 6274
47696
+ },
47697
+ {
47698
+ "epoch": 0.9258576171154556,
47699
+ "grad_norm": 1.0346524715423584,
47700
+ "learning_rate": 3.324594578018681e-07,
47701
+ "loss": 0.0221,
47702
+ "step": 6275
47703
+ },
47704
+ {
47705
+ "epoch": 0.9260051641460716,
47706
+ "grad_norm": 1.8550926446914673,
47707
+ "learning_rate": 3.3114358925162573e-07,
47708
+ "loss": 0.0368,
47709
+ "step": 6276
47710
+ },
47711
+ {
47712
+ "epoch": 0.9261527111766875,
47713
+ "grad_norm": 4.289306163787842,
47714
+ "learning_rate": 3.298302861160885e-07,
47715
+ "loss": 0.0772,
47716
+ "step": 6277
47717
+ },
47718
+ {
47719
+ "epoch": 0.9263002582073036,
47720
+ "grad_norm": 3.7169032096862793,
47721
+ "learning_rate": 3.2851954874371095e-07,
47722
+ "loss": 0.116,
47723
+ "step": 6278
47724
+ },
47725
+ {
47726
+ "epoch": 0.9264478052379196,
47727
+ "grad_norm": 1.6580818891525269,
47728
+ "learning_rate": 3.272113774822694e-07,
47729
+ "loss": 0.0495,
47730
+ "step": 6279
47731
+ },
47732
+ {
47733
+ "epoch": 0.9265953522685356,
47734
+ "grad_norm": 2.004760980606079,
47735
+ "learning_rate": 3.2590577267885726e-07,
47736
+ "loss": 0.0424,
47737
+ "step": 6280
47738
+ },
47739
+ {
47740
+ "epoch": 0.9265953522685356,
47741
+ "eval_accuracy": 0.9797395079594791,
47742
+ "eval_f1": 0.9653465346534653,
47743
+ "eval_loss": 0.05522174760699272,
47744
+ "eval_precision": 0.9848484848484849,
47745
+ "eval_recall": 0.9466019417475728,
47746
+ "eval_runtime": 52.1889,
47747
+ "eval_samples_per_second": 5.576,
47748
+ "eval_steps_per_second": 0.192,
47749
+ "step": 6280
47750
+ },
47751
+ {
47752
+ "epoch": 0.9267428992991517,
47753
+ "grad_norm": 2.8334689140319824,
47754
+ "learning_rate": 3.2460273467988635e-07,
47755
+ "loss": 0.0398,
47756
+ "step": 6281
47757
+ },
47758
+ {
47759
+ "epoch": 0.9268904463297676,
47760
+ "grad_norm": 0.9800840616226196,
47761
+ "learning_rate": 3.233022638310901e-07,
47762
+ "loss": 0.0199,
47763
+ "step": 6282
47764
+ },
47765
+ {
47766
+ "epoch": 0.9270379933603836,
47767
+ "grad_norm": 1.883017897605896,
47768
+ "learning_rate": 3.2200436047752026e-07,
47769
+ "loss": 0.0243,
47770
+ "step": 6283
47771
+ },
47772
+ {
47773
+ "epoch": 0.9271855403909997,
47774
+ "grad_norm": 3.470026731491089,
47775
+ "learning_rate": 3.207090249635436e-07,
47776
+ "loss": 0.0418,
47777
+ "step": 6284
47778
+ },
47779
+ {
47780
+ "epoch": 0.9273330874216157,
47781
+ "grad_norm": 3.9536657333374023,
47782
+ "learning_rate": 3.194162576328508e-07,
47783
+ "loss": 0.1209,
47784
+ "step": 6285
47785
+ },
47786
+ {
47787
+ "epoch": 0.9274806344522316,
47788
+ "grad_norm": 1.0981996059417725,
47789
+ "learning_rate": 3.181260588284485e-07,
47790
+ "loss": 0.0188,
47791
+ "step": 6286
47792
+ },
47793
+ {
47794
+ "epoch": 0.9276281814828476,
47795
+ "grad_norm": 13.924962997436523,
47796
+ "learning_rate": 3.168384288926596e-07,
47797
+ "loss": 0.1309,
47798
+ "step": 6287
47799
+ },
47800
+ {
47801
+ "epoch": 0.9277757285134637,
47802
+ "grad_norm": 1.7328006029129028,
47803
+ "learning_rate": 3.155533681671319e-07,
47804
+ "loss": 0.0376,
47805
+ "step": 6288
47806
+ },
47807
+ {
47808
+ "epoch": 0.9279232755440797,
47809
+ "grad_norm": 7.636415481567383,
47810
+ "learning_rate": 3.1427087699282375e-07,
47811
+ "loss": 0.0774,
47812
+ "step": 6289
47813
+ },
47814
+ {
47815
+ "epoch": 0.9280708225746956,
47816
+ "grad_norm": 2.2514214515686035,
47817
+ "learning_rate": 3.1299095571001745e-07,
47818
+ "loss": 0.1002,
47819
+ "step": 6290
47820
+ },
47821
+ {
47822
+ "epoch": 0.9282183696053117,
47823
+ "grad_norm": 2.580007314682007,
47824
+ "learning_rate": 3.1171360465831245e-07,
47825
+ "loss": 0.066,
47826
+ "step": 6291
47827
+ },
47828
+ {
47829
+ "epoch": 0.9283659166359277,
47830
+ "grad_norm": 4.111058712005615,
47831
+ "learning_rate": 3.104388241766232e-07,
47832
+ "loss": 0.1297,
47833
+ "step": 6292
47834
+ },
47835
+ {
47836
+ "epoch": 0.9285134636665437,
47837
+ "grad_norm": 2.8205816745758057,
47838
+ "learning_rate": 3.091666146031858e-07,
47839
+ "loss": 0.0468,
47840
+ "step": 6293
47841
+ },
47842
+ {
47843
+ "epoch": 0.9286610106971597,
47844
+ "grad_norm": 1.6066216230392456,
47845
+ "learning_rate": 3.0789697627555124e-07,
47846
+ "loss": 0.0386,
47847
+ "step": 6294
47848
+ },
47849
+ {
47850
+ "epoch": 0.9288085577277757,
47851
+ "grad_norm": 2.5085225105285645,
47852
+ "learning_rate": 3.0662990953058803e-07,
47853
+ "loss": 0.0928,
47854
+ "step": 6295
47855
+ },
47856
+ {
47857
+ "epoch": 0.9289561047583917,
47858
+ "grad_norm": 1.2958418130874634,
47859
+ "learning_rate": 3.0536541470448824e-07,
47860
+ "loss": 0.0261,
47861
+ "step": 6296
47862
+ },
47863
+ {
47864
+ "epoch": 0.9291036517890078,
47865
+ "grad_norm": 1.3905576467514038,
47866
+ "learning_rate": 3.041034921327557e-07,
47867
+ "loss": 0.0206,
47868
+ "step": 6297
47869
+ },
47870
+ {
47871
+ "epoch": 0.9292511988196237,
47872
+ "grad_norm": 2.7028305530548096,
47873
+ "learning_rate": 3.028441421502115e-07,
47874
+ "loss": 0.0687,
47875
+ "step": 6298
47876
+ },
47877
+ {
47878
+ "epoch": 0.9293987458502397,
47879
+ "grad_norm": 2.6002049446105957,
47880
+ "learning_rate": 3.015873650909984e-07,
47881
+ "loss": 0.0873,
47882
+ "step": 6299
47883
+ },
47884
+ {
47885
+ "epoch": 0.9295462928808558,
47886
+ "grad_norm": 2.208272695541382,
47887
+ "learning_rate": 3.003331612885718e-07,
47888
+ "loss": 0.0506,
47889
+ "step": 6300
47890
+ },
47891
+ {
47892
+ "epoch": 0.9295462928808558,
47893
+ "eval_accuracy": 0.9782923299565847,
47894
+ "eval_f1": 0.9629629629629629,
47895
+ "eval_loss": 0.05601061135530472,
47896
+ "eval_precision": 0.9798994974874372,
47897
+ "eval_recall": 0.9466019417475728,
47898
+ "eval_runtime": 50.1847,
47899
+ "eval_samples_per_second": 5.799,
47900
+ "eval_steps_per_second": 0.199,
47901
+ "step": 6300
47902
  }
47903
  ],
47904
  "logging_steps": 1,
 
47918
  "attributes": {}
47919
  }
47920
  },
47921
+ "total_flos": 1.9408628309913764e+18,
47922
  "train_batch_size": 8,
47923
  "trial_name": null,
47924
  "trial_params": null