ErrorAI commited on
Commit
aeae31b
·
verified ·
1 Parent(s): 1194d65

Training in progress, step 1570, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:776e3a60fc3ddacab8a74c719807c6ac17fc0ea6578b478112a3887015d61df1
3
  size 50365768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4708548b45c8348628ae58470ab2c8de30081e4857f0ea0f0012c207b4b19a1
3
  size 50365768
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0ed3cfac196364e7f703086c34678624ce9cdab1779b3183c7652ea39248a09
3
  size 25859412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eafb75aabef5d499a7b4b505c732d9aae83f1e9b33ae5c2a8b87bfb0e3d8ea7
3
  size 25859412
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e51717d4b453ecdefb3e1b21a1d457b9afb8eea5b3ba753b7f2ece6e2f24fa15
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9120e5e0f2890bc6d75221f2df80f8c47456ada42a7b7b6bb80bbadd64eed705
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6f940fecfc43794835ec25ad54cdeb0d35006f38f4199f153111d4cfd7b09dd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1f9acbfb9eb31ecbbbaf93a19358576018d3e7f8c62edbce6b77a1fdef5699a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5336048879837068,
5
  "eval_steps": 393,
6
- "global_step": 1179,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -8292,6 +8292,2743 @@
8292
  "eval_samples_per_second": 18.037,
8293
  "eval_steps_per_second": 9.028,
8294
  "step": 1179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8295
  }
8296
  ],
8297
  "logging_steps": 1,
@@ -8306,12 +11043,12 @@
8306
  "should_evaluate": false,
8307
  "should_log": false,
8308
  "should_save": true,
8309
- "should_training_stop": false
8310
  },
8311
  "attributes": {}
8312
  }
8313
  },
8314
- "total_flos": 2.150076298989404e+17,
8315
  "train_batch_size": 2,
8316
  "trial_name": null,
8317
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7105680018103643,
5
  "eval_steps": 393,
6
+ "global_step": 1570,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
8292
  "eval_samples_per_second": 18.037,
8293
  "eval_steps_per_second": 9.028,
8294
  "step": 1179
8295
+ },
8296
+ {
8297
+ "epoch": 0.5340574790676623,
8298
+ "grad_norm": 0.48865947127342224,
8299
+ "learning_rate": 2.9289321881345254e-05,
8300
+ "loss": 0.762,
8301
+ "step": 1180
8302
+ },
8303
+ {
8304
+ "epoch": 0.534510070151618,
8305
+ "grad_norm": 0.5784907341003418,
8306
+ "learning_rate": 2.9147065268982666e-05,
8307
+ "loss": 1.0548,
8308
+ "step": 1181
8309
+ },
8310
+ {
8311
+ "epoch": 0.5349626612355737,
8312
+ "grad_norm": 0.5535488128662109,
8313
+ "learning_rate": 2.9005096004664177e-05,
8314
+ "loss": 0.8442,
8315
+ "step": 1182
8316
+ },
8317
+ {
8318
+ "epoch": 0.5354152523195294,
8319
+ "grad_norm": 0.541508674621582,
8320
+ "learning_rate": 2.886341466415412e-05,
8321
+ "loss": 0.6527,
8322
+ "step": 1183
8323
+ },
8324
+ {
8325
+ "epoch": 0.5358678434034849,
8326
+ "grad_norm": 0.5890561938285828,
8327
+ "learning_rate": 2.8722021822049027e-05,
8328
+ "loss": 0.8764,
8329
+ "step": 1184
8330
+ },
8331
+ {
8332
+ "epoch": 0.5363204344874406,
8333
+ "grad_norm": 0.4952785074710846,
8334
+ "learning_rate": 2.858091805177554e-05,
8335
+ "loss": 0.8619,
8336
+ "step": 1185
8337
+ },
8338
+ {
8339
+ "epoch": 0.5367730255713963,
8340
+ "grad_norm": 0.4867090582847595,
8341
+ "learning_rate": 2.84401039255879e-05,
8342
+ "loss": 0.7619,
8343
+ "step": 1186
8344
+ },
8345
+ {
8346
+ "epoch": 0.5372256166553518,
8347
+ "grad_norm": 0.5630788207054138,
8348
+ "learning_rate": 2.8299580014565664e-05,
8349
+ "loss": 0.9779,
8350
+ "step": 1187
8351
+ },
8352
+ {
8353
+ "epoch": 0.5376782077393075,
8354
+ "grad_norm": 0.5053603053092957,
8355
+ "learning_rate": 2.815934688861146e-05,
8356
+ "loss": 0.8156,
8357
+ "step": 1188
8358
+ },
8359
+ {
8360
+ "epoch": 0.5381307988232632,
8361
+ "grad_norm": 0.463606595993042,
8362
+ "learning_rate": 2.8019405116448516e-05,
8363
+ "loss": 0.6967,
8364
+ "step": 1189
8365
+ },
8366
+ {
8367
+ "epoch": 0.5385833899072189,
8368
+ "grad_norm": 0.4266325533390045,
8369
+ "learning_rate": 2.7879755265618555e-05,
8370
+ "loss": 0.6799,
8371
+ "step": 1190
8372
+ },
8373
+ {
8374
+ "epoch": 0.5390359809911744,
8375
+ "grad_norm": 0.4817747175693512,
8376
+ "learning_rate": 2.7740397902479387e-05,
8377
+ "loss": 0.765,
8378
+ "step": 1191
8379
+ },
8380
+ {
8381
+ "epoch": 0.5394885720751301,
8382
+ "grad_norm": 0.5528002381324768,
8383
+ "learning_rate": 2.7601333592202583e-05,
8384
+ "loss": 0.8884,
8385
+ "step": 1192
8386
+ },
8387
+ {
8388
+ "epoch": 0.5399411631590858,
8389
+ "grad_norm": 0.6013336777687073,
8390
+ "learning_rate": 2.746256289877126e-05,
8391
+ "loss": 1.1148,
8392
+ "step": 1193
8393
+ },
8394
+ {
8395
+ "epoch": 0.5403937542430414,
8396
+ "grad_norm": 0.45258426666259766,
8397
+ "learning_rate": 2.7324086384977698e-05,
8398
+ "loss": 0.6737,
8399
+ "step": 1194
8400
+ },
8401
+ {
8402
+ "epoch": 0.540846345326997,
8403
+ "grad_norm": 0.4618009030818939,
8404
+ "learning_rate": 2.7185904612421176e-05,
8405
+ "loss": 0.6267,
8406
+ "step": 1195
8407
+ },
8408
+ {
8409
+ "epoch": 0.5412989364109527,
8410
+ "grad_norm": 0.5477524399757385,
8411
+ "learning_rate": 2.7048018141505604e-05,
8412
+ "loss": 0.6358,
8413
+ "step": 1196
8414
+ },
8415
+ {
8416
+ "epoch": 0.5417515274949084,
8417
+ "grad_norm": 0.5210055112838745,
8418
+ "learning_rate": 2.6910427531437287e-05,
8419
+ "loss": 0.8353,
8420
+ "step": 1197
8421
+ },
8422
+ {
8423
+ "epoch": 0.542204118578864,
8424
+ "grad_norm": 0.489916056394577,
8425
+ "learning_rate": 2.677313334022268e-05,
8426
+ "loss": 0.7661,
8427
+ "step": 1198
8428
+ },
8429
+ {
8430
+ "epoch": 0.5426567096628196,
8431
+ "grad_norm": 0.5317739844322205,
8432
+ "learning_rate": 2.6636136124666e-05,
8433
+ "loss": 0.8718,
8434
+ "step": 1199
8435
+ },
8436
+ {
8437
+ "epoch": 0.5431093007467753,
8438
+ "grad_norm": 0.563149094581604,
8439
+ "learning_rate": 2.6499436440367165e-05,
8440
+ "loss": 1.0695,
8441
+ "step": 1200
8442
+ },
8443
+ {
8444
+ "epoch": 0.5435618918307309,
8445
+ "grad_norm": 0.6181795597076416,
8446
+ "learning_rate": 2.6363034841719392e-05,
8447
+ "loss": 0.6715,
8448
+ "step": 1201
8449
+ },
8450
+ {
8451
+ "epoch": 0.5440144829146866,
8452
+ "grad_norm": 0.5255316495895386,
8453
+ "learning_rate": 2.622693188190699e-05,
8454
+ "loss": 0.7982,
8455
+ "step": 1202
8456
+ },
8457
+ {
8458
+ "epoch": 0.5444670739986422,
8459
+ "grad_norm": 0.5731631517410278,
8460
+ "learning_rate": 2.609112811290315e-05,
8461
+ "loss": 0.9418,
8462
+ "step": 1203
8463
+ },
8464
+ {
8465
+ "epoch": 0.5449196650825979,
8466
+ "grad_norm": 0.44508278369903564,
8467
+ "learning_rate": 2.59556240854677e-05,
8468
+ "loss": 0.7452,
8469
+ "step": 1204
8470
+ },
8471
+ {
8472
+ "epoch": 0.5453722561665535,
8473
+ "grad_norm": 0.5581162571907043,
8474
+ "learning_rate": 2.5820420349144693e-05,
8475
+ "loss": 0.9905,
8476
+ "step": 1205
8477
+ },
8478
+ {
8479
+ "epoch": 0.5458248472505092,
8480
+ "grad_norm": 0.4913443922996521,
8481
+ "learning_rate": 2.5685517452260567e-05,
8482
+ "loss": 0.8525,
8483
+ "step": 1206
8484
+ },
8485
+ {
8486
+ "epoch": 0.5462774383344648,
8487
+ "grad_norm": 0.6032697558403015,
8488
+ "learning_rate": 2.5550915941921526e-05,
8489
+ "loss": 0.8903,
8490
+ "step": 1207
8491
+ },
8492
+ {
8493
+ "epoch": 0.5467300294184204,
8494
+ "grad_norm": 0.5988163352012634,
8495
+ "learning_rate": 2.541661636401157e-05,
8496
+ "loss": 0.8709,
8497
+ "step": 1208
8498
+ },
8499
+ {
8500
+ "epoch": 0.5471826205023761,
8501
+ "grad_norm": 0.4508473873138428,
8502
+ "learning_rate": 2.52826192631901e-05,
8503
+ "loss": 0.6725,
8504
+ "step": 1209
8505
+ },
8506
+ {
8507
+ "epoch": 0.5476352115863318,
8508
+ "grad_norm": 0.5168390274047852,
8509
+ "learning_rate": 2.514892518288988e-05,
8510
+ "loss": 0.6997,
8511
+ "step": 1210
8512
+ },
8513
+ {
8514
+ "epoch": 0.5480878026702874,
8515
+ "grad_norm": 0.48190394043922424,
8516
+ "learning_rate": 2.5015534665314755e-05,
8517
+ "loss": 0.8461,
8518
+ "step": 1211
8519
+ },
8520
+ {
8521
+ "epoch": 0.548540393754243,
8522
+ "grad_norm": 0.5142277479171753,
8523
+ "learning_rate": 2.488244825143743e-05,
8524
+ "loss": 0.9458,
8525
+ "step": 1212
8526
+ },
8527
+ {
8528
+ "epoch": 0.5489929848381987,
8529
+ "grad_norm": 0.5585710406303406,
8530
+ "learning_rate": 2.4749666480997337e-05,
8531
+ "loss": 0.8265,
8532
+ "step": 1213
8533
+ },
8534
+ {
8535
+ "epoch": 0.5494455759221544,
8536
+ "grad_norm": 0.5171539783477783,
8537
+ "learning_rate": 2.4617189892498327e-05,
8538
+ "loss": 0.8442,
8539
+ "step": 1214
8540
+ },
8541
+ {
8542
+ "epoch": 0.5498981670061099,
8543
+ "grad_norm": 0.5711890459060669,
8544
+ "learning_rate": 2.4485019023206635e-05,
8545
+ "loss": 0.9659,
8546
+ "step": 1215
8547
+ },
8548
+ {
8549
+ "epoch": 0.5503507580900656,
8550
+ "grad_norm": 0.5380867719650269,
8551
+ "learning_rate": 2.4353154409148637e-05,
8552
+ "loss": 0.8394,
8553
+ "step": 1216
8554
+ },
8555
+ {
8556
+ "epoch": 0.5508033491740213,
8557
+ "grad_norm": 0.4734165370464325,
8558
+ "learning_rate": 2.4221596585108663e-05,
8559
+ "loss": 0.5909,
8560
+ "step": 1217
8561
+ },
8562
+ {
8563
+ "epoch": 0.551255940257977,
8564
+ "grad_norm": 0.5847920775413513,
8565
+ "learning_rate": 2.409034608462686e-05,
8566
+ "loss": 0.9881,
8567
+ "step": 1218
8568
+ },
8569
+ {
8570
+ "epoch": 0.5517085313419325,
8571
+ "grad_norm": 0.5413870811462402,
8572
+ "learning_rate": 2.3959403439996907e-05,
8573
+ "loss": 0.9771,
8574
+ "step": 1219
8575
+ },
8576
+ {
8577
+ "epoch": 0.5521611224258882,
8578
+ "grad_norm": 0.5978078842163086,
8579
+ "learning_rate": 2.382876918226409e-05,
8580
+ "loss": 0.7637,
8581
+ "step": 1220
8582
+ },
8583
+ {
8584
+ "epoch": 0.5526137135098439,
8585
+ "grad_norm": 0.5320140719413757,
8586
+ "learning_rate": 2.369844384122293e-05,
8587
+ "loss": 0.8887,
8588
+ "step": 1221
8589
+ },
8590
+ {
8591
+ "epoch": 0.5530663045937995,
8592
+ "grad_norm": 0.38298875093460083,
8593
+ "learning_rate": 2.356842794541516e-05,
8594
+ "loss": 0.4509,
8595
+ "step": 1222
8596
+ },
8597
+ {
8598
+ "epoch": 0.5535188956777551,
8599
+ "grad_norm": 0.4932439625263214,
8600
+ "learning_rate": 2.3438722022127546e-05,
8601
+ "loss": 0.7125,
8602
+ "step": 1223
8603
+ },
8604
+ {
8605
+ "epoch": 0.5539714867617108,
8606
+ "grad_norm": 0.4653611481189728,
8607
+ "learning_rate": 2.330932659738967e-05,
8608
+ "loss": 0.7279,
8609
+ "step": 1224
8610
+ },
8611
+ {
8612
+ "epoch": 0.5544240778456664,
8613
+ "grad_norm": 0.6703446507453918,
8614
+ "learning_rate": 2.318024219597196e-05,
8615
+ "loss": 1.2327,
8616
+ "step": 1225
8617
+ },
8618
+ {
8619
+ "epoch": 0.5548766689296221,
8620
+ "grad_norm": 0.5315276980400085,
8621
+ "learning_rate": 2.3051469341383402e-05,
8622
+ "loss": 0.9666,
8623
+ "step": 1226
8624
+ },
8625
+ {
8626
+ "epoch": 0.5553292600135777,
8627
+ "grad_norm": 0.4821360111236572,
8628
+ "learning_rate": 2.2923008555869552e-05,
8629
+ "loss": 0.7568,
8630
+ "step": 1227
8631
+ },
8632
+ {
8633
+ "epoch": 0.5557818510975334,
8634
+ "grad_norm": 0.7255955338478088,
8635
+ "learning_rate": 2.2794860360410342e-05,
8636
+ "loss": 1.0656,
8637
+ "step": 1228
8638
+ },
8639
+ {
8640
+ "epoch": 0.556234442181489,
8641
+ "grad_norm": 0.5304329991340637,
8642
+ "learning_rate": 2.266702527471788e-05,
8643
+ "loss": 0.8769,
8644
+ "step": 1229
8645
+ },
8646
+ {
8647
+ "epoch": 0.5566870332654447,
8648
+ "grad_norm": 0.5752689242362976,
8649
+ "learning_rate": 2.2539503817234553e-05,
8650
+ "loss": 1.0325,
8651
+ "step": 1230
8652
+ },
8653
+ {
8654
+ "epoch": 0.5571396243494003,
8655
+ "grad_norm": 0.5128302574157715,
8656
+ "learning_rate": 2.241229650513077e-05,
8657
+ "loss": 0.8263,
8658
+ "step": 1231
8659
+ },
8660
+ {
8661
+ "epoch": 0.5575922154333559,
8662
+ "grad_norm": 0.4604417383670807,
8663
+ "learning_rate": 2.2285403854302912e-05,
8664
+ "loss": 0.6971,
8665
+ "step": 1232
8666
+ },
8667
+ {
8668
+ "epoch": 0.5580448065173116,
8669
+ "grad_norm": 0.5629538893699646,
8670
+ "learning_rate": 2.2158826379371258e-05,
8671
+ "loss": 0.7893,
8672
+ "step": 1233
8673
+ },
8674
+ {
8675
+ "epoch": 0.5584973976012673,
8676
+ "grad_norm": 0.5437077879905701,
8677
+ "learning_rate": 2.2032564593677774e-05,
8678
+ "loss": 0.6816,
8679
+ "step": 1234
8680
+ },
8681
+ {
8682
+ "epoch": 0.5589499886852229,
8683
+ "grad_norm": 0.550563633441925,
8684
+ "learning_rate": 2.1906619009284257e-05,
8685
+ "loss": 0.8575,
8686
+ "step": 1235
8687
+ },
8688
+ {
8689
+ "epoch": 0.5594025797691785,
8690
+ "grad_norm": 0.5454031825065613,
8691
+ "learning_rate": 2.178099013697005e-05,
8692
+ "loss": 0.7705,
8693
+ "step": 1236
8694
+ },
8695
+ {
8696
+ "epoch": 0.5598551708531342,
8697
+ "grad_norm": 0.5278245210647583,
8698
+ "learning_rate": 2.165567848623009e-05,
8699
+ "loss": 0.9149,
8700
+ "step": 1237
8701
+ },
8702
+ {
8703
+ "epoch": 0.5603077619370899,
8704
+ "grad_norm": 0.4524843394756317,
8705
+ "learning_rate": 2.153068456527283e-05,
8706
+ "loss": 0.6749,
8707
+ "step": 1238
8708
+ },
8709
+ {
8710
+ "epoch": 0.5607603530210454,
8711
+ "grad_norm": 0.5835239887237549,
8712
+ "learning_rate": 2.1406008881018047e-05,
8713
+ "loss": 0.8808,
8714
+ "step": 1239
8715
+ },
8716
+ {
8717
+ "epoch": 0.5612129441050011,
8718
+ "grad_norm": 0.5145358443260193,
8719
+ "learning_rate": 2.1281651939094992e-05,
8720
+ "loss": 0.9971,
8721
+ "step": 1240
8722
+ },
8723
+ {
8724
+ "epoch": 0.5616655351889568,
8725
+ "grad_norm": 0.5458048582077026,
8726
+ "learning_rate": 2.1157614243840206e-05,
8727
+ "loss": 0.9558,
8728
+ "step": 1241
8729
+ },
8730
+ {
8731
+ "epoch": 0.5621181262729125,
8732
+ "grad_norm": 0.502225935459137,
8733
+ "learning_rate": 2.1033896298295508e-05,
8734
+ "loss": 0.7903,
8735
+ "step": 1242
8736
+ },
8737
+ {
8738
+ "epoch": 0.562570717356868,
8739
+ "grad_norm": 0.5830442905426025,
8740
+ "learning_rate": 2.0910498604205986e-05,
8741
+ "loss": 1.1186,
8742
+ "step": 1243
8743
+ },
8744
+ {
8745
+ "epoch": 0.5630233084408237,
8746
+ "grad_norm": 0.47947055101394653,
8747
+ "learning_rate": 2.0787421662017825e-05,
8748
+ "loss": 0.7421,
8749
+ "step": 1244
8750
+ },
8751
+ {
8752
+ "epoch": 0.5634758995247794,
8753
+ "grad_norm": 0.5443291068077087,
8754
+ "learning_rate": 2.0664665970876496e-05,
8755
+ "loss": 1.0004,
8756
+ "step": 1245
8757
+ },
8758
+ {
8759
+ "epoch": 0.563928490608735,
8760
+ "grad_norm": 0.6094164848327637,
8761
+ "learning_rate": 2.0542232028624586e-05,
8762
+ "loss": 0.9367,
8763
+ "step": 1246
8764
+ },
8765
+ {
8766
+ "epoch": 0.5643810816926906,
8767
+ "grad_norm": 0.5203437805175781,
8768
+ "learning_rate": 2.0420120331799786e-05,
8769
+ "loss": 1.035,
8770
+ "step": 1247
8771
+ },
8772
+ {
8773
+ "epoch": 0.5648336727766463,
8774
+ "grad_norm": 0.5212861895561218,
8775
+ "learning_rate": 2.0298331375632962e-05,
8776
+ "loss": 1.0298,
8777
+ "step": 1248
8778
+ },
8779
+ {
8780
+ "epoch": 0.565286263860602,
8781
+ "grad_norm": 0.5760666728019714,
8782
+ "learning_rate": 2.0176865654045974e-05,
8783
+ "loss": 1.0492,
8784
+ "step": 1249
8785
+ },
8786
+ {
8787
+ "epoch": 0.5657388549445576,
8788
+ "grad_norm": 0.5951511859893799,
8789
+ "learning_rate": 2.0055723659649904e-05,
8790
+ "loss": 1.0702,
8791
+ "step": 1250
8792
+ },
8793
+ {
8794
+ "epoch": 0.5661914460285132,
8795
+ "grad_norm": 0.4950634837150574,
8796
+ "learning_rate": 1.9934905883742882e-05,
8797
+ "loss": 0.6607,
8798
+ "step": 1251
8799
+ },
8800
+ {
8801
+ "epoch": 0.5666440371124689,
8802
+ "grad_norm": 0.42447158694267273,
8803
+ "learning_rate": 1.981441281630816e-05,
8804
+ "loss": 0.6971,
8805
+ "step": 1252
8806
+ },
8807
+ {
8808
+ "epoch": 0.5670966281964245,
8809
+ "grad_norm": 0.5706565976142883,
8810
+ "learning_rate": 1.969424494601213e-05,
8811
+ "loss": 1.0281,
8812
+ "step": 1253
8813
+ },
8814
+ {
8815
+ "epoch": 0.5675492192803802,
8816
+ "grad_norm": 0.5865153074264526,
8817
+ "learning_rate": 1.9574402760202315e-05,
8818
+ "loss": 1.1408,
8819
+ "step": 1254
8820
+ },
8821
+ {
8822
+ "epoch": 0.5680018103643358,
8823
+ "grad_norm": 0.5091719627380371,
8824
+ "learning_rate": 1.94548867449054e-05,
8825
+ "loss": 0.7702,
8826
+ "step": 1255
8827
+ },
8828
+ {
8829
+ "epoch": 0.5684544014482915,
8830
+ "grad_norm": 0.5343332290649414,
8831
+ "learning_rate": 1.933569738482529e-05,
8832
+ "loss": 0.9963,
8833
+ "step": 1256
8834
+ },
8835
+ {
8836
+ "epoch": 0.5689069925322471,
8837
+ "grad_norm": 0.5233331918716431,
8838
+ "learning_rate": 1.9216835163341106e-05,
8839
+ "loss": 0.9176,
8840
+ "step": 1257
8841
+ },
8842
+ {
8843
+ "epoch": 0.5693595836162028,
8844
+ "grad_norm": 0.45247891545295715,
8845
+ "learning_rate": 1.9098300562505266e-05,
8846
+ "loss": 0.6073,
8847
+ "step": 1258
8848
+ },
8849
+ {
8850
+ "epoch": 0.5698121747001584,
8851
+ "grad_norm": 0.4159603416919708,
8852
+ "learning_rate": 1.8980094063041432e-05,
8853
+ "loss": 0.6153,
8854
+ "step": 1259
8855
+ },
8856
+ {
8857
+ "epoch": 0.570264765784114,
8858
+ "grad_norm": 0.45843058824539185,
8859
+ "learning_rate": 1.8862216144342692e-05,
8860
+ "loss": 0.5988,
8861
+ "step": 1260
8862
+ },
8863
+ {
8864
+ "epoch": 0.5707173568680697,
8865
+ "grad_norm": 0.5220310091972351,
8866
+ "learning_rate": 1.8744667284469575e-05,
8867
+ "loss": 0.8899,
8868
+ "step": 1261
8869
+ },
8870
+ {
8871
+ "epoch": 0.5711699479520254,
8872
+ "grad_norm": 0.529350221157074,
8873
+ "learning_rate": 1.8627447960148037e-05,
8874
+ "loss": 0.818,
8875
+ "step": 1262
8876
+ },
8877
+ {
8878
+ "epoch": 0.571622539035981,
8879
+ "grad_norm": 0.48126599192619324,
8880
+ "learning_rate": 1.851055864676765e-05,
8881
+ "loss": 0.7028,
8882
+ "step": 1263
8883
+ },
8884
+ {
8885
+ "epoch": 0.5720751301199366,
8886
+ "grad_norm": 0.4716106653213501,
8887
+ "learning_rate": 1.8393999818379525e-05,
8888
+ "loss": 0.7965,
8889
+ "step": 1264
8890
+ },
8891
+ {
8892
+ "epoch": 0.5725277212038923,
8893
+ "grad_norm": 0.550206184387207,
8894
+ "learning_rate": 1.8277771947694523e-05,
8895
+ "loss": 1.1316,
8896
+ "step": 1265
8897
+ },
8898
+ {
8899
+ "epoch": 0.572980312287848,
8900
+ "grad_norm": 0.5137092471122742,
8901
+ "learning_rate": 1.8161875506081293e-05,
8902
+ "loss": 0.8549,
8903
+ "step": 1266
8904
+ },
8905
+ {
8906
+ "epoch": 0.5734329033718035,
8907
+ "grad_norm": 0.6171010136604309,
8908
+ "learning_rate": 1.804631096356435e-05,
8909
+ "loss": 0.9547,
8910
+ "step": 1267
8911
+ },
8912
+ {
8913
+ "epoch": 0.5738854944557592,
8914
+ "grad_norm": 0.5315536260604858,
8915
+ "learning_rate": 1.7931078788822175e-05,
8916
+ "loss": 0.896,
8917
+ "step": 1268
8918
+ },
8919
+ {
8920
+ "epoch": 0.5743380855397149,
8921
+ "grad_norm": 0.5210160613059998,
8922
+ "learning_rate": 1.781617944918528e-05,
8923
+ "loss": 0.8643,
8924
+ "step": 1269
8925
+ },
8926
+ {
8927
+ "epoch": 0.5747906766236706,
8928
+ "grad_norm": 0.4929242432117462,
8929
+ "learning_rate": 1.7701613410634365e-05,
8930
+ "loss": 0.7251,
8931
+ "step": 1270
8932
+ },
8933
+ {
8934
+ "epoch": 0.5752432677076261,
8935
+ "grad_norm": 0.5473180413246155,
8936
+ "learning_rate": 1.7587381137798432e-05,
8937
+ "loss": 0.9597,
8938
+ "step": 1271
8939
+ },
8940
+ {
8941
+ "epoch": 0.5756958587915818,
8942
+ "grad_norm": 0.5931307077407837,
8943
+ "learning_rate": 1.747348309395286e-05,
8944
+ "loss": 0.9572,
8945
+ "step": 1272
8946
+ },
8947
+ {
8948
+ "epoch": 0.5761484498755375,
8949
+ "grad_norm": 0.6130375862121582,
8950
+ "learning_rate": 1.735991974101756e-05,
8951
+ "loss": 1.0984,
8952
+ "step": 1273
8953
+ },
8954
+ {
8955
+ "epoch": 0.576601040959493,
8956
+ "grad_norm": 0.5877550840377808,
8957
+ "learning_rate": 1.7246691539555028e-05,
8958
+ "loss": 1.08,
8959
+ "step": 1274
8960
+ },
8961
+ {
8962
+ "epoch": 0.5770536320434487,
8963
+ "grad_norm": 0.498898446559906,
8964
+ "learning_rate": 1.7133798948768597e-05,
8965
+ "loss": 0.8566,
8966
+ "step": 1275
8967
+ },
8968
+ {
8969
+ "epoch": 0.5775062231274044,
8970
+ "grad_norm": 0.4698163568973541,
8971
+ "learning_rate": 1.7021242426500493e-05,
8972
+ "loss": 0.7786,
8973
+ "step": 1276
8974
+ },
8975
+ {
8976
+ "epoch": 0.5779588142113601,
8977
+ "grad_norm": 0.4652780592441559,
8978
+ "learning_rate": 1.6909022429229982e-05,
8979
+ "loss": 0.8628,
8980
+ "step": 1277
8981
+ },
8982
+ {
8983
+ "epoch": 0.5784114052953157,
8984
+ "grad_norm": 0.4151431620121002,
8985
+ "learning_rate": 1.6797139412071584e-05,
8986
+ "loss": 0.6402,
8987
+ "step": 1278
8988
+ },
8989
+ {
8990
+ "epoch": 0.5788639963792713,
8991
+ "grad_norm": 0.47546225786209106,
8992
+ "learning_rate": 1.6685593828773095e-05,
8993
+ "loss": 0.7404,
8994
+ "step": 1279
8995
+ },
8996
+ {
8997
+ "epoch": 0.579316587463227,
8998
+ "grad_norm": 0.47583967447280884,
8999
+ "learning_rate": 1.657438613171387e-05,
9000
+ "loss": 0.7162,
9001
+ "step": 1280
9002
+ },
9003
+ {
9004
+ "epoch": 0.5797691785471826,
9005
+ "grad_norm": 0.5199639797210693,
9006
+ "learning_rate": 1.6463516771902988e-05,
9007
+ "loss": 0.8366,
9008
+ "step": 1281
9009
+ },
9010
+ {
9011
+ "epoch": 0.5802217696311383,
9012
+ "grad_norm": 0.5876262784004211,
9013
+ "learning_rate": 1.6352986198977325e-05,
9014
+ "loss": 1.0567,
9015
+ "step": 1282
9016
+ },
9017
+ {
9018
+ "epoch": 0.5806743607150939,
9019
+ "grad_norm": 0.5289322137832642,
9020
+ "learning_rate": 1.624279486119984e-05,
9021
+ "loss": 0.9105,
9022
+ "step": 1283
9023
+ },
9024
+ {
9025
+ "epoch": 0.5811269517990496,
9026
+ "grad_norm": 0.5706743597984314,
9027
+ "learning_rate": 1.6132943205457606e-05,
9028
+ "loss": 1.1233,
9029
+ "step": 1284
9030
+ },
9031
+ {
9032
+ "epoch": 0.5815795428830052,
9033
+ "grad_norm": 0.5402660369873047,
9034
+ "learning_rate": 1.6023431677260214e-05,
9035
+ "loss": 0.7923,
9036
+ "step": 1285
9037
+ },
9038
+ {
9039
+ "epoch": 0.5820321339669609,
9040
+ "grad_norm": 0.4756923317909241,
9041
+ "learning_rate": 1.5914260720737795e-05,
9042
+ "loss": 0.7638,
9043
+ "step": 1286
9044
+ },
9045
+ {
9046
+ "epoch": 0.5824847250509165,
9047
+ "grad_norm": 0.5244125127792358,
9048
+ "learning_rate": 1.5805430778639263e-05,
9049
+ "loss": 0.7667,
9050
+ "step": 1287
9051
+ },
9052
+ {
9053
+ "epoch": 0.5829373161348721,
9054
+ "grad_norm": 0.5348141193389893,
9055
+ "learning_rate": 1.5696942292330576e-05,
9056
+ "loss": 0.8909,
9057
+ "step": 1288
9058
+ },
9059
+ {
9060
+ "epoch": 0.5833899072188278,
9061
+ "grad_norm": 0.5148683786392212,
9062
+ "learning_rate": 1.5588795701792803e-05,
9063
+ "loss": 0.9972,
9064
+ "step": 1289
9065
+ },
9066
+ {
9067
+ "epoch": 0.5838424983027835,
9068
+ "grad_norm": 0.5113561749458313,
9069
+ "learning_rate": 1.5480991445620542e-05,
9070
+ "loss": 0.8348,
9071
+ "step": 1290
9072
+ },
9073
+ {
9074
+ "epoch": 0.5842950893867391,
9075
+ "grad_norm": 0.4689948260784149,
9076
+ "learning_rate": 1.5373529961019974e-05,
9077
+ "loss": 0.7019,
9078
+ "step": 1291
9079
+ },
9080
+ {
9081
+ "epoch": 0.5847476804706947,
9082
+ "grad_norm": 0.45713579654693604,
9083
+ "learning_rate": 1.5266411683807168e-05,
9084
+ "loss": 0.6865,
9085
+ "step": 1292
9086
+ },
9087
+ {
9088
+ "epoch": 0.5852002715546504,
9089
+ "grad_norm": 0.46141064167022705,
9090
+ "learning_rate": 1.5159637048406328e-05,
9091
+ "loss": 0.6813,
9092
+ "step": 1293
9093
+ },
9094
+ {
9095
+ "epoch": 0.585652862638606,
9096
+ "grad_norm": 0.6206589341163635,
9097
+ "learning_rate": 1.5053206487847914e-05,
9098
+ "loss": 0.9386,
9099
+ "step": 1294
9100
+ },
9101
+ {
9102
+ "epoch": 0.5861054537225616,
9103
+ "grad_norm": 0.4815228283405304,
9104
+ "learning_rate": 1.4947120433767047e-05,
9105
+ "loss": 0.7759,
9106
+ "step": 1295
9107
+ },
9108
+ {
9109
+ "epoch": 0.5865580448065173,
9110
+ "grad_norm": 0.6688939929008484,
9111
+ "learning_rate": 1.484137931640167e-05,
9112
+ "loss": 0.9813,
9113
+ "step": 1296
9114
+ },
9115
+ {
9116
+ "epoch": 0.587010635890473,
9117
+ "grad_norm": 0.4881376624107361,
9118
+ "learning_rate": 1.4735983564590783e-05,
9119
+ "loss": 0.8419,
9120
+ "step": 1297
9121
+ },
9122
+ {
9123
+ "epoch": 0.5874632269744287,
9124
+ "grad_norm": 0.49161502718925476,
9125
+ "learning_rate": 1.4630933605772801e-05,
9126
+ "loss": 0.8166,
9127
+ "step": 1298
9128
+ },
9129
+ {
9130
+ "epoch": 0.5879158180583842,
9131
+ "grad_norm": 0.4156933128833771,
9132
+ "learning_rate": 1.4526229865983665e-05,
9133
+ "loss": 0.4593,
9134
+ "step": 1299
9135
+ },
9136
+ {
9137
+ "epoch": 0.5883684091423399,
9138
+ "grad_norm": 0.5126404166221619,
9139
+ "learning_rate": 1.442187276985526e-05,
9140
+ "loss": 0.952,
9141
+ "step": 1300
9142
+ },
9143
+ {
9144
+ "epoch": 0.5888210002262956,
9145
+ "grad_norm": 0.5474767088890076,
9146
+ "learning_rate": 1.4317862740613664e-05,
9147
+ "loss": 0.8347,
9148
+ "step": 1301
9149
+ },
9150
+ {
9151
+ "epoch": 0.5892735913102511,
9152
+ "grad_norm": 0.5409084558486938,
9153
+ "learning_rate": 1.4214200200077343e-05,
9154
+ "loss": 0.9025,
9155
+ "step": 1302
9156
+ },
9157
+ {
9158
+ "epoch": 0.5897261823942068,
9159
+ "grad_norm": 0.48775801062583923,
9160
+ "learning_rate": 1.4110885568655564e-05,
9161
+ "loss": 0.8533,
9162
+ "step": 1303
9163
+ },
9164
+ {
9165
+ "epoch": 0.5901787734781625,
9166
+ "grad_norm": 0.5462921857833862,
9167
+ "learning_rate": 1.400791926534657e-05,
9168
+ "loss": 0.9145,
9169
+ "step": 1304
9170
+ },
9171
+ {
9172
+ "epoch": 0.5906313645621182,
9173
+ "grad_norm": 0.5149967074394226,
9174
+ "learning_rate": 1.3905301707735985e-05,
9175
+ "loss": 0.8007,
9176
+ "step": 1305
9177
+ },
9178
+ {
9179
+ "epoch": 0.5910839556460737,
9180
+ "grad_norm": 0.4886469841003418,
9181
+ "learning_rate": 1.3803033311995072e-05,
9182
+ "loss": 0.6551,
9183
+ "step": 1306
9184
+ },
9185
+ {
9186
+ "epoch": 0.5915365467300294,
9187
+ "grad_norm": 0.42015543580055237,
9188
+ "learning_rate": 1.3701114492879007e-05,
9189
+ "loss": 0.5759,
9190
+ "step": 1307
9191
+ },
9192
+ {
9193
+ "epoch": 0.5919891378139851,
9194
+ "grad_norm": 0.43609780073165894,
9195
+ "learning_rate": 1.3599545663725321e-05,
9196
+ "loss": 0.639,
9197
+ "step": 1308
9198
+ },
9199
+ {
9200
+ "epoch": 0.5924417288979407,
9201
+ "grad_norm": 0.4982304871082306,
9202
+ "learning_rate": 1.3498327236452013e-05,
9203
+ "loss": 0.8886,
9204
+ "step": 1309
9205
+ },
9206
+ {
9207
+ "epoch": 0.5928943199818963,
9208
+ "grad_norm": 0.7850152850151062,
9209
+ "learning_rate": 1.339745962155613e-05,
9210
+ "loss": 0.7553,
9211
+ "step": 1310
9212
+ },
9213
+ {
9214
+ "epoch": 0.593346911065852,
9215
+ "grad_norm": 0.6349214911460876,
9216
+ "learning_rate": 1.3296943228111925e-05,
9217
+ "loss": 0.825,
9218
+ "step": 1311
9219
+ },
9220
+ {
9221
+ "epoch": 0.5937995021498077,
9222
+ "grad_norm": 0.509701669216156,
9223
+ "learning_rate": 1.3196778463769255e-05,
9224
+ "loss": 0.7534,
9225
+ "step": 1312
9226
+ },
9227
+ {
9228
+ "epoch": 0.5942520932337633,
9229
+ "grad_norm": 0.4454813301563263,
9230
+ "learning_rate": 1.3096965734751943e-05,
9231
+ "loss": 0.69,
9232
+ "step": 1313
9233
+ },
9234
+ {
9235
+ "epoch": 0.594704684317719,
9236
+ "grad_norm": 0.5413982272148132,
9237
+ "learning_rate": 1.2997505445856084e-05,
9238
+ "loss": 0.8602,
9239
+ "step": 1314
9240
+ },
9241
+ {
9242
+ "epoch": 0.5951572754016746,
9243
+ "grad_norm": 0.460597425699234,
9244
+ "learning_rate": 1.2898398000448443e-05,
9245
+ "loss": 0.6843,
9246
+ "step": 1315
9247
+ },
9248
+ {
9249
+ "epoch": 0.5956098664856302,
9250
+ "grad_norm": 0.49289670586586,
9251
+ "learning_rate": 1.2799643800464834e-05,
9252
+ "loss": 0.6259,
9253
+ "step": 1316
9254
+ },
9255
+ {
9256
+ "epoch": 0.5960624575695859,
9257
+ "grad_norm": 0.5211944580078125,
9258
+ "learning_rate": 1.2701243246408422e-05,
9259
+ "loss": 0.8669,
9260
+ "step": 1317
9261
+ },
9262
+ {
9263
+ "epoch": 0.5965150486535415,
9264
+ "grad_norm": 0.42615196108818054,
9265
+ "learning_rate": 1.260319673734821e-05,
9266
+ "loss": 0.6003,
9267
+ "step": 1318
9268
+ },
9269
+ {
9270
+ "epoch": 0.5969676397374972,
9271
+ "grad_norm": 0.6688699722290039,
9272
+ "learning_rate": 1.2505504670917256e-05,
9273
+ "loss": 1.1166,
9274
+ "step": 1319
9275
+ },
9276
+ {
9277
+ "epoch": 0.5974202308214528,
9278
+ "grad_norm": 0.4943235218524933,
9279
+ "learning_rate": 1.2408167443311214e-05,
9280
+ "loss": 0.726,
9281
+ "step": 1320
9282
+ },
9283
+ {
9284
+ "epoch": 0.5978728219054085,
9285
+ "grad_norm": 0.5515215992927551,
9286
+ "learning_rate": 1.2311185449286677e-05,
9287
+ "loss": 1.0609,
9288
+ "step": 1321
9289
+ },
9290
+ {
9291
+ "epoch": 0.5983254129893641,
9292
+ "grad_norm": 0.5061673521995544,
9293
+ "learning_rate": 1.2214559082159537e-05,
9294
+ "loss": 1.0484,
9295
+ "step": 1322
9296
+ },
9297
+ {
9298
+ "epoch": 0.5987780040733197,
9299
+ "grad_norm": 0.5210549831390381,
9300
+ "learning_rate": 1.2118288733803473e-05,
9301
+ "loss": 0.81,
9302
+ "step": 1323
9303
+ },
9304
+ {
9305
+ "epoch": 0.5992305951572754,
9306
+ "grad_norm": 1.2081853151321411,
9307
+ "learning_rate": 1.2022374794648228e-05,
9308
+ "loss": 0.7087,
9309
+ "step": 1324
9310
+ },
9311
+ {
9312
+ "epoch": 0.5996831862412311,
9313
+ "grad_norm": 0.5520872473716736,
9314
+ "learning_rate": 1.1926817653678157e-05,
9315
+ "loss": 0.6642,
9316
+ "step": 1325
9317
+ },
9318
+ {
9319
+ "epoch": 0.6001357773251867,
9320
+ "grad_norm": 0.5837110280990601,
9321
+ "learning_rate": 1.1831617698430609e-05,
9322
+ "loss": 0.8913,
9323
+ "step": 1326
9324
+ },
9325
+ {
9326
+ "epoch": 0.6005883684091423,
9327
+ "grad_norm": 0.47259557247161865,
9328
+ "learning_rate": 1.1736775314994342e-05,
9329
+ "loss": 0.6563,
9330
+ "step": 1327
9331
+ },
9332
+ {
9333
+ "epoch": 0.601040959493098,
9334
+ "grad_norm": 0.4800024628639221,
9335
+ "learning_rate": 1.1642290888007956e-05,
9336
+ "loss": 0.7851,
9337
+ "step": 1328
9338
+ },
9339
+ {
9340
+ "epoch": 0.6014935505770537,
9341
+ "grad_norm": 0.566657304763794,
9342
+ "learning_rate": 1.15481648006583e-05,
9343
+ "loss": 1.0709,
9344
+ "step": 1329
9345
+ },
9346
+ {
9347
+ "epoch": 0.6019461416610092,
9348
+ "grad_norm": 0.40888628363609314,
9349
+ "learning_rate": 1.1454397434679021e-05,
9350
+ "loss": 0.3452,
9351
+ "step": 1330
9352
+ },
9353
+ {
9354
+ "epoch": 0.6023987327449649,
9355
+ "grad_norm": 0.4961101710796356,
9356
+ "learning_rate": 1.1360989170348902e-05,
9357
+ "loss": 0.7467,
9358
+ "step": 1331
9359
+ },
9360
+ {
9361
+ "epoch": 0.6028513238289206,
9362
+ "grad_norm": 0.4577196538448334,
9363
+ "learning_rate": 1.1267940386490416e-05,
9364
+ "loss": 0.6897,
9365
+ "step": 1332
9366
+ },
9367
+ {
9368
+ "epoch": 0.6033039149128763,
9369
+ "grad_norm": 0.4734448194503784,
9370
+ "learning_rate": 1.1175251460468117e-05,
9371
+ "loss": 0.8086,
9372
+ "step": 1333
9373
+ },
9374
+ {
9375
+ "epoch": 0.6037565059968318,
9376
+ "grad_norm": 0.48683977127075195,
9377
+ "learning_rate": 1.10829227681871e-05,
9378
+ "loss": 0.8685,
9379
+ "step": 1334
9380
+ },
9381
+ {
9382
+ "epoch": 0.6042090970807875,
9383
+ "grad_norm": 0.46204137802124023,
9384
+ "learning_rate": 1.0990954684091558e-05,
9385
+ "loss": 0.5823,
9386
+ "step": 1335
9387
+ },
9388
+ {
9389
+ "epoch": 0.6046616881647432,
9390
+ "grad_norm": 0.47728225588798523,
9391
+ "learning_rate": 1.0899347581163221e-05,
9392
+ "loss": 0.8974,
9393
+ "step": 1336
9394
+ },
9395
+ {
9396
+ "epoch": 0.6051142792486988,
9397
+ "grad_norm": 0.4536563754081726,
9398
+ "learning_rate": 1.0808101830919814e-05,
9399
+ "loss": 0.7476,
9400
+ "step": 1337
9401
+ },
9402
+ {
9403
+ "epoch": 0.6055668703326544,
9404
+ "grad_norm": 0.519241452217102,
9405
+ "learning_rate": 1.0717217803413604e-05,
9406
+ "loss": 0.8848,
9407
+ "step": 1338
9408
+ },
9409
+ {
9410
+ "epoch": 0.6060194614166101,
9411
+ "grad_norm": 0.5880079865455627,
9412
+ "learning_rate": 1.062669586722983e-05,
9413
+ "loss": 1.1778,
9414
+ "step": 1339
9415
+ },
9416
+ {
9417
+ "epoch": 0.6064720525005657,
9418
+ "grad_norm": 0.5012646317481995,
9419
+ "learning_rate": 1.0536536389485275e-05,
9420
+ "loss": 0.9761,
9421
+ "step": 1340
9422
+ },
9423
+ {
9424
+ "epoch": 0.6069246435845214,
9425
+ "grad_norm": 0.5676561594009399,
9426
+ "learning_rate": 1.044673973582675e-05,
9427
+ "loss": 0.9059,
9428
+ "step": 1341
9429
+ },
9430
+ {
9431
+ "epoch": 0.607377234668477,
9432
+ "grad_norm": 0.5131206512451172,
9433
+ "learning_rate": 1.0357306270429624e-05,
9434
+ "loss": 0.9466,
9435
+ "step": 1342
9436
+ },
9437
+ {
9438
+ "epoch": 0.6078298257524327,
9439
+ "grad_norm": 0.5565215945243835,
9440
+ "learning_rate": 1.0268236355996341e-05,
9441
+ "loss": 1.1076,
9442
+ "step": 1343
9443
+ },
9444
+ {
9445
+ "epoch": 0.6082824168363883,
9446
+ "grad_norm": 0.5122553110122681,
9447
+ "learning_rate": 1.0179530353754874e-05,
9448
+ "loss": 0.8366,
9449
+ "step": 1344
9450
+ },
9451
+ {
9452
+ "epoch": 0.608735007920344,
9453
+ "grad_norm": 0.4143733084201813,
9454
+ "learning_rate": 1.0091188623457415e-05,
9455
+ "loss": 0.5818,
9456
+ "step": 1345
9457
+ },
9458
+ {
9459
+ "epoch": 0.6091875990042996,
9460
+ "grad_norm": 0.4871106445789337,
9461
+ "learning_rate": 1.0003211523378796e-05,
9462
+ "loss": 0.8138,
9463
+ "step": 1346
9464
+ },
9465
+ {
9466
+ "epoch": 0.6096401900882552,
9467
+ "grad_norm": 0.5023435354232788,
9468
+ "learning_rate": 9.915599410315068e-06,
9469
+ "loss": 0.8095,
9470
+ "step": 1347
9471
+ },
9472
+ {
9473
+ "epoch": 0.6100927811722109,
9474
+ "grad_norm": 0.5308644771575928,
9475
+ "learning_rate": 9.828352639582072e-06,
9476
+ "loss": 0.955,
9477
+ "step": 1348
9478
+ },
9479
+ {
9480
+ "epoch": 0.6105453722561666,
9481
+ "grad_norm": 0.46719393134117126,
9482
+ "learning_rate": 9.74147156501396e-06,
9483
+ "loss": 0.7615,
9484
+ "step": 1349
9485
+ },
9486
+ {
9487
+ "epoch": 0.6109979633401222,
9488
+ "grad_norm": 0.4629960358142853,
9489
+ "learning_rate": 9.65495653896179e-06,
9490
+ "loss": 0.6945,
9491
+ "step": 1350
9492
+ },
9493
+ {
9494
+ "epoch": 0.6114505544240778,
9495
+ "grad_norm": 0.4505084455013275,
9496
+ "learning_rate": 9.568807912292077e-06,
9497
+ "loss": 0.7088,
9498
+ "step": 1351
9499
+ },
9500
+ {
9501
+ "epoch": 0.6119031455080335,
9502
+ "grad_norm": 0.4612928032875061,
9503
+ "learning_rate": 9.483026034385467e-06,
9504
+ "loss": 0.5966,
9505
+ "step": 1352
9506
+ },
9507
+ {
9508
+ "epoch": 0.6123557365919892,
9509
+ "grad_norm": 0.81995689868927,
9510
+ "learning_rate": 9.397611253135118e-06,
9511
+ "loss": 0.751,
9512
+ "step": 1353
9513
+ },
9514
+ {
9515
+ "epoch": 0.6128083276759447,
9516
+ "grad_norm": 0.49387305974960327,
9517
+ "learning_rate": 9.31256391494546e-06,
9518
+ "loss": 0.8838,
9519
+ "step": 1354
9520
+ },
9521
+ {
9522
+ "epoch": 0.6132609187599004,
9523
+ "grad_norm": 0.5457798838615417,
9524
+ "learning_rate": 9.227884364730744e-06,
9525
+ "loss": 0.851,
9526
+ "step": 1355
9527
+ },
9528
+ {
9529
+ "epoch": 0.6137135098438561,
9530
+ "grad_norm": 0.4536944031715393,
9531
+ "learning_rate": 9.143572945913614e-06,
9532
+ "loss": 0.8056,
9533
+ "step": 1356
9534
+ },
9535
+ {
9536
+ "epoch": 0.6141661009278118,
9537
+ "grad_norm": 0.5066845417022705,
9538
+ "learning_rate": 9.05963000042378e-06,
9539
+ "loss": 0.8617,
9540
+ "step": 1357
9541
+ },
9542
+ {
9543
+ "epoch": 0.6146186920117673,
9544
+ "grad_norm": 0.5071107745170593,
9545
+ "learning_rate": 8.976055868696542e-06,
9546
+ "loss": 0.8629,
9547
+ "step": 1358
9548
+ },
9549
+ {
9550
+ "epoch": 0.615071283095723,
9551
+ "grad_norm": 0.5773394107818604,
9552
+ "learning_rate": 8.892850889671455e-06,
9553
+ "loss": 1.1922,
9554
+ "step": 1359
9555
+ },
9556
+ {
9557
+ "epoch": 0.6155238741796787,
9558
+ "grad_norm": 0.48692917823791504,
9559
+ "learning_rate": 8.810015400790994e-06,
9560
+ "loss": 0.7168,
9561
+ "step": 1360
9562
+ },
9563
+ {
9564
+ "epoch": 0.6159764652636343,
9565
+ "grad_norm": 0.4717068374156952,
9566
+ "learning_rate": 8.727549737999097e-06,
9567
+ "loss": 0.8643,
9568
+ "step": 1361
9569
+ },
9570
+ {
9571
+ "epoch": 0.6164290563475899,
9572
+ "grad_norm": 0.5000625252723694,
9573
+ "learning_rate": 8.645454235739903e-06,
9574
+ "loss": 0.7273,
9575
+ "step": 1362
9576
+ },
9577
+ {
9578
+ "epoch": 0.6168816474315456,
9579
+ "grad_norm": 0.43107035756111145,
9580
+ "learning_rate": 8.563729226956319e-06,
9581
+ "loss": 0.6671,
9582
+ "step": 1363
9583
+ },
9584
+ {
9585
+ "epoch": 0.6173342385155013,
9586
+ "grad_norm": 0.6068969368934631,
9587
+ "learning_rate": 8.482375043088665e-06,
9588
+ "loss": 1.2765,
9589
+ "step": 1364
9590
+ },
9591
+ {
9592
+ "epoch": 0.6177868295994569,
9593
+ "grad_norm": 0.5019914507865906,
9594
+ "learning_rate": 8.401392014073405e-06,
9595
+ "loss": 0.9452,
9596
+ "step": 1365
9597
+ },
9598
+ {
9599
+ "epoch": 0.6182394206834125,
9600
+ "grad_norm": 0.5206013917922974,
9601
+ "learning_rate": 8.32078046834176e-06,
9602
+ "loss": 0.9439,
9603
+ "step": 1366
9604
+ },
9605
+ {
9606
+ "epoch": 0.6186920117673682,
9607
+ "grad_norm": 0.4639444649219513,
9608
+ "learning_rate": 8.240540732818347e-06,
9609
+ "loss": 0.802,
9610
+ "step": 1367
9611
+ },
9612
+ {
9613
+ "epoch": 0.6191446028513238,
9614
+ "grad_norm": 0.5678917765617371,
9615
+ "learning_rate": 8.160673132919938e-06,
9616
+ "loss": 1.0333,
9617
+ "step": 1368
9618
+ },
9619
+ {
9620
+ "epoch": 0.6195971939352795,
9621
+ "grad_norm": 0.5168341994285583,
9622
+ "learning_rate": 8.081177992554013e-06,
9623
+ "loss": 0.8182,
9624
+ "step": 1369
9625
+ },
9626
+ {
9627
+ "epoch": 0.6200497850192351,
9628
+ "grad_norm": 0.611613929271698,
9629
+ "learning_rate": 8.002055634117578e-06,
9630
+ "loss": 1.0066,
9631
+ "step": 1370
9632
+ },
9633
+ {
9634
+ "epoch": 0.6205023761031908,
9635
+ "grad_norm": 0.5084848999977112,
9636
+ "learning_rate": 7.923306378495809e-06,
9637
+ "loss": 1.0312,
9638
+ "step": 1371
9639
+ },
9640
+ {
9641
+ "epoch": 0.6209549671871464,
9642
+ "grad_norm": 0.3988780081272125,
9643
+ "learning_rate": 7.844930545060703e-06,
9644
+ "loss": 0.4544,
9645
+ "step": 1372
9646
+ },
9647
+ {
9648
+ "epoch": 0.6214075582711021,
9649
+ "grad_norm": 0.6262059211730957,
9650
+ "learning_rate": 7.766928451669863e-06,
9651
+ "loss": 1.004,
9652
+ "step": 1373
9653
+ },
9654
+ {
9655
+ "epoch": 0.6218601493550577,
9656
+ "grad_norm": 0.5216922760009766,
9657
+ "learning_rate": 7.689300414665124e-06,
9658
+ "loss": 0.7166,
9659
+ "step": 1374
9660
+ },
9661
+ {
9662
+ "epoch": 0.6223127404390133,
9663
+ "grad_norm": 0.5540860891342163,
9664
+ "learning_rate": 7.612046748871327e-06,
9665
+ "loss": 0.8727,
9666
+ "step": 1375
9667
+ },
9668
+ {
9669
+ "epoch": 0.622765331522969,
9670
+ "grad_norm": 0.44992414116859436,
9671
+ "learning_rate": 7.5351677675950635e-06,
9672
+ "loss": 0.6504,
9673
+ "step": 1376
9674
+ },
9675
+ {
9676
+ "epoch": 0.6232179226069247,
9677
+ "grad_norm": 0.4714016616344452,
9678
+ "learning_rate": 7.458663782623343e-06,
9679
+ "loss": 0.8419,
9680
+ "step": 1377
9681
+ },
9682
+ {
9683
+ "epoch": 0.6236705136908803,
9684
+ "grad_norm": 0.49986764788627625,
9685
+ "learning_rate": 7.382535104222366e-06,
9686
+ "loss": 0.7894,
9687
+ "step": 1378
9688
+ },
9689
+ {
9690
+ "epoch": 0.6241231047748359,
9691
+ "grad_norm": 0.5532403588294983,
9692
+ "learning_rate": 7.306782041136218e-06,
9693
+ "loss": 0.901,
9694
+ "step": 1379
9695
+ },
9696
+ {
9697
+ "epoch": 0.6245756958587916,
9698
+ "grad_norm": 0.5384380221366882,
9699
+ "learning_rate": 7.231404900585714e-06,
9700
+ "loss": 0.9753,
9701
+ "step": 1380
9702
+ },
9703
+ {
9704
+ "epoch": 0.6250282869427473,
9705
+ "grad_norm": 0.5002140402793884,
9706
+ "learning_rate": 7.156403988267069e-06,
9707
+ "loss": 0.8285,
9708
+ "step": 1381
9709
+ },
9710
+ {
9711
+ "epoch": 0.6254808780267028,
9712
+ "grad_norm": 0.5726694464683533,
9713
+ "learning_rate": 7.08177960835068e-06,
9714
+ "loss": 0.8634,
9715
+ "step": 1382
9716
+ },
9717
+ {
9718
+ "epoch": 0.6259334691106585,
9719
+ "grad_norm": 0.5283421874046326,
9720
+ "learning_rate": 7.0075320634799045e-06,
9721
+ "loss": 0.7802,
9722
+ "step": 1383
9723
+ },
9724
+ {
9725
+ "epoch": 0.6263860601946142,
9726
+ "grad_norm": 0.4411744475364685,
9727
+ "learning_rate": 6.9336616547697965e-06,
9728
+ "loss": 0.5788,
9729
+ "step": 1384
9730
+ },
9731
+ {
9732
+ "epoch": 0.6268386512785699,
9733
+ "grad_norm": 0.5233549475669861,
9734
+ "learning_rate": 6.860168681805945e-06,
9735
+ "loss": 0.8098,
9736
+ "step": 1385
9737
+ },
9738
+ {
9739
+ "epoch": 0.6272912423625254,
9740
+ "grad_norm": 0.5534676313400269,
9741
+ "learning_rate": 6.787053442643232e-06,
9742
+ "loss": 1.0433,
9743
+ "step": 1386
9744
+ },
9745
+ {
9746
+ "epoch": 0.6277438334464811,
9747
+ "grad_norm": 0.5603635907173157,
9748
+ "learning_rate": 6.714316233804574e-06,
9749
+ "loss": 0.8382,
9750
+ "step": 1387
9751
+ },
9752
+ {
9753
+ "epoch": 0.6281964245304368,
9754
+ "grad_norm": 0.48828354477882385,
9755
+ "learning_rate": 6.6419573502798374e-06,
9756
+ "loss": 0.7261,
9757
+ "step": 1388
9758
+ },
9759
+ {
9760
+ "epoch": 0.6286490156143923,
9761
+ "grad_norm": 0.46339505910873413,
9762
+ "learning_rate": 6.5699770855244815e-06,
9763
+ "loss": 0.6944,
9764
+ "step": 1389
9765
+ },
9766
+ {
9767
+ "epoch": 0.629101606698348,
9768
+ "grad_norm": 0.5434744954109192,
9769
+ "learning_rate": 6.498375731458528e-06,
9770
+ "loss": 0.9542,
9771
+ "step": 1390
9772
+ },
9773
+ {
9774
+ "epoch": 0.6295541977823037,
9775
+ "grad_norm": 0.49759843945503235,
9776
+ "learning_rate": 6.427153578465262e-06,
9777
+ "loss": 0.7949,
9778
+ "step": 1391
9779
+ },
9780
+ {
9781
+ "epoch": 0.6300067888662594,
9782
+ "grad_norm": 0.5009887218475342,
9783
+ "learning_rate": 6.356310915390118e-06,
9784
+ "loss": 0.8088,
9785
+ "step": 1392
9786
+ },
9787
+ {
9788
+ "epoch": 0.630459379950215,
9789
+ "grad_norm": 0.5697285532951355,
9790
+ "learning_rate": 6.28584802953951e-06,
9791
+ "loss": 0.9026,
9792
+ "step": 1393
9793
+ },
9794
+ {
9795
+ "epoch": 0.6309119710341706,
9796
+ "grad_norm": 0.48114413022994995,
9797
+ "learning_rate": 6.215765206679569e-06,
9798
+ "loss": 0.733,
9799
+ "step": 1394
9800
+ },
9801
+ {
9802
+ "epoch": 0.6313645621181263,
9803
+ "grad_norm": 0.5739166140556335,
9804
+ "learning_rate": 6.146062731035129e-06,
9805
+ "loss": 1.1062,
9806
+ "step": 1395
9807
+ },
9808
+ {
9809
+ "epoch": 0.6318171532020819,
9810
+ "grad_norm": 0.475238561630249,
9811
+ "learning_rate": 6.076740885288479e-06,
9812
+ "loss": 0.7195,
9813
+ "step": 1396
9814
+ },
9815
+ {
9816
+ "epoch": 0.6322697442860375,
9817
+ "grad_norm": 0.5954925417900085,
9818
+ "learning_rate": 6.007799950578264e-06,
9819
+ "loss": 1.0236,
9820
+ "step": 1397
9821
+ },
9822
+ {
9823
+ "epoch": 0.6327223353699932,
9824
+ "grad_norm": 0.5766705870628357,
9825
+ "learning_rate": 5.939240206498287e-06,
9826
+ "loss": 0.9263,
9827
+ "step": 1398
9828
+ },
9829
+ {
9830
+ "epoch": 0.6331749264539489,
9831
+ "grad_norm": 0.4999594986438751,
9832
+ "learning_rate": 5.8710619310964445e-06,
9833
+ "loss": 0.696,
9834
+ "step": 1399
9835
+ },
9836
+ {
9837
+ "epoch": 0.6336275175379045,
9838
+ "grad_norm": 0.4891878068447113,
9839
+ "learning_rate": 5.803265400873514e-06,
9840
+ "loss": 0.9738,
9841
+ "step": 1400
9842
+ },
9843
+ {
9844
+ "epoch": 0.6340801086218602,
9845
+ "grad_norm": 0.5583204627037048,
9846
+ "learning_rate": 5.735850890782157e-06,
9847
+ "loss": 1.0242,
9848
+ "step": 1401
9849
+ },
9850
+ {
9851
+ "epoch": 0.6345326997058158,
9852
+ "grad_norm": 0.49677279591560364,
9853
+ "learning_rate": 5.668818674225685e-06,
9854
+ "loss": 0.757,
9855
+ "step": 1402
9856
+ },
9857
+ {
9858
+ "epoch": 0.6349852907897714,
9859
+ "grad_norm": 0.4881908595561981,
9860
+ "learning_rate": 5.602169023057013e-06,
9861
+ "loss": 0.7328,
9862
+ "step": 1403
9863
+ },
9864
+ {
9865
+ "epoch": 0.6354378818737271,
9866
+ "grad_norm": 0.5154109597206116,
9867
+ "learning_rate": 5.5359022075775146e-06,
9868
+ "loss": 0.8986,
9869
+ "step": 1404
9870
+ },
9871
+ {
9872
+ "epoch": 0.6358904729576828,
9873
+ "grad_norm": 0.585472047328949,
9874
+ "learning_rate": 5.470018496535967e-06,
9875
+ "loss": 0.7595,
9876
+ "step": 1405
9877
+ },
9878
+ {
9879
+ "epoch": 0.6363430640416384,
9880
+ "grad_norm": 0.5411213636398315,
9881
+ "learning_rate": 5.40451815712748e-06,
9882
+ "loss": 0.9672,
9883
+ "step": 1406
9884
+ },
9885
+ {
9886
+ "epoch": 0.636795655125594,
9887
+ "grad_norm": 0.4666892886161804,
9888
+ "learning_rate": 5.33940145499231e-06,
9889
+ "loss": 0.6498,
9890
+ "step": 1407
9891
+ },
9892
+ {
9893
+ "epoch": 0.6372482462095497,
9894
+ "grad_norm": 0.4871276319026947,
9895
+ "learning_rate": 5.274668654214932e-06,
9896
+ "loss": 0.6612,
9897
+ "step": 1408
9898
+ },
9899
+ {
9900
+ "epoch": 0.6377008372935054,
9901
+ "grad_norm": 0.6037775874137878,
9902
+ "learning_rate": 5.210320017322812e-06,
9903
+ "loss": 1.0683,
9904
+ "step": 1409
9905
+ },
9906
+ {
9907
+ "epoch": 0.6381534283774609,
9908
+ "grad_norm": 0.5718627572059631,
9909
+ "learning_rate": 5.146355805285452e-06,
9910
+ "loss": 1.0986,
9911
+ "step": 1410
9912
+ },
9913
+ {
9914
+ "epoch": 0.6386060194614166,
9915
+ "grad_norm": 0.5335869789123535,
9916
+ "learning_rate": 5.08277627751329e-06,
9917
+ "loss": 0.6686,
9918
+ "step": 1411
9919
+ },
9920
+ {
9921
+ "epoch": 0.6390586105453723,
9922
+ "grad_norm": 0.44057121872901917,
9923
+ "learning_rate": 5.01958169185669e-06,
9924
+ "loss": 0.587,
9925
+ "step": 1412
9926
+ },
9927
+ {
9928
+ "epoch": 0.639511201629328,
9929
+ "grad_norm": 0.4762479066848755,
9930
+ "learning_rate": 4.956772304604818e-06,
9931
+ "loss": 0.6572,
9932
+ "step": 1413
9933
+ },
9934
+ {
9935
+ "epoch": 0.6399637927132835,
9936
+ "grad_norm": 0.5564635396003723,
9937
+ "learning_rate": 4.8943483704846475e-06,
9938
+ "loss": 1.0787,
9939
+ "step": 1414
9940
+ },
9941
+ {
9942
+ "epoch": 0.6404163837972392,
9943
+ "grad_norm": 0.461028128862381,
9944
+ "learning_rate": 4.832310142659946e-06,
9945
+ "loss": 0.6813,
9946
+ "step": 1415
9947
+ },
9948
+ {
9949
+ "epoch": 0.6408689748811949,
9950
+ "grad_norm": 0.5097917318344116,
9951
+ "learning_rate": 4.7706578727302224e-06,
9952
+ "loss": 0.892,
9953
+ "step": 1416
9954
+ },
9955
+ {
9956
+ "epoch": 0.6413215659651504,
9957
+ "grad_norm": 0.5178496241569519,
9958
+ "learning_rate": 4.709391810729713e-06,
9959
+ "loss": 0.9012,
9960
+ "step": 1417
9961
+ },
9962
+ {
9963
+ "epoch": 0.6417741570491061,
9964
+ "grad_norm": 0.49436789751052856,
9965
+ "learning_rate": 4.648512205126376e-06,
9966
+ "loss": 0.7641,
9967
+ "step": 1418
9968
+ },
9969
+ {
9970
+ "epoch": 0.6422267481330618,
9971
+ "grad_norm": 0.8753583431243896,
9972
+ "learning_rate": 4.588019302820834e-06,
9973
+ "loss": 0.9981,
9974
+ "step": 1419
9975
+ },
9976
+ {
9977
+ "epoch": 0.6426793392170175,
9978
+ "grad_norm": 0.5542361736297607,
9979
+ "learning_rate": 4.527913349145441e-06,
9980
+ "loss": 0.9366,
9981
+ "step": 1420
9982
+ },
9983
+ {
9984
+ "epoch": 0.643131930300973,
9985
+ "grad_norm": 0.41615116596221924,
9986
+ "learning_rate": 4.468194587863273e-06,
9987
+ "loss": 0.5197,
9988
+ "step": 1421
9989
+ },
9990
+ {
9991
+ "epoch": 0.6435845213849287,
9992
+ "grad_norm": 0.5194451212882996,
9993
+ "learning_rate": 4.408863261167096e-06,
9994
+ "loss": 0.7452,
9995
+ "step": 1422
9996
+ },
9997
+ {
9998
+ "epoch": 0.6440371124688844,
9999
+ "grad_norm": 0.43391209840774536,
10000
+ "learning_rate": 4.349919609678455e-06,
10001
+ "loss": 0.6486,
10002
+ "step": 1423
10003
+ },
10004
+ {
10005
+ "epoch": 0.64448970355284,
10006
+ "grad_norm": 0.4887462556362152,
10007
+ "learning_rate": 4.291363872446597e-06,
10008
+ "loss": 0.6932,
10009
+ "step": 1424
10010
+ },
10011
+ {
10012
+ "epoch": 0.6449422946367956,
10013
+ "grad_norm": 0.6093336343765259,
10014
+ "learning_rate": 4.233196286947605e-06,
10015
+ "loss": 1.1266,
10016
+ "step": 1425
10017
+ },
10018
+ {
10019
+ "epoch": 0.6453948857207513,
10020
+ "grad_norm": 0.4346993565559387,
10021
+ "learning_rate": 4.175417089083378e-06,
10022
+ "loss": 0.5141,
10023
+ "step": 1426
10024
+ },
10025
+ {
10026
+ "epoch": 0.645847476804707,
10027
+ "grad_norm": 0.6119694709777832,
10028
+ "learning_rate": 4.118026513180695e-06,
10029
+ "loss": 0.9554,
10030
+ "step": 1427
10031
+ },
10032
+ {
10033
+ "epoch": 0.6463000678886626,
10034
+ "grad_norm": 0.5438103079795837,
10035
+ "learning_rate": 4.061024791990253e-06,
10036
+ "loss": 0.8614,
10037
+ "step": 1428
10038
+ },
10039
+ {
10040
+ "epoch": 0.6467526589726182,
10041
+ "grad_norm": 0.6257548928260803,
10042
+ "learning_rate": 4.004412156685711e-06,
10043
+ "loss": 0.9136,
10044
+ "step": 1429
10045
+ },
10046
+ {
10047
+ "epoch": 0.6472052500565739,
10048
+ "grad_norm": 0.5468961000442505,
10049
+ "learning_rate": 3.948188836862776e-06,
10050
+ "loss": 0.7189,
10051
+ "step": 1430
10052
+ },
10053
+ {
10054
+ "epoch": 0.6476578411405295,
10055
+ "grad_norm": 0.44468608498573303,
10056
+ "learning_rate": 3.892355060538289e-06,
10057
+ "loss": 0.6663,
10058
+ "step": 1431
10059
+ },
10060
+ {
10061
+ "epoch": 0.6481104322244852,
10062
+ "grad_norm": 0.5603303909301758,
10063
+ "learning_rate": 3.836911054149239e-06,
10064
+ "loss": 0.8297,
10065
+ "step": 1432
10066
+ },
10067
+ {
10068
+ "epoch": 0.6485630233084408,
10069
+ "grad_norm": 0.5190114974975586,
10070
+ "learning_rate": 3.7818570425519173e-06,
10071
+ "loss": 0.7831,
10072
+ "step": 1433
10073
+ },
10074
+ {
10075
+ "epoch": 0.6490156143923965,
10076
+ "grad_norm": 0.5360861420631409,
10077
+ "learning_rate": 3.7271932490209328e-06,
10078
+ "loss": 0.9773,
10079
+ "step": 1434
10080
+ },
10081
+ {
10082
+ "epoch": 0.6494682054763521,
10083
+ "grad_norm": 0.5341346263885498,
10084
+ "learning_rate": 3.6729198952483724e-06,
10085
+ "loss": 0.7224,
10086
+ "step": 1435
10087
+ },
10088
+ {
10089
+ "epoch": 0.6499207965603078,
10090
+ "grad_norm": 0.6777652502059937,
10091
+ "learning_rate": 3.6190372013428562e-06,
10092
+ "loss": 1.1764,
10093
+ "step": 1436
10094
+ },
10095
+ {
10096
+ "epoch": 0.6503733876442634,
10097
+ "grad_norm": 0.5760977864265442,
10098
+ "learning_rate": 3.5655453858286614e-06,
10099
+ "loss": 1.1423,
10100
+ "step": 1437
10101
+ },
10102
+ {
10103
+ "epoch": 0.650825978728219,
10104
+ "grad_norm": 0.44717341661453247,
10105
+ "learning_rate": 3.512444665644865e-06,
10106
+ "loss": 0.5806,
10107
+ "step": 1438
10108
+ },
10109
+ {
10110
+ "epoch": 0.6512785698121747,
10111
+ "grad_norm": 0.5548418760299683,
10112
+ "learning_rate": 3.4597352561443807e-06,
10113
+ "loss": 0.8524,
10114
+ "step": 1439
10115
+ },
10116
+ {
10117
+ "epoch": 0.6517311608961304,
10118
+ "grad_norm": 0.5654526948928833,
10119
+ "learning_rate": 3.40741737109318e-06,
10120
+ "loss": 1.0357,
10121
+ "step": 1440
10122
+ },
10123
+ {
10124
+ "epoch": 0.652183751980086,
10125
+ "grad_norm": 0.5160826444625854,
10126
+ "learning_rate": 3.355491222669371e-06,
10127
+ "loss": 0.7621,
10128
+ "step": 1441
10129
+ },
10130
+ {
10131
+ "epoch": 0.6526363430640416,
10132
+ "grad_norm": 0.4938196539878845,
10133
+ "learning_rate": 3.3039570214623782e-06,
10134
+ "loss": 0.7649,
10135
+ "step": 1442
10136
+ },
10137
+ {
10138
+ "epoch": 0.6530889341479973,
10139
+ "grad_norm": 0.5363398790359497,
10140
+ "learning_rate": 3.2528149764720186e-06,
10141
+ "loss": 0.8831,
10142
+ "step": 1443
10143
+ },
10144
+ {
10145
+ "epoch": 0.653541525231953,
10146
+ "grad_norm": 0.6059714555740356,
10147
+ "learning_rate": 3.202065295107726e-06,
10148
+ "loss": 0.9239,
10149
+ "step": 1444
10150
+ },
10151
+ {
10152
+ "epoch": 0.6539941163159085,
10153
+ "grad_norm": 0.5283812284469604,
10154
+ "learning_rate": 3.1517081831876737e-06,
10155
+ "loss": 0.8756,
10156
+ "step": 1445
10157
+ },
10158
+ {
10159
+ "epoch": 0.6544467073998642,
10160
+ "grad_norm": 0.5337501764297485,
10161
+ "learning_rate": 3.1017438449379434e-06,
10162
+ "loss": 1.0205,
10163
+ "step": 1446
10164
+ },
10165
+ {
10166
+ "epoch": 0.6548992984838199,
10167
+ "grad_norm": 0.5220997929573059,
10168
+ "learning_rate": 3.052172482991711e-06,
10169
+ "loss": 1.0293,
10170
+ "step": 1447
10171
+ },
10172
+ {
10173
+ "epoch": 0.6553518895677756,
10174
+ "grad_norm": 0.6735256314277649,
10175
+ "learning_rate": 3.0029942983884173e-06,
10176
+ "loss": 0.9027,
10177
+ "step": 1448
10178
+ },
10179
+ {
10180
+ "epoch": 0.6558044806517311,
10181
+ "grad_norm": 0.5760489106178284,
10182
+ "learning_rate": 2.9542094905729457e-06,
10183
+ "loss": 1.1187,
10184
+ "step": 1449
10185
+ },
10186
+ {
10187
+ "epoch": 0.6562570717356868,
10188
+ "grad_norm": 0.5203390717506409,
10189
+ "learning_rate": 2.905818257394799e-06,
10190
+ "loss": 0.9437,
10191
+ "step": 1450
10192
+ },
10193
+ {
10194
+ "epoch": 0.6567096628196425,
10195
+ "grad_norm": 0.4524308741092682,
10196
+ "learning_rate": 2.8578207951073353e-06,
10197
+ "loss": 0.6448,
10198
+ "step": 1451
10199
+ },
10200
+ {
10201
+ "epoch": 0.6571622539035981,
10202
+ "grad_norm": 0.4728708863258362,
10203
+ "learning_rate": 2.810217298366968e-06,
10204
+ "loss": 0.6843,
10205
+ "step": 1452
10206
+ },
10207
+ {
10208
+ "epoch": 0.6576148449875537,
10209
+ "grad_norm": 0.5334341526031494,
10210
+ "learning_rate": 2.7630079602323442e-06,
10211
+ "loss": 0.8001,
10212
+ "step": 1453
10213
+ },
10214
+ {
10215
+ "epoch": 0.6580674360715094,
10216
+ "grad_norm": 0.484829306602478,
10217
+ "learning_rate": 2.716192972163556e-06,
10218
+ "loss": 0.7185,
10219
+ "step": 1454
10220
+ },
10221
+ {
10222
+ "epoch": 0.658520027155465,
10223
+ "grad_norm": 0.5486847162246704,
10224
+ "learning_rate": 2.6697725240214076e-06,
10225
+ "loss": 0.9219,
10226
+ "step": 1455
10227
+ },
10228
+ {
10229
+ "epoch": 0.6589726182394207,
10230
+ "grad_norm": 0.551567792892456,
10231
+ "learning_rate": 2.6237468040666512e-06,
10232
+ "loss": 1.0728,
10233
+ "step": 1456
10234
+ },
10235
+ {
10236
+ "epoch": 0.6594252093233763,
10237
+ "grad_norm": 0.4729478657245636,
10238
+ "learning_rate": 2.578115998959152e-06,
10239
+ "loss": 0.655,
10240
+ "step": 1457
10241
+ },
10242
+ {
10243
+ "epoch": 0.659877800407332,
10244
+ "grad_norm": 0.5266134738922119,
10245
+ "learning_rate": 2.532880293757223e-06,
10246
+ "loss": 0.9098,
10247
+ "step": 1458
10248
+ },
10249
+ {
10250
+ "epoch": 0.6603303914912876,
10251
+ "grad_norm": 0.408477246761322,
10252
+ "learning_rate": 2.4880398719167586e-06,
10253
+ "loss": 0.644,
10254
+ "step": 1459
10255
+ },
10256
+ {
10257
+ "epoch": 0.6607829825752433,
10258
+ "grad_norm": 0.5005697011947632,
10259
+ "learning_rate": 2.4435949152906145e-06,
10260
+ "loss": 0.8143,
10261
+ "step": 1460
10262
+ },
10263
+ {
10264
+ "epoch": 0.6612355736591989,
10265
+ "grad_norm": 0.5645555257797241,
10266
+ "learning_rate": 2.3995456041278066e-06,
10267
+ "loss": 1.0237,
10268
+ "step": 1461
10269
+ },
10270
+ {
10271
+ "epoch": 0.6616881647431545,
10272
+ "grad_norm": 0.5656578540802002,
10273
+ "learning_rate": 2.3558921170727888e-06,
10274
+ "loss": 0.746,
10275
+ "step": 1462
10276
+ },
10277
+ {
10278
+ "epoch": 0.6621407558271102,
10279
+ "grad_norm": 0.48828980326652527,
10280
+ "learning_rate": 2.312634631164723e-06,
10281
+ "loss": 0.8299,
10282
+ "step": 1463
10283
+ },
10284
+ {
10285
+ "epoch": 0.6625933469110659,
10286
+ "grad_norm": 0.5643355250358582,
10287
+ "learning_rate": 2.2697733218367436e-06,
10288
+ "loss": 0.987,
10289
+ "step": 1464
10290
+ },
10291
+ {
10292
+ "epoch": 0.6630459379950215,
10293
+ "grad_norm": 0.49453938007354736,
10294
+ "learning_rate": 2.2273083629153147e-06,
10295
+ "loss": 0.7006,
10296
+ "step": 1465
10297
+ },
10298
+ {
10299
+ "epoch": 0.6634985290789771,
10300
+ "grad_norm": 0.5133518576622009,
10301
+ "learning_rate": 2.1852399266194314e-06,
10302
+ "loss": 0.8049,
10303
+ "step": 1466
10304
+ },
10305
+ {
10306
+ "epoch": 0.6639511201629328,
10307
+ "grad_norm": 0.456297367811203,
10308
+ "learning_rate": 2.1435681835600184e-06,
10309
+ "loss": 0.5972,
10310
+ "step": 1467
10311
+ },
10312
+ {
10313
+ "epoch": 0.6644037112468885,
10314
+ "grad_norm": 0.5126147270202637,
10315
+ "learning_rate": 2.1022933027391555e-06,
10316
+ "loss": 1.0061,
10317
+ "step": 1468
10318
+ },
10319
+ {
10320
+ "epoch": 0.664856302330844,
10321
+ "grad_norm": 0.5274229645729065,
10322
+ "learning_rate": 2.06141545154942e-06,
10323
+ "loss": 0.8853,
10324
+ "step": 1469
10325
+ },
10326
+ {
10327
+ "epoch": 0.6653088934147997,
10328
+ "grad_norm": 0.4462442100048065,
10329
+ "learning_rate": 2.0209347957732328e-06,
10330
+ "loss": 0.6457,
10331
+ "step": 1470
10332
+ },
10333
+ {
10334
+ "epoch": 0.6657614844987554,
10335
+ "grad_norm": 0.5552085041999817,
10336
+ "learning_rate": 1.9808514995821593e-06,
10337
+ "loss": 0.9793,
10338
+ "step": 1471
10339
+ },
10340
+ {
10341
+ "epoch": 0.6662140755827111,
10342
+ "grad_norm": 0.4626757800579071,
10343
+ "learning_rate": 1.941165725536265e-06,
10344
+ "loss": 0.7582,
10345
+ "step": 1472
10346
+ },
10347
+ {
10348
+ "epoch": 0.6666666666666666,
10349
+ "grad_norm": 0.6225503087043762,
10350
+ "learning_rate": 1.9018776345834155e-06,
10351
+ "loss": 0.5593,
10352
+ "step": 1473
10353
+ },
10354
+ {
10355
+ "epoch": 0.6671192577506223,
10356
+ "grad_norm": 0.4841187000274658,
10357
+ "learning_rate": 1.8629873860586566e-06,
10358
+ "loss": 0.8542,
10359
+ "step": 1474
10360
+ },
10361
+ {
10362
+ "epoch": 0.667571848834578,
10363
+ "grad_norm": 0.636572539806366,
10364
+ "learning_rate": 1.8244951376835906e-06,
10365
+ "loss": 1.1556,
10366
+ "step": 1475
10367
+ },
10368
+ {
10369
+ "epoch": 0.6680244399185336,
10370
+ "grad_norm": 0.5484120845794678,
10371
+ "learning_rate": 1.7864010455656554e-06,
10372
+ "loss": 1.1598,
10373
+ "step": 1476
10374
+ },
10375
+ {
10376
+ "epoch": 0.6684770310024892,
10377
+ "grad_norm": 0.5752256512641907,
10378
+ "learning_rate": 1.7487052641976032e-06,
10379
+ "loss": 0.9162,
10380
+ "step": 1477
10381
+ },
10382
+ {
10383
+ "epoch": 0.6689296220864449,
10384
+ "grad_norm": 0.5652234554290771,
10385
+ "learning_rate": 1.7114079464567888e-06,
10386
+ "loss": 0.8911,
10387
+ "step": 1478
10388
+ },
10389
+ {
10390
+ "epoch": 0.6693822131704006,
10391
+ "grad_norm": 0.4782993495464325,
10392
+ "learning_rate": 1.6745092436045494e-06,
10393
+ "loss": 0.8625,
10394
+ "step": 1479
10395
+ },
10396
+ {
10397
+ "epoch": 0.6698348042543562,
10398
+ "grad_norm": 0.5361889600753784,
10399
+ "learning_rate": 1.6380093052856483e-06,
10400
+ "loss": 0.8206,
10401
+ "step": 1480
10402
+ },
10403
+ {
10404
+ "epoch": 0.6702873953383118,
10405
+ "grad_norm": 0.47865453362464905,
10406
+ "learning_rate": 1.6019082795276307e-06,
10407
+ "loss": 0.7713,
10408
+ "step": 1481
10409
+ },
10410
+ {
10411
+ "epoch": 0.6707399864222675,
10412
+ "grad_norm": 0.457772821187973,
10413
+ "learning_rate": 1.566206312740226e-06,
10414
+ "loss": 0.6757,
10415
+ "step": 1482
10416
+ },
10417
+ {
10418
+ "epoch": 0.6711925775062231,
10419
+ "grad_norm": 0.5296614170074463,
10420
+ "learning_rate": 1.5309035497147684e-06,
10421
+ "loss": 0.9659,
10422
+ "step": 1483
10423
+ },
10424
+ {
10425
+ "epoch": 0.6716451685901788,
10426
+ "grad_norm": 0.496402770280838,
10427
+ "learning_rate": 1.4960001336235875e-06,
10428
+ "loss": 0.8881,
10429
+ "step": 1484
10430
+ },
10431
+ {
10432
+ "epoch": 0.6720977596741344,
10433
+ "grad_norm": 0.43575870990753174,
10434
+ "learning_rate": 1.4614962060194304e-06,
10435
+ "loss": 0.6084,
10436
+ "step": 1485
10437
+ },
10438
+ {
10439
+ "epoch": 0.6725503507580901,
10440
+ "grad_norm": 0.6141435503959656,
10441
+ "learning_rate": 1.4273919068349184e-06,
10442
+ "loss": 0.8805,
10443
+ "step": 1486
10444
+ },
10445
+ {
10446
+ "epoch": 0.6730029418420457,
10447
+ "grad_norm": 0.5889500975608826,
10448
+ "learning_rate": 1.3936873743819357e-06,
10449
+ "loss": 1.056,
10450
+ "step": 1487
10451
+ },
10452
+ {
10453
+ "epoch": 0.6734555329260014,
10454
+ "grad_norm": 0.4447315037250519,
10455
+ "learning_rate": 1.3603827453511186e-06,
10456
+ "loss": 0.6903,
10457
+ "step": 1488
10458
+ },
10459
+ {
10460
+ "epoch": 0.673908124009957,
10461
+ "grad_norm": 0.5051842331886292,
10462
+ "learning_rate": 1.3274781548112458e-06,
10463
+ "loss": 0.7553,
10464
+ "step": 1489
10465
+ },
10466
+ {
10467
+ "epoch": 0.6743607150939126,
10468
+ "grad_norm": 0.5147336721420288,
10469
+ "learning_rate": 1.2949737362087156e-06,
10470
+ "loss": 0.8062,
10471
+ "step": 1490
10472
+ },
10473
+ {
10474
+ "epoch": 0.6748133061778683,
10475
+ "grad_norm": 0.5651899576187134,
10476
+ "learning_rate": 1.2628696213670355e-06,
10477
+ "loss": 0.9131,
10478
+ "step": 1491
10479
+ },
10480
+ {
10481
+ "epoch": 0.675265897261824,
10482
+ "grad_norm": 0.569429337978363,
10483
+ "learning_rate": 1.231165940486234e-06,
10484
+ "loss": 0.9232,
10485
+ "step": 1492
10486
+ },
10487
+ {
10488
+ "epoch": 0.6757184883457796,
10489
+ "grad_norm": 0.5901250839233398,
10490
+ "learning_rate": 1.1998628221423614e-06,
10491
+ "loss": 1.138,
10492
+ "step": 1493
10493
+ },
10494
+ {
10495
+ "epoch": 0.6761710794297352,
10496
+ "grad_norm": 0.47215431928634644,
10497
+ "learning_rate": 1.1689603932869665e-06,
10498
+ "loss": 0.7919,
10499
+ "step": 1494
10500
+ },
10501
+ {
10502
+ "epoch": 0.6766236705136909,
10503
+ "grad_norm": 0.5352398753166199,
10504
+ "learning_rate": 1.1384587792465872e-06,
10505
+ "loss": 0.6431,
10506
+ "step": 1495
10507
+ },
10508
+ {
10509
+ "epoch": 0.6770762615976466,
10510
+ "grad_norm": 0.50892174243927,
10511
+ "learning_rate": 1.1083581037222068e-06,
10512
+ "loss": 0.7254,
10513
+ "step": 1496
10514
+ },
10515
+ {
10516
+ "epoch": 0.6775288526816021,
10517
+ "grad_norm": 0.5627516508102417,
10518
+ "learning_rate": 1.0786584887888307e-06,
10519
+ "loss": 0.949,
10520
+ "step": 1497
10521
+ },
10522
+ {
10523
+ "epoch": 0.6779814437655578,
10524
+ "grad_norm": 0.4339214265346527,
10525
+ "learning_rate": 1.0493600548948878e-06,
10526
+ "loss": 0.6264,
10527
+ "step": 1498
10528
+ },
10529
+ {
10530
+ "epoch": 0.6784340348495135,
10531
+ "grad_norm": 0.42282262444496155,
10532
+ "learning_rate": 1.020462920861831e-06,
10533
+ "loss": 0.5289,
10534
+ "step": 1499
10535
+ },
10536
+ {
10537
+ "epoch": 0.6788866259334692,
10538
+ "grad_norm": 0.46268293261528015,
10539
+ "learning_rate": 9.919672038835925e-07,
10540
+ "loss": 0.6008,
10541
+ "step": 1500
10542
+ },
10543
+ {
10544
+ "epoch": 0.6793392170174247,
10545
+ "grad_norm": 0.5364608764648438,
10546
+ "learning_rate": 9.638730195261625e-07,
10547
+ "loss": 0.6824,
10548
+ "step": 1501
10549
+ },
10550
+ {
10551
+ "epoch": 0.6797918081013804,
10552
+ "grad_norm": 0.5147013664245605,
10553
+ "learning_rate": 9.36180481727067e-07,
10554
+ "loss": 0.8468,
10555
+ "step": 1502
10556
+ },
10557
+ {
10558
+ "epoch": 0.6802443991853361,
10559
+ "grad_norm": 0.5776438117027283,
10560
+ "learning_rate": 9.088897027949462e-07,
10561
+ "loss": 0.7729,
10562
+ "step": 1503
10563
+ },
10564
+ {
10565
+ "epoch": 0.6806969902692916,
10566
+ "grad_norm": 0.47045034170150757,
10567
+ "learning_rate": 8.820007934090879e-07,
10568
+ "loss": 0.8525,
10569
+ "step": 1504
10570
+ },
10571
+ {
10572
+ "epoch": 0.6811495813532473,
10573
+ "grad_norm": 0.554664134979248,
10574
+ "learning_rate": 8.555138626189618e-07,
10575
+ "loss": 0.8944,
10576
+ "step": 1505
10577
+ },
10578
+ {
10579
+ "epoch": 0.681602172437203,
10580
+ "grad_norm": 0.5782079696655273,
10581
+ "learning_rate": 8.294290178437969e-07,
10582
+ "loss": 0.8888,
10583
+ "step": 1506
10584
+ },
10585
+ {
10586
+ "epoch": 0.6820547635211587,
10587
+ "grad_norm": 0.5328008532524109,
10588
+ "learning_rate": 8.037463648721488e-07,
10589
+ "loss": 0.8906,
10590
+ "step": 1507
10591
+ },
10592
+ {
10593
+ "epoch": 0.6825073546051142,
10594
+ "grad_norm": 0.5287159085273743,
10595
+ "learning_rate": 7.78466007861467e-07,
10596
+ "loss": 0.757,
10597
+ "step": 1508
10598
+ },
10599
+ {
10600
+ "epoch": 0.6829599456890699,
10601
+ "grad_norm": 0.5075718760490417,
10602
+ "learning_rate": 7.535880493376279e-07,
10603
+ "loss": 0.8139,
10604
+ "step": 1509
10605
+ },
10606
+ {
10607
+ "epoch": 0.6834125367730256,
10608
+ "grad_norm": 0.5284056067466736,
10609
+ "learning_rate": 7.291125901946027e-07,
10610
+ "loss": 0.9401,
10611
+ "step": 1510
10612
+ },
10613
+ {
10614
+ "epoch": 0.6838651278569812,
10615
+ "grad_norm": 0.7645094394683838,
10616
+ "learning_rate": 7.050397296939792e-07,
10617
+ "loss": 0.7314,
10618
+ "step": 1511
10619
+ },
10620
+ {
10621
+ "epoch": 0.6843177189409368,
10622
+ "grad_norm": 0.602204442024231,
10623
+ "learning_rate": 6.813695654645957e-07,
10624
+ "loss": 1.0208,
10625
+ "step": 1512
10626
+ },
10627
+ {
10628
+ "epoch": 0.6847703100248925,
10629
+ "grad_norm": 0.5655729174613953,
10630
+ "learning_rate": 6.581021935021304e-07,
10631
+ "loss": 1.0402,
10632
+ "step": 1513
10633
+ },
10634
+ {
10635
+ "epoch": 0.6852229011088482,
10636
+ "grad_norm": 0.6173549294471741,
10637
+ "learning_rate": 6.352377081687011e-07,
10638
+ "loss": 1.0761,
10639
+ "step": 1514
10640
+ },
10641
+ {
10642
+ "epoch": 0.6856754921928038,
10643
+ "grad_norm": 0.6081221103668213,
10644
+ "learning_rate": 6.127762021925221e-07,
10645
+ "loss": 1.0481,
10646
+ "step": 1515
10647
+ },
10648
+ {
10649
+ "epoch": 0.6861280832767594,
10650
+ "grad_norm": 0.6282268166542053,
10651
+ "learning_rate": 5.907177666674812e-07,
10652
+ "loss": 1.0363,
10653
+ "step": 1516
10654
+ },
10655
+ {
10656
+ "epoch": 0.6865806743607151,
10657
+ "grad_norm": 0.5493825674057007,
10658
+ "learning_rate": 5.690624910527964e-07,
10659
+ "loss": 0.8727,
10660
+ "step": 1517
10661
+ },
10662
+ {
10663
+ "epoch": 0.6870332654446707,
10664
+ "grad_norm": 0.5676363706588745,
10665
+ "learning_rate": 5.478104631726711e-07,
10666
+ "loss": 0.8488,
10667
+ "step": 1518
10668
+ },
10669
+ {
10670
+ "epoch": 0.6874858565286264,
10671
+ "grad_norm": 0.49481111764907837,
10672
+ "learning_rate": 5.269617692158613e-07,
10673
+ "loss": 0.6527,
10674
+ "step": 1519
10675
+ },
10676
+ {
10677
+ "epoch": 0.687938447612582,
10678
+ "grad_norm": 0.4594171643257141,
10679
+ "learning_rate": 5.065164937354428e-07,
10680
+ "loss": 0.8464,
10681
+ "step": 1520
10682
+ },
10683
+ {
10684
+ "epoch": 0.6883910386965377,
10685
+ "grad_norm": 0.5010755062103271,
10686
+ "learning_rate": 4.864747196483554e-07,
10687
+ "loss": 0.6373,
10688
+ "step": 1521
10689
+ },
10690
+ {
10691
+ "epoch": 0.6888436297804933,
10692
+ "grad_norm": 0.5262997150421143,
10693
+ "learning_rate": 4.668365282351372e-07,
10694
+ "loss": 0.7576,
10695
+ "step": 1522
10696
+ },
10697
+ {
10698
+ "epoch": 0.689296220864449,
10699
+ "grad_norm": 0.4877280592918396,
10700
+ "learning_rate": 4.476019991395908e-07,
10701
+ "loss": 0.7472,
10702
+ "step": 1523
10703
+ },
10704
+ {
10705
+ "epoch": 0.6897488119484046,
10706
+ "grad_norm": 0.5093807578086853,
10707
+ "learning_rate": 4.2877121036840606e-07,
10708
+ "loss": 0.7657,
10709
+ "step": 1524
10710
+ },
10711
+ {
10712
+ "epoch": 0.6902014030323602,
10713
+ "grad_norm": 0.5577916502952576,
10714
+ "learning_rate": 4.103442382909051e-07,
10715
+ "loss": 0.8773,
10716
+ "step": 1525
10717
+ },
10718
+ {
10719
+ "epoch": 0.6906539941163159,
10720
+ "grad_norm": 0.5437626242637634,
10721
+ "learning_rate": 3.923211576387087e-07,
10722
+ "loss": 0.8471,
10723
+ "step": 1526
10724
+ },
10725
+ {
10726
+ "epoch": 0.6911065852002716,
10727
+ "grad_norm": 0.4851123094558716,
10728
+ "learning_rate": 3.74702041505437e-07,
10729
+ "loss": 0.7631,
10730
+ "step": 1527
10731
+ },
10732
+ {
10733
+ "epoch": 0.6915591762842273,
10734
+ "grad_norm": 0.511060893535614,
10735
+ "learning_rate": 3.5748696134639825e-07,
10736
+ "loss": 0.7885,
10737
+ "step": 1528
10738
+ },
10739
+ {
10740
+ "epoch": 0.6920117673681828,
10741
+ "grad_norm": 0.44935715198516846,
10742
+ "learning_rate": 3.406759869783005e-07,
10743
+ "loss": 0.5878,
10744
+ "step": 1529
10745
+ },
10746
+ {
10747
+ "epoch": 0.6924643584521385,
10748
+ "grad_norm": 0.5244868397712708,
10749
+ "learning_rate": 3.2426918657900704e-07,
10750
+ "loss": 0.8548,
10751
+ "step": 1530
10752
+ },
10753
+ {
10754
+ "epoch": 0.6929169495360942,
10755
+ "grad_norm": 0.467731237411499,
10756
+ "learning_rate": 3.0826662668720364e-07,
10757
+ "loss": 0.8462,
10758
+ "step": 1531
10759
+ },
10760
+ {
10761
+ "epoch": 0.6933695406200497,
10762
+ "grad_norm": 0.49825143814086914,
10763
+ "learning_rate": 2.9266837220217613e-07,
10764
+ "loss": 0.6598,
10765
+ "step": 1532
10766
+ },
10767
+ {
10768
+ "epoch": 0.6938221317040054,
10769
+ "grad_norm": 0.48928219079971313,
10770
+ "learning_rate": 2.7747448638352215e-07,
10771
+ "loss": 0.7955,
10772
+ "step": 1533
10773
+ },
10774
+ {
10775
+ "epoch": 0.6942747227879611,
10776
+ "grad_norm": 0.5015487670898438,
10777
+ "learning_rate": 2.6268503085089547e-07,
10778
+ "loss": 0.9172,
10779
+ "step": 1534
10780
+ },
10781
+ {
10782
+ "epoch": 0.6947273138719168,
10783
+ "grad_norm": 0.44026800990104675,
10784
+ "learning_rate": 2.4830006558373973e-07,
10785
+ "loss": 0.6144,
10786
+ "step": 1535
10787
+ },
10788
+ {
10789
+ "epoch": 0.6951799049558723,
10790
+ "grad_norm": 0.5041724443435669,
10791
+ "learning_rate": 2.343196489211219e-07,
10792
+ "loss": 0.6941,
10793
+ "step": 1536
10794
+ },
10795
+ {
10796
+ "epoch": 0.695632496039828,
10797
+ "grad_norm": 0.5276736617088318,
10798
+ "learning_rate": 2.2074383756137686e-07,
10799
+ "loss": 0.9376,
10800
+ "step": 1537
10801
+ },
10802
+ {
10803
+ "epoch": 0.6960850871237837,
10804
+ "grad_norm": 0.539641261100769,
10805
+ "learning_rate": 2.0757268656198537e-07,
10806
+ "loss": 0.8445,
10807
+ "step": 1538
10808
+ },
10809
+ {
10810
+ "epoch": 0.6965376782077393,
10811
+ "grad_norm": 0.6825346350669861,
10812
+ "learning_rate": 1.948062493392744e-07,
10813
+ "loss": 1.1659,
10814
+ "step": 1539
10815
+ },
10816
+ {
10817
+ "epoch": 0.6969902692916949,
10818
+ "grad_norm": 0.5396426320075989,
10819
+ "learning_rate": 1.824445776682504e-07,
10820
+ "loss": 0.8319,
10821
+ "step": 1540
10822
+ },
10823
+ {
10824
+ "epoch": 0.6974428603756506,
10825
+ "grad_norm": 0.47626543045043945,
10826
+ "learning_rate": 1.7048772168237748e-07,
10827
+ "loss": 0.6278,
10828
+ "step": 1541
10829
+ },
10830
+ {
10831
+ "epoch": 0.6978954514596063,
10832
+ "grad_norm": 0.5386638641357422,
10833
+ "learning_rate": 1.5893572987333293e-07,
10834
+ "loss": 0.8372,
10835
+ "step": 1542
10836
+ },
10837
+ {
10838
+ "epoch": 0.6983480425435619,
10839
+ "grad_norm": 0.47005295753479004,
10840
+ "learning_rate": 1.477886490908742e-07,
10841
+ "loss": 0.7101,
10842
+ "step": 1543
10843
+ },
10844
+ {
10845
+ "epoch": 0.6988006336275175,
10846
+ "grad_norm": 0.48174676299095154,
10847
+ "learning_rate": 1.3704652454261668e-07,
10848
+ "loss": 0.6952,
10849
+ "step": 1544
10850
+ },
10851
+ {
10852
+ "epoch": 0.6992532247114732,
10853
+ "grad_norm": 0.46316617727279663,
10854
+ "learning_rate": 1.2670939979384512e-07,
10855
+ "loss": 0.7623,
10856
+ "step": 1545
10857
+ },
10858
+ {
10859
+ "epoch": 0.6997058157954288,
10860
+ "grad_norm": 0.6071258783340454,
10861
+ "learning_rate": 1.1677731676733584e-07,
10862
+ "loss": 1.0641,
10863
+ "step": 1546
10864
+ },
10865
+ {
10866
+ "epoch": 0.7001584068793845,
10867
+ "grad_norm": 0.4970654249191284,
10868
+ "learning_rate": 1.0725031574323474e-07,
10869
+ "loss": 0.7059,
10870
+ "step": 1547
10871
+ },
10872
+ {
10873
+ "epoch": 0.7006109979633401,
10874
+ "grad_norm": 0.4778405725955963,
10875
+ "learning_rate": 9.8128435358813e-08,
10876
+ "loss": 0.7019,
10877
+ "step": 1548
10878
+ },
10879
+ {
10880
+ "epoch": 0.7010635890472958,
10881
+ "grad_norm": 0.653716504573822,
10882
+ "learning_rate": 8.941171260835601e-08,
10883
+ "loss": 1.1438,
10884
+ "step": 1549
10885
+ },
10886
+ {
10887
+ "epoch": 0.7015161801312514,
10888
+ "grad_norm": 0.480570524930954,
10889
+ "learning_rate": 8.110018284304133e-08,
10890
+ "loss": 0.7621,
10891
+ "step": 1550
10892
+ },
10893
+ {
10894
+ "epoch": 0.7019687712152071,
10895
+ "grad_norm": 0.5458505153656006,
10896
+ "learning_rate": 7.319387977072766e-08,
10897
+ "loss": 1.0065,
10898
+ "step": 1551
10899
+ },
10900
+ {
10901
+ "epoch": 0.7024213622991627,
10902
+ "grad_norm": 0.4744986593723297,
10903
+ "learning_rate": 6.569283545587724e-08,
10904
+ "loss": 0.7546,
10905
+ "step": 1552
10906
+ },
10907
+ {
10908
+ "epoch": 0.7028739533831183,
10909
+ "grad_norm": 0.5370805859565735,
10910
+ "learning_rate": 5.8597080319389156e-08,
10911
+ "loss": 1.1206,
10912
+ "step": 1553
10913
+ },
10914
+ {
10915
+ "epoch": 0.703326544467074,
10916
+ "grad_norm": 0.49838805198669434,
10917
+ "learning_rate": 5.190664313851068e-08,
10918
+ "loss": 0.69,
10919
+ "step": 1554
10920
+ },
10921
+ {
10922
+ "epoch": 0.7037791355510297,
10923
+ "grad_norm": 0.5323602557182312,
10924
+ "learning_rate": 4.562155104665955e-08,
10925
+ "loss": 0.7611,
10926
+ "step": 1555
10927
+ },
10928
+ {
10929
+ "epoch": 0.7042317266349853,
10930
+ "grad_norm": 0.5184367299079895,
10931
+ "learning_rate": 3.9741829533401775e-08,
10932
+ "loss": 0.9261,
10933
+ "step": 1556
10934
+ },
10935
+ {
10936
+ "epoch": 0.7046843177189409,
10937
+ "grad_norm": 0.5408939719200134,
10938
+ "learning_rate": 3.4267502444274015e-08,
10939
+ "loss": 0.7767,
10940
+ "step": 1557
10941
+ },
10942
+ {
10943
+ "epoch": 0.7051369088028966,
10944
+ "grad_norm": 0.49419355392456055,
10945
+ "learning_rate": 2.9198591980705848e-08,
10946
+ "loss": 0.8236,
10947
+ "step": 1558
10948
+ },
10949
+ {
10950
+ "epoch": 0.7055894998868523,
10951
+ "grad_norm": 0.5697168111801147,
10952
+ "learning_rate": 2.4535118699953176e-08,
10953
+ "loss": 1.0097,
10954
+ "step": 1559
10955
+ },
10956
+ {
10957
+ "epoch": 0.7060420909708078,
10958
+ "grad_norm": 0.6209654808044434,
10959
+ "learning_rate": 2.0277101514987184e-08,
10960
+ "loss": 1.1677,
10961
+ "step": 1560
10962
+ },
10963
+ {
10964
+ "epoch": 0.7064946820547635,
10965
+ "grad_norm": 0.4766218364238739,
10966
+ "learning_rate": 1.642455769444995e-08,
10967
+ "loss": 0.6273,
10968
+ "step": 1561
10969
+ },
10970
+ {
10971
+ "epoch": 0.7069472731387192,
10972
+ "grad_norm": 0.5815181136131287,
10973
+ "learning_rate": 1.2977502862532297e-08,
10974
+ "loss": 0.8964,
10975
+ "step": 1562
10976
+ },
10977
+ {
10978
+ "epoch": 0.7073998642226749,
10979
+ "grad_norm": 0.5831205248832703,
10980
+ "learning_rate": 9.935950998962717e-09,
10981
+ "loss": 0.8613,
10982
+ "step": 1563
10983
+ },
10984
+ {
10985
+ "epoch": 0.7078524553066304,
10986
+ "grad_norm": 0.4257395565509796,
10987
+ "learning_rate": 7.2999144389296335e-09,
10988
+ "loss": 0.6569,
10989
+ "step": 1564
10990
+ },
10991
+ {
10992
+ "epoch": 0.7083050463905861,
10993
+ "grad_norm": 0.6523287296295166,
10994
+ "learning_rate": 5.069403873025902e-09,
10995
+ "loss": 1.023,
10996
+ "step": 1565
10997
+ },
10998
+ {
10999
+ "epoch": 0.7087576374745418,
11000
+ "grad_norm": 0.5527802109718323,
11001
+ "learning_rate": 3.244428347204398e-09,
11002
+ "loss": 0.8362,
11003
+ "step": 1566
11004
+ },
11005
+ {
11006
+ "epoch": 0.7092102285584974,
11007
+ "grad_norm": 0.42681992053985596,
11008
+ "learning_rate": 1.8249952627669154e-09,
11009
+ "loss": 0.5779,
11010
+ "step": 1567
11011
+ },
11012
+ {
11013
+ "epoch": 0.709662819642453,
11014
+ "grad_norm": 0.489521861076355,
11015
+ "learning_rate": 8.111103762975524e-10,
11016
+ "loss": 0.8613,
11017
+ "step": 1568
11018
+ },
11019
+ {
11020
+ "epoch": 0.7101154107264087,
11021
+ "grad_norm": 0.55333411693573,
11022
+ "learning_rate": 2.027777996738145e-10,
11023
+ "loss": 1.0663,
11024
+ "step": 1569
11025
+ },
11026
+ {
11027
+ "epoch": 0.7105680018103643,
11028
+ "grad_norm": 0.5273703336715698,
11029
+ "learning_rate": 0.0,
11030
+ "loss": 0.7856,
11031
+ "step": 1570
11032
  }
11033
  ],
11034
  "logging_steps": 1,
 
11043
  "should_evaluate": false,
11044
  "should_log": false,
11045
  "should_save": true,
11046
+ "should_training_stop": true
11047
  },
11048
  "attributes": {}
11049
  }
11050
  },
11051
+ "total_flos": 2.8612606783861555e+17,
11052
  "train_batch_size": 2,
11053
  "trial_name": null,
11054
  "trial_params": null