Charlie81 commited on
Commit
5e25cd5
Β·
1 Parent(s): 7199a48

Checkpoint at step 10000

Browse files
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/config.json RENAMED
File without changes
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/model-00001-of-00003.safetensors RENAMED
File without changes
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/model-00002-of-00003.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9301453191ca6c9a3a03f12d88cded41dd11d2f875bd10a7d18f8f8d8750f60a
3
  size 4999439616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95ce03424f29b5a5326f65e763bb43ccf25b2d1c2196f5f1ccb6bc94e4e0f597
3
  size 4999439616
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/model-00003-of-00003.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c547995731e06b15fac744631f82ebdfe5d9a5760e0e2376cb6b37ace1f11c78
3
  size 3892418912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ee5f2e34146a859edbe91c7eb079daefda1e8d86d170772fa7498590ef7ad45
3
  size 3892418912
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/model.safetensors.index.json RENAMED
File without changes
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d22ffaf56393f265c3fc654f54fc8fcc7739e5ea6ae9ffd11f3339e1cd1a0ee
3
  size 101356346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfa0f04dbfcbc97d18d5a64bae0e3edc47140b0acf3379561607a2e59c3aa5ef
3
  size 101356346
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/rng_state.pth RENAMED
File without changes
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8820a78a12a9bbd865f91029e6c7ff0f061ecda8f234ec86d844afce34177b03
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4628f65a9b6630de3e3b20d025ee2a85cbc417daf8b84231b5ed59b1aa0015ad
3
  size 1064
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/trainer_state.json RENAMED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.29433948380213026,
6
  "eval_steps": 500,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4208,6 +4208,2806 @@
4208
  "learning_rate": 9.808698495748856e-05,
4209
  "loss": 9.6542,
4210
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4211
  }
4212
  ],
4213
  "logging_steps": 10,
@@ -4227,7 +7027,7 @@
4227
  "attributes": {}
4228
  }
4229
  },
4230
- "total_flos": 1.614097296654336e+19,
4231
  "train_batch_size": 2,
4232
  "trial_name": null,
4233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4905658063368838,
6
  "eval_steps": 500,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4208
  "learning_rate": 9.808698495748856e-05,
4209
  "loss": 9.6542,
4210
  "step": 6000
4211
+ },
4212
+ {
4213
+ "epoch": 0.29483004960846715,
4214
+ "grad_norm": 1.75,
4215
+ "learning_rate": 9.825049051667757e-05,
4216
+ "loss": 9.5273,
4217
+ "step": 6010
4218
+ },
4219
+ {
4220
+ "epoch": 0.29532061541480403,
4221
+ "grad_norm": 2.796875,
4222
+ "learning_rate": 9.841399607586659e-05,
4223
+ "loss": 9.1164,
4224
+ "step": 6020
4225
+ },
4226
+ {
4227
+ "epoch": 0.2958111812211409,
4228
+ "grad_norm": 1.46875,
4229
+ "learning_rate": 9.857750163505559e-05,
4230
+ "loss": 9.6354,
4231
+ "step": 6030
4232
+ },
4233
+ {
4234
+ "epoch": 0.2963017470274778,
4235
+ "grad_norm": 1.4921875,
4236
+ "learning_rate": 9.874100719424461e-05,
4237
+ "loss": 9.8704,
4238
+ "step": 6040
4239
+ },
4240
+ {
4241
+ "epoch": 0.2967923128338147,
4242
+ "grad_norm": 1.25,
4243
+ "learning_rate": 9.890451275343363e-05,
4244
+ "loss": 9.1349,
4245
+ "step": 6050
4246
+ },
4247
+ {
4248
+ "epoch": 0.29728287864015157,
4249
+ "grad_norm": 1.8203125,
4250
+ "learning_rate": 9.906801831262264e-05,
4251
+ "loss": 9.3073,
4252
+ "step": 6060
4253
+ },
4254
+ {
4255
+ "epoch": 0.29777344444648846,
4256
+ "grad_norm": 1.53125,
4257
+ "learning_rate": 9.923152387181165e-05,
4258
+ "loss": 9.479,
4259
+ "step": 6070
4260
+ },
4261
+ {
4262
+ "epoch": 0.29826401025282534,
4263
+ "grad_norm": 3.1875,
4264
+ "learning_rate": 9.939502943100067e-05,
4265
+ "loss": 9.6812,
4266
+ "step": 6080
4267
+ },
4268
+ {
4269
+ "epoch": 0.2987545760591622,
4270
+ "grad_norm": 1.3125,
4271
+ "learning_rate": 9.955853499018967e-05,
4272
+ "loss": 8.847,
4273
+ "step": 6090
4274
+ },
4275
+ {
4276
+ "epoch": 0.2992451418654991,
4277
+ "grad_norm": 1.609375,
4278
+ "learning_rate": 9.972204054937868e-05,
4279
+ "loss": 9.6771,
4280
+ "step": 6100
4281
+ },
4282
+ {
4283
+ "epoch": 0.299735707671836,
4284
+ "grad_norm": 2.296875,
4285
+ "learning_rate": 9.988554610856769e-05,
4286
+ "loss": 9.7602,
4287
+ "step": 6110
4288
+ },
4289
+ {
4290
+ "epoch": 0.3002262734781729,
4291
+ "grad_norm": 0.984375,
4292
+ "learning_rate": 9.999999926693718e-05,
4293
+ "loss": 9.519,
4294
+ "step": 6120
4295
+ },
4296
+ {
4297
+ "epoch": 0.30071683928450976,
4298
+ "grad_norm": 2.015625,
4299
+ "learning_rate": 9.999998623470987e-05,
4300
+ "loss": 9.9142,
4301
+ "step": 6130
4302
+ },
4303
+ {
4304
+ "epoch": 0.30120740509084665,
4305
+ "grad_norm": 1.2109375,
4306
+ "learning_rate": 9.999995691220257e-05,
4307
+ "loss": 9.5174,
4308
+ "step": 6140
4309
+ },
4310
+ {
4311
+ "epoch": 0.30169797089718353,
4312
+ "grad_norm": 3.046875,
4313
+ "learning_rate": 9.999991129942481e-05,
4314
+ "loss": 9.6071,
4315
+ "step": 6150
4316
+ },
4317
+ {
4318
+ "epoch": 0.3021885367035204,
4319
+ "grad_norm": 1.921875,
4320
+ "learning_rate": 9.999984939639145e-05,
4321
+ "loss": 9.5535,
4322
+ "step": 6160
4323
+ },
4324
+ {
4325
+ "epoch": 0.3026791025098573,
4326
+ "grad_norm": 1.6484375,
4327
+ "learning_rate": 9.99997712031227e-05,
4328
+ "loss": 9.0336,
4329
+ "step": 6170
4330
+ },
4331
+ {
4332
+ "epoch": 0.3031696683161942,
4333
+ "grad_norm": 1.3828125,
4334
+ "learning_rate": 9.999967671964399e-05,
4335
+ "loss": 9.4898,
4336
+ "step": 6180
4337
+ },
4338
+ {
4339
+ "epoch": 0.30366023412253107,
4340
+ "grad_norm": 1.8203125,
4341
+ "learning_rate": 9.999956594598612e-05,
4342
+ "loss": 9.3706,
4343
+ "step": 6190
4344
+ },
4345
+ {
4346
+ "epoch": 0.30415079992886795,
4347
+ "grad_norm": 2.875,
4348
+ "learning_rate": 9.99994388821852e-05,
4349
+ "loss": 10.0704,
4350
+ "step": 6200
4351
+ },
4352
+ {
4353
+ "epoch": 0.30464136573520484,
4354
+ "grad_norm": 2.0625,
4355
+ "learning_rate": 9.99992955282826e-05,
4356
+ "loss": 9.6955,
4357
+ "step": 6210
4358
+ },
4359
+ {
4360
+ "epoch": 0.3051319315415417,
4361
+ "grad_norm": 1.21875,
4362
+ "learning_rate": 9.999913588432502e-05,
4363
+ "loss": 9.3493,
4364
+ "step": 6220
4365
+ },
4366
+ {
4367
+ "epoch": 0.3056224973478786,
4368
+ "grad_norm": 194.0,
4369
+ "learning_rate": 9.99989599503645e-05,
4370
+ "loss": 9.1016,
4371
+ "step": 6230
4372
+ },
4373
+ {
4374
+ "epoch": 0.3061130631542155,
4375
+ "grad_norm": 1.1328125,
4376
+ "learning_rate": 9.999876772645835e-05,
4377
+ "loss": 9.5685,
4378
+ "step": 6240
4379
+ },
4380
+ {
4381
+ "epoch": 0.3066036289605524,
4382
+ "grad_norm": 2.28125,
4383
+ "learning_rate": 9.999855921266921e-05,
4384
+ "loss": 9.7715,
4385
+ "step": 6250
4386
+ },
4387
+ {
4388
+ "epoch": 0.30709419476688926,
4389
+ "grad_norm": 2.046875,
4390
+ "learning_rate": 9.999833440906499e-05,
4391
+ "loss": 10.0416,
4392
+ "step": 6260
4393
+ },
4394
+ {
4395
+ "epoch": 0.30758476057322615,
4396
+ "grad_norm": 1.3515625,
4397
+ "learning_rate": 9.999809331571895e-05,
4398
+ "loss": 9.3091,
4399
+ "step": 6270
4400
+ },
4401
+ {
4402
+ "epoch": 0.30807532637956303,
4403
+ "grad_norm": 2.1875,
4404
+ "learning_rate": 9.999783593270962e-05,
4405
+ "loss": 9.6715,
4406
+ "step": 6280
4407
+ },
4408
+ {
4409
+ "epoch": 0.3085658921858999,
4410
+ "grad_norm": 1.5703125,
4411
+ "learning_rate": 9.999756226012087e-05,
4412
+ "loss": 9.9269,
4413
+ "step": 6290
4414
+ },
4415
+ {
4416
+ "epoch": 0.3090564579922368,
4417
+ "grad_norm": 1.7421875,
4418
+ "learning_rate": 9.999727229804187e-05,
4419
+ "loss": 9.7873,
4420
+ "step": 6300
4421
+ },
4422
+ {
4423
+ "epoch": 0.3095470237985737,
4424
+ "grad_norm": 1.2578125,
4425
+ "learning_rate": 9.999696604656709e-05,
4426
+ "loss": 9.9722,
4427
+ "step": 6310
4428
+ },
4429
+ {
4430
+ "epoch": 0.31003758960491057,
4431
+ "grad_norm": 1.3828125,
4432
+ "learning_rate": 9.999664350579631e-05,
4433
+ "loss": 9.2905,
4434
+ "step": 6320
4435
+ },
4436
+ {
4437
+ "epoch": 0.31052815541124745,
4438
+ "grad_norm": 1.1640625,
4439
+ "learning_rate": 9.99963046758346e-05,
4440
+ "loss": 9.6233,
4441
+ "step": 6330
4442
+ },
4443
+ {
4444
+ "epoch": 0.31101872121758434,
4445
+ "grad_norm": 2.4375,
4446
+ "learning_rate": 9.999594955679236e-05,
4447
+ "loss": 9.7319,
4448
+ "step": 6340
4449
+ },
4450
+ {
4451
+ "epoch": 0.3115092870239212,
4452
+ "grad_norm": 1.8828125,
4453
+ "learning_rate": 9.999557814878531e-05,
4454
+ "loss": 10.4584,
4455
+ "step": 6350
4456
+ },
4457
+ {
4458
+ "epoch": 0.3119998528302581,
4459
+ "grad_norm": 1.234375,
4460
+ "learning_rate": 9.999519045193442e-05,
4461
+ "loss": 9.4544,
4462
+ "step": 6360
4463
+ },
4464
+ {
4465
+ "epoch": 0.312490418636595,
4466
+ "grad_norm": 1.7578125,
4467
+ "learning_rate": 9.999478646636602e-05,
4468
+ "loss": 9.6401,
4469
+ "step": 6370
4470
+ },
4471
+ {
4472
+ "epoch": 0.3129809844429319,
4473
+ "grad_norm": 1.7109375,
4474
+ "learning_rate": 9.999436619221175e-05,
4475
+ "loss": 9.2365,
4476
+ "step": 6380
4477
+ },
4478
+ {
4479
+ "epoch": 0.31347155024926876,
4480
+ "grad_norm": 1.6328125,
4481
+ "learning_rate": 9.999392962960851e-05,
4482
+ "loss": 9.4004,
4483
+ "step": 6390
4484
+ },
4485
+ {
4486
+ "epoch": 0.31396211605560564,
4487
+ "grad_norm": 0.8515625,
4488
+ "learning_rate": 9.999347677869854e-05,
4489
+ "loss": 9.4101,
4490
+ "step": 6400
4491
+ },
4492
+ {
4493
+ "epoch": 0.31445268186194253,
4494
+ "grad_norm": 1.8125,
4495
+ "learning_rate": 9.99930076396294e-05,
4496
+ "loss": 9.4732,
4497
+ "step": 6410
4498
+ },
4499
+ {
4500
+ "epoch": 0.3149432476682794,
4501
+ "grad_norm": 1.765625,
4502
+ "learning_rate": 9.999252221255393e-05,
4503
+ "loss": 9.2319,
4504
+ "step": 6420
4505
+ },
4506
+ {
4507
+ "epoch": 0.3154338134746163,
4508
+ "grad_norm": 0.9765625,
4509
+ "learning_rate": 9.999202049763027e-05,
4510
+ "loss": 9.4702,
4511
+ "step": 6430
4512
+ },
4513
+ {
4514
+ "epoch": 0.3159243792809532,
4515
+ "grad_norm": 0.7578125,
4516
+ "learning_rate": 9.99915024950219e-05,
4517
+ "loss": 9.2764,
4518
+ "step": 6440
4519
+ },
4520
+ {
4521
+ "epoch": 0.31641494508729007,
4522
+ "grad_norm": 1.1953125,
4523
+ "learning_rate": 9.999096820489757e-05,
4524
+ "loss": 9.2799,
4525
+ "step": 6450
4526
+ },
4527
+ {
4528
+ "epoch": 0.31690551089362695,
4529
+ "grad_norm": 2.9375,
4530
+ "learning_rate": 9.999041762743137e-05,
4531
+ "loss": 9.2077,
4532
+ "step": 6460
4533
+ },
4534
+ {
4535
+ "epoch": 0.31739607669996384,
4536
+ "grad_norm": 1.234375,
4537
+ "learning_rate": 9.998985076280268e-05,
4538
+ "loss": 9.0989,
4539
+ "step": 6470
4540
+ },
4541
+ {
4542
+ "epoch": 0.3178866425063007,
4543
+ "grad_norm": 0.8515625,
4544
+ "learning_rate": 9.998926761119618e-05,
4545
+ "loss": 9.5567,
4546
+ "step": 6480
4547
+ },
4548
+ {
4549
+ "epoch": 0.3183772083126376,
4550
+ "grad_norm": 1.984375,
4551
+ "learning_rate": 9.998866817280187e-05,
4552
+ "loss": 9.6093,
4553
+ "step": 6490
4554
+ },
4555
+ {
4556
+ "epoch": 0.3188677741189745,
4557
+ "grad_norm": 1.0234375,
4558
+ "learning_rate": 9.998805244781504e-05,
4559
+ "loss": 9.0293,
4560
+ "step": 6500
4561
+ },
4562
+ {
4563
+ "epoch": 0.3193583399253114,
4564
+ "grad_norm": 1.3515625,
4565
+ "learning_rate": 9.998742043643632e-05,
4566
+ "loss": 9.4566,
4567
+ "step": 6510
4568
+ },
4569
+ {
4570
+ "epoch": 0.31984890573164826,
4571
+ "grad_norm": 1.359375,
4572
+ "learning_rate": 9.99867721388716e-05,
4573
+ "loss": 9.5419,
4574
+ "step": 6520
4575
+ },
4576
+ {
4577
+ "epoch": 0.32033947153798514,
4578
+ "grad_norm": 1.2578125,
4579
+ "learning_rate": 9.998610755533211e-05,
4580
+ "loss": 9.3771,
4581
+ "step": 6530
4582
+ },
4583
+ {
4584
+ "epoch": 0.320830037344322,
4585
+ "grad_norm": 1.8203125,
4586
+ "learning_rate": 9.998542668603436e-05,
4587
+ "loss": 10.0576,
4588
+ "step": 6540
4589
+ },
4590
+ {
4591
+ "epoch": 0.3213206031506589,
4592
+ "grad_norm": 0.99609375,
4593
+ "learning_rate": 9.998472953120022e-05,
4594
+ "loss": 8.9065,
4595
+ "step": 6550
4596
+ },
4597
+ {
4598
+ "epoch": 0.3218111689569958,
4599
+ "grad_norm": 1.15625,
4600
+ "learning_rate": 9.998401609105677e-05,
4601
+ "loss": 9.5357,
4602
+ "step": 6560
4603
+ },
4604
+ {
4605
+ "epoch": 0.3223017347633327,
4606
+ "grad_norm": 1.46875,
4607
+ "learning_rate": 9.99832863658365e-05,
4608
+ "loss": 9.4754,
4609
+ "step": 6570
4610
+ },
4611
+ {
4612
+ "epoch": 0.32279230056966957,
4613
+ "grad_norm": 1.7890625,
4614
+ "learning_rate": 9.998254035577714e-05,
4615
+ "loss": 10.0797,
4616
+ "step": 6580
4617
+ },
4618
+ {
4619
+ "epoch": 0.32328286637600645,
4620
+ "grad_norm": 1.0,
4621
+ "learning_rate": 9.998177806112175e-05,
4622
+ "loss": 9.659,
4623
+ "step": 6590
4624
+ },
4625
+ {
4626
+ "epoch": 0.32377343218234333,
4627
+ "grad_norm": 1.4140625,
4628
+ "learning_rate": 9.998099948211868e-05,
4629
+ "loss": 9.5543,
4630
+ "step": 6600
4631
+ },
4632
+ {
4633
+ "epoch": 0.3242639979886802,
4634
+ "grad_norm": 1.25,
4635
+ "learning_rate": 9.99802046190216e-05,
4636
+ "loss": 9.3885,
4637
+ "step": 6610
4638
+ },
4639
+ {
4640
+ "epoch": 0.3247545637950171,
4641
+ "grad_norm": 1.171875,
4642
+ "learning_rate": 9.997939347208949e-05,
4643
+ "loss": 8.6886,
4644
+ "step": 6620
4645
+ },
4646
+ {
4647
+ "epoch": 0.325245129601354,
4648
+ "grad_norm": 0.8515625,
4649
+ "learning_rate": 9.997856604158662e-05,
4650
+ "loss": 10.1025,
4651
+ "step": 6630
4652
+ },
4653
+ {
4654
+ "epoch": 0.3257356954076909,
4655
+ "grad_norm": 1.8671875,
4656
+ "learning_rate": 9.997772232778256e-05,
4657
+ "loss": 9.5884,
4658
+ "step": 6640
4659
+ },
4660
+ {
4661
+ "epoch": 0.3262262612140277,
4662
+ "grad_norm": 1.4375,
4663
+ "learning_rate": 9.997686233095222e-05,
4664
+ "loss": 9.5029,
4665
+ "step": 6650
4666
+ },
4667
+ {
4668
+ "epoch": 0.3267168270203646,
4669
+ "grad_norm": 1.765625,
4670
+ "learning_rate": 9.997598605137577e-05,
4671
+ "loss": 9.418,
4672
+ "step": 6660
4673
+ },
4674
+ {
4675
+ "epoch": 0.32720739282670147,
4676
+ "grad_norm": 1.1640625,
4677
+ "learning_rate": 9.997509348933871e-05,
4678
+ "loss": 9.5372,
4679
+ "step": 6670
4680
+ },
4681
+ {
4682
+ "epoch": 0.32769795863303836,
4683
+ "grad_norm": 2.25,
4684
+ "learning_rate": 9.997418464513187e-05,
4685
+ "loss": 9.3032,
4686
+ "step": 6680
4687
+ },
4688
+ {
4689
+ "epoch": 0.32818852443937524,
4690
+ "grad_norm": 1.71875,
4691
+ "learning_rate": 9.997325951905131e-05,
4692
+ "loss": 9.7677,
4693
+ "step": 6690
4694
+ },
4695
+ {
4696
+ "epoch": 0.3286790902457121,
4697
+ "grad_norm": 2.5625,
4698
+ "learning_rate": 9.997231811139848e-05,
4699
+ "loss": 9.8891,
4700
+ "step": 6700
4701
+ },
4702
+ {
4703
+ "epoch": 0.329169656052049,
4704
+ "grad_norm": 1.5078125,
4705
+ "learning_rate": 9.997136042248008e-05,
4706
+ "loss": 9.475,
4707
+ "step": 6710
4708
+ },
4709
+ {
4710
+ "epoch": 0.3296602218583859,
4711
+ "grad_norm": 1.9453125,
4712
+ "learning_rate": 9.997038645260812e-05,
4713
+ "loss": 9.2955,
4714
+ "step": 6720
4715
+ },
4716
+ {
4717
+ "epoch": 0.3301507876647228,
4718
+ "grad_norm": 1.578125,
4719
+ "learning_rate": 9.996939620209995e-05,
4720
+ "loss": 10.0806,
4721
+ "step": 6730
4722
+ },
4723
+ {
4724
+ "epoch": 0.33064135347105966,
4725
+ "grad_norm": 1.078125,
4726
+ "learning_rate": 9.996838967127818e-05,
4727
+ "loss": 8.6916,
4728
+ "step": 6740
4729
+ },
4730
+ {
4731
+ "epoch": 0.33113191927739655,
4732
+ "grad_norm": 1.734375,
4733
+ "learning_rate": 9.996736686047075e-05,
4734
+ "loss": 9.1876,
4735
+ "step": 6750
4736
+ },
4737
+ {
4738
+ "epoch": 0.33162248508373343,
4739
+ "grad_norm": 1.2578125,
4740
+ "learning_rate": 9.99663277700109e-05,
4741
+ "loss": 9.1853,
4742
+ "step": 6760
4743
+ },
4744
+ {
4745
+ "epoch": 0.3321130508900703,
4746
+ "grad_norm": 1.421875,
4747
+ "learning_rate": 9.996527240023717e-05,
4748
+ "loss": 9.2008,
4749
+ "step": 6770
4750
+ },
4751
+ {
4752
+ "epoch": 0.3326036166964072,
4753
+ "grad_norm": 1.75,
4754
+ "learning_rate": 9.996420075149341e-05,
4755
+ "loss": 9.2655,
4756
+ "step": 6780
4757
+ },
4758
+ {
4759
+ "epoch": 0.3330941825027441,
4760
+ "grad_norm": 1.7265625,
4761
+ "learning_rate": 9.996311282412876e-05,
4762
+ "loss": 9.4802,
4763
+ "step": 6790
4764
+ },
4765
+ {
4766
+ "epoch": 0.33358474830908097,
4767
+ "grad_norm": 1.5234375,
4768
+ "learning_rate": 9.996200861849767e-05,
4769
+ "loss": 9.3536,
4770
+ "step": 6800
4771
+ },
4772
+ {
4773
+ "epoch": 0.33407531411541785,
4774
+ "grad_norm": 1.390625,
4775
+ "learning_rate": 9.99608881349599e-05,
4776
+ "loss": 9.6765,
4777
+ "step": 6810
4778
+ },
4779
+ {
4780
+ "epoch": 0.33456587992175474,
4781
+ "grad_norm": 2.484375,
4782
+ "learning_rate": 9.995975137388052e-05,
4783
+ "loss": 9.1743,
4784
+ "step": 6820
4785
+ },
4786
+ {
4787
+ "epoch": 0.3350564457280916,
4788
+ "grad_norm": 1.171875,
4789
+ "learning_rate": 9.995859833562988e-05,
4790
+ "loss": 8.9971,
4791
+ "step": 6830
4792
+ },
4793
+ {
4794
+ "epoch": 0.3355470115344285,
4795
+ "grad_norm": 2.140625,
4796
+ "learning_rate": 9.995742902058367e-05,
4797
+ "loss": 9.143,
4798
+ "step": 6840
4799
+ },
4800
+ {
4801
+ "epoch": 0.3360375773407654,
4802
+ "grad_norm": 1.859375,
4803
+ "learning_rate": 9.995624342912282e-05,
4804
+ "loss": 9.8555,
4805
+ "step": 6850
4806
+ },
4807
+ {
4808
+ "epoch": 0.3365281431471023,
4809
+ "grad_norm": 2.203125,
4810
+ "learning_rate": 9.995504156163364e-05,
4811
+ "loss": 9.659,
4812
+ "step": 6860
4813
+ },
4814
+ {
4815
+ "epoch": 0.33701870895343916,
4816
+ "grad_norm": 1.9765625,
4817
+ "learning_rate": 9.995382341850769e-05,
4818
+ "loss": 9.178,
4819
+ "step": 6870
4820
+ },
4821
+ {
4822
+ "epoch": 0.33750927475977605,
4823
+ "grad_norm": 1.4765625,
4824
+ "learning_rate": 9.995258900014184e-05,
4825
+ "loss": 9.9946,
4826
+ "step": 6880
4827
+ },
4828
+ {
4829
+ "epoch": 0.33799984056611293,
4830
+ "grad_norm": 1.3046875,
4831
+ "learning_rate": 9.995133830693828e-05,
4832
+ "loss": 9.1998,
4833
+ "step": 6890
4834
+ },
4835
+ {
4836
+ "epoch": 0.3384904063724498,
4837
+ "grad_norm": 1.2265625,
4838
+ "learning_rate": 9.995007133930449e-05,
4839
+ "loss": 9.3834,
4840
+ "step": 6900
4841
+ },
4842
+ {
4843
+ "epoch": 0.3389809721787867,
4844
+ "grad_norm": 1.5078125,
4845
+ "learning_rate": 9.994878809765327e-05,
4846
+ "loss": 9.5709,
4847
+ "step": 6910
4848
+ },
4849
+ {
4850
+ "epoch": 0.3394715379851236,
4851
+ "grad_norm": 1.5859375,
4852
+ "learning_rate": 9.994748858240268e-05,
4853
+ "loss": 9.3729,
4854
+ "step": 6920
4855
+ },
4856
+ {
4857
+ "epoch": 0.33996210379146047,
4858
+ "grad_norm": 1.1484375,
4859
+ "learning_rate": 9.994617279397612e-05,
4860
+ "loss": 10.3316,
4861
+ "step": 6930
4862
+ },
4863
+ {
4864
+ "epoch": 0.34045266959779735,
4865
+ "grad_norm": 1.8125,
4866
+ "learning_rate": 9.99448407328023e-05,
4867
+ "loss": 10.2478,
4868
+ "step": 6940
4869
+ },
4870
+ {
4871
+ "epoch": 0.34094323540413424,
4872
+ "grad_norm": 1.421875,
4873
+ "learning_rate": 9.994349239931517e-05,
4874
+ "loss": 9.4121,
4875
+ "step": 6950
4876
+ },
4877
+ {
4878
+ "epoch": 0.3414338012104711,
4879
+ "grad_norm": 1.8359375,
4880
+ "learning_rate": 9.994212779395409e-05,
4881
+ "loss": 9.5804,
4882
+ "step": 6960
4883
+ },
4884
+ {
4885
+ "epoch": 0.341924367016808,
4886
+ "grad_norm": 1.359375,
4887
+ "learning_rate": 9.99407469171636e-05,
4888
+ "loss": 9.8077,
4889
+ "step": 6970
4890
+ },
4891
+ {
4892
+ "epoch": 0.3424149328231449,
4893
+ "grad_norm": 1.6484375,
4894
+ "learning_rate": 9.99393497693936e-05,
4895
+ "loss": 9.1917,
4896
+ "step": 6980
4897
+ },
4898
+ {
4899
+ "epoch": 0.3429054986294818,
4900
+ "grad_norm": 1.609375,
4901
+ "learning_rate": 9.993793635109933e-05,
4902
+ "loss": 9.6918,
4903
+ "step": 6990
4904
+ },
4905
+ {
4906
+ "epoch": 0.34339606443581866,
4907
+ "grad_norm": 1.1015625,
4908
+ "learning_rate": 9.993650666274126e-05,
4909
+ "loss": 10.0207,
4910
+ "step": 7000
4911
+ },
4912
+ {
4913
+ "epoch": 0.34388663024215554,
4914
+ "grad_norm": 1.5234375,
4915
+ "learning_rate": 9.993506070478519e-05,
4916
+ "loss": 9.6955,
4917
+ "step": 7010
4918
+ },
4919
+ {
4920
+ "epoch": 0.34437719604849243,
4921
+ "grad_norm": 5.125,
4922
+ "learning_rate": 9.993359847770223e-05,
4923
+ "loss": 9.4934,
4924
+ "step": 7020
4925
+ },
4926
+ {
4927
+ "epoch": 0.3448677618548293,
4928
+ "grad_norm": 1.2421875,
4929
+ "learning_rate": 9.993211998196877e-05,
4930
+ "loss": 9.6029,
4931
+ "step": 7030
4932
+ },
4933
+ {
4934
+ "epoch": 0.3453583276611662,
4935
+ "grad_norm": 1.5078125,
4936
+ "learning_rate": 9.993062521806653e-05,
4937
+ "loss": 10.157,
4938
+ "step": 7040
4939
+ },
4940
+ {
4941
+ "epoch": 0.3458488934675031,
4942
+ "grad_norm": 1.4453125,
4943
+ "learning_rate": 9.99291141864825e-05,
4944
+ "loss": 9.3385,
4945
+ "step": 7050
4946
+ },
4947
+ {
4948
+ "epoch": 0.34633945927383997,
4949
+ "grad_norm": 2.125,
4950
+ "learning_rate": 9.9927586887709e-05,
4951
+ "loss": 9.7968,
4952
+ "step": 7060
4953
+ },
4954
+ {
4955
+ "epoch": 0.34683002508017685,
4956
+ "grad_norm": 1.3515625,
4957
+ "learning_rate": 9.99260433222436e-05,
4958
+ "loss": 9.5476,
4959
+ "step": 7070
4960
+ },
4961
+ {
4962
+ "epoch": 0.34732059088651374,
4963
+ "grad_norm": 1.09375,
4964
+ "learning_rate": 9.992448349058923e-05,
4965
+ "loss": 9.7711,
4966
+ "step": 7080
4967
+ },
4968
+ {
4969
+ "epoch": 0.3478111566928506,
4970
+ "grad_norm": 1.296875,
4971
+ "learning_rate": 9.992290739325408e-05,
4972
+ "loss": 9.3846,
4973
+ "step": 7090
4974
+ },
4975
+ {
4976
+ "epoch": 0.3483017224991875,
4977
+ "grad_norm": 1.5,
4978
+ "learning_rate": 9.992131503075166e-05,
4979
+ "loss": 9.5349,
4980
+ "step": 7100
4981
+ },
4982
+ {
4983
+ "epoch": 0.3487922883055244,
4984
+ "grad_norm": 1.5546875,
4985
+ "learning_rate": 9.991970640360075e-05,
4986
+ "loss": 9.5095,
4987
+ "step": 7110
4988
+ },
4989
+ {
4990
+ "epoch": 0.3492828541118613,
4991
+ "grad_norm": 1.28125,
4992
+ "learning_rate": 9.99180815123255e-05,
4993
+ "loss": 9.7473,
4994
+ "step": 7120
4995
+ },
4996
+ {
4997
+ "epoch": 0.34977341991819816,
4998
+ "grad_norm": 2.359375,
4999
+ "learning_rate": 9.991644035745524e-05,
5000
+ "loss": 9.3305,
5001
+ "step": 7130
5002
+ },
5003
+ {
5004
+ "epoch": 0.35026398572453504,
5005
+ "grad_norm": 2.390625,
5006
+ "learning_rate": 9.991478293952472e-05,
5007
+ "loss": 9.5504,
5008
+ "step": 7140
5009
+ },
5010
+ {
5011
+ "epoch": 0.3507545515308719,
5012
+ "grad_norm": 1.9140625,
5013
+ "learning_rate": 9.99131092590739e-05,
5014
+ "loss": 9.4789,
5015
+ "step": 7150
5016
+ },
5017
+ {
5018
+ "epoch": 0.3512451173372088,
5019
+ "grad_norm": 1.2734375,
5020
+ "learning_rate": 9.991141931664811e-05,
5021
+ "loss": 9.5481,
5022
+ "step": 7160
5023
+ },
5024
+ {
5025
+ "epoch": 0.3517356831435457,
5026
+ "grad_norm": 2.328125,
5027
+ "learning_rate": 9.99097131127979e-05,
5028
+ "loss": 9.7359,
5029
+ "step": 7170
5030
+ },
5031
+ {
5032
+ "epoch": 0.3522262489498826,
5033
+ "grad_norm": 1.921875,
5034
+ "learning_rate": 9.990799064807921e-05,
5035
+ "loss": 9.6452,
5036
+ "step": 7180
5037
+ },
5038
+ {
5039
+ "epoch": 0.35271681475621947,
5040
+ "grad_norm": 1.28125,
5041
+ "learning_rate": 9.990625192305321e-05,
5042
+ "loss": 9.7301,
5043
+ "step": 7190
5044
+ },
5045
+ {
5046
+ "epoch": 0.35320738056255635,
5047
+ "grad_norm": 0.81640625,
5048
+ "learning_rate": 9.990449693828636e-05,
5049
+ "loss": 9.1542,
5050
+ "step": 7200
5051
+ },
5052
+ {
5053
+ "epoch": 0.35369794636889323,
5054
+ "grad_norm": 1.5390625,
5055
+ "learning_rate": 9.990272569435048e-05,
5056
+ "loss": 9.6957,
5057
+ "step": 7210
5058
+ },
5059
+ {
5060
+ "epoch": 0.3541885121752301,
5061
+ "grad_norm": 1.4765625,
5062
+ "learning_rate": 9.990093819182264e-05,
5063
+ "loss": 10.0123,
5064
+ "step": 7220
5065
+ },
5066
+ {
5067
+ "epoch": 0.354679077981567,
5068
+ "grad_norm": 1.0,
5069
+ "learning_rate": 9.989913443128521e-05,
5070
+ "loss": 9.1995,
5071
+ "step": 7230
5072
+ },
5073
+ {
5074
+ "epoch": 0.3551696437879039,
5075
+ "grad_norm": 1.046875,
5076
+ "learning_rate": 9.989731441332587e-05,
5077
+ "loss": 9.1959,
5078
+ "step": 7240
5079
+ },
5080
+ {
5081
+ "epoch": 0.3556602095942408,
5082
+ "grad_norm": 1.3828125,
5083
+ "learning_rate": 9.989547813853758e-05,
5084
+ "loss": 9.8696,
5085
+ "step": 7250
5086
+ },
5087
+ {
5088
+ "epoch": 0.35615077540057766,
5089
+ "grad_norm": 1.6328125,
5090
+ "learning_rate": 9.989362560751864e-05,
5091
+ "loss": 9.9184,
5092
+ "step": 7260
5093
+ },
5094
+ {
5095
+ "epoch": 0.35664134120691454,
5096
+ "grad_norm": 1.53125,
5097
+ "learning_rate": 9.989175682087261e-05,
5098
+ "loss": 8.766,
5099
+ "step": 7270
5100
+ },
5101
+ {
5102
+ "epoch": 0.3571319070132514,
5103
+ "grad_norm": 1.3046875,
5104
+ "learning_rate": 9.988987177920832e-05,
5105
+ "loss": 9.919,
5106
+ "step": 7280
5107
+ },
5108
+ {
5109
+ "epoch": 0.3576224728195883,
5110
+ "grad_norm": 2.03125,
5111
+ "learning_rate": 9.988797048313995e-05,
5112
+ "loss": 9.6369,
5113
+ "step": 7290
5114
+ },
5115
+ {
5116
+ "epoch": 0.3581130386259252,
5117
+ "grad_norm": 1.484375,
5118
+ "learning_rate": 9.988605293328694e-05,
5119
+ "loss": 9.3527,
5120
+ "step": 7300
5121
+ },
5122
+ {
5123
+ "epoch": 0.3586036044322621,
5124
+ "grad_norm": 1.0703125,
5125
+ "learning_rate": 9.988411913027405e-05,
5126
+ "loss": 9.7346,
5127
+ "step": 7310
5128
+ },
5129
+ {
5130
+ "epoch": 0.35909417023859896,
5131
+ "grad_norm": 0.9921875,
5132
+ "learning_rate": 9.988216907473134e-05,
5133
+ "loss": 9.8303,
5134
+ "step": 7320
5135
+ },
5136
+ {
5137
+ "epoch": 0.35958473604493585,
5138
+ "grad_norm": 1.03125,
5139
+ "learning_rate": 9.988020276729411e-05,
5140
+ "loss": 9.6438,
5141
+ "step": 7330
5142
+ },
5143
+ {
5144
+ "epoch": 0.36007530185127273,
5145
+ "grad_norm": 1.1015625,
5146
+ "learning_rate": 9.987822020860303e-05,
5147
+ "loss": 9.2781,
5148
+ "step": 7340
5149
+ },
5150
+ {
5151
+ "epoch": 0.3605658676576096,
5152
+ "grad_norm": 1.3515625,
5153
+ "learning_rate": 9.987622139930399e-05,
5154
+ "loss": 9.7961,
5155
+ "step": 7350
5156
+ },
5157
+ {
5158
+ "epoch": 0.3610564334639465,
5159
+ "grad_norm": 2.078125,
5160
+ "learning_rate": 9.987420634004826e-05,
5161
+ "loss": 9.5671,
5162
+ "step": 7360
5163
+ },
5164
+ {
5165
+ "epoch": 0.3615469992702834,
5166
+ "grad_norm": 1.53125,
5167
+ "learning_rate": 9.987217503149233e-05,
5168
+ "loss": 9.6844,
5169
+ "step": 7370
5170
+ },
5171
+ {
5172
+ "epoch": 0.36203756507662027,
5173
+ "grad_norm": 1.8203125,
5174
+ "learning_rate": 9.987012747429801e-05,
5175
+ "loss": 10.0758,
5176
+ "step": 7380
5177
+ },
5178
+ {
5179
+ "epoch": 0.36252813088295716,
5180
+ "grad_norm": 1.8828125,
5181
+ "learning_rate": 9.986806366913241e-05,
5182
+ "loss": 9.2339,
5183
+ "step": 7390
5184
+ },
5185
+ {
5186
+ "epoch": 0.36301869668929404,
5187
+ "grad_norm": 2.1875,
5188
+ "learning_rate": 9.986598361666794e-05,
5189
+ "loss": 9.7534,
5190
+ "step": 7400
5191
+ },
5192
+ {
5193
+ "epoch": 0.3635092624956309,
5194
+ "grad_norm": 1.4453125,
5195
+ "learning_rate": 9.98638873175823e-05,
5196
+ "loss": 9.531,
5197
+ "step": 7410
5198
+ },
5199
+ {
5200
+ "epoch": 0.3639998283019678,
5201
+ "grad_norm": 1.125,
5202
+ "learning_rate": 9.986177477255843e-05,
5203
+ "loss": 9.281,
5204
+ "step": 7420
5205
+ },
5206
+ {
5207
+ "epoch": 0.36449039410830464,
5208
+ "grad_norm": 4.5,
5209
+ "learning_rate": 9.985964598228465e-05,
5210
+ "loss": 9.3941,
5211
+ "step": 7430
5212
+ },
5213
+ {
5214
+ "epoch": 0.3649809599146415,
5215
+ "grad_norm": 2.640625,
5216
+ "learning_rate": 9.985750094745454e-05,
5217
+ "loss": 9.8157,
5218
+ "step": 7440
5219
+ },
5220
+ {
5221
+ "epoch": 0.3654715257209784,
5222
+ "grad_norm": 1.3515625,
5223
+ "learning_rate": 9.985533966876693e-05,
5224
+ "loss": 9.3986,
5225
+ "step": 7450
5226
+ },
5227
+ {
5228
+ "epoch": 0.3659620915273153,
5229
+ "grad_norm": 2.578125,
5230
+ "learning_rate": 9.9853162146926e-05,
5231
+ "loss": 10.3461,
5232
+ "step": 7460
5233
+ },
5234
+ {
5235
+ "epoch": 0.3664526573336522,
5236
+ "grad_norm": 1.5703125,
5237
+ "learning_rate": 9.985096838264118e-05,
5238
+ "loss": 9.5886,
5239
+ "step": 7470
5240
+ },
5241
+ {
5242
+ "epoch": 0.36694322313998906,
5243
+ "grad_norm": 1.1015625,
5244
+ "learning_rate": 9.984875837662725e-05,
5245
+ "loss": 9.5306,
5246
+ "step": 7480
5247
+ },
5248
+ {
5249
+ "epoch": 0.36743378894632595,
5250
+ "grad_norm": 1.8515625,
5251
+ "learning_rate": 9.98465321296042e-05,
5252
+ "loss": 9.589,
5253
+ "step": 7490
5254
+ },
5255
+ {
5256
+ "epoch": 0.36792435475266283,
5257
+ "grad_norm": 2.078125,
5258
+ "learning_rate": 9.984428964229737e-05,
5259
+ "loss": 9.4794,
5260
+ "step": 7500
5261
+ },
5262
+ {
5263
+ "epoch": 0.3684149205589997,
5264
+ "grad_norm": 1.640625,
5265
+ "learning_rate": 9.984203091543738e-05,
5266
+ "loss": 9.4859,
5267
+ "step": 7510
5268
+ },
5269
+ {
5270
+ "epoch": 0.3689054863653366,
5271
+ "grad_norm": 0.84375,
5272
+ "learning_rate": 9.983975594976012e-05,
5273
+ "loss": 9.3247,
5274
+ "step": 7520
5275
+ },
5276
+ {
5277
+ "epoch": 0.3693960521716735,
5278
+ "grad_norm": 1.40625,
5279
+ "learning_rate": 9.983746474600682e-05,
5280
+ "loss": 9.3625,
5281
+ "step": 7530
5282
+ },
5283
+ {
5284
+ "epoch": 0.36988661797801037,
5285
+ "grad_norm": 1.6640625,
5286
+ "learning_rate": 9.983515730492392e-05,
5287
+ "loss": 8.9516,
5288
+ "step": 7540
5289
+ },
5290
+ {
5291
+ "epoch": 0.37037718378434725,
5292
+ "grad_norm": 1.421875,
5293
+ "learning_rate": 9.983283362726324e-05,
5294
+ "loss": 9.6106,
5295
+ "step": 7550
5296
+ },
5297
+ {
5298
+ "epoch": 0.37086774959068414,
5299
+ "grad_norm": 1.4921875,
5300
+ "learning_rate": 9.983049371378182e-05,
5301
+ "loss": 9.892,
5302
+ "step": 7560
5303
+ },
5304
+ {
5305
+ "epoch": 0.371358315397021,
5306
+ "grad_norm": 0.9921875,
5307
+ "learning_rate": 9.982813756524203e-05,
5308
+ "loss": 9.1626,
5309
+ "step": 7570
5310
+ },
5311
+ {
5312
+ "epoch": 0.3718488812033579,
5313
+ "grad_norm": 1.3671875,
5314
+ "learning_rate": 9.98257651824115e-05,
5315
+ "loss": 9.6846,
5316
+ "step": 7580
5317
+ },
5318
+ {
5319
+ "epoch": 0.3723394470096948,
5320
+ "grad_norm": 1.0390625,
5321
+ "learning_rate": 9.982337656606323e-05,
5322
+ "loss": 9.525,
5323
+ "step": 7590
5324
+ },
5325
+ {
5326
+ "epoch": 0.3728300128160317,
5327
+ "grad_norm": 286.0,
5328
+ "learning_rate": 9.982097171697535e-05,
5329
+ "loss": 9.5155,
5330
+ "step": 7600
5331
+ },
5332
+ {
5333
+ "epoch": 0.37332057862236856,
5334
+ "grad_norm": 1.7265625,
5335
+ "learning_rate": 9.981855063593142e-05,
5336
+ "loss": 10.0883,
5337
+ "step": 7610
5338
+ },
5339
+ {
5340
+ "epoch": 0.37381114442870544,
5341
+ "grad_norm": 1.2265625,
5342
+ "learning_rate": 9.981611332372025e-05,
5343
+ "loss": 8.9297,
5344
+ "step": 7620
5345
+ },
5346
+ {
5347
+ "epoch": 0.37430171023504233,
5348
+ "grad_norm": 1.5234375,
5349
+ "learning_rate": 9.981365978113592e-05,
5350
+ "loss": 9.2481,
5351
+ "step": 7630
5352
+ },
5353
+ {
5354
+ "epoch": 0.3747922760413792,
5355
+ "grad_norm": 2.21875,
5356
+ "learning_rate": 9.98111900089778e-05,
5357
+ "loss": 9.2244,
5358
+ "step": 7640
5359
+ },
5360
+ {
5361
+ "epoch": 0.3752828418477161,
5362
+ "grad_norm": 1.5859375,
5363
+ "learning_rate": 9.980870400805058e-05,
5364
+ "loss": 9.7969,
5365
+ "step": 7650
5366
+ },
5367
+ {
5368
+ "epoch": 0.375773407654053,
5369
+ "grad_norm": 1.015625,
5370
+ "learning_rate": 9.980620177916419e-05,
5371
+ "loss": 9.0249,
5372
+ "step": 7660
5373
+ },
5374
+ {
5375
+ "epoch": 0.37626397346038987,
5376
+ "grad_norm": 1.59375,
5377
+ "learning_rate": 9.980368332313388e-05,
5378
+ "loss": 9.4258,
5379
+ "step": 7670
5380
+ },
5381
+ {
5382
+ "epoch": 0.37675453926672675,
5383
+ "grad_norm": 1.71875,
5384
+ "learning_rate": 9.980114864078016e-05,
5385
+ "loss": 9.5474,
5386
+ "step": 7680
5387
+ },
5388
+ {
5389
+ "epoch": 0.37724510507306364,
5390
+ "grad_norm": 2.28125,
5391
+ "learning_rate": 9.979859773292888e-05,
5392
+ "loss": 9.557,
5393
+ "step": 7690
5394
+ },
5395
+ {
5396
+ "epoch": 0.3777356708794005,
5397
+ "grad_norm": 0.98046875,
5398
+ "learning_rate": 9.979603060041111e-05,
5399
+ "loss": 9.2212,
5400
+ "step": 7700
5401
+ },
5402
+ {
5403
+ "epoch": 0.3782262366857374,
5404
+ "grad_norm": 1.265625,
5405
+ "learning_rate": 9.979344724406323e-05,
5406
+ "loss": 9.5588,
5407
+ "step": 7710
5408
+ },
5409
+ {
5410
+ "epoch": 0.3787168024920743,
5411
+ "grad_norm": 1.34375,
5412
+ "learning_rate": 9.979084766472695e-05,
5413
+ "loss": 9.7479,
5414
+ "step": 7720
5415
+ },
5416
+ {
5417
+ "epoch": 0.3792073682984112,
5418
+ "grad_norm": 1.53125,
5419
+ "learning_rate": 9.978823186324921e-05,
5420
+ "loss": 8.9423,
5421
+ "step": 7730
5422
+ },
5423
+ {
5424
+ "epoch": 0.37969793410474806,
5425
+ "grad_norm": 0.9921875,
5426
+ "learning_rate": 9.978559984048224e-05,
5427
+ "loss": 9.62,
5428
+ "step": 7740
5429
+ },
5430
+ {
5431
+ "epoch": 0.38018849991108494,
5432
+ "grad_norm": 1.421875,
5433
+ "learning_rate": 9.978295159728357e-05,
5434
+ "loss": 9.4276,
5435
+ "step": 7750
5436
+ },
5437
+ {
5438
+ "epoch": 0.3806790657174218,
5439
+ "grad_norm": 1.4140625,
5440
+ "learning_rate": 9.978028713451602e-05,
5441
+ "loss": 9.5612,
5442
+ "step": 7760
5443
+ },
5444
+ {
5445
+ "epoch": 0.3811696315237587,
5446
+ "grad_norm": 1.0859375,
5447
+ "learning_rate": 9.97776064530477e-05,
5448
+ "loss": 9.5241,
5449
+ "step": 7770
5450
+ },
5451
+ {
5452
+ "epoch": 0.3816601973300956,
5453
+ "grad_norm": 1.4140625,
5454
+ "learning_rate": 9.977490955375195e-05,
5455
+ "loss": 9.4522,
5456
+ "step": 7780
5457
+ },
5458
+ {
5459
+ "epoch": 0.3821507631364325,
5460
+ "grad_norm": 1.0390625,
5461
+ "learning_rate": 9.977219643750747e-05,
5462
+ "loss": 9.0719,
5463
+ "step": 7790
5464
+ },
5465
+ {
5466
+ "epoch": 0.38264132894276937,
5467
+ "grad_norm": 0.98046875,
5468
+ "learning_rate": 9.976946710519823e-05,
5469
+ "loss": 9.0643,
5470
+ "step": 7800
5471
+ },
5472
+ {
5473
+ "epoch": 0.38313189474910625,
5474
+ "grad_norm": 1.3984375,
5475
+ "learning_rate": 9.976672155771342e-05,
5476
+ "loss": 9.7797,
5477
+ "step": 7810
5478
+ },
5479
+ {
5480
+ "epoch": 0.38362246055544313,
5481
+ "grad_norm": 1.0546875,
5482
+ "learning_rate": 9.976395979594755e-05,
5483
+ "loss": 9.2137,
5484
+ "step": 7820
5485
+ },
5486
+ {
5487
+ "epoch": 0.38411302636178,
5488
+ "grad_norm": 1.375,
5489
+ "learning_rate": 9.976118182080045e-05,
5490
+ "loss": 9.1207,
5491
+ "step": 7830
5492
+ },
5493
+ {
5494
+ "epoch": 0.3846035921681169,
5495
+ "grad_norm": 1.578125,
5496
+ "learning_rate": 9.97583876331772e-05,
5497
+ "loss": 9.828,
5498
+ "step": 7840
5499
+ },
5500
+ {
5501
+ "epoch": 0.3850941579744538,
5502
+ "grad_norm": 1.265625,
5503
+ "learning_rate": 9.975557723398814e-05,
5504
+ "loss": 9.2545,
5505
+ "step": 7850
5506
+ },
5507
+ {
5508
+ "epoch": 0.3855847237807907,
5509
+ "grad_norm": 96.0,
5510
+ "learning_rate": 9.975275062414891e-05,
5511
+ "loss": 9.3358,
5512
+ "step": 7860
5513
+ },
5514
+ {
5515
+ "epoch": 0.38607528958712756,
5516
+ "grad_norm": 0.88671875,
5517
+ "learning_rate": 9.974990780458047e-05,
5518
+ "loss": 9.3523,
5519
+ "step": 7870
5520
+ },
5521
+ {
5522
+ "epoch": 0.38656585539346444,
5523
+ "grad_norm": 1.0625,
5524
+ "learning_rate": 9.974704877620899e-05,
5525
+ "loss": 8.9798,
5526
+ "step": 7880
5527
+ },
5528
+ {
5529
+ "epoch": 0.3870564211998013,
5530
+ "grad_norm": 1.8203125,
5531
+ "learning_rate": 9.974417353996597e-05,
5532
+ "loss": 9.2332,
5533
+ "step": 7890
5534
+ },
5535
+ {
5536
+ "epoch": 0.3875469870061382,
5537
+ "grad_norm": 1.828125,
5538
+ "learning_rate": 9.97412820967882e-05,
5539
+ "loss": 9.6917,
5540
+ "step": 7900
5541
+ },
5542
+ {
5543
+ "epoch": 0.3880375528124751,
5544
+ "grad_norm": 1.8125,
5545
+ "learning_rate": 9.973837444761767e-05,
5546
+ "loss": 10.2114,
5547
+ "step": 7910
5548
+ },
5549
+ {
5550
+ "epoch": 0.388528118618812,
5551
+ "grad_norm": 1.1328125,
5552
+ "learning_rate": 9.973545059340179e-05,
5553
+ "loss": 8.8373,
5554
+ "step": 7920
5555
+ },
5556
+ {
5557
+ "epoch": 0.38901868442514886,
5558
+ "grad_norm": 2.125,
5559
+ "learning_rate": 9.973251053509311e-05,
5560
+ "loss": 9.2673,
5561
+ "step": 7930
5562
+ },
5563
+ {
5564
+ "epoch": 0.38950925023148575,
5565
+ "grad_norm": 1.4609375,
5566
+ "learning_rate": 9.972955427364953e-05,
5567
+ "loss": 9.5442,
5568
+ "step": 7940
5569
+ },
5570
+ {
5571
+ "epoch": 0.38999981603782263,
5572
+ "grad_norm": 1.2421875,
5573
+ "learning_rate": 9.972658181003422e-05,
5574
+ "loss": 9.3631,
5575
+ "step": 7950
5576
+ },
5577
+ {
5578
+ "epoch": 0.3904903818441595,
5579
+ "grad_norm": 1.46875,
5580
+ "learning_rate": 9.972359314521565e-05,
5581
+ "loss": 9.2575,
5582
+ "step": 7960
5583
+ },
5584
+ {
5585
+ "epoch": 0.3909809476504964,
5586
+ "grad_norm": 1.4453125,
5587
+ "learning_rate": 9.972058828016749e-05,
5588
+ "loss": 9.5496,
5589
+ "step": 7970
5590
+ },
5591
+ {
5592
+ "epoch": 0.3914715134568333,
5593
+ "grad_norm": 2.21875,
5594
+ "learning_rate": 9.971756721586879e-05,
5595
+ "loss": 9.5364,
5596
+ "step": 7980
5597
+ },
5598
+ {
5599
+ "epoch": 0.39196207926317017,
5600
+ "grad_norm": 1.3359375,
5601
+ "learning_rate": 9.971452995330382e-05,
5602
+ "loss": 9.7381,
5603
+ "step": 7990
5604
+ },
5605
+ {
5606
+ "epoch": 0.39245264506950706,
5607
+ "grad_norm": 1.328125,
5608
+ "learning_rate": 9.971147649346211e-05,
5609
+ "loss": 9.1089,
5610
+ "step": 8000
5611
+ },
5612
+ {
5613
+ "epoch": 0.39294321087584394,
5614
+ "grad_norm": 1.0078125,
5615
+ "learning_rate": 9.970840683733853e-05,
5616
+ "loss": 9.3147,
5617
+ "step": 8010
5618
+ },
5619
+ {
5620
+ "epoch": 0.3934337766821808,
5621
+ "grad_norm": 1.2421875,
5622
+ "learning_rate": 9.970532098593317e-05,
5623
+ "loss": 9.3678,
5624
+ "step": 8020
5625
+ },
5626
+ {
5627
+ "epoch": 0.3939243424885177,
5628
+ "grad_norm": 1.453125,
5629
+ "learning_rate": 9.970221894025143e-05,
5630
+ "loss": 8.9748,
5631
+ "step": 8030
5632
+ },
5633
+ {
5634
+ "epoch": 0.3944149082948546,
5635
+ "grad_norm": 0.92578125,
5636
+ "learning_rate": 9.969910070130395e-05,
5637
+ "loss": 9.4,
5638
+ "step": 8040
5639
+ },
5640
+ {
5641
+ "epoch": 0.3949054741011915,
5642
+ "grad_norm": 0.76171875,
5643
+ "learning_rate": 9.969596627010671e-05,
5644
+ "loss": 9.2384,
5645
+ "step": 8050
5646
+ },
5647
+ {
5648
+ "epoch": 0.39539603990752836,
5649
+ "grad_norm": 1.53125,
5650
+ "learning_rate": 9.969281564768089e-05,
5651
+ "loss": 9.7778,
5652
+ "step": 8060
5653
+ },
5654
+ {
5655
+ "epoch": 0.39588660571386525,
5656
+ "grad_norm": 1.0234375,
5657
+ "learning_rate": 9.9689648835053e-05,
5658
+ "loss": 9.2588,
5659
+ "step": 8070
5660
+ },
5661
+ {
5662
+ "epoch": 0.39637717152020213,
5663
+ "grad_norm": 1.5078125,
5664
+ "learning_rate": 9.96864658332548e-05,
5665
+ "loss": 9.7844,
5666
+ "step": 8080
5667
+ },
5668
+ {
5669
+ "epoch": 0.396867737326539,
5670
+ "grad_norm": 1.09375,
5671
+ "learning_rate": 9.968326664332333e-05,
5672
+ "loss": 8.7478,
5673
+ "step": 8090
5674
+ },
5675
+ {
5676
+ "epoch": 0.3973583031328759,
5677
+ "grad_norm": 2.40625,
5678
+ "learning_rate": 9.96800512663009e-05,
5679
+ "loss": 10.1442,
5680
+ "step": 8100
5681
+ },
5682
+ {
5683
+ "epoch": 0.3978488689392128,
5684
+ "grad_norm": 1.40625,
5685
+ "learning_rate": 9.967681970323512e-05,
5686
+ "loss": 9.3243,
5687
+ "step": 8110
5688
+ },
5689
+ {
5690
+ "epoch": 0.39833943474554967,
5691
+ "grad_norm": 1.3359375,
5692
+ "learning_rate": 9.967357195517881e-05,
5693
+ "loss": 9.4967,
5694
+ "step": 8120
5695
+ },
5696
+ {
5697
+ "epoch": 0.39883000055188655,
5698
+ "grad_norm": 1.453125,
5699
+ "learning_rate": 9.967030802319015e-05,
5700
+ "loss": 9.3689,
5701
+ "step": 8130
5702
+ },
5703
+ {
5704
+ "epoch": 0.39932056635822344,
5705
+ "grad_norm": 1.2578125,
5706
+ "learning_rate": 9.966702790833253e-05,
5707
+ "loss": 9.0353,
5708
+ "step": 8140
5709
+ },
5710
+ {
5711
+ "epoch": 0.3998111321645603,
5712
+ "grad_norm": 0.76171875,
5713
+ "learning_rate": 9.96637316116746e-05,
5714
+ "loss": 9.3215,
5715
+ "step": 8150
5716
+ },
5717
+ {
5718
+ "epoch": 0.4003016979708972,
5719
+ "grad_norm": 1.15625,
5720
+ "learning_rate": 9.966041913429036e-05,
5721
+ "loss": 10.0265,
5722
+ "step": 8160
5723
+ },
5724
+ {
5725
+ "epoch": 0.4007922637772341,
5726
+ "grad_norm": 1.1640625,
5727
+ "learning_rate": 9.965709047725901e-05,
5728
+ "loss": 8.743,
5729
+ "step": 8170
5730
+ },
5731
+ {
5732
+ "epoch": 0.401282829583571,
5733
+ "grad_norm": 0.82421875,
5734
+ "learning_rate": 9.965374564166505e-05,
5735
+ "loss": 8.9593,
5736
+ "step": 8180
5737
+ },
5738
+ {
5739
+ "epoch": 0.40177339538990786,
5740
+ "grad_norm": 1.0390625,
5741
+ "learning_rate": 9.965038462859824e-05,
5742
+ "loss": 8.961,
5743
+ "step": 8190
5744
+ },
5745
+ {
5746
+ "epoch": 0.40226396119624475,
5747
+ "grad_norm": 1.2890625,
5748
+ "learning_rate": 9.964700743915361e-05,
5749
+ "loss": 9.808,
5750
+ "step": 8200
5751
+ },
5752
+ {
5753
+ "epoch": 0.4027545270025816,
5754
+ "grad_norm": 8.375,
5755
+ "learning_rate": 9.96436140744315e-05,
5756
+ "loss": 9.4756,
5757
+ "step": 8210
5758
+ },
5759
+ {
5760
+ "epoch": 0.40324509280891846,
5761
+ "grad_norm": 0.99609375,
5762
+ "learning_rate": 9.964020453553746e-05,
5763
+ "loss": 9.3352,
5764
+ "step": 8220
5765
+ },
5766
+ {
5767
+ "epoch": 0.40373565861525534,
5768
+ "grad_norm": 1.7421875,
5769
+ "learning_rate": 9.963677882358233e-05,
5770
+ "loss": 9.3511,
5771
+ "step": 8230
5772
+ },
5773
+ {
5774
+ "epoch": 0.40422622442159223,
5775
+ "grad_norm": 1.203125,
5776
+ "learning_rate": 9.963333693968226e-05,
5777
+ "loss": 9.5166,
5778
+ "step": 8240
5779
+ },
5780
+ {
5781
+ "epoch": 0.4047167902279291,
5782
+ "grad_norm": 1.53125,
5783
+ "learning_rate": 9.962987888495862e-05,
5784
+ "loss": 9.3433,
5785
+ "step": 8250
5786
+ },
5787
+ {
5788
+ "epoch": 0.405207356034266,
5789
+ "grad_norm": 2.59375,
5790
+ "learning_rate": 9.962640466053804e-05,
5791
+ "loss": 9.9907,
5792
+ "step": 8260
5793
+ },
5794
+ {
5795
+ "epoch": 0.4056979218406029,
5796
+ "grad_norm": 1.625,
5797
+ "learning_rate": 9.962291426755248e-05,
5798
+ "loss": 9.5791,
5799
+ "step": 8270
5800
+ },
5801
+ {
5802
+ "epoch": 0.40618848764693977,
5803
+ "grad_norm": 1.125,
5804
+ "learning_rate": 9.96194077071391e-05,
5805
+ "loss": 9.2634,
5806
+ "step": 8280
5807
+ },
5808
+ {
5809
+ "epoch": 0.40667905345327665,
5810
+ "grad_norm": 1.1484375,
5811
+ "learning_rate": 9.961588498044037e-05,
5812
+ "loss": 9.2259,
5813
+ "step": 8290
5814
+ },
5815
+ {
5816
+ "epoch": 0.40716961925961354,
5817
+ "grad_norm": 1.6875,
5818
+ "learning_rate": 9.961234608860402e-05,
5819
+ "loss": 9.1666,
5820
+ "step": 8300
5821
+ },
5822
+ {
5823
+ "epoch": 0.4076601850659504,
5824
+ "grad_norm": 1.3359375,
5825
+ "learning_rate": 9.960879103278303e-05,
5826
+ "loss": 9.4266,
5827
+ "step": 8310
5828
+ },
5829
+ {
5830
+ "epoch": 0.4081507508722873,
5831
+ "grad_norm": 1.0703125,
5832
+ "learning_rate": 9.960521981413566e-05,
5833
+ "loss": 9.2596,
5834
+ "step": 8320
5835
+ },
5836
+ {
5837
+ "epoch": 0.4086413166786242,
5838
+ "grad_norm": 1.9296875,
5839
+ "learning_rate": 9.960163243382545e-05,
5840
+ "loss": 8.4992,
5841
+ "step": 8330
5842
+ },
5843
+ {
5844
+ "epoch": 0.4091318824849611,
5845
+ "grad_norm": 1.5234375,
5846
+ "learning_rate": 9.959802889302117e-05,
5847
+ "loss": 9.1112,
5848
+ "step": 8340
5849
+ },
5850
+ {
5851
+ "epoch": 0.40962244829129796,
5852
+ "grad_norm": 1.421875,
5853
+ "learning_rate": 9.959440919289686e-05,
5854
+ "loss": 9.6092,
5855
+ "step": 8350
5856
+ },
5857
+ {
5858
+ "epoch": 0.41011301409763484,
5859
+ "grad_norm": 2.09375,
5860
+ "learning_rate": 9.959077333463187e-05,
5861
+ "loss": 9.1144,
5862
+ "step": 8360
5863
+ },
5864
+ {
5865
+ "epoch": 0.4106035799039717,
5866
+ "grad_norm": 1.265625,
5867
+ "learning_rate": 9.958712131941077e-05,
5868
+ "loss": 9.007,
5869
+ "step": 8370
5870
+ },
5871
+ {
5872
+ "epoch": 0.4110941457103086,
5873
+ "grad_norm": 1.1328125,
5874
+ "learning_rate": 9.95834531484234e-05,
5875
+ "loss": 9.4561,
5876
+ "step": 8380
5877
+ },
5878
+ {
5879
+ "epoch": 0.4115847115166455,
5880
+ "grad_norm": 1.6484375,
5881
+ "learning_rate": 9.95797688228649e-05,
5882
+ "loss": 9.0578,
5883
+ "step": 8390
5884
+ },
5885
+ {
5886
+ "epoch": 0.4120752773229824,
5887
+ "grad_norm": 1.5703125,
5888
+ "learning_rate": 9.957606834393561e-05,
5889
+ "loss": 9.0047,
5890
+ "step": 8400
5891
+ },
5892
+ {
5893
+ "epoch": 0.41256584312931927,
5894
+ "grad_norm": 1.1328125,
5895
+ "learning_rate": 9.957235171284118e-05,
5896
+ "loss": 9.0116,
5897
+ "step": 8410
5898
+ },
5899
+ {
5900
+ "epoch": 0.41305640893565615,
5901
+ "grad_norm": 1.09375,
5902
+ "learning_rate": 9.956861893079253e-05,
5903
+ "loss": 9.5539,
5904
+ "step": 8420
5905
+ },
5906
+ {
5907
+ "epoch": 0.41354697474199303,
5908
+ "grad_norm": 2.375,
5909
+ "learning_rate": 9.956486999900578e-05,
5910
+ "loss": 9.4898,
5911
+ "step": 8430
5912
+ },
5913
+ {
5914
+ "epoch": 0.4140375405483299,
5915
+ "grad_norm": 1.0703125,
5916
+ "learning_rate": 9.956110491870237e-05,
5917
+ "loss": 9.4607,
5918
+ "step": 8440
5919
+ },
5920
+ {
5921
+ "epoch": 0.4145281063546668,
5922
+ "grad_norm": 0.9765625,
5923
+ "learning_rate": 9.9557323691109e-05,
5924
+ "loss": 9.0116,
5925
+ "step": 8450
5926
+ },
5927
+ {
5928
+ "epoch": 0.4150186721610037,
5929
+ "grad_norm": 1.0625,
5930
+ "learning_rate": 9.955352631745761e-05,
5931
+ "loss": 8.9376,
5932
+ "step": 8460
5933
+ },
5934
+ {
5935
+ "epoch": 0.4155092379673406,
5936
+ "grad_norm": 0.84375,
5937
+ "learning_rate": 9.954971279898538e-05,
5938
+ "loss": 9.4654,
5939
+ "step": 8470
5940
+ },
5941
+ {
5942
+ "epoch": 0.41599980377367746,
5943
+ "grad_norm": 1.65625,
5944
+ "learning_rate": 9.954588313693482e-05,
5945
+ "loss": 9.6625,
5946
+ "step": 8480
5947
+ },
5948
+ {
5949
+ "epoch": 0.41649036958001434,
5950
+ "grad_norm": 1.84375,
5951
+ "learning_rate": 9.954203733255362e-05,
5952
+ "loss": 9.5381,
5953
+ "step": 8490
5954
+ },
5955
+ {
5956
+ "epoch": 0.4169809353863512,
5957
+ "grad_norm": 5.375,
5958
+ "learning_rate": 9.953817538709478e-05,
5959
+ "loss": 9.4922,
5960
+ "step": 8500
5961
+ },
5962
+ {
5963
+ "epoch": 0.4174715011926881,
5964
+ "grad_norm": 0.9921875,
5965
+ "learning_rate": 9.953429730181653e-05,
5966
+ "loss": 9.4195,
5967
+ "step": 8510
5968
+ },
5969
+ {
5970
+ "epoch": 0.417962066999025,
5971
+ "grad_norm": 1.71875,
5972
+ "learning_rate": 9.95304030779824e-05,
5973
+ "loss": 9.0456,
5974
+ "step": 8520
5975
+ },
5976
+ {
5977
+ "epoch": 0.4184526328053619,
5978
+ "grad_norm": 1.3671875,
5979
+ "learning_rate": 9.952649271686114e-05,
5980
+ "loss": 9.5672,
5981
+ "step": 8530
5982
+ },
5983
+ {
5984
+ "epoch": 0.41894319861169876,
5985
+ "grad_norm": 0.83203125,
5986
+ "learning_rate": 9.952256621972676e-05,
5987
+ "loss": 8.7467,
5988
+ "step": 8540
5989
+ },
5990
+ {
5991
+ "epoch": 0.41943376441803565,
5992
+ "grad_norm": 1.515625,
5993
+ "learning_rate": 9.951862358785852e-05,
5994
+ "loss": 9.6638,
5995
+ "step": 8550
5996
+ },
5997
+ {
5998
+ "epoch": 0.41992433022437253,
5999
+ "grad_norm": 1.28125,
6000
+ "learning_rate": 9.951466482254097e-05,
6001
+ "loss": 9.5221,
6002
+ "step": 8560
6003
+ },
6004
+ {
6005
+ "epoch": 0.4204148960307094,
6006
+ "grad_norm": 1.6015625,
6007
+ "learning_rate": 9.951068992506391e-05,
6008
+ "loss": 9.3728,
6009
+ "step": 8570
6010
+ },
6011
+ {
6012
+ "epoch": 0.4209054618370463,
6013
+ "grad_norm": 1.6640625,
6014
+ "learning_rate": 9.950669889672238e-05,
6015
+ "loss": 9.0309,
6016
+ "step": 8580
6017
+ },
6018
+ {
6019
+ "epoch": 0.4213960276433832,
6020
+ "grad_norm": 0.77734375,
6021
+ "learning_rate": 9.950269173881663e-05,
6022
+ "loss": 9.546,
6023
+ "step": 8590
6024
+ },
6025
+ {
6026
+ "epoch": 0.42188659344972007,
6027
+ "grad_norm": 1.390625,
6028
+ "learning_rate": 9.94986684526523e-05,
6029
+ "loss": 9.3854,
6030
+ "step": 8600
6031
+ },
6032
+ {
6033
+ "epoch": 0.42237715925605696,
6034
+ "grad_norm": 1.734375,
6035
+ "learning_rate": 9.949462903954014e-05,
6036
+ "loss": 10.0409,
6037
+ "step": 8610
6038
+ },
6039
+ {
6040
+ "epoch": 0.42286772506239384,
6041
+ "grad_norm": 1.1875,
6042
+ "learning_rate": 9.949057350079622e-05,
6043
+ "loss": 10.0401,
6044
+ "step": 8620
6045
+ },
6046
+ {
6047
+ "epoch": 0.4233582908687307,
6048
+ "grad_norm": 1.171875,
6049
+ "learning_rate": 9.948650183774187e-05,
6050
+ "loss": 9.3531,
6051
+ "step": 8630
6052
+ },
6053
+ {
6054
+ "epoch": 0.4238488566750676,
6055
+ "grad_norm": 1.046875,
6056
+ "learning_rate": 9.948241405170367e-05,
6057
+ "loss": 9.6503,
6058
+ "step": 8640
6059
+ },
6060
+ {
6061
+ "epoch": 0.4243394224814045,
6062
+ "grad_norm": 1.03125,
6063
+ "learning_rate": 9.947831014401342e-05,
6064
+ "loss": 10.1025,
6065
+ "step": 8650
6066
+ },
6067
+ {
6068
+ "epoch": 0.4248299882877414,
6069
+ "grad_norm": 1.8515625,
6070
+ "learning_rate": 9.94741901160082e-05,
6071
+ "loss": 9.1217,
6072
+ "step": 8660
6073
+ },
6074
+ {
6075
+ "epoch": 0.42532055409407826,
6076
+ "grad_norm": 1.296875,
6077
+ "learning_rate": 9.947005396903036e-05,
6078
+ "loss": 9.4203,
6079
+ "step": 8670
6080
+ },
6081
+ {
6082
+ "epoch": 0.42581111990041515,
6083
+ "grad_norm": 1.3046875,
6084
+ "learning_rate": 9.946590170442747e-05,
6085
+ "loss": 8.9508,
6086
+ "step": 8680
6087
+ },
6088
+ {
6089
+ "epoch": 0.42630168570675203,
6090
+ "grad_norm": 1.78125,
6091
+ "learning_rate": 9.946173332355236e-05,
6092
+ "loss": 9.3826,
6093
+ "step": 8690
6094
+ },
6095
+ {
6096
+ "epoch": 0.4267922515130889,
6097
+ "grad_norm": 1.1796875,
6098
+ "learning_rate": 9.945754882776311e-05,
6099
+ "loss": 9.6903,
6100
+ "step": 8700
6101
+ },
6102
+ {
6103
+ "epoch": 0.4272828173194258,
6104
+ "grad_norm": 0.984375,
6105
+ "learning_rate": 9.945334821842303e-05,
6106
+ "loss": 9.1157,
6107
+ "step": 8710
6108
+ },
6109
+ {
6110
+ "epoch": 0.4277733831257627,
6111
+ "grad_norm": 3.171875,
6112
+ "learning_rate": 9.944913149690075e-05,
6113
+ "loss": 9.7155,
6114
+ "step": 8720
6115
+ },
6116
+ {
6117
+ "epoch": 0.42826394893209957,
6118
+ "grad_norm": 1.421875,
6119
+ "learning_rate": 9.944489866457008e-05,
6120
+ "loss": 9.4667,
6121
+ "step": 8730
6122
+ },
6123
+ {
6124
+ "epoch": 0.42875451473843645,
6125
+ "grad_norm": 1.53125,
6126
+ "learning_rate": 9.944064972281007e-05,
6127
+ "loss": 9.5808,
6128
+ "step": 8740
6129
+ },
6130
+ {
6131
+ "epoch": 0.42924508054477334,
6132
+ "grad_norm": 1.1328125,
6133
+ "learning_rate": 9.943638467300511e-05,
6134
+ "loss": 9.3587,
6135
+ "step": 8750
6136
+ },
6137
+ {
6138
+ "epoch": 0.4297356463511102,
6139
+ "grad_norm": 2.921875,
6140
+ "learning_rate": 9.943210351654473e-05,
6141
+ "loss": 9.6425,
6142
+ "step": 8760
6143
+ },
6144
+ {
6145
+ "epoch": 0.4302262121574471,
6146
+ "grad_norm": 1.40625,
6147
+ "learning_rate": 9.942780625482376e-05,
6148
+ "loss": 9.6145,
6149
+ "step": 8770
6150
+ },
6151
+ {
6152
+ "epoch": 0.430716777963784,
6153
+ "grad_norm": 1.703125,
6154
+ "learning_rate": 9.942349288924229e-05,
6155
+ "loss": 9.6646,
6156
+ "step": 8780
6157
+ },
6158
+ {
6159
+ "epoch": 0.4312073437701209,
6160
+ "grad_norm": 1.546875,
6161
+ "learning_rate": 9.941916342120564e-05,
6162
+ "loss": 9.7029,
6163
+ "step": 8790
6164
+ },
6165
+ {
6166
+ "epoch": 0.43169790957645776,
6167
+ "grad_norm": 1.71875,
6168
+ "learning_rate": 9.941481785212437e-05,
6169
+ "loss": 9.1658,
6170
+ "step": 8800
6171
+ },
6172
+ {
6173
+ "epoch": 0.43218847538279465,
6174
+ "grad_norm": 1.09375,
6175
+ "learning_rate": 9.941045618341427e-05,
6176
+ "loss": 9.9594,
6177
+ "step": 8810
6178
+ },
6179
+ {
6180
+ "epoch": 0.43267904118913153,
6181
+ "grad_norm": 1.0390625,
6182
+ "learning_rate": 9.940607841649643e-05,
6183
+ "loss": 9.3896,
6184
+ "step": 8820
6185
+ },
6186
+ {
6187
+ "epoch": 0.4331696069954684,
6188
+ "grad_norm": 1.4375,
6189
+ "learning_rate": 9.940168455279713e-05,
6190
+ "loss": 8.9524,
6191
+ "step": 8830
6192
+ },
6193
+ {
6194
+ "epoch": 0.4336601728018053,
6195
+ "grad_norm": 1.203125,
6196
+ "learning_rate": 9.939727459374792e-05,
6197
+ "loss": 9.8094,
6198
+ "step": 8840
6199
+ },
6200
+ {
6201
+ "epoch": 0.4341507386081422,
6202
+ "grad_norm": 2.734375,
6203
+ "learning_rate": 9.93928485407856e-05,
6204
+ "loss": 9.6122,
6205
+ "step": 8850
6206
+ },
6207
+ {
6208
+ "epoch": 0.43464130441447907,
6209
+ "grad_norm": 0.88671875,
6210
+ "learning_rate": 9.938840639535219e-05,
6211
+ "loss": 9.5027,
6212
+ "step": 8860
6213
+ },
6214
+ {
6215
+ "epoch": 0.43513187022081595,
6216
+ "grad_norm": 1.1015625,
6217
+ "learning_rate": 9.938394815889497e-05,
6218
+ "loss": 8.9665,
6219
+ "step": 8870
6220
+ },
6221
+ {
6222
+ "epoch": 0.43562243602715284,
6223
+ "grad_norm": 0.96484375,
6224
+ "learning_rate": 9.937947383286646e-05,
6225
+ "loss": 9.6251,
6226
+ "step": 8880
6227
+ },
6228
+ {
6229
+ "epoch": 0.4361130018334897,
6230
+ "grad_norm": 1.7265625,
6231
+ "learning_rate": 9.937498341872443e-05,
6232
+ "loss": 10.0964,
6233
+ "step": 8890
6234
+ },
6235
+ {
6236
+ "epoch": 0.4366035676398266,
6237
+ "grad_norm": 0.91796875,
6238
+ "learning_rate": 9.937047691793186e-05,
6239
+ "loss": 9.6408,
6240
+ "step": 8900
6241
+ },
6242
+ {
6243
+ "epoch": 0.4370941334461635,
6244
+ "grad_norm": 121.5,
6245
+ "learning_rate": 9.936595433195701e-05,
6246
+ "loss": 9.911,
6247
+ "step": 8910
6248
+ },
6249
+ {
6250
+ "epoch": 0.4375846992525004,
6251
+ "grad_norm": 1.328125,
6252
+ "learning_rate": 9.936141566227335e-05,
6253
+ "loss": 9.3583,
6254
+ "step": 8920
6255
+ },
6256
+ {
6257
+ "epoch": 0.43807526505883726,
6258
+ "grad_norm": 1.1171875,
6259
+ "learning_rate": 9.935686091035963e-05,
6260
+ "loss": 9.074,
6261
+ "step": 8930
6262
+ },
6263
+ {
6264
+ "epoch": 0.43856583086517414,
6265
+ "grad_norm": 1.4765625,
6266
+ "learning_rate": 9.93522900776998e-05,
6267
+ "loss": 9.409,
6268
+ "step": 8940
6269
+ },
6270
+ {
6271
+ "epoch": 0.43905639667151103,
6272
+ "grad_norm": 1.3515625,
6273
+ "learning_rate": 9.934770316578306e-05,
6274
+ "loss": 9.5203,
6275
+ "step": 8950
6276
+ },
6277
+ {
6278
+ "epoch": 0.4395469624778479,
6279
+ "grad_norm": 1.2421875,
6280
+ "learning_rate": 9.934310017610385e-05,
6281
+ "loss": 9.6801,
6282
+ "step": 8960
6283
+ },
6284
+ {
6285
+ "epoch": 0.4400375282841848,
6286
+ "grad_norm": 1.1015625,
6287
+ "learning_rate": 9.933848111016186e-05,
6288
+ "loss": 9.5165,
6289
+ "step": 8970
6290
+ },
6291
+ {
6292
+ "epoch": 0.4405280940905217,
6293
+ "grad_norm": 1.5234375,
6294
+ "learning_rate": 9.933384596946201e-05,
6295
+ "loss": 9.6044,
6296
+ "step": 8980
6297
+ },
6298
+ {
6299
+ "epoch": 0.44101865989685857,
6300
+ "grad_norm": 1.1875,
6301
+ "learning_rate": 9.932919475551443e-05,
6302
+ "loss": 9.2709,
6303
+ "step": 8990
6304
+ },
6305
+ {
6306
+ "epoch": 0.4415092257031954,
6307
+ "grad_norm": 1.234375,
6308
+ "learning_rate": 9.932452746983455e-05,
6309
+ "loss": 9.0887,
6310
+ "step": 9000
6311
+ },
6312
+ {
6313
+ "epoch": 0.4419997915095323,
6314
+ "grad_norm": 0.94140625,
6315
+ "learning_rate": 9.931984411394297e-05,
6316
+ "loss": 8.9249,
6317
+ "step": 9010
6318
+ },
6319
+ {
6320
+ "epoch": 0.44249035731586916,
6321
+ "grad_norm": 1.234375,
6322
+ "learning_rate": 9.931514468936556e-05,
6323
+ "loss": 9.3311,
6324
+ "step": 9020
6325
+ },
6326
+ {
6327
+ "epoch": 0.44298092312220605,
6328
+ "grad_norm": 1.5625,
6329
+ "learning_rate": 9.931042919763343e-05,
6330
+ "loss": 9.1204,
6331
+ "step": 9030
6332
+ },
6333
+ {
6334
+ "epoch": 0.44347148892854293,
6335
+ "grad_norm": 1.0078125,
6336
+ "learning_rate": 9.930569764028289e-05,
6337
+ "loss": 8.936,
6338
+ "step": 9040
6339
+ },
6340
+ {
6341
+ "epoch": 0.4439620547348798,
6342
+ "grad_norm": 1.5078125,
6343
+ "learning_rate": 9.930095001885554e-05,
6344
+ "loss": 9.8463,
6345
+ "step": 9050
6346
+ },
6347
+ {
6348
+ "epoch": 0.4444526205412167,
6349
+ "grad_norm": 1.3125,
6350
+ "learning_rate": 9.929618633489815e-05,
6351
+ "loss": 9.4598,
6352
+ "step": 9060
6353
+ },
6354
+ {
6355
+ "epoch": 0.4449431863475536,
6356
+ "grad_norm": 1.390625,
6357
+ "learning_rate": 9.929140658996278e-05,
6358
+ "loss": 9.1565,
6359
+ "step": 9070
6360
+ },
6361
+ {
6362
+ "epoch": 0.44543375215389047,
6363
+ "grad_norm": 1.0625,
6364
+ "learning_rate": 9.928661078560669e-05,
6365
+ "loss": 9.5713,
6366
+ "step": 9080
6367
+ },
6368
+ {
6369
+ "epoch": 0.44592431796022736,
6370
+ "grad_norm": 1.265625,
6371
+ "learning_rate": 9.928179892339238e-05,
6372
+ "loss": 9.5154,
6373
+ "step": 9090
6374
+ },
6375
+ {
6376
+ "epoch": 0.44641488376656424,
6377
+ "grad_norm": 2.671875,
6378
+ "learning_rate": 9.927697100488757e-05,
6379
+ "loss": 9.8792,
6380
+ "step": 9100
6381
+ },
6382
+ {
6383
+ "epoch": 0.4469054495729011,
6384
+ "grad_norm": 1.1640625,
6385
+ "learning_rate": 9.927212703166526e-05,
6386
+ "loss": 8.8977,
6387
+ "step": 9110
6388
+ },
6389
+ {
6390
+ "epoch": 0.447396015379238,
6391
+ "grad_norm": 1.2109375,
6392
+ "learning_rate": 9.92672670053036e-05,
6393
+ "loss": 9.3305,
6394
+ "step": 9120
6395
+ },
6396
+ {
6397
+ "epoch": 0.4478865811855749,
6398
+ "grad_norm": 1.5390625,
6399
+ "learning_rate": 9.926239092738606e-05,
6400
+ "loss": 9.4327,
6401
+ "step": 9130
6402
+ },
6403
+ {
6404
+ "epoch": 0.4483771469919118,
6405
+ "grad_norm": 1.0859375,
6406
+ "learning_rate": 9.925749879950123e-05,
6407
+ "loss": 9.3707,
6408
+ "step": 9140
6409
+ },
6410
+ {
6411
+ "epoch": 0.44886771279824866,
6412
+ "grad_norm": 3.578125,
6413
+ "learning_rate": 9.925259062324305e-05,
6414
+ "loss": 9.2381,
6415
+ "step": 9150
6416
+ },
6417
+ {
6418
+ "epoch": 0.44935827860458555,
6419
+ "grad_norm": 6.03125,
6420
+ "learning_rate": 9.924766640021061e-05,
6421
+ "loss": 8.6538,
6422
+ "step": 9160
6423
+ },
6424
+ {
6425
+ "epoch": 0.44984884441092243,
6426
+ "grad_norm": 2.25,
6427
+ "learning_rate": 9.924272613200825e-05,
6428
+ "loss": 9.581,
6429
+ "step": 9170
6430
+ },
6431
+ {
6432
+ "epoch": 0.4503394102172593,
6433
+ "grad_norm": 1.9921875,
6434
+ "learning_rate": 9.923776982024554e-05,
6435
+ "loss": 9.4617,
6436
+ "step": 9180
6437
+ },
6438
+ {
6439
+ "epoch": 0.4508299760235962,
6440
+ "grad_norm": 1.59375,
6441
+ "learning_rate": 9.923279746653728e-05,
6442
+ "loss": 9.5088,
6443
+ "step": 9190
6444
+ },
6445
+ {
6446
+ "epoch": 0.4513205418299331,
6447
+ "grad_norm": 1.3125,
6448
+ "learning_rate": 9.922780907250348e-05,
6449
+ "loss": 8.9247,
6450
+ "step": 9200
6451
+ },
6452
+ {
6453
+ "epoch": 0.45181110763626997,
6454
+ "grad_norm": 1.4921875,
6455
+ "learning_rate": 9.922280463976938e-05,
6456
+ "loss": 9.1817,
6457
+ "step": 9210
6458
+ },
6459
+ {
6460
+ "epoch": 0.45230167344260686,
6461
+ "grad_norm": 1.6953125,
6462
+ "learning_rate": 9.921778416996549e-05,
6463
+ "loss": 9.3479,
6464
+ "step": 9220
6465
+ },
6466
+ {
6467
+ "epoch": 0.45279223924894374,
6468
+ "grad_norm": 1.3125,
6469
+ "learning_rate": 9.921274766472748e-05,
6470
+ "loss": 10.0808,
6471
+ "step": 9230
6472
+ },
6473
+ {
6474
+ "epoch": 0.4532828050552806,
6475
+ "grad_norm": 1.2421875,
6476
+ "learning_rate": 9.920769512569625e-05,
6477
+ "loss": 9.1088,
6478
+ "step": 9240
6479
+ },
6480
+ {
6481
+ "epoch": 0.4537733708616175,
6482
+ "grad_norm": 1.171875,
6483
+ "learning_rate": 9.9202626554518e-05,
6484
+ "loss": 10.0015,
6485
+ "step": 9250
6486
+ },
6487
+ {
6488
+ "epoch": 0.4542639366679544,
6489
+ "grad_norm": 1.8046875,
6490
+ "learning_rate": 9.919754195284406e-05,
6491
+ "loss": 10.0798,
6492
+ "step": 9260
6493
+ },
6494
+ {
6495
+ "epoch": 0.4547545024742913,
6496
+ "grad_norm": 1.09375,
6497
+ "learning_rate": 9.919244132233104e-05,
6498
+ "loss": 9.0094,
6499
+ "step": 9270
6500
+ },
6501
+ {
6502
+ "epoch": 0.45524506828062816,
6503
+ "grad_norm": 2.640625,
6504
+ "learning_rate": 9.918732466464072e-05,
6505
+ "loss": 9.6669,
6506
+ "step": 9280
6507
+ },
6508
+ {
6509
+ "epoch": 0.45573563408696505,
6510
+ "grad_norm": 1.484375,
6511
+ "learning_rate": 9.918219198144019e-05,
6512
+ "loss": 9.5382,
6513
+ "step": 9290
6514
+ },
6515
+ {
6516
+ "epoch": 0.45622619989330193,
6517
+ "grad_norm": 1.421875,
6518
+ "learning_rate": 9.917704327440166e-05,
6519
+ "loss": 9.4784,
6520
+ "step": 9300
6521
+ },
6522
+ {
6523
+ "epoch": 0.4567167656996388,
6524
+ "grad_norm": 1.1484375,
6525
+ "learning_rate": 9.917187854520264e-05,
6526
+ "loss": 10.2777,
6527
+ "step": 9310
6528
+ },
6529
+ {
6530
+ "epoch": 0.4572073315059757,
6531
+ "grad_norm": 1.1953125,
6532
+ "learning_rate": 9.916669779552581e-05,
6533
+ "loss": 9.0473,
6534
+ "step": 9320
6535
+ },
6536
+ {
6537
+ "epoch": 0.4576978973123126,
6538
+ "grad_norm": 1.46875,
6539
+ "learning_rate": 9.91615010270591e-05,
6540
+ "loss": 9.1827,
6541
+ "step": 9330
6542
+ },
6543
+ {
6544
+ "epoch": 0.45818846311864947,
6545
+ "grad_norm": 1.8828125,
6546
+ "learning_rate": 9.915628824149564e-05,
6547
+ "loss": 9.3418,
6548
+ "step": 9340
6549
+ },
6550
+ {
6551
+ "epoch": 0.45867902892498635,
6552
+ "grad_norm": 1.0546875,
6553
+ "learning_rate": 9.91510594405338e-05,
6554
+ "loss": 9.5231,
6555
+ "step": 9350
6556
+ },
6557
+ {
6558
+ "epoch": 0.45916959473132324,
6559
+ "grad_norm": 0.93359375,
6560
+ "learning_rate": 9.914581462587712e-05,
6561
+ "loss": 9.2647,
6562
+ "step": 9360
6563
+ },
6564
+ {
6565
+ "epoch": 0.4596601605376601,
6566
+ "grad_norm": 1.1484375,
6567
+ "learning_rate": 9.914055379923442e-05,
6568
+ "loss": 9.6544,
6569
+ "step": 9370
6570
+ },
6571
+ {
6572
+ "epoch": 0.460150726343997,
6573
+ "grad_norm": 2.015625,
6574
+ "learning_rate": 9.913527696231969e-05,
6575
+ "loss": 9.1867,
6576
+ "step": 9380
6577
+ },
6578
+ {
6579
+ "epoch": 0.4606412921503339,
6580
+ "grad_norm": 1.125,
6581
+ "learning_rate": 9.912998411685216e-05,
6582
+ "loss": 9.5706,
6583
+ "step": 9390
6584
+ },
6585
+ {
6586
+ "epoch": 0.4611318579566708,
6587
+ "grad_norm": 0.9375,
6588
+ "learning_rate": 9.912467526455626e-05,
6589
+ "loss": 9.2712,
6590
+ "step": 9400
6591
+ },
6592
+ {
6593
+ "epoch": 0.46162242376300766,
6594
+ "grad_norm": 1.15625,
6595
+ "learning_rate": 9.911935040716169e-05,
6596
+ "loss": 9.6751,
6597
+ "step": 9410
6598
+ },
6599
+ {
6600
+ "epoch": 0.46211298956934455,
6601
+ "grad_norm": 1.265625,
6602
+ "learning_rate": 9.911400954640325e-05,
6603
+ "loss": 9.2709,
6604
+ "step": 9420
6605
+ },
6606
+ {
6607
+ "epoch": 0.46260355537568143,
6608
+ "grad_norm": 1.6953125,
6609
+ "learning_rate": 9.910865268402106e-05,
6610
+ "loss": 9.6717,
6611
+ "step": 9430
6612
+ },
6613
+ {
6614
+ "epoch": 0.4630941211820183,
6615
+ "grad_norm": 1.0625,
6616
+ "learning_rate": 9.910327982176042e-05,
6617
+ "loss": 8.833,
6618
+ "step": 9440
6619
+ },
6620
+ {
6621
+ "epoch": 0.4635846869883552,
6622
+ "grad_norm": 1.3125,
6623
+ "learning_rate": 9.909789096137184e-05,
6624
+ "loss": 9.7037,
6625
+ "step": 9450
6626
+ },
6627
+ {
6628
+ "epoch": 0.4640752527946921,
6629
+ "grad_norm": 1.21875,
6630
+ "learning_rate": 9.909248610461101e-05,
6631
+ "loss": 9.3282,
6632
+ "step": 9460
6633
+ },
6634
+ {
6635
+ "epoch": 0.46456581860102897,
6636
+ "grad_norm": 1.4921875,
6637
+ "learning_rate": 9.90870652532389e-05,
6638
+ "loss": 9.7322,
6639
+ "step": 9470
6640
+ },
6641
+ {
6642
+ "epoch": 0.46505638440736585,
6643
+ "grad_norm": 2.609375,
6644
+ "learning_rate": 9.908162840902163e-05,
6645
+ "loss": 9.5577,
6646
+ "step": 9480
6647
+ },
6648
+ {
6649
+ "epoch": 0.46554695021370274,
6650
+ "grad_norm": 1.3671875,
6651
+ "learning_rate": 9.907617557373057e-05,
6652
+ "loss": 9.5126,
6653
+ "step": 9490
6654
+ },
6655
+ {
6656
+ "epoch": 0.4660375160200396,
6657
+ "grad_norm": 1.5546875,
6658
+ "learning_rate": 9.907070674914228e-05,
6659
+ "loss": 9.4535,
6660
+ "step": 9500
6661
+ },
6662
+ {
6663
+ "epoch": 0.4665280818263765,
6664
+ "grad_norm": 1.0390625,
6665
+ "learning_rate": 9.906522193703853e-05,
6666
+ "loss": 9.1627,
6667
+ "step": 9510
6668
+ },
6669
+ {
6670
+ "epoch": 0.4670186476327134,
6671
+ "grad_norm": 1.15625,
6672
+ "learning_rate": 9.905972113920632e-05,
6673
+ "loss": 8.9282,
6674
+ "step": 9520
6675
+ },
6676
+ {
6677
+ "epoch": 0.4675092134390503,
6678
+ "grad_norm": 1.8828125,
6679
+ "learning_rate": 9.905420435743782e-05,
6680
+ "loss": 9.3116,
6681
+ "step": 9530
6682
+ },
6683
+ {
6684
+ "epoch": 0.46799977924538716,
6685
+ "grad_norm": 1.1875,
6686
+ "learning_rate": 9.904867159353041e-05,
6687
+ "loss": 9.3445,
6688
+ "step": 9540
6689
+ },
6690
+ {
6691
+ "epoch": 0.46849034505172404,
6692
+ "grad_norm": 1.3984375,
6693
+ "learning_rate": 9.904312284928677e-05,
6694
+ "loss": 8.8062,
6695
+ "step": 9550
6696
+ },
6697
+ {
6698
+ "epoch": 0.46898091085806093,
6699
+ "grad_norm": 1.7109375,
6700
+ "learning_rate": 9.903755812651463e-05,
6701
+ "loss": 9.8019,
6702
+ "step": 9560
6703
+ },
6704
+ {
6705
+ "epoch": 0.4694714766643978,
6706
+ "grad_norm": 1.4140625,
6707
+ "learning_rate": 9.903197742702706e-05,
6708
+ "loss": 9.7276,
6709
+ "step": 9570
6710
+ },
6711
+ {
6712
+ "epoch": 0.4699620424707347,
6713
+ "grad_norm": 1.4140625,
6714
+ "learning_rate": 9.902638075264226e-05,
6715
+ "loss": 8.9828,
6716
+ "step": 9580
6717
+ },
6718
+ {
6719
+ "epoch": 0.4704526082770716,
6720
+ "grad_norm": 1.0234375,
6721
+ "learning_rate": 9.902076810518366e-05,
6722
+ "loss": 9.4312,
6723
+ "step": 9590
6724
+ },
6725
+ {
6726
+ "epoch": 0.47094317408340847,
6727
+ "grad_norm": 1.6875,
6728
+ "learning_rate": 9.90151394864799e-05,
6729
+ "loss": 9.4076,
6730
+ "step": 9600
6731
+ },
6732
+ {
6733
+ "epoch": 0.47143373988974535,
6734
+ "grad_norm": 1.265625,
6735
+ "learning_rate": 9.900949489836482e-05,
6736
+ "loss": 9.443,
6737
+ "step": 9610
6738
+ },
6739
+ {
6740
+ "epoch": 0.47192430569608224,
6741
+ "grad_norm": 0.9765625,
6742
+ "learning_rate": 9.900383434267745e-05,
6743
+ "loss": 9.1485,
6744
+ "step": 9620
6745
+ },
6746
+ {
6747
+ "epoch": 0.4724148715024191,
6748
+ "grad_norm": 1.390625,
6749
+ "learning_rate": 9.899815782126203e-05,
6750
+ "loss": 9.0047,
6751
+ "step": 9630
6752
+ },
6753
+ {
6754
+ "epoch": 0.472905437308756,
6755
+ "grad_norm": 1.2421875,
6756
+ "learning_rate": 9.8992465335968e-05,
6757
+ "loss": 9.5106,
6758
+ "step": 9640
6759
+ },
6760
+ {
6761
+ "epoch": 0.4733960031150929,
6762
+ "grad_norm": 1.2734375,
6763
+ "learning_rate": 9.898675688865004e-05,
6764
+ "loss": 8.8625,
6765
+ "step": 9650
6766
+ },
6767
+ {
6768
+ "epoch": 0.4738865689214298,
6769
+ "grad_norm": 1.578125,
6770
+ "learning_rate": 9.898103248116795e-05,
6771
+ "loss": 9.7804,
6772
+ "step": 9660
6773
+ },
6774
+ {
6775
+ "epoch": 0.47437713472776666,
6776
+ "grad_norm": 1.3203125,
6777
+ "learning_rate": 9.897529211538678e-05,
6778
+ "loss": 9.1752,
6779
+ "step": 9670
6780
+ },
6781
+ {
6782
+ "epoch": 0.47486770053410354,
6783
+ "grad_norm": 1.6015625,
6784
+ "learning_rate": 9.896953579317679e-05,
6785
+ "loss": 9.4863,
6786
+ "step": 9680
6787
+ },
6788
+ {
6789
+ "epoch": 0.4753582663404404,
6790
+ "grad_norm": 0.9296875,
6791
+ "learning_rate": 9.89637635164134e-05,
6792
+ "loss": 9.5613,
6793
+ "step": 9690
6794
+ },
6795
+ {
6796
+ "epoch": 0.4758488321467773,
6797
+ "grad_norm": 1.6796875,
6798
+ "learning_rate": 9.89579752869773e-05,
6799
+ "loss": 9.3063,
6800
+ "step": 9700
6801
+ },
6802
+ {
6803
+ "epoch": 0.4763393979531142,
6804
+ "grad_norm": 1.28125,
6805
+ "learning_rate": 9.895217110675428e-05,
6806
+ "loss": 9.356,
6807
+ "step": 9710
6808
+ },
6809
+ {
6810
+ "epoch": 0.4768299637594511,
6811
+ "grad_norm": 1.4609375,
6812
+ "learning_rate": 9.894635097763538e-05,
6813
+ "loss": 9.4519,
6814
+ "step": 9720
6815
+ },
6816
+ {
6817
+ "epoch": 0.47732052956578797,
6818
+ "grad_norm": 1.1484375,
6819
+ "learning_rate": 9.894051490151686e-05,
6820
+ "loss": 9.8951,
6821
+ "step": 9730
6822
+ },
6823
+ {
6824
+ "epoch": 0.47781109537212485,
6825
+ "grad_norm": 1.703125,
6826
+ "learning_rate": 9.893466288030011e-05,
6827
+ "loss": 9.0165,
6828
+ "step": 9740
6829
+ },
6830
+ {
6831
+ "epoch": 0.47830166117846173,
6832
+ "grad_norm": 1.625,
6833
+ "learning_rate": 9.892879491589179e-05,
6834
+ "loss": 9.2397,
6835
+ "step": 9750
6836
+ },
6837
+ {
6838
+ "epoch": 0.4787922269847986,
6839
+ "grad_norm": 1.875,
6840
+ "learning_rate": 9.892291101020368e-05,
6841
+ "loss": 9.1272,
6842
+ "step": 9760
6843
+ },
6844
+ {
6845
+ "epoch": 0.4792827927911355,
6846
+ "grad_norm": 1.1796875,
6847
+ "learning_rate": 9.891701116515282e-05,
6848
+ "loss": 9.2885,
6849
+ "step": 9770
6850
+ },
6851
+ {
6852
+ "epoch": 0.47977335859747233,
6853
+ "grad_norm": 1.5390625,
6854
+ "learning_rate": 9.89110953826614e-05,
6855
+ "loss": 10.1661,
6856
+ "step": 9780
6857
+ },
6858
+ {
6859
+ "epoch": 0.4802639244038092,
6860
+ "grad_norm": 0.8515625,
6861
+ "learning_rate": 9.890516366465678e-05,
6862
+ "loss": 9.3322,
6863
+ "step": 9790
6864
+ },
6865
+ {
6866
+ "epoch": 0.4807544902101461,
6867
+ "grad_norm": 1.28125,
6868
+ "learning_rate": 9.889921601307161e-05,
6869
+ "loss": 9.2185,
6870
+ "step": 9800
6871
+ },
6872
+ {
6873
+ "epoch": 0.481245056016483,
6874
+ "grad_norm": 1.046875,
6875
+ "learning_rate": 9.889325242984365e-05,
6876
+ "loss": 9.7809,
6877
+ "step": 9810
6878
+ },
6879
+ {
6880
+ "epoch": 0.48173562182281987,
6881
+ "grad_norm": 0.9765625,
6882
+ "learning_rate": 9.888727291691584e-05,
6883
+ "loss": 9.2243,
6884
+ "step": 9820
6885
+ },
6886
+ {
6887
+ "epoch": 0.48222618762915676,
6888
+ "grad_norm": 1.6875,
6889
+ "learning_rate": 9.888127747623637e-05,
6890
+ "loss": 9.6819,
6891
+ "step": 9830
6892
+ },
6893
+ {
6894
+ "epoch": 0.48271675343549364,
6895
+ "grad_norm": 1.7578125,
6896
+ "learning_rate": 9.887526610975857e-05,
6897
+ "loss": 9.172,
6898
+ "step": 9840
6899
+ },
6900
+ {
6901
+ "epoch": 0.4832073192418305,
6902
+ "grad_norm": 1.1171875,
6903
+ "learning_rate": 9.8869238819441e-05,
6904
+ "loss": 9.2786,
6905
+ "step": 9850
6906
+ },
6907
+ {
6908
+ "epoch": 0.4836978850481674,
6909
+ "grad_norm": 1.9296875,
6910
+ "learning_rate": 9.886319560724735e-05,
6911
+ "loss": 9.4504,
6912
+ "step": 9860
6913
+ },
6914
+ {
6915
+ "epoch": 0.4841884508545043,
6916
+ "grad_norm": 1.6640625,
6917
+ "learning_rate": 9.885713647514658e-05,
6918
+ "loss": 10.0407,
6919
+ "step": 9870
6920
+ },
6921
+ {
6922
+ "epoch": 0.4846790166608412,
6923
+ "grad_norm": 0.9765625,
6924
+ "learning_rate": 9.885106142511275e-05,
6925
+ "loss": 8.9902,
6926
+ "step": 9880
6927
+ },
6928
+ {
6929
+ "epoch": 0.48516958246717806,
6930
+ "grad_norm": 1.0234375,
6931
+ "learning_rate": 9.884497045912515e-05,
6932
+ "loss": 8.8224,
6933
+ "step": 9890
6934
+ },
6935
+ {
6936
+ "epoch": 0.48566014827351495,
6937
+ "grad_norm": 1.4453125,
6938
+ "learning_rate": 9.883886357916828e-05,
6939
+ "loss": 9.893,
6940
+ "step": 9900
6941
+ },
6942
+ {
6943
+ "epoch": 0.48615071407985183,
6944
+ "grad_norm": 1.2890625,
6945
+ "learning_rate": 9.883274078723177e-05,
6946
+ "loss": 9.0176,
6947
+ "step": 9910
6948
+ },
6949
+ {
6950
+ "epoch": 0.4866412798861887,
6951
+ "grad_norm": 3.359375,
6952
+ "learning_rate": 9.882660208531046e-05,
6953
+ "loss": 9.8512,
6954
+ "step": 9920
6955
+ },
6956
+ {
6957
+ "epoch": 0.4871318456925256,
6958
+ "grad_norm": 1.1640625,
6959
+ "learning_rate": 9.882044747540439e-05,
6960
+ "loss": 9.3034,
6961
+ "step": 9930
6962
+ },
6963
+ {
6964
+ "epoch": 0.4876224114988625,
6965
+ "grad_norm": 1.265625,
6966
+ "learning_rate": 9.881427695951875e-05,
6967
+ "loss": 9.5191,
6968
+ "step": 9940
6969
+ },
6970
+ {
6971
+ "epoch": 0.48811297730519937,
6972
+ "grad_norm": 1.1796875,
6973
+ "learning_rate": 9.880809053966395e-05,
6974
+ "loss": 9.4207,
6975
+ "step": 9950
6976
+ },
6977
+ {
6978
+ "epoch": 0.48860354311153625,
6979
+ "grad_norm": 1.3828125,
6980
+ "learning_rate": 9.880188821785554e-05,
6981
+ "loss": 9.2694,
6982
+ "step": 9960
6983
+ },
6984
+ {
6985
+ "epoch": 0.48909410891787314,
6986
+ "grad_norm": 1.609375,
6987
+ "learning_rate": 9.879566999611429e-05,
6988
+ "loss": 9.7303,
6989
+ "step": 9970
6990
+ },
6991
+ {
6992
+ "epoch": 0.48958467472421,
6993
+ "grad_norm": 1.3515625,
6994
+ "learning_rate": 9.878943587646611e-05,
6995
+ "loss": 9.449,
6996
+ "step": 9980
6997
+ },
6998
+ {
6999
+ "epoch": 0.4900752405305469,
7000
+ "grad_norm": 1.6484375,
7001
+ "learning_rate": 9.878318586094213e-05,
7002
+ "loss": 9.3069,
7003
+ "step": 9990
7004
+ },
7005
+ {
7006
+ "epoch": 0.4905658063368838,
7007
+ "grad_norm": 1.2734375,
7008
+ "learning_rate": 9.877691995157862e-05,
7009
+ "loss": 9.0935,
7010
+ "step": 10000
7011
  }
7012
  ],
7013
  "logging_steps": 10,
 
7027
  "attributes": {}
7028
  }
7029
  },
7030
+ "total_flos": 2.69016216109056e+19,
7031
  "train_batch_size": 2,
7032
  "trial_name": null,
7033
  "trial_params": null
checkpoints/{checkpoint-6000 β†’ checkpoint-10000}/training_args.bin RENAMED
File without changes
logs/events.out.tfevents.1752345702.5f638d684589.1618.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71fe7fff984ce23a79dba89725d66ffe54461fe305b846e476ba39c7254d0071
3
- size 174047
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ccf911ef8ac8ed9bc3e58f256b3fe099ed727d89e49446ed0edb60994b81b3
3
+ size 216247