Upload 10 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1095 -3
- training_args.bin +1 -1
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 598635032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f341e7c0d50547a5d48a2244cc30330ab7ed2ceaff5186455a531e8c69a77105
|
| 3 |
size 598635032
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1197359627
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b47ad41e6a351695e914e54e9b102721f48985ba894d88bae93aad1de73672f
|
| 3 |
size 1197359627
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0065f5fa67d21a3e3251b9235347d2a9d93494140e986cefd3a276ca1160a3e0
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ad1df73ab0092710b52025da1ad2250f73bf46d66d45d561b7da8dfce44525e
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -28719,6 +28719,1098 @@
|
|
| 28719 |
"eval_samples_per_second": 196.867,
|
| 28720 |
"eval_steps_per_second": 1.545,
|
| 28721 |
"step": 368000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28722 |
}
|
| 28723 |
],
|
| 28724 |
"logging_steps": 100,
|
|
@@ -28738,7 +29830,7 @@
|
|
| 28738 |
"attributes": {}
|
| 28739 |
}
|
| 28740 |
},
|
| 28741 |
-
"total_flos": 3.
|
| 28742 |
"train_batch_size": 128,
|
| 28743 |
"trial_name": null,
|
| 28744 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 117.000154,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 382000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 28719 |
"eval_samples_per_second": 196.867,
|
| 28720 |
"eval_steps_per_second": 1.545,
|
| 28721 |
"step": 368000
|
| 28722 |
+
},
|
| 28723 |
+
{
|
| 28724 |
+
"epoch": 0.0002,
|
| 28725 |
+
"grad_norm": 1.7583304643630981,
|
| 28726 |
+
"learning_rate": 8.378246507831702e-06,
|
| 28727 |
+
"loss": 2.0821,
|
| 28728 |
+
"step": 368100
|
| 28729 |
+
},
|
| 28730 |
+
{
|
| 28731 |
+
"epoch": 1.000162,
|
| 28732 |
+
"grad_norm": 1.6928149461746216,
|
| 28733 |
+
"learning_rate": 8.366398135029847e-06,
|
| 28734 |
+
"loss": 1.9175,
|
| 28735 |
+
"step": 368200
|
| 28736 |
+
},
|
| 28737 |
+
{
|
| 28738 |
+
"epoch": 2.000124,
|
| 28739 |
+
"grad_norm": 1.6941829919815063,
|
| 28740 |
+
"learning_rate": 8.354556462240829e-06,
|
| 28741 |
+
"loss": 1.8645,
|
| 28742 |
+
"step": 368300
|
| 28743 |
+
},
|
| 28744 |
+
{
|
| 28745 |
+
"epoch": 3.000086,
|
| 28746 |
+
"grad_norm": 1.6606141328811646,
|
| 28747 |
+
"learning_rate": 8.342721494234487e-06,
|
| 28748 |
+
"loss": 1.8296,
|
| 28749 |
+
"step": 368400
|
| 28750 |
+
},
|
| 28751 |
+
{
|
| 28752 |
+
"epoch": 4.000048,
|
| 28753 |
+
"grad_norm": 1.508047342300415,
|
| 28754 |
+
"learning_rate": 8.330893235777929e-06,
|
| 28755 |
+
"loss": 1.7982,
|
| 28756 |
+
"step": 368500
|
| 28757 |
+
},
|
| 28758 |
+
{
|
| 28759 |
+
"epoch": 5.00001,
|
| 28760 |
+
"grad_norm": 1.6567221879959106,
|
| 28761 |
+
"learning_rate": 8.31907169163558e-06,
|
| 28762 |
+
"loss": 1.776,
|
| 28763 |
+
"step": 368600
|
| 28764 |
+
},
|
| 28765 |
+
{
|
| 28766 |
+
"epoch": 5.00021,
|
| 28767 |
+
"grad_norm": 1.5388526916503906,
|
| 28768 |
+
"learning_rate": 8.30725686656916e-06,
|
| 28769 |
+
"loss": 1.7492,
|
| 28770 |
+
"step": 368700
|
| 28771 |
+
},
|
| 28772 |
+
{
|
| 28773 |
+
"epoch": 6.000172,
|
| 28774 |
+
"grad_norm": 1.6148278713226318,
|
| 28775 |
+
"learning_rate": 8.295448765337685e-06,
|
| 28776 |
+
"loss": 1.7284,
|
| 28777 |
+
"step": 368800
|
| 28778 |
+
},
|
| 28779 |
+
{
|
| 28780 |
+
"epoch": 7.000134,
|
| 28781 |
+
"grad_norm": 1.5249569416046143,
|
| 28782 |
+
"learning_rate": 8.28364739269744e-06,
|
| 28783 |
+
"loss": 1.7221,
|
| 28784 |
+
"step": 368900
|
| 28785 |
+
},
|
| 28786 |
+
{
|
| 28787 |
+
"epoch": 8.000096,
|
| 28788 |
+
"grad_norm": 1.5550845861434937,
|
| 28789 |
+
"learning_rate": 8.271852753402028e-06,
|
| 28790 |
+
"loss": 1.7079,
|
| 28791 |
+
"step": 369000
|
| 28792 |
+
},
|
| 28793 |
+
{
|
| 28794 |
+
"epoch": 8.000096,
|
| 28795 |
+
"eval_loss": 1.9369168281555176,
|
| 28796 |
+
"eval_runtime": 55.0617,
|
| 28797 |
+
"eval_samples_per_second": 185.138,
|
| 28798 |
+
"eval_steps_per_second": 1.453,
|
| 28799 |
+
"step": 369000
|
| 28800 |
+
},
|
| 28801 |
+
{
|
| 28802 |
+
"epoch": 9.000058,
|
| 28803 |
+
"grad_norm": 2.09401273727417,
|
| 28804 |
+
"learning_rate": 8.260064852202329e-06,
|
| 28805 |
+
"loss": 3.9424,
|
| 28806 |
+
"step": 369100
|
| 28807 |
+
},
|
| 28808 |
+
{
|
| 28809 |
+
"epoch": 10.00002,
|
| 28810 |
+
"grad_norm": 1.9706476926803589,
|
| 28811 |
+
"learning_rate": 8.248283693846509e-06,
|
| 28812 |
+
"loss": 2.7687,
|
| 28813 |
+
"step": 369200
|
| 28814 |
+
},
|
| 28815 |
+
{
|
| 28816 |
+
"epoch": 10.00022,
|
| 28817 |
+
"grad_norm": 2.0509135723114014,
|
| 28818 |
+
"learning_rate": 8.23650928308001e-06,
|
| 28819 |
+
"loss": 2.546,
|
| 28820 |
+
"step": 369300
|
| 28821 |
+
},
|
| 28822 |
+
{
|
| 28823 |
+
"epoch": 11.000182,
|
| 28824 |
+
"grad_norm": 1.9125868082046509,
|
| 28825 |
+
"learning_rate": 8.224741624645565e-06,
|
| 28826 |
+
"loss": 2.4164,
|
| 28827 |
+
"step": 369400
|
| 28828 |
+
},
|
| 28829 |
+
{
|
| 28830 |
+
"epoch": 12.000144,
|
| 28831 |
+
"grad_norm": 2.175070285797119,
|
| 28832 |
+
"learning_rate": 8.212980723283186e-06,
|
| 28833 |
+
"loss": 2.3405,
|
| 28834 |
+
"step": 369500
|
| 28835 |
+
},
|
| 28836 |
+
{
|
| 28837 |
+
"epoch": 13.000106,
|
| 28838 |
+
"grad_norm": 1.9154648780822754,
|
| 28839 |
+
"learning_rate": 8.201226583730175e-06,
|
| 28840 |
+
"loss": 2.2729,
|
| 28841 |
+
"step": 369600
|
| 28842 |
+
},
|
| 28843 |
+
{
|
| 28844 |
+
"epoch": 14.000068,
|
| 28845 |
+
"grad_norm": 2.021451711654663,
|
| 28846 |
+
"learning_rate": 8.189479210721076e-06,
|
| 28847 |
+
"loss": 2.2268,
|
| 28848 |
+
"step": 369700
|
| 28849 |
+
},
|
| 28850 |
+
{
|
| 28851 |
+
"epoch": 15.00003,
|
| 28852 |
+
"grad_norm": 2.0009710788726807,
|
| 28853 |
+
"learning_rate": 8.177738608987745e-06,
|
| 28854 |
+
"loss": 2.1859,
|
| 28855 |
+
"step": 369800
|
| 28856 |
+
},
|
| 28857 |
+
{
|
| 28858 |
+
"epoch": 15.00023,
|
| 28859 |
+
"grad_norm": 1.9311867952346802,
|
| 28860 |
+
"learning_rate": 8.166004783259295e-06,
|
| 28861 |
+
"loss": 2.1494,
|
| 28862 |
+
"step": 369900
|
| 28863 |
+
},
|
| 28864 |
+
{
|
| 28865 |
+
"epoch": 16.000192,
|
| 28866 |
+
"grad_norm": 1.967115044593811,
|
| 28867 |
+
"learning_rate": 8.154277738262097e-06,
|
| 28868 |
+
"loss": 2.1181,
|
| 28869 |
+
"step": 370000
|
| 28870 |
+
},
|
| 28871 |
+
{
|
| 28872 |
+
"epoch": 16.000192,
|
| 28873 |
+
"eval_loss": 2.407406806945801,
|
| 28874 |
+
"eval_runtime": 54.9275,
|
| 28875 |
+
"eval_samples_per_second": 185.59,
|
| 28876 |
+
"eval_steps_per_second": 1.456,
|
| 28877 |
+
"step": 370000
|
| 28878 |
+
},
|
| 28879 |
+
{
|
| 28880 |
+
"epoch": 17.000154,
|
| 28881 |
+
"grad_norm": 2.050703525543213,
|
| 28882 |
+
"learning_rate": 8.142557478719814e-06,
|
| 28883 |
+
"loss": 2.496,
|
| 28884 |
+
"step": 370100
|
| 28885 |
+
},
|
| 28886 |
+
{
|
| 28887 |
+
"epoch": 18.000116,
|
| 28888 |
+
"grad_norm": 2.053346872329712,
|
| 28889 |
+
"learning_rate": 8.130844009353362e-06,
|
| 28890 |
+
"loss": 2.3323,
|
| 28891 |
+
"step": 370200
|
| 28892 |
+
},
|
| 28893 |
+
{
|
| 28894 |
+
"epoch": 19.000078,
|
| 28895 |
+
"grad_norm": 1.9913196563720703,
|
| 28896 |
+
"learning_rate": 8.119137334880933e-06,
|
| 28897 |
+
"loss": 2.2625,
|
| 28898 |
+
"step": 370300
|
| 28899 |
+
},
|
| 28900 |
+
{
|
| 28901 |
+
"epoch": 20.00004,
|
| 28902 |
+
"grad_norm": 2.018827438354492,
|
| 28903 |
+
"learning_rate": 8.107437460017958e-06,
|
| 28904 |
+
"loss": 2.2166,
|
| 28905 |
+
"step": 370400
|
| 28906 |
+
},
|
| 28907 |
+
{
|
| 28908 |
+
"epoch": 21.000002,
|
| 28909 |
+
"grad_norm": 2.2157461643218994,
|
| 28910 |
+
"learning_rate": 8.095744389477155e-06,
|
| 28911 |
+
"loss": 2.1759,
|
| 28912 |
+
"step": 370500
|
| 28913 |
+
},
|
| 28914 |
+
{
|
| 28915 |
+
"epoch": 21.000202,
|
| 28916 |
+
"grad_norm": 1.975722312927246,
|
| 28917 |
+
"learning_rate": 8.084058127968497e-06,
|
| 28918 |
+
"loss": 2.1349,
|
| 28919 |
+
"step": 370600
|
| 28920 |
+
},
|
| 28921 |
+
{
|
| 28922 |
+
"epoch": 22.000164,
|
| 28923 |
+
"grad_norm": 2.118351459503174,
|
| 28924 |
+
"learning_rate": 8.072378680199197e-06,
|
| 28925 |
+
"loss": 2.1051,
|
| 28926 |
+
"step": 370700
|
| 28927 |
+
},
|
| 28928 |
+
{
|
| 28929 |
+
"epoch": 23.000126,
|
| 28930 |
+
"grad_norm": 1.9632095098495483,
|
| 28931 |
+
"learning_rate": 8.060706050873746e-06,
|
| 28932 |
+
"loss": 2.0781,
|
| 28933 |
+
"step": 370800
|
| 28934 |
+
},
|
| 28935 |
+
{
|
| 28936 |
+
"epoch": 24.000088,
|
| 28937 |
+
"grad_norm": 2.0141265392303467,
|
| 28938 |
+
"learning_rate": 8.049040244693864e-06,
|
| 28939 |
+
"loss": 2.0583,
|
| 28940 |
+
"step": 370900
|
| 28941 |
+
},
|
| 28942 |
+
{
|
| 28943 |
+
"epoch": 25.00005,
|
| 28944 |
+
"grad_norm": 2.0297467708587646,
|
| 28945 |
+
"learning_rate": 8.037381266358546e-06,
|
| 28946 |
+
"loss": 2.0323,
|
| 28947 |
+
"step": 371000
|
| 28948 |
+
},
|
| 28949 |
+
{
|
| 28950 |
+
"epoch": 25.00005,
|
| 28951 |
+
"eval_loss": 2.3608412742614746,
|
| 28952 |
+
"eval_runtime": 55.107,
|
| 28953 |
+
"eval_samples_per_second": 184.986,
|
| 28954 |
+
"eval_steps_per_second": 1.452,
|
| 28955 |
+
"step": 371000
|
| 28956 |
+
},
|
| 28957 |
+
{
|
| 28958 |
+
"epoch": 26.000012,
|
| 28959 |
+
"grad_norm": 2.0017659664154053,
|
| 28960 |
+
"learning_rate": 8.025729120564025e-06,
|
| 28961 |
+
"loss": 2.2111,
|
| 28962 |
+
"step": 371100
|
| 28963 |
+
},
|
| 28964 |
+
{
|
| 28965 |
+
"epoch": 26.000212,
|
| 28966 |
+
"grad_norm": 2.087977409362793,
|
| 28967 |
+
"learning_rate": 8.01408381200379e-06,
|
| 28968 |
+
"loss": 2.1626,
|
| 28969 |
+
"step": 371200
|
| 28970 |
+
},
|
| 28971 |
+
{
|
| 28972 |
+
"epoch": 27.000174,
|
| 28973 |
+
"grad_norm": 1.9115463495254517,
|
| 28974 |
+
"learning_rate": 8.002445345368556e-06,
|
| 28975 |
+
"loss": 2.1198,
|
| 28976 |
+
"step": 371300
|
| 28977 |
+
},
|
| 28978 |
+
{
|
| 28979 |
+
"epoch": 28.000136,
|
| 28980 |
+
"grad_norm": 2.075347423553467,
|
| 28981 |
+
"learning_rate": 7.990813725346307e-06,
|
| 28982 |
+
"loss": 2.0987,
|
| 28983 |
+
"step": 371400
|
| 28984 |
+
},
|
| 28985 |
+
{
|
| 28986 |
+
"epoch": 29.000098,
|
| 28987 |
+
"grad_norm": 2.004270553588867,
|
| 28988 |
+
"learning_rate": 7.979188956622263e-06,
|
| 28989 |
+
"loss": 2.0634,
|
| 28990 |
+
"step": 371500
|
| 28991 |
+
},
|
| 28992 |
+
{
|
| 28993 |
+
"epoch": 30.00006,
|
| 28994 |
+
"grad_norm": 2.0730834007263184,
|
| 28995 |
+
"learning_rate": 7.967571043878863e-06,
|
| 28996 |
+
"loss": 2.0421,
|
| 28997 |
+
"step": 371600
|
| 28998 |
+
},
|
| 28999 |
+
{
|
| 29000 |
+
"epoch": 31.000022,
|
| 29001 |
+
"grad_norm": 2.0204977989196777,
|
| 29002 |
+
"learning_rate": 7.955959991795809e-06,
|
| 29003 |
+
"loss": 2.0191,
|
| 29004 |
+
"step": 371700
|
| 29005 |
+
},
|
| 29006 |
+
{
|
| 29007 |
+
"epoch": 31.000222,
|
| 29008 |
+
"grad_norm": 1.9809165000915527,
|
| 29009 |
+
"learning_rate": 7.944355805050032e-06,
|
| 29010 |
+
"loss": 1.9979,
|
| 29011 |
+
"step": 371800
|
| 29012 |
+
},
|
| 29013 |
+
{
|
| 29014 |
+
"epoch": 32.000184,
|
| 29015 |
+
"grad_norm": 1.8896480798721313,
|
| 29016 |
+
"learning_rate": 7.932758488315705e-06,
|
| 29017 |
+
"loss": 1.9788,
|
| 29018 |
+
"step": 371900
|
| 29019 |
+
},
|
| 29020 |
+
{
|
| 29021 |
+
"epoch": 33.000146,
|
| 29022 |
+
"grad_norm": 1.8905068635940552,
|
| 29023 |
+
"learning_rate": 7.921168046264213e-06,
|
| 29024 |
+
"loss": 1.9646,
|
| 29025 |
+
"step": 372000
|
| 29026 |
+
},
|
| 29027 |
+
{
|
| 29028 |
+
"epoch": 33.000146,
|
| 29029 |
+
"eval_loss": 2.3312835693359375,
|
| 29030 |
+
"eval_runtime": 55.0336,
|
| 29031 |
+
"eval_samples_per_second": 185.232,
|
| 29032 |
+
"eval_steps_per_second": 1.454,
|
| 29033 |
+
"step": 372000
|
| 29034 |
+
},
|
| 29035 |
+
{
|
| 29036 |
+
"epoch": 34.000108,
|
| 29037 |
+
"grad_norm": 2.0993173122406006,
|
| 29038 |
+
"learning_rate": 7.909584483564187e-06,
|
| 29039 |
+
"loss": 2.0813,
|
| 29040 |
+
"step": 372100
|
| 29041 |
+
},
|
| 29042 |
+
{
|
| 29043 |
+
"epoch": 35.00007,
|
| 29044 |
+
"grad_norm": 2.0958781242370605,
|
| 29045 |
+
"learning_rate": 7.898007804881485e-06,
|
| 29046 |
+
"loss": 2.0596,
|
| 29047 |
+
"step": 372200
|
| 29048 |
+
},
|
| 29049 |
+
{
|
| 29050 |
+
"epoch": 36.000032,
|
| 29051 |
+
"grad_norm": 1.9180951118469238,
|
| 29052 |
+
"learning_rate": 7.886438014879205e-06,
|
| 29053 |
+
"loss": 2.0353,
|
| 29054 |
+
"step": 372300
|
| 29055 |
+
},
|
| 29056 |
+
{
|
| 29057 |
+
"epoch": 36.000232,
|
| 29058 |
+
"grad_norm": 2.0129170417785645,
|
| 29059 |
+
"learning_rate": 7.874875118217639e-06,
|
| 29060 |
+
"loss": 2.007,
|
| 29061 |
+
"step": 372400
|
| 29062 |
+
},
|
| 29063 |
+
{
|
| 29064 |
+
"epoch": 37.000194,
|
| 29065 |
+
"grad_norm": 1.9586989879608154,
|
| 29066 |
+
"learning_rate": 7.863319119554325e-06,
|
| 29067 |
+
"loss": 1.9911,
|
| 29068 |
+
"step": 372500
|
| 29069 |
+
},
|
| 29070 |
+
{
|
| 29071 |
+
"epoch": 38.000156,
|
| 29072 |
+
"grad_norm": 2.0036728382110596,
|
| 29073 |
+
"learning_rate": 7.851770023544022e-06,
|
| 29074 |
+
"loss": 1.97,
|
| 29075 |
+
"step": 372600
|
| 29076 |
+
},
|
| 29077 |
+
{
|
| 29078 |
+
"epoch": 39.000118,
|
| 29079 |
+
"grad_norm": 2.0655548572540283,
|
| 29080 |
+
"learning_rate": 7.840227834838709e-06,
|
| 29081 |
+
"loss": 1.9609,
|
| 29082 |
+
"step": 372700
|
| 29083 |
+
},
|
| 29084 |
+
{
|
| 29085 |
+
"epoch": 40.00008,
|
| 29086 |
+
"grad_norm": 1.8536264896392822,
|
| 29087 |
+
"learning_rate": 7.828692558087566e-06,
|
| 29088 |
+
"loss": 1.9389,
|
| 29089 |
+
"step": 372800
|
| 29090 |
+
},
|
| 29091 |
+
{
|
| 29092 |
+
"epoch": 41.000042,
|
| 29093 |
+
"grad_norm": 2.0123019218444824,
|
| 29094 |
+
"learning_rate": 7.817164197937006e-06,
|
| 29095 |
+
"loss": 1.9311,
|
| 29096 |
+
"step": 372900
|
| 29097 |
+
},
|
| 29098 |
+
{
|
| 29099 |
+
"epoch": 42.000004,
|
| 29100 |
+
"grad_norm": 1.9356095790863037,
|
| 29101 |
+
"learning_rate": 7.80564275903066e-06,
|
| 29102 |
+
"loss": 1.9157,
|
| 29103 |
+
"step": 373000
|
| 29104 |
+
},
|
| 29105 |
+
{
|
| 29106 |
+
"epoch": 42.000004,
|
| 29107 |
+
"eval_loss": 2.2908835411071777,
|
| 29108 |
+
"eval_runtime": 54.8694,
|
| 29109 |
+
"eval_samples_per_second": 185.787,
|
| 29110 |
+
"eval_steps_per_second": 1.458,
|
| 29111 |
+
"step": 373000
|
| 29112 |
+
},
|
| 29113 |
+
{
|
| 29114 |
+
"epoch": 42.000204,
|
| 29115 |
+
"grad_norm": 1.9983534812927246,
|
| 29116 |
+
"learning_rate": 7.794128246009346e-06,
|
| 29117 |
+
"loss": 1.9932,
|
| 29118 |
+
"step": 373100
|
| 29119 |
+
},
|
| 29120 |
+
{
|
| 29121 |
+
"epoch": 43.000166,
|
| 29122 |
+
"grad_norm": 2.0036892890930176,
|
| 29123 |
+
"learning_rate": 7.782620663511117e-06,
|
| 29124 |
+
"loss": 1.9803,
|
| 29125 |
+
"step": 373200
|
| 29126 |
+
},
|
| 29127 |
+
{
|
| 29128 |
+
"epoch": 44.000128,
|
| 29129 |
+
"grad_norm": 1.9349839687347412,
|
| 29130 |
+
"learning_rate": 7.771120016171227e-06,
|
| 29131 |
+
"loss": 1.9687,
|
| 29132 |
+
"step": 373300
|
| 29133 |
+
},
|
| 29134 |
+
{
|
| 29135 |
+
"epoch": 45.00009,
|
| 29136 |
+
"grad_norm": 1.8848403692245483,
|
| 29137 |
+
"learning_rate": 7.759626308622142e-06,
|
| 29138 |
+
"loss": 1.9474,
|
| 29139 |
+
"step": 373400
|
| 29140 |
+
},
|
| 29141 |
+
{
|
| 29142 |
+
"epoch": 46.000052,
|
| 29143 |
+
"grad_norm": 1.9943233728408813,
|
| 29144 |
+
"learning_rate": 7.74813954549351e-06,
|
| 29145 |
+
"loss": 1.9319,
|
| 29146 |
+
"step": 373500
|
| 29147 |
+
},
|
| 29148 |
+
{
|
| 29149 |
+
"epoch": 47.000014,
|
| 29150 |
+
"grad_norm": 1.9002938270568848,
|
| 29151 |
+
"learning_rate": 7.736659731412204e-06,
|
| 29152 |
+
"loss": 1.9217,
|
| 29153 |
+
"step": 373600
|
| 29154 |
+
},
|
| 29155 |
+
{
|
| 29156 |
+
"epoch": 47.000214,
|
| 29157 |
+
"grad_norm": 1.9708117246627808,
|
| 29158 |
+
"learning_rate": 7.725186871002296e-06,
|
| 29159 |
+
"loss": 1.9083,
|
| 29160 |
+
"step": 373700
|
| 29161 |
+
},
|
| 29162 |
+
{
|
| 29163 |
+
"epoch": 48.000176,
|
| 29164 |
+
"grad_norm": 1.9721884727478027,
|
| 29165 |
+
"learning_rate": 7.713720968885057e-06,
|
| 29166 |
+
"loss": 1.8956,
|
| 29167 |
+
"step": 373800
|
| 29168 |
+
},
|
| 29169 |
+
{
|
| 29170 |
+
"epoch": 49.000138,
|
| 29171 |
+
"grad_norm": 1.9223700761795044,
|
| 29172 |
+
"learning_rate": 7.702262029678939e-06,
|
| 29173 |
+
"loss": 1.8808,
|
| 29174 |
+
"step": 373900
|
| 29175 |
+
},
|
| 29176 |
+
{
|
| 29177 |
+
"epoch": 50.0001,
|
| 29178 |
+
"grad_norm": 2.03428316116333,
|
| 29179 |
+
"learning_rate": 7.690810057999607e-06,
|
| 29180 |
+
"loss": 1.868,
|
| 29181 |
+
"step": 374000
|
| 29182 |
+
},
|
| 29183 |
+
{
|
| 29184 |
+
"epoch": 50.0001,
|
| 29185 |
+
"eval_loss": 2.2805299758911133,
|
| 29186 |
+
"eval_runtime": 55.0731,
|
| 29187 |
+
"eval_samples_per_second": 185.099,
|
| 29188 |
+
"eval_steps_per_second": 1.453,
|
| 29189 |
+
"step": 374000
|
| 29190 |
+
},
|
| 29191 |
+
{
|
| 29192 |
+
"epoch": 51.000062,
|
| 29193 |
+
"grad_norm": 1.947739601135254,
|
| 29194 |
+
"learning_rate": 7.67936505845991e-06,
|
| 29195 |
+
"loss": 1.9356,
|
| 29196 |
+
"step": 374100
|
| 29197 |
+
},
|
| 29198 |
+
{
|
| 29199 |
+
"epoch": 52.000024,
|
| 29200 |
+
"grad_norm": 1.939833164215088,
|
| 29201 |
+
"learning_rate": 7.667927035669906e-06,
|
| 29202 |
+
"loss": 1.9287,
|
| 29203 |
+
"step": 374200
|
| 29204 |
+
},
|
| 29205 |
+
{
|
| 29206 |
+
"epoch": 52.000224,
|
| 29207 |
+
"grad_norm": 2.120412588119507,
|
| 29208 |
+
"learning_rate": 7.656495994236813e-06,
|
| 29209 |
+
"loss": 1.9083,
|
| 29210 |
+
"step": 374300
|
| 29211 |
+
},
|
| 29212 |
+
{
|
| 29213 |
+
"epoch": 53.000186,
|
| 29214 |
+
"grad_norm": 1.9514408111572266,
|
| 29215 |
+
"learning_rate": 7.645071938765055e-06,
|
| 29216 |
+
"loss": 1.9005,
|
| 29217 |
+
"step": 374400
|
| 29218 |
+
},
|
| 29219 |
+
{
|
| 29220 |
+
"epoch": 54.000148,
|
| 29221 |
+
"grad_norm": 1.9537405967712402,
|
| 29222 |
+
"learning_rate": 7.633654873856258e-06,
|
| 29223 |
+
"loss": 1.8885,
|
| 29224 |
+
"step": 374500
|
| 29225 |
+
},
|
| 29226 |
+
{
|
| 29227 |
+
"epoch": 55.00011,
|
| 29228 |
+
"grad_norm": 1.9912673234939575,
|
| 29229 |
+
"learning_rate": 7.6222448041091884e-06,
|
| 29230 |
+
"loss": 1.8727,
|
| 29231 |
+
"step": 374600
|
| 29232 |
+
},
|
| 29233 |
+
{
|
| 29234 |
+
"epoch": 56.000072,
|
| 29235 |
+
"grad_norm": 2.0160086154937744,
|
| 29236 |
+
"learning_rate": 7.6108417341198366e-06,
|
| 29237 |
+
"loss": 1.8652,
|
| 29238 |
+
"step": 374700
|
| 29239 |
+
},
|
| 29240 |
+
{
|
| 29241 |
+
"epoch": 57.000034,
|
| 29242 |
+
"grad_norm": 1.962786078453064,
|
| 29243 |
+
"learning_rate": 7.599445668481353e-06,
|
| 29244 |
+
"loss": 1.8495,
|
| 29245 |
+
"step": 374800
|
| 29246 |
+
},
|
| 29247 |
+
{
|
| 29248 |
+
"epoch": 57.000234,
|
| 29249 |
+
"grad_norm": 2.0677285194396973,
|
| 29250 |
+
"learning_rate": 7.588056611784084e-06,
|
| 29251 |
+
"loss": 1.8414,
|
| 29252 |
+
"step": 374900
|
| 29253 |
+
},
|
| 29254 |
+
{
|
| 29255 |
+
"epoch": 58.000196,
|
| 29256 |
+
"grad_norm": 1.923409104347229,
|
| 29257 |
+
"learning_rate": 7.576674568615519e-06,
|
| 29258 |
+
"loss": 1.8278,
|
| 29259 |
+
"step": 375000
|
| 29260 |
+
},
|
| 29261 |
+
{
|
| 29262 |
+
"epoch": 58.000196,
|
| 29263 |
+
"eval_loss": 2.2644314765930176,
|
| 29264 |
+
"eval_runtime": 54.7576,
|
| 29265 |
+
"eval_samples_per_second": 186.166,
|
| 29266 |
+
"eval_steps_per_second": 1.461,
|
| 29267 |
+
"step": 375000
|
| 29268 |
+
},
|
| 29269 |
+
{
|
| 29270 |
+
"epoch": 59.000158,
|
| 29271 |
+
"grad_norm": 2.0004312992095947,
|
| 29272 |
+
"learning_rate": 7.565299543560353e-06,
|
| 29273 |
+
"loss": 1.8848,
|
| 29274 |
+
"step": 375100
|
| 29275 |
+
},
|
| 29276 |
+
{
|
| 29277 |
+
"epoch": 60.00012,
|
| 29278 |
+
"grad_norm": 2.0457980632781982,
|
| 29279 |
+
"learning_rate": 7.553931541200448e-06,
|
| 29280 |
+
"loss": 1.8788,
|
| 29281 |
+
"step": 375200
|
| 29282 |
+
},
|
| 29283 |
+
{
|
| 29284 |
+
"epoch": 61.000082,
|
| 29285 |
+
"grad_norm": 1.9472349882125854,
|
| 29286 |
+
"learning_rate": 7.54257056611484e-06,
|
| 29287 |
+
"loss": 1.8666,
|
| 29288 |
+
"step": 375300
|
| 29289 |
+
},
|
| 29290 |
+
{
|
| 29291 |
+
"epoch": 62.000044,
|
| 29292 |
+
"grad_norm": 2.019150733947754,
|
| 29293 |
+
"learning_rate": 7.531216622879711e-06,
|
| 29294 |
+
"loss": 1.8555,
|
| 29295 |
+
"step": 375400
|
| 29296 |
+
},
|
| 29297 |
+
{
|
| 29298 |
+
"epoch": 63.000006,
|
| 29299 |
+
"grad_norm": 1.9674944877624512,
|
| 29300 |
+
"learning_rate": 7.5198697160684365e-06,
|
| 29301 |
+
"loss": 1.8495,
|
| 29302 |
+
"step": 375500
|
| 29303 |
+
},
|
| 29304 |
+
{
|
| 29305 |
+
"epoch": 63.000206,
|
| 29306 |
+
"grad_norm": 1.959089756011963,
|
| 29307 |
+
"learning_rate": 7.5085298502515525e-06,
|
| 29308 |
+
"loss": 1.8353,
|
| 29309 |
+
"step": 375600
|
| 29310 |
+
},
|
| 29311 |
+
{
|
| 29312 |
+
"epoch": 64.000168,
|
| 29313 |
+
"grad_norm": 1.9350240230560303,
|
| 29314 |
+
"learning_rate": 7.4971970299967605e-06,
|
| 29315 |
+
"loss": 1.8257,
|
| 29316 |
+
"step": 375700
|
| 29317 |
+
},
|
| 29318 |
+
{
|
| 29319 |
+
"epoch": 65.00013,
|
| 29320 |
+
"grad_norm": 1.9134896993637085,
|
| 29321 |
+
"learning_rate": 7.4858712598689014e-06,
|
| 29322 |
+
"loss": 1.8124,
|
| 29323 |
+
"step": 375800
|
| 29324 |
+
},
|
| 29325 |
+
{
|
| 29326 |
+
"epoch": 66.000092,
|
| 29327 |
+
"grad_norm": 2.0086705684661865,
|
| 29328 |
+
"learning_rate": 7.474552544430008e-06,
|
| 29329 |
+
"loss": 1.8052,
|
| 29330 |
+
"step": 375900
|
| 29331 |
+
},
|
| 29332 |
+
{
|
| 29333 |
+
"epoch": 67.000054,
|
| 29334 |
+
"grad_norm": 1.9945427179336548,
|
| 29335 |
+
"learning_rate": 7.4632408882392504e-06,
|
| 29336 |
+
"loss": 1.8005,
|
| 29337 |
+
"step": 376000
|
| 29338 |
+
},
|
| 29339 |
+
{
|
| 29340 |
+
"epoch": 67.000054,
|
| 29341 |
+
"eval_loss": 2.248349189758301,
|
| 29342 |
+
"eval_runtime": 54.5876,
|
| 29343 |
+
"eval_samples_per_second": 186.746,
|
| 29344 |
+
"eval_steps_per_second": 1.466,
|
| 29345 |
+
"step": 376000
|
| 29346 |
+
},
|
| 29347 |
+
{
|
| 29348 |
+
"epoch": 68.000016,
|
| 29349 |
+
"grad_norm": 1.9743598699569702,
|
| 29350 |
+
"learning_rate": 7.451936295852976e-06,
|
| 29351 |
+
"loss": 1.8454,
|
| 29352 |
+
"step": 376100
|
| 29353 |
+
},
|
| 29354 |
+
{
|
| 29355 |
+
"epoch": 68.000216,
|
| 29356 |
+
"grad_norm": 1.898568034172058,
|
| 29357 |
+
"learning_rate": 7.440638771824654e-06,
|
| 29358 |
+
"loss": 1.8431,
|
| 29359 |
+
"step": 376200
|
| 29360 |
+
},
|
| 29361 |
+
{
|
| 29362 |
+
"epoch": 69.000178,
|
| 29363 |
+
"grad_norm": 2.142463445663452,
|
| 29364 |
+
"learning_rate": 7.429348320704935e-06,
|
| 29365 |
+
"loss": 1.8277,
|
| 29366 |
+
"step": 376300
|
| 29367 |
+
},
|
| 29368 |
+
{
|
| 29369 |
+
"epoch": 70.00014,
|
| 29370 |
+
"grad_norm": 1.9892468452453613,
|
| 29371 |
+
"learning_rate": 7.41806494704162e-06,
|
| 29372 |
+
"loss": 1.8119,
|
| 29373 |
+
"step": 376400
|
| 29374 |
+
},
|
| 29375 |
+
{
|
| 29376 |
+
"epoch": 71.000102,
|
| 29377 |
+
"grad_norm": 2.005885601043701,
|
| 29378 |
+
"learning_rate": 7.406788655379634e-06,
|
| 29379 |
+
"loss": 1.8086,
|
| 29380 |
+
"step": 376500
|
| 29381 |
+
},
|
| 29382 |
+
{
|
| 29383 |
+
"epoch": 72.000064,
|
| 29384 |
+
"grad_norm": 1.9385697841644287,
|
| 29385 |
+
"learning_rate": 7.395519450261074e-06,
|
| 29386 |
+
"loss": 1.8024,
|
| 29387 |
+
"step": 376600
|
| 29388 |
+
},
|
| 29389 |
+
{
|
| 29390 |
+
"epoch": 73.000026,
|
| 29391 |
+
"grad_norm": 1.9773157835006714,
|
| 29392 |
+
"learning_rate": 7.384257336225173e-06,
|
| 29393 |
+
"loss": 1.7934,
|
| 29394 |
+
"step": 376700
|
| 29395 |
+
},
|
| 29396 |
+
{
|
| 29397 |
+
"epoch": 73.000226,
|
| 29398 |
+
"grad_norm": 1.8618143796920776,
|
| 29399 |
+
"learning_rate": 7.373002317808317e-06,
|
| 29400 |
+
"loss": 1.7824,
|
| 29401 |
+
"step": 376800
|
| 29402 |
+
},
|
| 29403 |
+
{
|
| 29404 |
+
"epoch": 74.000188,
|
| 29405 |
+
"grad_norm": 1.9531538486480713,
|
| 29406 |
+
"learning_rate": 7.361754399544013e-06,
|
| 29407 |
+
"loss": 1.7727,
|
| 29408 |
+
"step": 376900
|
| 29409 |
+
},
|
| 29410 |
+
{
|
| 29411 |
+
"epoch": 75.00015,
|
| 29412 |
+
"grad_norm": 1.931515097618103,
|
| 29413 |
+
"learning_rate": 7.350513585962926e-06,
|
| 29414 |
+
"loss": 1.764,
|
| 29415 |
+
"step": 377000
|
| 29416 |
+
},
|
| 29417 |
+
{
|
| 29418 |
+
"epoch": 75.00015,
|
| 29419 |
+
"eval_loss": 2.2430500984191895,
|
| 29420 |
+
"eval_runtime": 54.6415,
|
| 29421 |
+
"eval_samples_per_second": 186.561,
|
| 29422 |
+
"eval_steps_per_second": 1.464,
|
| 29423 |
+
"step": 377000
|
| 29424 |
+
},
|
| 29425 |
+
{
|
| 29426 |
+
"epoch": 76.000112,
|
| 29427 |
+
"grad_norm": 1.9521348476409912,
|
| 29428 |
+
"learning_rate": 7.339279881592859e-06,
|
| 29429 |
+
"loss": 1.8087,
|
| 29430 |
+
"step": 377100
|
| 29431 |
+
},
|
| 29432 |
+
{
|
| 29433 |
+
"epoch": 77.000074,
|
| 29434 |
+
"grad_norm": 2.0013513565063477,
|
| 29435 |
+
"learning_rate": 7.32805329095875e-06,
|
| 29436 |
+
"loss": 1.8023,
|
| 29437 |
+
"step": 377200
|
| 29438 |
+
},
|
| 29439 |
+
{
|
| 29440 |
+
"epoch": 78.000036,
|
| 29441 |
+
"grad_norm": 1.8955408334732056,
|
| 29442 |
+
"learning_rate": 7.316833818582652e-06,
|
| 29443 |
+
"loss": 1.7943,
|
| 29444 |
+
"step": 377300
|
| 29445 |
+
},
|
| 29446 |
+
{
|
| 29447 |
+
"epoch": 78.000236,
|
| 29448 |
+
"grad_norm": 2.0025761127471924,
|
| 29449 |
+
"learning_rate": 7.305621468983781e-06,
|
| 29450 |
+
"loss": 1.7903,
|
| 29451 |
+
"step": 377400
|
| 29452 |
+
},
|
| 29453 |
+
{
|
| 29454 |
+
"epoch": 79.000198,
|
| 29455 |
+
"grad_norm": 1.9769165515899658,
|
| 29456 |
+
"learning_rate": 7.294416246678462e-06,
|
| 29457 |
+
"loss": 1.7774,
|
| 29458 |
+
"step": 377500
|
| 29459 |
+
},
|
| 29460 |
+
{
|
| 29461 |
+
"epoch": 80.00016,
|
| 29462 |
+
"grad_norm": 1.8650860786437988,
|
| 29463 |
+
"learning_rate": 7.283218156180174e-06,
|
| 29464 |
+
"loss": 1.7698,
|
| 29465 |
+
"step": 377600
|
| 29466 |
+
},
|
| 29467 |
+
{
|
| 29468 |
+
"epoch": 81.000122,
|
| 29469 |
+
"grad_norm": 1.9133366346359253,
|
| 29470 |
+
"learning_rate": 7.272027201999484e-06,
|
| 29471 |
+
"loss": 1.7658,
|
| 29472 |
+
"step": 377700
|
| 29473 |
+
},
|
| 29474 |
+
{
|
| 29475 |
+
"epoch": 82.000084,
|
| 29476 |
+
"grad_norm": 1.9629889726638794,
|
| 29477 |
+
"learning_rate": 7.260843388644117e-06,
|
| 29478 |
+
"loss": 1.7552,
|
| 29479 |
+
"step": 377800
|
| 29480 |
+
},
|
| 29481 |
+
{
|
| 29482 |
+
"epoch": 83.000046,
|
| 29483 |
+
"grad_norm": 1.9844943284988403,
|
| 29484 |
+
"learning_rate": 7.249666720618919e-06,
|
| 29485 |
+
"loss": 1.7539,
|
| 29486 |
+
"step": 377900
|
| 29487 |
+
},
|
| 29488 |
+
{
|
| 29489 |
+
"epoch": 84.000008,
|
| 29490 |
+
"grad_norm": 1.9470826387405396,
|
| 29491 |
+
"learning_rate": 7.238497202425834e-06,
|
| 29492 |
+
"loss": 1.7404,
|
| 29493 |
+
"step": 378000
|
| 29494 |
+
},
|
| 29495 |
+
{
|
| 29496 |
+
"epoch": 84.000008,
|
| 29497 |
+
"eval_loss": 2.234076499938965,
|
| 29498 |
+
"eval_runtime": 54.5427,
|
| 29499 |
+
"eval_samples_per_second": 186.9,
|
| 29500 |
+
"eval_steps_per_second": 1.467,
|
| 29501 |
+
"step": 378000
|
| 29502 |
+
},
|
| 29503 |
+
{
|
| 29504 |
+
"epoch": 84.000208,
|
| 29505 |
+
"grad_norm": 2.091539144515991,
|
| 29506 |
+
"learning_rate": 7.2273348385639535e-06,
|
| 29507 |
+
"loss": 1.7783,
|
| 29508 |
+
"step": 378100
|
| 29509 |
+
},
|
| 29510 |
+
{
|
| 29511 |
+
"epoch": 85.00017,
|
| 29512 |
+
"grad_norm": 1.9156265258789062,
|
| 29513 |
+
"learning_rate": 7.216179633529477e-06,
|
| 29514 |
+
"loss": 1.7714,
|
| 29515 |
+
"step": 378200
|
| 29516 |
+
},
|
| 29517 |
+
{
|
| 29518 |
+
"epoch": 86.000132,
|
| 29519 |
+
"grad_norm": 2.0570554733276367,
|
| 29520 |
+
"learning_rate": 7.205031591815723e-06,
|
| 29521 |
+
"loss": 1.7658,
|
| 29522 |
+
"step": 378300
|
| 29523 |
+
},
|
| 29524 |
+
{
|
| 29525 |
+
"epoch": 87.000094,
|
| 29526 |
+
"grad_norm": 2.0413947105407715,
|
| 29527 |
+
"learning_rate": 7.193890717913107e-06,
|
| 29528 |
+
"loss": 1.7564,
|
| 29529 |
+
"step": 378400
|
| 29530 |
+
},
|
| 29531 |
+
{
|
| 29532 |
+
"epoch": 88.000056,
|
| 29533 |
+
"grad_norm": 1.91609787940979,
|
| 29534 |
+
"learning_rate": 7.18275701630918e-06,
|
| 29535 |
+
"loss": 1.7538,
|
| 29536 |
+
"step": 378500
|
| 29537 |
+
},
|
| 29538 |
+
{
|
| 29539 |
+
"epoch": 89.000018,
|
| 29540 |
+
"grad_norm": 1.8070498704910278,
|
| 29541 |
+
"learning_rate": 7.171630491488598e-06,
|
| 29542 |
+
"loss": 1.7439,
|
| 29543 |
+
"step": 378600
|
| 29544 |
+
},
|
| 29545 |
+
{
|
| 29546 |
+
"epoch": 89.000218,
|
| 29547 |
+
"grad_norm": 1.9066287279129028,
|
| 29548 |
+
"learning_rate": 7.16051114793313e-06,
|
| 29549 |
+
"loss": 1.7382,
|
| 29550 |
+
"step": 378700
|
| 29551 |
+
},
|
| 29552 |
+
{
|
| 29553 |
+
"epoch": 90.00018,
|
| 29554 |
+
"grad_norm": 1.8805670738220215,
|
| 29555 |
+
"learning_rate": 7.149398990121628e-06,
|
| 29556 |
+
"loss": 1.7322,
|
| 29557 |
+
"step": 378800
|
| 29558 |
+
},
|
| 29559 |
+
{
|
| 29560 |
+
"epoch": 91.000142,
|
| 29561 |
+
"grad_norm": 1.93112313747406,
|
| 29562 |
+
"learning_rate": 7.138294022530081e-06,
|
| 29563 |
+
"loss": 1.7221,
|
| 29564 |
+
"step": 378900
|
| 29565 |
+
},
|
| 29566 |
+
{
|
| 29567 |
+
"epoch": 92.000104,
|
| 29568 |
+
"grad_norm": 1.9273699522018433,
|
| 29569 |
+
"learning_rate": 7.127196249631565e-06,
|
| 29570 |
+
"loss": 1.717,
|
| 29571 |
+
"step": 379000
|
| 29572 |
+
},
|
| 29573 |
+
{
|
| 29574 |
+
"epoch": 92.000104,
|
| 29575 |
+
"eval_loss": 2.222762107849121,
|
| 29576 |
+
"eval_runtime": 54.5793,
|
| 29577 |
+
"eval_samples_per_second": 186.774,
|
| 29578 |
+
"eval_steps_per_second": 1.466,
|
| 29579 |
+
"step": 379000
|
| 29580 |
+
},
|
| 29581 |
+
{
|
| 29582 |
+
"epoch": 93.000066,
|
| 29583 |
+
"grad_norm": 1.9170584678649902,
|
| 29584 |
+
"learning_rate": 7.116105675896276e-06,
|
| 29585 |
+
"loss": 1.7486,
|
| 29586 |
+
"step": 379100
|
| 29587 |
+
},
|
| 29588 |
+
{
|
| 29589 |
+
"epoch": 94.000028,
|
| 29590 |
+
"grad_norm": 1.886796474456787,
|
| 29591 |
+
"learning_rate": 7.105022305791467e-06,
|
| 29592 |
+
"loss": 1.7455,
|
| 29593 |
+
"step": 379200
|
| 29594 |
+
},
|
| 29595 |
+
{
|
| 29596 |
+
"epoch": 94.000228,
|
| 29597 |
+
"grad_norm": 1.9963804483413696,
|
| 29598 |
+
"learning_rate": 7.0939461437815354e-06,
|
| 29599 |
+
"loss": 1.744,
|
| 29600 |
+
"step": 379300
|
| 29601 |
+
},
|
| 29602 |
+
{
|
| 29603 |
+
"epoch": 95.00019,
|
| 29604 |
+
"grad_norm": 1.9092683792114258,
|
| 29605 |
+
"learning_rate": 7.082877194327953e-06,
|
| 29606 |
+
"loss": 1.7332,
|
| 29607 |
+
"step": 379400
|
| 29608 |
+
},
|
| 29609 |
+
{
|
| 29610 |
+
"epoch": 96.000152,
|
| 29611 |
+
"grad_norm": 1.9792388677597046,
|
| 29612 |
+
"learning_rate": 7.071815461889303e-06,
|
| 29613 |
+
"loss": 1.728,
|
| 29614 |
+
"step": 379500
|
| 29615 |
+
},
|
| 29616 |
+
{
|
| 29617 |
+
"epoch": 97.000114,
|
| 29618 |
+
"grad_norm": 1.9630019664764404,
|
| 29619 |
+
"learning_rate": 7.060760950921233e-06,
|
| 29620 |
+
"loss": 1.7224,
|
| 29621 |
+
"step": 379600
|
| 29622 |
+
},
|
| 29623 |
+
{
|
| 29624 |
+
"epoch": 98.000076,
|
| 29625 |
+
"grad_norm": 1.9032080173492432,
|
| 29626 |
+
"learning_rate": 7.049713665876509e-06,
|
| 29627 |
+
"loss": 1.7176,
|
| 29628 |
+
"step": 379700
|
| 29629 |
+
},
|
| 29630 |
+
{
|
| 29631 |
+
"epoch": 99.000038,
|
| 29632 |
+
"grad_norm": 1.9760445356369019,
|
| 29633 |
+
"learning_rate": 7.038673611204971e-06,
|
| 29634 |
+
"loss": 1.7142,
|
| 29635 |
+
"step": 379800
|
| 29636 |
+
},
|
| 29637 |
+
{
|
| 29638 |
+
"epoch": 99.000238,
|
| 29639 |
+
"grad_norm": 2.5537993907928467,
|
| 29640 |
+
"learning_rate": 7.027640791353562e-06,
|
| 29641 |
+
"loss": 1.7043,
|
| 29642 |
+
"step": 379900
|
| 29643 |
+
},
|
| 29644 |
+
{
|
| 29645 |
+
"epoch": 100.0002,
|
| 29646 |
+
"grad_norm": 1.9134443998336792,
|
| 29647 |
+
"learning_rate": 7.016615210766287e-06,
|
| 29648 |
+
"loss": 1.6935,
|
| 29649 |
+
"step": 380000
|
| 29650 |
+
},
|
| 29651 |
+
{
|
| 29652 |
+
"epoch": 100.0002,
|
| 29653 |
+
"eval_loss": 2.2129366397857666,
|
| 29654 |
+
"eval_runtime": 54.6255,
|
| 29655 |
+
"eval_samples_per_second": 186.616,
|
| 29656 |
+
"eval_steps_per_second": 1.465,
|
| 29657 |
+
"step": 380000
|
| 29658 |
+
},
|
| 29659 |
+
{
|
| 29660 |
+
"epoch": 101.000162,
|
| 29661 |
+
"grad_norm": 1.8621317148208618,
|
| 29662 |
+
"learning_rate": 7.005596873884254e-06,
|
| 29663 |
+
"loss": 1.7287,
|
| 29664 |
+
"step": 380100
|
| 29665 |
+
},
|
| 29666 |
+
{
|
| 29667 |
+
"epoch": 102.000124,
|
| 29668 |
+
"grad_norm": 2.0007071495056152,
|
| 29669 |
+
"learning_rate": 6.994585785145647e-06,
|
| 29670 |
+
"loss": 1.7216,
|
| 29671 |
+
"step": 380200
|
| 29672 |
+
},
|
| 29673 |
+
{
|
| 29674 |
+
"epoch": 103.000086,
|
| 29675 |
+
"grad_norm": 1.981418490409851,
|
| 29676 |
+
"learning_rate": 6.98358194898574e-06,
|
| 29677 |
+
"loss": 1.7192,
|
| 29678 |
+
"step": 380300
|
| 29679 |
+
},
|
| 29680 |
+
{
|
| 29681 |
+
"epoch": 104.000048,
|
| 29682 |
+
"grad_norm": 1.7912635803222656,
|
| 29683 |
+
"learning_rate": 6.972585369836865e-06,
|
| 29684 |
+
"loss": 1.7046,
|
| 29685 |
+
"step": 380400
|
| 29686 |
+
},
|
| 29687 |
+
{
|
| 29688 |
+
"epoch": 105.00001,
|
| 29689 |
+
"grad_norm": 1.9558844566345215,
|
| 29690 |
+
"learning_rate": 6.961596052128444e-06,
|
| 29691 |
+
"loss": 1.708,
|
| 29692 |
+
"step": 380500
|
| 29693 |
+
},
|
| 29694 |
+
{
|
| 29695 |
+
"epoch": 105.00021,
|
| 29696 |
+
"grad_norm": 1.9592783451080322,
|
| 29697 |
+
"learning_rate": 6.9506140002869756e-06,
|
| 29698 |
+
"loss": 1.699,
|
| 29699 |
+
"step": 380600
|
| 29700 |
+
},
|
| 29701 |
+
{
|
| 29702 |
+
"epoch": 106.000172,
|
| 29703 |
+
"grad_norm": 1.9580655097961426,
|
| 29704 |
+
"learning_rate": 6.939639218736041e-06,
|
| 29705 |
+
"loss": 1.6912,
|
| 29706 |
+
"step": 380700
|
| 29707 |
+
},
|
| 29708 |
+
{
|
| 29709 |
+
"epoch": 107.000134,
|
| 29710 |
+
"grad_norm": 1.9187573194503784,
|
| 29711 |
+
"learning_rate": 6.928671711896259e-06,
|
| 29712 |
+
"loss": 1.6864,
|
| 29713 |
+
"step": 380800
|
| 29714 |
+
},
|
| 29715 |
+
{
|
| 29716 |
+
"epoch": 108.000096,
|
| 29717 |
+
"grad_norm": 2.0804340839385986,
|
| 29718 |
+
"learning_rate": 6.917711484185349e-06,
|
| 29719 |
+
"loss": 1.6843,
|
| 29720 |
+
"step": 380900
|
| 29721 |
+
},
|
| 29722 |
+
{
|
| 29723 |
+
"epoch": 109.000058,
|
| 29724 |
+
"grad_norm": 1.9156286716461182,
|
| 29725 |
+
"learning_rate": 6.906758540018099e-06,
|
| 29726 |
+
"loss": 1.6788,
|
| 29727 |
+
"step": 381000
|
| 29728 |
+
},
|
| 29729 |
+
{
|
| 29730 |
+
"epoch": 109.000058,
|
| 29731 |
+
"eval_loss": 2.2096140384674072,
|
| 29732 |
+
"eval_runtime": 54.6776,
|
| 29733 |
+
"eval_samples_per_second": 186.438,
|
| 29734 |
+
"eval_steps_per_second": 1.463,
|
| 29735 |
+
"step": 381000
|
| 29736 |
+
},
|
| 29737 |
+
{
|
| 29738 |
+
"epoch": 110.00002,
|
| 29739 |
+
"grad_norm": 1.8327763080596924,
|
| 29740 |
+
"learning_rate": 6.895812883806341e-06,
|
| 29741 |
+
"loss": 1.703,
|
| 29742 |
+
"step": 381100
|
| 29743 |
+
},
|
| 29744 |
+
{
|
| 29745 |
+
"epoch": 110.00022,
|
| 29746 |
+
"grad_norm": 1.9155895709991455,
|
| 29747 |
+
"learning_rate": 6.884874519958984e-06,
|
| 29748 |
+
"loss": 1.6962,
|
| 29749 |
+
"step": 381200
|
| 29750 |
+
},
|
| 29751 |
+
{
|
| 29752 |
+
"epoch": 111.000182,
|
| 29753 |
+
"grad_norm": 1.8222503662109375,
|
| 29754 |
+
"learning_rate": 6.873943452882006e-06,
|
| 29755 |
+
"loss": 1.6917,
|
| 29756 |
+
"step": 381300
|
| 29757 |
+
},
|
| 29758 |
+
{
|
| 29759 |
+
"epoch": 112.000144,
|
| 29760 |
+
"grad_norm": 1.8987947702407837,
|
| 29761 |
+
"learning_rate": 6.863019686978445e-06,
|
| 29762 |
+
"loss": 1.6892,
|
| 29763 |
+
"step": 381400
|
| 29764 |
+
},
|
| 29765 |
+
{
|
| 29766 |
+
"epoch": 113.000106,
|
| 29767 |
+
"grad_norm": 1.8653353452682495,
|
| 29768 |
+
"learning_rate": 6.85210322664838e-06,
|
| 29769 |
+
"loss": 1.6867,
|
| 29770 |
+
"step": 381500
|
| 29771 |
+
},
|
| 29772 |
+
{
|
| 29773 |
+
"epoch": 114.000068,
|
| 29774 |
+
"grad_norm": 1.8713948726654053,
|
| 29775 |
+
"learning_rate": 6.841194076288962e-06,
|
| 29776 |
+
"loss": 1.6777,
|
| 29777 |
+
"step": 381600
|
| 29778 |
+
},
|
| 29779 |
+
{
|
| 29780 |
+
"epoch": 115.00003,
|
| 29781 |
+
"grad_norm": 1.9354687929153442,
|
| 29782 |
+
"learning_rate": 6.830292240294398e-06,
|
| 29783 |
+
"loss": 1.6756,
|
| 29784 |
+
"step": 381700
|
| 29785 |
+
},
|
| 29786 |
+
{
|
| 29787 |
+
"epoch": 115.00023,
|
| 29788 |
+
"grad_norm": 1.8539812564849854,
|
| 29789 |
+
"learning_rate": 6.8193977230559565e-06,
|
| 29790 |
+
"loss": 1.669,
|
| 29791 |
+
"step": 381800
|
| 29792 |
+
},
|
| 29793 |
+
{
|
| 29794 |
+
"epoch": 116.000192,
|
| 29795 |
+
"grad_norm": 1.913901448249817,
|
| 29796 |
+
"learning_rate": 6.808510528961928e-06,
|
| 29797 |
+
"loss": 1.6632,
|
| 29798 |
+
"step": 381900
|
| 29799 |
+
},
|
| 29800 |
+
{
|
| 29801 |
+
"epoch": 117.000154,
|
| 29802 |
+
"grad_norm": 1.8366894721984863,
|
| 29803 |
+
"learning_rate": 6.797630662397683e-06,
|
| 29804 |
+
"loss": 1.6619,
|
| 29805 |
+
"step": 382000
|
| 29806 |
+
},
|
| 29807 |
+
{
|
| 29808 |
+
"epoch": 117.000154,
|
| 29809 |
+
"eval_loss": 2.1981077194213867,
|
| 29810 |
+
"eval_runtime": 54.646,
|
| 29811 |
+
"eval_samples_per_second": 186.546,
|
| 29812 |
+
"eval_steps_per_second": 1.464,
|
| 29813 |
+
"step": 382000
|
| 29814 |
}
|
| 29815 |
],
|
| 29816 |
"logging_steps": 100,
|
|
|
|
| 29830 |
"attributes": {}
|
| 29831 |
}
|
| 29832 |
},
|
| 29833 |
+
"total_flos": 3.333426940465899e+19,
|
| 29834 |
"train_batch_size": 128,
|
| 29835 |
"trial_name": null,
|
| 29836 |
"trial_params": null
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5777
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a19fa79233fd468fcb689b7b8c5f704161aecb10646540b1133405c7c866d2ff
|
| 3 |
size 5777
|