Upload 8 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- scheduler.pt +1 -1
- trainer_state.json +884 -4
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 18494040
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f64a3cafa47c9ba3e54437d1f9852c222a0087b81b8ce6e387c02057cb1bfd3
|
| 3 |
size 18494040
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 37035002
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be93c1be2bc3f33f7d84eeeeb4d8c4d995ed64199a72fdbe553b1f003bc30445
|
| 3 |
size 37035002
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60cbdd85cf6bcb7c6140c88eacbc709e5746be6620fc2427f93d0a9c73d83631
|
| 3 |
size 1064
|
trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 1.
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -5311,11 +5311,891 @@
|
|
| 5311 |
"loss": 4.3878,
|
| 5312 |
"num_input_tokens_seen": 581214146,
|
| 5313 |
"step": 99450
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5314 |
}
|
| 5315 |
],
|
| 5316 |
"logging_steps": 150,
|
| 5317 |
"max_steps": 272232,
|
| 5318 |
-
"num_input_tokens_seen":
|
| 5319 |
"num_train_epochs": 3,
|
| 5320 |
"save_steps": 500,
|
| 5321 |
"stateful_callbacks": {
|
|
@@ -5330,7 +6210,7 @@
|
|
| 5330 |
"attributes": {}
|
| 5331 |
}
|
| 5332 |
},
|
| 5333 |
-
"total_flos":
|
| 5334 |
"train_batch_size": 32,
|
| 5335 |
"trial_name": null,
|
| 5336 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 1.2783143881998358,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 116000,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 5311 |
"loss": 4.3878,
|
| 5312 |
"num_input_tokens_seen": 581214146,
|
| 5313 |
"step": 99450
|
| 5314 |
+
},
|
| 5315 |
+
{
|
| 5316 |
+
"epoch": 1.0975871815922729,
|
| 5317 |
+
"grad_norm": 1.9078856706619263,
|
| 5318 |
+
"learning_rate": 0.00010148434400575989,
|
| 5319 |
+
"loss": 4.3976,
|
| 5320 |
+
"num_input_tokens_seen": 582094658,
|
| 5321 |
+
"step": 99600
|
| 5322 |
+
},
|
| 5323 |
+
{
|
| 5324 |
+
"epoch": 1.0992401743356346,
|
| 5325 |
+
"grad_norm": 1.8750337362289429,
|
| 5326 |
+
"learning_rate": 0.00010139618261163437,
|
| 5327 |
+
"loss": 4.3999,
|
| 5328 |
+
"num_input_tokens_seen": 582981922,
|
| 5329 |
+
"step": 99750
|
| 5330 |
+
},
|
| 5331 |
+
{
|
| 5332 |
+
"epoch": 1.1008931670789965,
|
| 5333 |
+
"grad_norm": 1.9243488311767578,
|
| 5334 |
+
"learning_rate": 0.00010130802121750887,
|
| 5335 |
+
"loss": 4.3879,
|
| 5336 |
+
"num_input_tokens_seen": 583869026,
|
| 5337 |
+
"step": 99900
|
| 5338 |
+
},
|
| 5339 |
+
{
|
| 5340 |
+
"epoch": 1.1025461598223583,
|
| 5341 |
+
"grad_norm": 1.8446391820907593,
|
| 5342 |
+
"learning_rate": 0.00010121985982338334,
|
| 5343 |
+
"loss": 4.3894,
|
| 5344 |
+
"num_input_tokens_seen": 584749826,
|
| 5345 |
+
"step": 100050
|
| 5346 |
+
},
|
| 5347 |
+
{
|
| 5348 |
+
"epoch": 1.1041991525657202,
|
| 5349 |
+
"grad_norm": 1.726158857345581,
|
| 5350 |
+
"learning_rate": 0.00010113169842925785,
|
| 5351 |
+
"loss": 4.3985,
|
| 5352 |
+
"num_input_tokens_seen": 585630274,
|
| 5353 |
+
"step": 100200
|
| 5354 |
+
},
|
| 5355 |
+
{
|
| 5356 |
+
"epoch": 1.1058521453090822,
|
| 5357 |
+
"grad_norm": 1.8227604627609253,
|
| 5358 |
+
"learning_rate": 0.00010104353703513232,
|
| 5359 |
+
"loss": 4.3906,
|
| 5360 |
+
"num_input_tokens_seen": 586484930,
|
| 5361 |
+
"step": 100350
|
| 5362 |
+
},
|
| 5363 |
+
{
|
| 5364 |
+
"epoch": 1.1075051380524439,
|
| 5365 |
+
"grad_norm": 1.9156420230865479,
|
| 5366 |
+
"learning_rate": 0.00010095537564100682,
|
| 5367 |
+
"loss": 4.3893,
|
| 5368 |
+
"num_input_tokens_seen": 587352738,
|
| 5369 |
+
"step": 100500
|
| 5370 |
+
},
|
| 5371 |
+
{
|
| 5372 |
+
"epoch": 1.1091581307958058,
|
| 5373 |
+
"grad_norm": 1.8385225534439087,
|
| 5374 |
+
"learning_rate": 0.0001008678019895088,
|
| 5375 |
+
"loss": 4.3994,
|
| 5376 |
+
"num_input_tokens_seen": 588239810,
|
| 5377 |
+
"step": 100650
|
| 5378 |
+
},
|
| 5379 |
+
{
|
| 5380 |
+
"epoch": 1.1108111235391678,
|
| 5381 |
+
"grad_norm": 1.9076261520385742,
|
| 5382 |
+
"learning_rate": 0.00010077964059538329,
|
| 5383 |
+
"loss": 4.3922,
|
| 5384 |
+
"num_input_tokens_seen": 589116514,
|
| 5385 |
+
"step": 100800
|
| 5386 |
+
},
|
| 5387 |
+
{
|
| 5388 |
+
"epoch": 1.1124641162825295,
|
| 5389 |
+
"grad_norm": 1.8701651096343994,
|
| 5390 |
+
"learning_rate": 0.00010069147920125778,
|
| 5391 |
+
"loss": 4.4015,
|
| 5392 |
+
"num_input_tokens_seen": 589983426,
|
| 5393 |
+
"step": 100950
|
| 5394 |
+
},
|
| 5395 |
+
{
|
| 5396 |
+
"epoch": 1.1141171090258915,
|
| 5397 |
+
"grad_norm": 1.9545180797576904,
|
| 5398 |
+
"learning_rate": 0.00010060331780713227,
|
| 5399 |
+
"loss": 4.3978,
|
| 5400 |
+
"num_input_tokens_seen": 590856994,
|
| 5401 |
+
"step": 101100
|
| 5402 |
+
},
|
| 5403 |
+
{
|
| 5404 |
+
"epoch": 1.1157701017692532,
|
| 5405 |
+
"grad_norm": 1.9418137073516846,
|
| 5406 |
+
"learning_rate": 0.00010051515641300676,
|
| 5407 |
+
"loss": 4.3893,
|
| 5408 |
+
"num_input_tokens_seen": 591735490,
|
| 5409 |
+
"step": 101250
|
| 5410 |
+
},
|
| 5411 |
+
{
|
| 5412 |
+
"epoch": 1.1174230945126151,
|
| 5413 |
+
"grad_norm": 1.892683982849121,
|
| 5414 |
+
"learning_rate": 0.00010042699501888123,
|
| 5415 |
+
"loss": 4.3833,
|
| 5416 |
+
"num_input_tokens_seen": 592622626,
|
| 5417 |
+
"step": 101400
|
| 5418 |
+
},
|
| 5419 |
+
{
|
| 5420 |
+
"epoch": 1.1190760872559768,
|
| 5421 |
+
"grad_norm": 1.830404281616211,
|
| 5422 |
+
"learning_rate": 0.00010033883362475573,
|
| 5423 |
+
"loss": 4.3939,
|
| 5424 |
+
"num_input_tokens_seen": 593500354,
|
| 5425 |
+
"step": 101550
|
| 5426 |
+
},
|
| 5427 |
+
{
|
| 5428 |
+
"epoch": 1.1207290799993388,
|
| 5429 |
+
"grad_norm": 1.8536481857299805,
|
| 5430 |
+
"learning_rate": 0.00010025067223063021,
|
| 5431 |
+
"loss": 4.3826,
|
| 5432 |
+
"num_input_tokens_seen": 594383234,
|
| 5433 |
+
"step": 101700
|
| 5434 |
+
},
|
| 5435 |
+
{
|
| 5436 |
+
"epoch": 1.1223820727427007,
|
| 5437 |
+
"grad_norm": 1.84872567653656,
|
| 5438 |
+
"learning_rate": 0.00010016251083650471,
|
| 5439 |
+
"loss": 4.3847,
|
| 5440 |
+
"num_input_tokens_seen": 595255266,
|
| 5441 |
+
"step": 101850
|
| 5442 |
+
},
|
| 5443 |
+
{
|
| 5444 |
+
"epoch": 1.1240350654860625,
|
| 5445 |
+
"grad_norm": 1.8653180599212646,
|
| 5446 |
+
"learning_rate": 0.00010007434944237918,
|
| 5447 |
+
"loss": 4.392,
|
| 5448 |
+
"num_input_tokens_seen": 596135586,
|
| 5449 |
+
"step": 102000
|
| 5450 |
+
},
|
| 5451 |
+
{
|
| 5452 |
+
"epoch": 1.1256880582294244,
|
| 5453 |
+
"grad_norm": 1.8534561395645142,
|
| 5454 |
+
"learning_rate": 9.998618804825369e-05,
|
| 5455 |
+
"loss": 4.3862,
|
| 5456 |
+
"num_input_tokens_seen": 597009218,
|
| 5457 |
+
"step": 102150
|
| 5458 |
+
},
|
| 5459 |
+
{
|
| 5460 |
+
"epoch": 1.1273410509727861,
|
| 5461 |
+
"grad_norm": 1.8982864618301392,
|
| 5462 |
+
"learning_rate": 9.989802665412816e-05,
|
| 5463 |
+
"loss": 4.3969,
|
| 5464 |
+
"num_input_tokens_seen": 597873026,
|
| 5465 |
+
"step": 102300
|
| 5466 |
+
},
|
| 5467 |
+
{
|
| 5468 |
+
"epoch": 1.128994043716148,
|
| 5469 |
+
"grad_norm": 1.9212620258331299,
|
| 5470 |
+
"learning_rate": 9.980986526000266e-05,
|
| 5471 |
+
"loss": 4.3872,
|
| 5472 |
+
"num_input_tokens_seen": 598748322,
|
| 5473 |
+
"step": 102450
|
| 5474 |
+
},
|
| 5475 |
+
{
|
| 5476 |
+
"epoch": 1.13064703645951,
|
| 5477 |
+
"grad_norm": 1.8133482933044434,
|
| 5478 |
+
"learning_rate": 9.972170386587714e-05,
|
| 5479 |
+
"loss": 4.3801,
|
| 5480 |
+
"num_input_tokens_seen": 599625410,
|
| 5481 |
+
"step": 102600
|
| 5482 |
+
},
|
| 5483 |
+
{
|
| 5484 |
+
"epoch": 1.1323000292028718,
|
| 5485 |
+
"grad_norm": 1.8521312475204468,
|
| 5486 |
+
"learning_rate": 9.963354247175164e-05,
|
| 5487 |
+
"loss": 4.3867,
|
| 5488 |
+
"num_input_tokens_seen": 600489762,
|
| 5489 |
+
"step": 102750
|
| 5490 |
+
},
|
| 5491 |
+
{
|
| 5492 |
+
"epoch": 1.1339530219462337,
|
| 5493 |
+
"grad_norm": 2.050074577331543,
|
| 5494 |
+
"learning_rate": 9.954538107762612e-05,
|
| 5495 |
+
"loss": 4.3813,
|
| 5496 |
+
"num_input_tokens_seen": 601357666,
|
| 5497 |
+
"step": 102900
|
| 5498 |
+
},
|
| 5499 |
+
{
|
| 5500 |
+
"epoch": 1.1356060146895954,
|
| 5501 |
+
"grad_norm": 1.8785549402236938,
|
| 5502 |
+
"learning_rate": 9.945721968350062e-05,
|
| 5503 |
+
"loss": 4.3799,
|
| 5504 |
+
"num_input_tokens_seen": 602239362,
|
| 5505 |
+
"step": 103050
|
| 5506 |
+
},
|
| 5507 |
+
{
|
| 5508 |
+
"epoch": 1.1372590074329574,
|
| 5509 |
+
"grad_norm": 1.9237360954284668,
|
| 5510 |
+
"learning_rate": 9.93690582893751e-05,
|
| 5511 |
+
"loss": 4.3902,
|
| 5512 |
+
"num_input_tokens_seen": 603119650,
|
| 5513 |
+
"step": 103200
|
| 5514 |
+
},
|
| 5515 |
+
{
|
| 5516 |
+
"epoch": 1.1389120001763193,
|
| 5517 |
+
"grad_norm": 1.8664278984069824,
|
| 5518 |
+
"learning_rate": 9.928089689524957e-05,
|
| 5519 |
+
"loss": 4.3905,
|
| 5520 |
+
"num_input_tokens_seen": 603985666,
|
| 5521 |
+
"step": 103350
|
| 5522 |
+
},
|
| 5523 |
+
{
|
| 5524 |
+
"epoch": 1.140564992919681,
|
| 5525 |
+
"grad_norm": 1.812515139579773,
|
| 5526 |
+
"learning_rate": 9.919273550112407e-05,
|
| 5527 |
+
"loss": 4.3757,
|
| 5528 |
+
"num_input_tokens_seen": 604874530,
|
| 5529 |
+
"step": 103500
|
| 5530 |
+
},
|
| 5531 |
+
{
|
| 5532 |
+
"epoch": 1.142217985663043,
|
| 5533 |
+
"grad_norm": 1.9093918800354004,
|
| 5534 |
+
"learning_rate": 9.910457410699855e-05,
|
| 5535 |
+
"loss": 4.4058,
|
| 5536 |
+
"num_input_tokens_seen": 605755394,
|
| 5537 |
+
"step": 103650
|
| 5538 |
+
},
|
| 5539 |
+
{
|
| 5540 |
+
"epoch": 1.1438709784064047,
|
| 5541 |
+
"grad_norm": 1.9712496995925903,
|
| 5542 |
+
"learning_rate": 9.901641271287305e-05,
|
| 5543 |
+
"loss": 4.3848,
|
| 5544 |
+
"num_input_tokens_seen": 606649794,
|
| 5545 |
+
"step": 103800
|
| 5546 |
+
},
|
| 5547 |
+
{
|
| 5548 |
+
"epoch": 1.1455239711497667,
|
| 5549 |
+
"grad_norm": 1.9102181196212769,
|
| 5550 |
+
"learning_rate": 9.892825131874752e-05,
|
| 5551 |
+
"loss": 4.3926,
|
| 5552 |
+
"num_input_tokens_seen": 607513858,
|
| 5553 |
+
"step": 103950
|
| 5554 |
+
},
|
| 5555 |
+
{
|
| 5556 |
+
"epoch": 1.1471769638931284,
|
| 5557 |
+
"grad_norm": 1.7749512195587158,
|
| 5558 |
+
"learning_rate": 9.884008992462201e-05,
|
| 5559 |
+
"loss": 4.3906,
|
| 5560 |
+
"num_input_tokens_seen": 608391202,
|
| 5561 |
+
"step": 104100
|
| 5562 |
+
},
|
| 5563 |
+
{
|
| 5564 |
+
"epoch": 1.1488299566364903,
|
| 5565 |
+
"grad_norm": 1.8394023180007935,
|
| 5566 |
+
"learning_rate": 9.87519285304965e-05,
|
| 5567 |
+
"loss": 4.3814,
|
| 5568 |
+
"num_input_tokens_seen": 609282018,
|
| 5569 |
+
"step": 104250
|
| 5570 |
+
},
|
| 5571 |
+
{
|
| 5572 |
+
"epoch": 1.1504829493798523,
|
| 5573 |
+
"grad_norm": 1.9161593914031982,
|
| 5574 |
+
"learning_rate": 9.866376713637099e-05,
|
| 5575 |
+
"loss": 4.3947,
|
| 5576 |
+
"num_input_tokens_seen": 610168514,
|
| 5577 |
+
"step": 104400
|
| 5578 |
+
},
|
| 5579 |
+
{
|
| 5580 |
+
"epoch": 1.152135942123214,
|
| 5581 |
+
"grad_norm": 1.930790901184082,
|
| 5582 |
+
"learning_rate": 9.857560574224548e-05,
|
| 5583 |
+
"loss": 4.3928,
|
| 5584 |
+
"num_input_tokens_seen": 611052354,
|
| 5585 |
+
"step": 104550
|
| 5586 |
+
},
|
| 5587 |
+
{
|
| 5588 |
+
"epoch": 1.153788934866576,
|
| 5589 |
+
"grad_norm": 1.836146354675293,
|
| 5590 |
+
"learning_rate": 9.848803209074748e-05,
|
| 5591 |
+
"loss": 4.3977,
|
| 5592 |
+
"num_input_tokens_seen": 611926498,
|
| 5593 |
+
"step": 104700
|
| 5594 |
+
},
|
| 5595 |
+
{
|
| 5596 |
+
"epoch": 1.155441927609938,
|
| 5597 |
+
"grad_norm": 1.7802364826202393,
|
| 5598 |
+
"learning_rate": 9.839987069662196e-05,
|
| 5599 |
+
"loss": 4.3921,
|
| 5600 |
+
"num_input_tokens_seen": 612818210,
|
| 5601 |
+
"step": 104850
|
| 5602 |
+
},
|
| 5603 |
+
{
|
| 5604 |
+
"epoch": 1.1570949203532996,
|
| 5605 |
+
"grad_norm": 1.9587794542312622,
|
| 5606 |
+
"learning_rate": 9.831170930249643e-05,
|
| 5607 |
+
"loss": 4.3925,
|
| 5608 |
+
"num_input_tokens_seen": 613694850,
|
| 5609 |
+
"step": 105000
|
| 5610 |
+
},
|
| 5611 |
+
{
|
| 5612 |
+
"epoch": 1.1587479130966616,
|
| 5613 |
+
"grad_norm": 1.9676165580749512,
|
| 5614 |
+
"learning_rate": 9.822354790837093e-05,
|
| 5615 |
+
"loss": 4.3782,
|
| 5616 |
+
"num_input_tokens_seen": 614583618,
|
| 5617 |
+
"step": 105150
|
| 5618 |
+
},
|
| 5619 |
+
{
|
| 5620 |
+
"epoch": 1.1604009058400233,
|
| 5621 |
+
"grad_norm": 1.8942914009094238,
|
| 5622 |
+
"learning_rate": 9.813538651424541e-05,
|
| 5623 |
+
"loss": 4.3792,
|
| 5624 |
+
"num_input_tokens_seen": 615478530,
|
| 5625 |
+
"step": 105300
|
| 5626 |
+
},
|
| 5627 |
+
{
|
| 5628 |
+
"epoch": 1.1620538985833853,
|
| 5629 |
+
"grad_norm": 1.8436447381973267,
|
| 5630 |
+
"learning_rate": 9.804722512011991e-05,
|
| 5631 |
+
"loss": 4.3848,
|
| 5632 |
+
"num_input_tokens_seen": 616374914,
|
| 5633 |
+
"step": 105450
|
| 5634 |
+
},
|
| 5635 |
+
{
|
| 5636 |
+
"epoch": 1.163706891326747,
|
| 5637 |
+
"grad_norm": 1.9150909185409546,
|
| 5638 |
+
"learning_rate": 9.795906372599439e-05,
|
| 5639 |
+
"loss": 4.381,
|
| 5640 |
+
"num_input_tokens_seen": 617260162,
|
| 5641 |
+
"step": 105600
|
| 5642 |
+
},
|
| 5643 |
+
{
|
| 5644 |
+
"epoch": 1.165359884070109,
|
| 5645 |
+
"grad_norm": 2.0403525829315186,
|
| 5646 |
+
"learning_rate": 9.787090233186889e-05,
|
| 5647 |
+
"loss": 4.3835,
|
| 5648 |
+
"num_input_tokens_seen": 618136386,
|
| 5649 |
+
"step": 105750
|
| 5650 |
+
},
|
| 5651 |
+
{
|
| 5652 |
+
"epoch": 1.1670128768134709,
|
| 5653 |
+
"grad_norm": 1.8062185049057007,
|
| 5654 |
+
"learning_rate": 9.778274093774336e-05,
|
| 5655 |
+
"loss": 4.3821,
|
| 5656 |
+
"num_input_tokens_seen": 619009282,
|
| 5657 |
+
"step": 105900
|
| 5658 |
+
},
|
| 5659 |
+
{
|
| 5660 |
+
"epoch": 1.1686658695568326,
|
| 5661 |
+
"grad_norm": 1.9948753118515015,
|
| 5662 |
+
"learning_rate": 9.769457954361787e-05,
|
| 5663 |
+
"loss": 4.3911,
|
| 5664 |
+
"num_input_tokens_seen": 619886722,
|
| 5665 |
+
"step": 106050
|
| 5666 |
+
},
|
| 5667 |
+
{
|
| 5668 |
+
"epoch": 1.1703188623001946,
|
| 5669 |
+
"grad_norm": 1.8109992742538452,
|
| 5670 |
+
"learning_rate": 9.760641814949234e-05,
|
| 5671 |
+
"loss": 4.3791,
|
| 5672 |
+
"num_input_tokens_seen": 620758178,
|
| 5673 |
+
"step": 106200
|
| 5674 |
+
},
|
| 5675 |
+
{
|
| 5676 |
+
"epoch": 1.1719718550435563,
|
| 5677 |
+
"grad_norm": 1.9707014560699463,
|
| 5678 |
+
"learning_rate": 9.751825675536684e-05,
|
| 5679 |
+
"loss": 4.3809,
|
| 5680 |
+
"num_input_tokens_seen": 621629506,
|
| 5681 |
+
"step": 106350
|
| 5682 |
+
},
|
| 5683 |
+
{
|
| 5684 |
+
"epoch": 1.1736248477869182,
|
| 5685 |
+
"grad_norm": 1.9458143711090088,
|
| 5686 |
+
"learning_rate": 9.743009536124132e-05,
|
| 5687 |
+
"loss": 4.3952,
|
| 5688 |
+
"num_input_tokens_seen": 622496418,
|
| 5689 |
+
"step": 106500
|
| 5690 |
+
},
|
| 5691 |
+
{
|
| 5692 |
+
"epoch": 1.17527784053028,
|
| 5693 |
+
"grad_norm": 1.9349957704544067,
|
| 5694 |
+
"learning_rate": 9.734310945237081e-05,
|
| 5695 |
+
"loss": 4.379,
|
| 5696 |
+
"num_input_tokens_seen": 623395010,
|
| 5697 |
+
"step": 106650
|
| 5698 |
+
},
|
| 5699 |
+
{
|
| 5700 |
+
"epoch": 1.176930833273642,
|
| 5701 |
+
"grad_norm": 1.9133590459823608,
|
| 5702 |
+
"learning_rate": 9.725494805824531e-05,
|
| 5703 |
+
"loss": 4.3689,
|
| 5704 |
+
"num_input_tokens_seen": 624262434,
|
| 5705 |
+
"step": 106800
|
| 5706 |
+
},
|
| 5707 |
+
{
|
| 5708 |
+
"epoch": 1.1785838260170038,
|
| 5709 |
+
"grad_norm": 1.9451539516448975,
|
| 5710 |
+
"learning_rate": 9.716678666411979e-05,
|
| 5711 |
+
"loss": 4.3863,
|
| 5712 |
+
"num_input_tokens_seen": 625153506,
|
| 5713 |
+
"step": 106950
|
| 5714 |
+
},
|
| 5715 |
+
{
|
| 5716 |
+
"epoch": 1.1802368187603656,
|
| 5717 |
+
"grad_norm": 2.0072357654571533,
|
| 5718 |
+
"learning_rate": 9.707862526999429e-05,
|
| 5719 |
+
"loss": 4.378,
|
| 5720 |
+
"num_input_tokens_seen": 626026690,
|
| 5721 |
+
"step": 107100
|
| 5722 |
+
},
|
| 5723 |
+
{
|
| 5724 |
+
"epoch": 1.1818898115037275,
|
| 5725 |
+
"grad_norm": 1.7655397653579712,
|
| 5726 |
+
"learning_rate": 9.699046387586877e-05,
|
| 5727 |
+
"loss": 4.3801,
|
| 5728 |
+
"num_input_tokens_seen": 626902594,
|
| 5729 |
+
"step": 107250
|
| 5730 |
+
},
|
| 5731 |
+
{
|
| 5732 |
+
"epoch": 1.1835428042470895,
|
| 5733 |
+
"grad_norm": 1.9583156108856201,
|
| 5734 |
+
"learning_rate": 9.690230248174325e-05,
|
| 5735 |
+
"loss": 4.3902,
|
| 5736 |
+
"num_input_tokens_seen": 627796194,
|
| 5737 |
+
"step": 107400
|
| 5738 |
+
},
|
| 5739 |
+
{
|
| 5740 |
+
"epoch": 1.1851957969904512,
|
| 5741 |
+
"grad_norm": 1.7717612981796265,
|
| 5742 |
+
"learning_rate": 9.681414108761774e-05,
|
| 5743 |
+
"loss": 4.3812,
|
| 5744 |
+
"num_input_tokens_seen": 628675970,
|
| 5745 |
+
"step": 107550
|
| 5746 |
+
},
|
| 5747 |
+
{
|
| 5748 |
+
"epoch": 1.1868487897338131,
|
| 5749 |
+
"grad_norm": 1.9090009927749634,
|
| 5750 |
+
"learning_rate": 9.672597969349223e-05,
|
| 5751 |
+
"loss": 4.3889,
|
| 5752 |
+
"num_input_tokens_seen": 629549794,
|
| 5753 |
+
"step": 107700
|
| 5754 |
+
},
|
| 5755 |
+
{
|
| 5756 |
+
"epoch": 1.1885017824771749,
|
| 5757 |
+
"grad_norm": 1.8910843133926392,
|
| 5758 |
+
"learning_rate": 9.663781829936672e-05,
|
| 5759 |
+
"loss": 4.3913,
|
| 5760 |
+
"num_input_tokens_seen": 630437378,
|
| 5761 |
+
"step": 107850
|
| 5762 |
+
},
|
| 5763 |
+
{
|
| 5764 |
+
"epoch": 1.1901547752205368,
|
| 5765 |
+
"grad_norm": 1.840728521347046,
|
| 5766 |
+
"learning_rate": 9.654965690524121e-05,
|
| 5767 |
+
"loss": 4.3792,
|
| 5768 |
+
"num_input_tokens_seen": 631313666,
|
| 5769 |
+
"step": 108000
|
| 5770 |
+
},
|
| 5771 |
+
{
|
| 5772 |
+
"epoch": 1.1918077679638985,
|
| 5773 |
+
"grad_norm": 1.8772791624069214,
|
| 5774 |
+
"learning_rate": 9.64614955111157e-05,
|
| 5775 |
+
"loss": 4.3813,
|
| 5776 |
+
"num_input_tokens_seen": 632194466,
|
| 5777 |
+
"step": 108150
|
| 5778 |
+
},
|
| 5779 |
+
{
|
| 5780 |
+
"epoch": 1.1934607607072605,
|
| 5781 |
+
"grad_norm": 1.9666273593902588,
|
| 5782 |
+
"learning_rate": 9.637333411699017e-05,
|
| 5783 |
+
"loss": 4.3716,
|
| 5784 |
+
"num_input_tokens_seen": 633058978,
|
| 5785 |
+
"step": 108300
|
| 5786 |
+
},
|
| 5787 |
+
{
|
| 5788 |
+
"epoch": 1.1951137534506224,
|
| 5789 |
+
"grad_norm": 1.930409550666809,
|
| 5790 |
+
"learning_rate": 9.628517272286466e-05,
|
| 5791 |
+
"loss": 4.3934,
|
| 5792 |
+
"num_input_tokens_seen": 633935458,
|
| 5793 |
+
"step": 108450
|
| 5794 |
+
},
|
| 5795 |
+
{
|
| 5796 |
+
"epoch": 1.1967667461939842,
|
| 5797 |
+
"grad_norm": 1.8000093698501587,
|
| 5798 |
+
"learning_rate": 9.619701132873915e-05,
|
| 5799 |
+
"loss": 4.3794,
|
| 5800 |
+
"num_input_tokens_seen": 634825634,
|
| 5801 |
+
"step": 108600
|
| 5802 |
+
},
|
| 5803 |
+
{
|
| 5804 |
+
"epoch": 1.198419738937346,
|
| 5805 |
+
"grad_norm": 1.8369793891906738,
|
| 5806 |
+
"learning_rate": 9.610884993461364e-05,
|
| 5807 |
+
"loss": 4.386,
|
| 5808 |
+
"num_input_tokens_seen": 635701666,
|
| 5809 |
+
"step": 108750
|
| 5810 |
+
},
|
| 5811 |
+
{
|
| 5812 |
+
"epoch": 1.2000727316807078,
|
| 5813 |
+
"grad_norm": 1.9381849765777588,
|
| 5814 |
+
"learning_rate": 9.602068854048813e-05,
|
| 5815 |
+
"loss": 4.3824,
|
| 5816 |
+
"num_input_tokens_seen": 636568994,
|
| 5817 |
+
"step": 108900
|
| 5818 |
+
},
|
| 5819 |
+
{
|
| 5820 |
+
"epoch": 1.2017257244240698,
|
| 5821 |
+
"grad_norm": 1.8089631795883179,
|
| 5822 |
+
"learning_rate": 9.593252714636261e-05,
|
| 5823 |
+
"loss": 4.3733,
|
| 5824 |
+
"num_input_tokens_seen": 637444034,
|
| 5825 |
+
"step": 109050
|
| 5826 |
+
},
|
| 5827 |
+
{
|
| 5828 |
+
"epoch": 1.2033787171674317,
|
| 5829 |
+
"grad_norm": 1.7429847717285156,
|
| 5830 |
+
"learning_rate": 9.584436575223709e-05,
|
| 5831 |
+
"loss": 4.3766,
|
| 5832 |
+
"num_input_tokens_seen": 638321634,
|
| 5833 |
+
"step": 109200
|
| 5834 |
+
},
|
| 5835 |
+
{
|
| 5836 |
+
"epoch": 1.2050317099107934,
|
| 5837 |
+
"grad_norm": 1.9182720184326172,
|
| 5838 |
+
"learning_rate": 9.575620435811159e-05,
|
| 5839 |
+
"loss": 4.3724,
|
| 5840 |
+
"num_input_tokens_seen": 639189538,
|
| 5841 |
+
"step": 109350
|
| 5842 |
+
},
|
| 5843 |
+
{
|
| 5844 |
+
"epoch": 1.2066847026541554,
|
| 5845 |
+
"grad_norm": 1.9700244665145874,
|
| 5846 |
+
"learning_rate": 9.566804296398607e-05,
|
| 5847 |
+
"loss": 4.3859,
|
| 5848 |
+
"num_input_tokens_seen": 640080354,
|
| 5849 |
+
"step": 109500
|
| 5850 |
+
},
|
| 5851 |
+
{
|
| 5852 |
+
"epoch": 1.2083376953975171,
|
| 5853 |
+
"grad_norm": 1.86391019821167,
|
| 5854 |
+
"learning_rate": 9.557988156986057e-05,
|
| 5855 |
+
"loss": 4.3875,
|
| 5856 |
+
"num_input_tokens_seen": 640977634,
|
| 5857 |
+
"step": 109650
|
| 5858 |
+
},
|
| 5859 |
+
{
|
| 5860 |
+
"epoch": 1.209990688140879,
|
| 5861 |
+
"grad_norm": 1.9451704025268555,
|
| 5862 |
+
"learning_rate": 9.549230791836256e-05,
|
| 5863 |
+
"loss": 4.3928,
|
| 5864 |
+
"num_input_tokens_seen": 641871874,
|
| 5865 |
+
"step": 109800
|
| 5866 |
+
},
|
| 5867 |
+
{
|
| 5868 |
+
"epoch": 1.211643680884241,
|
| 5869 |
+
"grad_norm": 2.063884735107422,
|
| 5870 |
+
"learning_rate": 9.540414652423704e-05,
|
| 5871 |
+
"loss": 4.3704,
|
| 5872 |
+
"num_input_tokens_seen": 642751170,
|
| 5873 |
+
"step": 109950
|
| 5874 |
+
},
|
| 5875 |
+
{
|
| 5876 |
+
"epoch": 1.2132966736276027,
|
| 5877 |
+
"grad_norm": 1.8499351739883423,
|
| 5878 |
+
"learning_rate": 9.531598513011154e-05,
|
| 5879 |
+
"loss": 4.3886,
|
| 5880 |
+
"num_input_tokens_seen": 643629698,
|
| 5881 |
+
"step": 110100
|
| 5882 |
+
},
|
| 5883 |
+
{
|
| 5884 |
+
"epoch": 1.2149496663709647,
|
| 5885 |
+
"grad_norm": 1.9735474586486816,
|
| 5886 |
+
"learning_rate": 9.522782373598601e-05,
|
| 5887 |
+
"loss": 4.3854,
|
| 5888 |
+
"num_input_tokens_seen": 644509698,
|
| 5889 |
+
"step": 110250
|
| 5890 |
+
},
|
| 5891 |
+
{
|
| 5892 |
+
"epoch": 1.2166026591143264,
|
| 5893 |
+
"grad_norm": 1.9430962800979614,
|
| 5894 |
+
"learning_rate": 9.513966234186051e-05,
|
| 5895 |
+
"loss": 4.3905,
|
| 5896 |
+
"num_input_tokens_seen": 645395394,
|
| 5897 |
+
"step": 110400
|
| 5898 |
+
},
|
| 5899 |
+
{
|
| 5900 |
+
"epoch": 1.2182556518576884,
|
| 5901 |
+
"grad_norm": 1.9608047008514404,
|
| 5902 |
+
"learning_rate": 9.505150094773499e-05,
|
| 5903 |
+
"loss": 4.383,
|
| 5904 |
+
"num_input_tokens_seen": 646254626,
|
| 5905 |
+
"step": 110550
|
| 5906 |
+
},
|
| 5907 |
+
{
|
| 5908 |
+
"epoch": 1.21990864460105,
|
| 5909 |
+
"grad_norm": 1.9237737655639648,
|
| 5910 |
+
"learning_rate": 9.4963927296237e-05,
|
| 5911 |
+
"loss": 4.3886,
|
| 5912 |
+
"num_input_tokens_seen": 647146658,
|
| 5913 |
+
"step": 110700
|
| 5914 |
+
},
|
| 5915 |
+
{
|
| 5916 |
+
"epoch": 1.221561637344412,
|
| 5917 |
+
"grad_norm": 1.9678759574890137,
|
| 5918 |
+
"learning_rate": 9.487576590211147e-05,
|
| 5919 |
+
"loss": 4.3858,
|
| 5920 |
+
"num_input_tokens_seen": 648004962,
|
| 5921 |
+
"step": 110850
|
| 5922 |
+
},
|
| 5923 |
+
{
|
| 5924 |
+
"epoch": 1.223214630087774,
|
| 5925 |
+
"grad_norm": 1.8643629550933838,
|
| 5926 |
+
"learning_rate": 9.478760450798597e-05,
|
| 5927 |
+
"loss": 4.3718,
|
| 5928 |
+
"num_input_tokens_seen": 648877602,
|
| 5929 |
+
"step": 111000
|
| 5930 |
+
},
|
| 5931 |
+
{
|
| 5932 |
+
"epoch": 1.2248676228311357,
|
| 5933 |
+
"grad_norm": 1.8100017309188843,
|
| 5934 |
+
"learning_rate": 9.469944311386045e-05,
|
| 5935 |
+
"loss": 4.38,
|
| 5936 |
+
"num_input_tokens_seen": 649743970,
|
| 5937 |
+
"step": 111150
|
| 5938 |
+
},
|
| 5939 |
+
{
|
| 5940 |
+
"epoch": 1.2265206155744977,
|
| 5941 |
+
"grad_norm": 1.8271883726119995,
|
| 5942 |
+
"learning_rate": 9.461128171973495e-05,
|
| 5943 |
+
"loss": 4.3911,
|
| 5944 |
+
"num_input_tokens_seen": 650620130,
|
| 5945 |
+
"step": 111300
|
| 5946 |
+
},
|
| 5947 |
+
{
|
| 5948 |
+
"epoch": 1.2281736083178596,
|
| 5949 |
+
"grad_norm": 1.9749687910079956,
|
| 5950 |
+
"learning_rate": 9.452312032560942e-05,
|
| 5951 |
+
"loss": 4.3715,
|
| 5952 |
+
"num_input_tokens_seen": 651492738,
|
| 5953 |
+
"step": 111450
|
| 5954 |
+
},
|
| 5955 |
+
{
|
| 5956 |
+
"epoch": 1.2298266010612213,
|
| 5957 |
+
"grad_norm": 1.9666537046432495,
|
| 5958 |
+
"learning_rate": 9.44349589314839e-05,
|
| 5959 |
+
"loss": 4.3823,
|
| 5960 |
+
"num_input_tokens_seen": 652359170,
|
| 5961 |
+
"step": 111600
|
| 5962 |
+
},
|
| 5963 |
+
{
|
| 5964 |
+
"epoch": 1.2314795938045833,
|
| 5965 |
+
"grad_norm": 1.9260027408599854,
|
| 5966 |
+
"learning_rate": 9.43467975373584e-05,
|
| 5967 |
+
"loss": 4.3862,
|
| 5968 |
+
"num_input_tokens_seen": 653229570,
|
| 5969 |
+
"step": 111750
|
| 5970 |
+
},
|
| 5971 |
+
{
|
| 5972 |
+
"epoch": 1.233132586547945,
|
| 5973 |
+
"grad_norm": 1.8240337371826172,
|
| 5974 |
+
"learning_rate": 9.425863614323288e-05,
|
| 5975 |
+
"loss": 4.3771,
|
| 5976 |
+
"num_input_tokens_seen": 654109090,
|
| 5977 |
+
"step": 111900
|
| 5978 |
+
},
|
| 5979 |
+
{
|
| 5980 |
+
"epoch": 1.234785579291307,
|
| 5981 |
+
"grad_norm": 1.957507848739624,
|
| 5982 |
+
"learning_rate": 9.417047474910738e-05,
|
| 5983 |
+
"loss": 4.3817,
|
| 5984 |
+
"num_input_tokens_seen": 654980482,
|
| 5985 |
+
"step": 112050
|
| 5986 |
+
},
|
| 5987 |
+
{
|
| 5988 |
+
"epoch": 1.2364385720346687,
|
| 5989 |
+
"grad_norm": 1.8944330215454102,
|
| 5990 |
+
"learning_rate": 9.408231335498185e-05,
|
| 5991 |
+
"loss": 4.3812,
|
| 5992 |
+
"num_input_tokens_seen": 655849634,
|
| 5993 |
+
"step": 112200
|
| 5994 |
+
},
|
| 5995 |
+
{
|
| 5996 |
+
"epoch": 1.2380915647780306,
|
| 5997 |
+
"grad_norm": 1.8677889108657837,
|
| 5998 |
+
"learning_rate": 9.399415196085636e-05,
|
| 5999 |
+
"loss": 4.3803,
|
| 6000 |
+
"num_input_tokens_seen": 656736738,
|
| 6001 |
+
"step": 112350
|
| 6002 |
+
},
|
| 6003 |
+
{
|
| 6004 |
+
"epoch": 1.2397445575213926,
|
| 6005 |
+
"grad_norm": 1.8283082246780396,
|
| 6006 |
+
"learning_rate": 9.390599056673083e-05,
|
| 6007 |
+
"loss": 4.3933,
|
| 6008 |
+
"num_input_tokens_seen": 657615938,
|
| 6009 |
+
"step": 112500
|
| 6010 |
+
},
|
| 6011 |
+
{
|
| 6012 |
+
"epoch": 1.2413975502647543,
|
| 6013 |
+
"grad_norm": 1.9106853008270264,
|
| 6014 |
+
"learning_rate": 9.381782917260533e-05,
|
| 6015 |
+
"loss": 4.3847,
|
| 6016 |
+
"num_input_tokens_seen": 658494850,
|
| 6017 |
+
"step": 112650
|
| 6018 |
+
},
|
| 6019 |
+
{
|
| 6020 |
+
"epoch": 1.2430505430081162,
|
| 6021 |
+
"grad_norm": 1.8882030248641968,
|
| 6022 |
+
"learning_rate": 9.372966777847981e-05,
|
| 6023 |
+
"loss": 4.3862,
|
| 6024 |
+
"num_input_tokens_seen": 659363618,
|
| 6025 |
+
"step": 112800
|
| 6026 |
+
},
|
| 6027 |
+
{
|
| 6028 |
+
"epoch": 1.244703535751478,
|
| 6029 |
+
"grad_norm": 1.964934229850769,
|
| 6030 |
+
"learning_rate": 9.36415063843543e-05,
|
| 6031 |
+
"loss": 4.3805,
|
| 6032 |
+
"num_input_tokens_seen": 660234946,
|
| 6033 |
+
"step": 112950
|
| 6034 |
+
},
|
| 6035 |
+
{
|
| 6036 |
+
"epoch": 1.24635652849484,
|
| 6037 |
+
"grad_norm": 1.8856420516967773,
|
| 6038 |
+
"learning_rate": 9.355334499022878e-05,
|
| 6039 |
+
"loss": 4.3794,
|
| 6040 |
+
"num_input_tokens_seen": 661115810,
|
| 6041 |
+
"step": 113100
|
| 6042 |
+
},
|
| 6043 |
+
{
|
| 6044 |
+
"epoch": 1.2480095212382019,
|
| 6045 |
+
"grad_norm": 1.8618583679199219,
|
| 6046 |
+
"learning_rate": 9.346518359610327e-05,
|
| 6047 |
+
"loss": 4.3883,
|
| 6048 |
+
"num_input_tokens_seen": 661994434,
|
| 6049 |
+
"step": 113250
|
| 6050 |
+
},
|
| 6051 |
+
{
|
| 6052 |
+
"epoch": 1.2496625139815636,
|
| 6053 |
+
"grad_norm": 1.9158508777618408,
|
| 6054 |
+
"learning_rate": 9.337702220197776e-05,
|
| 6055 |
+
"loss": 4.3739,
|
| 6056 |
+
"num_input_tokens_seen": 662868834,
|
| 6057 |
+
"step": 113400
|
| 6058 |
+
},
|
| 6059 |
+
{
|
| 6060 |
+
"epoch": 1.2513155067249255,
|
| 6061 |
+
"grad_norm": 1.8499860763549805,
|
| 6062 |
+
"learning_rate": 9.328886080785225e-05,
|
| 6063 |
+
"loss": 4.379,
|
| 6064 |
+
"num_input_tokens_seen": 663752002,
|
| 6065 |
+
"step": 113550
|
| 6066 |
+
},
|
| 6067 |
+
{
|
| 6068 |
+
"epoch": 1.2529684994682873,
|
| 6069 |
+
"grad_norm": 1.8565645217895508,
|
| 6070 |
+
"learning_rate": 9.320069941372673e-05,
|
| 6071 |
+
"loss": 4.3854,
|
| 6072 |
+
"num_input_tokens_seen": 664622402,
|
| 6073 |
+
"step": 113700
|
| 6074 |
+
},
|
| 6075 |
+
{
|
| 6076 |
+
"epoch": 1.2546214922116492,
|
| 6077 |
+
"grad_norm": 2.060188055038452,
|
| 6078 |
+
"learning_rate": 9.311253801960123e-05,
|
| 6079 |
+
"loss": 4.3758,
|
| 6080 |
+
"num_input_tokens_seen": 665495618,
|
| 6081 |
+
"step": 113850
|
| 6082 |
+
},
|
| 6083 |
+
{
|
| 6084 |
+
"epoch": 1.2562744849550112,
|
| 6085 |
+
"grad_norm": 1.892635464668274,
|
| 6086 |
+
"learning_rate": 9.30243766254757e-05,
|
| 6087 |
+
"loss": 4.3884,
|
| 6088 |
+
"num_input_tokens_seen": 666361922,
|
| 6089 |
+
"step": 114000
|
| 6090 |
+
},
|
| 6091 |
+
{
|
| 6092 |
+
"epoch": 1.2579274776983729,
|
| 6093 |
+
"grad_norm": 1.9154144525527954,
|
| 6094 |
+
"learning_rate": 9.29362152313502e-05,
|
| 6095 |
+
"loss": 4.3752,
|
| 6096 |
+
"num_input_tokens_seen": 667241410,
|
| 6097 |
+
"step": 114150
|
| 6098 |
+
},
|
| 6099 |
+
{
|
| 6100 |
+
"epoch": 1.2595804704417348,
|
| 6101 |
+
"grad_norm": 1.9253753423690796,
|
| 6102 |
+
"learning_rate": 9.284805383722468e-05,
|
| 6103 |
+
"loss": 4.3875,
|
| 6104 |
+
"num_input_tokens_seen": 668132226,
|
| 6105 |
+
"step": 114300
|
| 6106 |
+
},
|
| 6107 |
+
{
|
| 6108 |
+
"epoch": 1.2612334631850965,
|
| 6109 |
+
"grad_norm": 1.9465709924697876,
|
| 6110 |
+
"learning_rate": 9.275989244309918e-05,
|
| 6111 |
+
"loss": 4.3742,
|
| 6112 |
+
"num_input_tokens_seen": 669015202,
|
| 6113 |
+
"step": 114450
|
| 6114 |
+
},
|
| 6115 |
+
{
|
| 6116 |
+
"epoch": 1.2628864559284585,
|
| 6117 |
+
"grad_norm": 1.9070016145706177,
|
| 6118 |
+
"learning_rate": 9.267173104897366e-05,
|
| 6119 |
+
"loss": 4.3737,
|
| 6120 |
+
"num_input_tokens_seen": 669892578,
|
| 6121 |
+
"step": 114600
|
| 6122 |
+
},
|
| 6123 |
+
{
|
| 6124 |
+
"epoch": 1.2645394486718202,
|
| 6125 |
+
"grad_norm": 1.9075013399124146,
|
| 6126 |
+
"learning_rate": 9.258356965484816e-05,
|
| 6127 |
+
"loss": 4.3789,
|
| 6128 |
+
"num_input_tokens_seen": 670773314,
|
| 6129 |
+
"step": 114750
|
| 6130 |
+
},
|
| 6131 |
+
{
|
| 6132 |
+
"epoch": 1.2661924414151822,
|
| 6133 |
+
"grad_norm": 1.8648816347122192,
|
| 6134 |
+
"learning_rate": 9.249540826072263e-05,
|
| 6135 |
+
"loss": 4.3583,
|
| 6136 |
+
"num_input_tokens_seen": 671644514,
|
| 6137 |
+
"step": 114900
|
| 6138 |
+
},
|
| 6139 |
+
{
|
| 6140 |
+
"epoch": 1.2678454341585441,
|
| 6141 |
+
"grad_norm": 1.9572055339813232,
|
| 6142 |
+
"learning_rate": 9.240724686659714e-05,
|
| 6143 |
+
"loss": 4.3871,
|
| 6144 |
+
"num_input_tokens_seen": 672523202,
|
| 6145 |
+
"step": 115050
|
| 6146 |
+
},
|
| 6147 |
+
{
|
| 6148 |
+
"epoch": 1.2694984269019058,
|
| 6149 |
+
"grad_norm": 1.9419187307357788,
|
| 6150 |
+
"learning_rate": 9.231908547247161e-05,
|
| 6151 |
+
"loss": 4.3802,
|
| 6152 |
+
"num_input_tokens_seen": 673387298,
|
| 6153 |
+
"step": 115200
|
| 6154 |
+
},
|
| 6155 |
+
{
|
| 6156 |
+
"epoch": 1.2711514196452678,
|
| 6157 |
+
"grad_norm": 1.9556363821029663,
|
| 6158 |
+
"learning_rate": 9.223092407834611e-05,
|
| 6159 |
+
"loss": 4.3922,
|
| 6160 |
+
"num_input_tokens_seen": 674262786,
|
| 6161 |
+
"step": 115350
|
| 6162 |
+
},
|
| 6163 |
+
{
|
| 6164 |
+
"epoch": 1.2728044123886297,
|
| 6165 |
+
"grad_norm": 1.8693435192108154,
|
| 6166 |
+
"learning_rate": 9.214276268422059e-05,
|
| 6167 |
+
"loss": 4.3719,
|
| 6168 |
+
"num_input_tokens_seen": 675145058,
|
| 6169 |
+
"step": 115500
|
| 6170 |
+
},
|
| 6171 |
+
{
|
| 6172 |
+
"epoch": 1.2744574051319915,
|
| 6173 |
+
"grad_norm": 1.9475206136703491,
|
| 6174 |
+
"learning_rate": 9.205460129009508e-05,
|
| 6175 |
+
"loss": 4.38,
|
| 6176 |
+
"num_input_tokens_seen": 676008962,
|
| 6177 |
+
"step": 115650
|
| 6178 |
+
},
|
| 6179 |
+
{
|
| 6180 |
+
"epoch": 1.2761103978753534,
|
| 6181 |
+
"grad_norm": 1.8718332052230835,
|
| 6182 |
+
"learning_rate": 9.196643989596957e-05,
|
| 6183 |
+
"loss": 4.3734,
|
| 6184 |
+
"num_input_tokens_seen": 676887042,
|
| 6185 |
+
"step": 115800
|
| 6186 |
+
},
|
| 6187 |
+
{
|
| 6188 |
+
"epoch": 1.2777633906187151,
|
| 6189 |
+
"grad_norm": 1.8318613767623901,
|
| 6190 |
+
"learning_rate": 9.187827850184405e-05,
|
| 6191 |
+
"loss": 4.3857,
|
| 6192 |
+
"num_input_tokens_seen": 677766690,
|
| 6193 |
+
"step": 115950
|
| 6194 |
}
|
| 6195 |
],
|
| 6196 |
"logging_steps": 150,
|
| 6197 |
"max_steps": 272232,
|
| 6198 |
+
"num_input_tokens_seen": 678060130,
|
| 6199 |
"num_train_epochs": 3,
|
| 6200 |
"save_steps": 500,
|
| 6201 |
"stateful_callbacks": {
|
|
|
|
| 6210 |
"attributes": {}
|
| 6211 |
}
|
| 6212 |
},
|
| 6213 |
+
"total_flos": 1.04696823656832e+16,
|
| 6214 |
"train_batch_size": 32,
|
| 6215 |
"trial_name": null,
|
| 6216 |
"trial_params": null
|