Upload checkpoint-600
Browse files- model-00001-of-00002.safetensors +1 -1
- model-00002-of-00002.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +49 -6
model-00001-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4966315264
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c0eb8256abd61f22bcffbcec610d3c210e32ec6939a01419fa52e72abc831c3
|
| 3 |
size 4966315264
|
model-00002-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1183919744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61bb1d2426ffeffdb7b41bb9f8ad70de7b954c8d105546b657b0ecd82e90e083
|
| 3 |
size 1183919744
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12300683155
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da32da7f2111b60955ad152993e4e09f5ff9ef40ee4fc472e1a78a637e27a28b
|
| 3 |
size 12300683155
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:236304ae89e49aae8260113165ee63419b9b745f79120014328a7fa31ed79b42
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23756909803a9fdd0e1e0f7083cd1a77d433834eb7dafbf07ba536044bccc4fc
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 3.
|
| 4 |
-
"best_model_checkpoint": "KBayoud/SmolLM3-3B-bs-16-lr-0.0001-ep-2-wp-0.1-gacc-32-gnm-1.0-FP16-mx-2048-v0.1/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 50,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -481,6 +481,49 @@
|
|
| 481 |
"eval_samples_per_second": 13.051,
|
| 482 |
"eval_steps_per_second": 0.816,
|
| 483 |
"step": 550
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
}
|
| 485 |
],
|
| 486 |
"logging_steps": 10,
|
|
@@ -500,7 +543,7 @@
|
|
| 500 |
"attributes": {}
|
| 501 |
}
|
| 502 |
},
|
| 503 |
-
"total_flos":
|
| 504 |
"train_batch_size": 16,
|
| 505 |
"trial_name": null,
|
| 506 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 600,
|
| 3 |
+
"best_metric": 3.139875650405884,
|
| 4 |
+
"best_model_checkpoint": "KBayoud/SmolLM3-3B-bs-16-lr-0.0001-ep-2-wp-0.1-gacc-32-gnm-1.0-FP16-mx-2048-v0.1/checkpoint-600",
|
| 5 |
+
"epoch": 0.09776315856471463,
|
| 6 |
"eval_steps": 50,
|
| 7 |
+
"global_step": 600,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 481 |
"eval_samples_per_second": 13.051,
|
| 482 |
"eval_steps_per_second": 0.816,
|
| 483 |
"step": 550
|
| 484 |
+
},
|
| 485 |
+
{
|
| 486 |
+
"epoch": 0.09124561466040032,
|
| 487 |
+
"grad_norm": 2.625,
|
| 488 |
+
"learning_rate": 4.5521172638436485e-05,
|
| 489 |
+
"loss": 2.9931,
|
| 490 |
+
"step": 560
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"epoch": 0.0928750006364789,
|
| 494 |
+
"grad_norm": 2.859375,
|
| 495 |
+
"learning_rate": 4.633550488599348e-05,
|
| 496 |
+
"loss": 3.0269,
|
| 497 |
+
"step": 570
|
| 498 |
+
},
|
| 499 |
+
{
|
| 500 |
+
"epoch": 0.09450438661255747,
|
| 501 |
+
"grad_norm": 2.53125,
|
| 502 |
+
"learning_rate": 4.714983713355049e-05,
|
| 503 |
+
"loss": 3.0161,
|
| 504 |
+
"step": 580
|
| 505 |
+
},
|
| 506 |
+
{
|
| 507 |
+
"epoch": 0.09613377258863605,
|
| 508 |
+
"grad_norm": 2.453125,
|
| 509 |
+
"learning_rate": 4.796416938110749e-05,
|
| 510 |
+
"loss": 2.9851,
|
| 511 |
+
"step": 590
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"epoch": 0.09776315856471463,
|
| 515 |
+
"grad_norm": 2.6875,
|
| 516 |
+
"learning_rate": 4.8778501628664496e-05,
|
| 517 |
+
"loss": 2.9918,
|
| 518 |
+
"step": 600
|
| 519 |
+
},
|
| 520 |
+
{
|
| 521 |
+
"epoch": 0.09776315856471463,
|
| 522 |
+
"eval_loss": 3.139875650405884,
|
| 523 |
+
"eval_runtime": 59897.1239,
|
| 524 |
+
"eval_samples_per_second": 13.115,
|
| 525 |
+
"eval_steps_per_second": 0.82,
|
| 526 |
+
"step": 600
|
| 527 |
}
|
| 528 |
],
|
| 529 |
"logging_steps": 10,
|
|
|
|
| 543 |
"attributes": {}
|
| 544 |
}
|
| 545 |
},
|
| 546 |
+
"total_flos": 1.061656902720553e+19,
|
| 547 |
"train_batch_size": 16,
|
| 548 |
"trial_name": null,
|
| 549 |
"trial_params": null
|