sft-6900-step
Browse files- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +228 -6
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 698419728
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2ffc6552dcc71c20350838f8a219506181ba46c38f659ec852bba2bddda4dfc
|
| 3 |
size 698419728
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1397136587
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42b8045c238ff4f74f9e3fe7c94d27857f2fadb3c697f6661bb73fb9bb04a576
|
| 3 |
size 1397136587
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be73d303d67b9e1d37ae52f58cd2c7c7c5aeb597a44b9c72b8875cd9acb7be14
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fff36b79323a1d28015f7255a86b9e604fcdba024c3e96bcdca5e4c7054b0293
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 1.
|
| 4 |
-
"best_model_checkpoint": "/workspace/project_2026_1/checkpoints/sft/checkpoint-
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7445,6 +7445,228 @@
|
|
| 7445 |
"eval_samples_per_second": 26.029,
|
| 7446 |
"eval_steps_per_second": 3.257,
|
| 7447 |
"step": 6700
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7448 |
}
|
| 7449 |
],
|
| 7450 |
"logging_steps": 10,
|
|
@@ -7464,7 +7686,7 @@
|
|
| 7464 |
"attributes": {}
|
| 7465 |
}
|
| 7466 |
},
|
| 7467 |
-
"total_flos": 1.
|
| 7468 |
"train_batch_size": 8,
|
| 7469 |
"trial_name": null,
|
| 7470 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 6800,
|
| 3 |
+
"best_metric": 1.1395292282104492,
|
| 4 |
+
"best_model_checkpoint": "/workspace/project_2026_1/checkpoints/sft/checkpoint-6800",
|
| 5 |
+
"epoch": 2.030008826125331,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 6900,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7445 |
"eval_samples_per_second": 26.029,
|
| 7446 |
"eval_steps_per_second": 3.257,
|
| 7447 |
"step": 6700
|
| 7448 |
+
},
|
| 7449 |
+
{
|
| 7450 |
+
"entropy": 0.9509491920471191,
|
| 7451 |
+
"epoch": 1.9741100323624594,
|
| 7452 |
+
"grad_norm": 0.6006605625152588,
|
| 7453 |
+
"learning_rate": 5.744202611276379e-05,
|
| 7454 |
+
"loss": 0.9503057479858399,
|
| 7455 |
+
"mean_token_accuracy": 0.7786516189575196,
|
| 7456 |
+
"num_tokens": 27097949.0,
|
| 7457 |
+
"step": 6710
|
| 7458 |
+
},
|
| 7459 |
+
{
|
| 7460 |
+
"entropy": 1.0138035595417023,
|
| 7461 |
+
"epoch": 1.9770520741394528,
|
| 7462 |
+
"grad_norm": 0.5902991890907288,
|
| 7463 |
+
"learning_rate": 5.7148775369783694e-05,
|
| 7464 |
+
"loss": 1.0296749114990233,
|
| 7465 |
+
"mean_token_accuracy": 0.7590757310390472,
|
| 7466 |
+
"num_tokens": 27138453.0,
|
| 7467 |
+
"step": 6720
|
| 7468 |
+
},
|
| 7469 |
+
{
|
| 7470 |
+
"entropy": 0.933520519733429,
|
| 7471 |
+
"epoch": 1.979994115916446,
|
| 7472 |
+
"grad_norm": 0.5484936833381653,
|
| 7473 |
+
"learning_rate": 5.685597532311455e-05,
|
| 7474 |
+
"loss": 0.957374095916748,
|
| 7475 |
+
"mean_token_accuracy": 0.7793904483318329,
|
| 7476 |
+
"num_tokens": 27178805.0,
|
| 7477 |
+
"step": 6730
|
| 7478 |
+
},
|
| 7479 |
+
{
|
| 7480 |
+
"entropy": 0.9440324783325196,
|
| 7481 |
+
"epoch": 1.9829361576934392,
|
| 7482 |
+
"grad_norm": 0.5826029777526855,
|
| 7483 |
+
"learning_rate": 5.656362905233923e-05,
|
| 7484 |
+
"loss": 0.9262220382690429,
|
| 7485 |
+
"mean_token_accuracy": 0.7845340669155121,
|
| 7486 |
+
"num_tokens": 27219347.0,
|
| 7487 |
+
"step": 6740
|
| 7488 |
+
},
|
| 7489 |
+
{
|
| 7490 |
+
"entropy": 0.9071877419948577,
|
| 7491 |
+
"epoch": 1.9858781994704324,
|
| 7492 |
+
"grad_norm": 0.5721964836120605,
|
| 7493 |
+
"learning_rate": 5.6271739632268094e-05,
|
| 7494 |
+
"loss": 0.9060114860534668,
|
| 7495 |
+
"mean_token_accuracy": 0.7890908360481262,
|
| 7496 |
+
"num_tokens": 27258890.0,
|
| 7497 |
+
"step": 6750
|
| 7498 |
+
},
|
| 7499 |
+
{
|
| 7500 |
+
"entropy": 0.9562793612480164,
|
| 7501 |
+
"epoch": 1.9888202412474256,
|
| 7502 |
+
"grad_norm": 0.614380955696106,
|
| 7503 |
+
"learning_rate": 5.598031013290631e-05,
|
| 7504 |
+
"loss": 0.9876157760620117,
|
| 7505 |
+
"mean_token_accuracy": 0.768429833650589,
|
| 7506 |
+
"num_tokens": 27299053.0,
|
| 7507 |
+
"step": 6760
|
| 7508 |
+
},
|
| 7509 |
+
{
|
| 7510 |
+
"entropy": 0.9924969553947449,
|
| 7511 |
+
"epoch": 1.991762283024419,
|
| 7512 |
+
"grad_norm": 0.6030513644218445,
|
| 7513 |
+
"learning_rate": 5.5689343619421906e-05,
|
| 7514 |
+
"loss": 0.9977625846862793,
|
| 7515 |
+
"mean_token_accuracy": 0.7658666670322418,
|
| 7516 |
+
"num_tokens": 27339515.0,
|
| 7517 |
+
"step": 6770
|
| 7518 |
+
},
|
| 7519 |
+
{
|
| 7520 |
+
"entropy": 0.9534170269966126,
|
| 7521 |
+
"epoch": 1.994704324801412,
|
| 7522 |
+
"grad_norm": 0.5039950609207153,
|
| 7523 |
+
"learning_rate": 5.539884315211321e-05,
|
| 7524 |
+
"loss": 0.9545814514160156,
|
| 7525 |
+
"mean_token_accuracy": 0.7779964745044708,
|
| 7526 |
+
"num_tokens": 27379693.0,
|
| 7527 |
+
"step": 6780
|
| 7528 |
+
},
|
| 7529 |
+
{
|
| 7530 |
+
"entropy": 0.9789716601371765,
|
| 7531 |
+
"epoch": 1.9976463665784054,
|
| 7532 |
+
"grad_norm": 0.5822030305862427,
|
| 7533 |
+
"learning_rate": 5.5108811786376925e-05,
|
| 7534 |
+
"loss": 0.9928366661071777,
|
| 7535 |
+
"mean_token_accuracy": 0.7682704031467438,
|
| 7536 |
+
"num_tokens": 27419734.0,
|
| 7537 |
+
"step": 6790
|
| 7538 |
+
},
|
| 7539 |
+
{
|
| 7540 |
+
"entropy": 0.915216040611267,
|
| 7541 |
+
"epoch": 2.000588408355399,
|
| 7542 |
+
"grad_norm": 0.4654218554496765,
|
| 7543 |
+
"learning_rate": 5.481925257267589e-05,
|
| 7544 |
+
"loss": 0.8871613502502441,
|
| 7545 |
+
"mean_token_accuracy": 0.7920856356620789,
|
| 7546 |
+
"num_tokens": 27458303.0,
|
| 7547 |
+
"step": 6800
|
| 7548 |
+
},
|
| 7549 |
+
{
|
| 7550 |
+
"epoch": 2.000588408355399,
|
| 7551 |
+
"eval_entropy": 0.9942471109663095,
|
| 7552 |
+
"eval_loss": 1.1395292282104492,
|
| 7553 |
+
"eval_mean_token_accuracy": 0.7511763375575148,
|
| 7554 |
+
"eval_num_tokens": 27458303.0,
|
| 7555 |
+
"eval_runtime": 116.8845,
|
| 7556 |
+
"eval_samples_per_second": 26.051,
|
| 7557 |
+
"eval_steps_per_second": 3.26,
|
| 7558 |
+
"step": 6800
|
| 7559 |
+
},
|
| 7560 |
+
{
|
| 7561 |
+
"entropy": 0.7543269693851471,
|
| 7562 |
+
"epoch": 2.003530450132392,
|
| 7563 |
+
"grad_norm": 0.6209985613822937,
|
| 7564 |
+
"learning_rate": 5.4530168556506875e-05,
|
| 7565 |
+
"loss": 0.6749869823455811,
|
| 7566 |
+
"mean_token_accuracy": 0.8347735464572906,
|
| 7567 |
+
"num_tokens": 27498607.0,
|
| 7568 |
+
"step": 6810
|
| 7569 |
+
},
|
| 7570 |
+
{
|
| 7571 |
+
"entropy": 0.6835850536823272,
|
| 7572 |
+
"epoch": 2.0064724919093853,
|
| 7573 |
+
"grad_norm": 0.781541109085083,
|
| 7574 |
+
"learning_rate": 5.424156277836881e-05,
|
| 7575 |
+
"loss": 0.6951170921325683,
|
| 7576 |
+
"mean_token_accuracy": 0.8288436651229858,
|
| 7577 |
+
"num_tokens": 27538904.0,
|
| 7578 |
+
"step": 6820
|
| 7579 |
+
},
|
| 7580 |
+
{
|
| 7581 |
+
"entropy": 0.6437631964683532,
|
| 7582 |
+
"epoch": 2.0094145336863782,
|
| 7583 |
+
"grad_norm": 0.8998324871063232,
|
| 7584 |
+
"learning_rate": 5.395343827373053e-05,
|
| 7585 |
+
"loss": 0.6296420574188233,
|
| 7586 |
+
"mean_token_accuracy": 0.8461188077926636,
|
| 7587 |
+
"num_tokens": 27579223.0,
|
| 7588 |
+
"step": 6830
|
| 7589 |
+
},
|
| 7590 |
+
{
|
| 7591 |
+
"entropy": 0.6127074956893921,
|
| 7592 |
+
"epoch": 2.0123565754633717,
|
| 7593 |
+
"grad_norm": 0.6167740225791931,
|
| 7594 |
+
"learning_rate": 5.366579807299909e-05,
|
| 7595 |
+
"loss": 0.5965664386749268,
|
| 7596 |
+
"mean_token_accuracy": 0.850104957818985,
|
| 7597 |
+
"num_tokens": 27619638.0,
|
| 7598 |
+
"step": 6840
|
| 7599 |
+
},
|
| 7600 |
+
{
|
| 7601 |
+
"entropy": 0.6964607417583466,
|
| 7602 |
+
"epoch": 2.0152986172403646,
|
| 7603 |
+
"grad_norm": 0.637476921081543,
|
| 7604 |
+
"learning_rate": 5.337864520148768e-05,
|
| 7605 |
+
"loss": 0.6968545913696289,
|
| 7606 |
+
"mean_token_accuracy": 0.8300110459327698,
|
| 7607 |
+
"num_tokens": 27660158.0,
|
| 7608 |
+
"step": 6850
|
| 7609 |
+
},
|
| 7610 |
+
{
|
| 7611 |
+
"entropy": 0.6738093435764313,
|
| 7612 |
+
"epoch": 2.018240659017358,
|
| 7613 |
+
"grad_norm": 0.7894798517227173,
|
| 7614 |
+
"learning_rate": 5.309198267938402e-05,
|
| 7615 |
+
"loss": 0.6670093059539794,
|
| 7616 |
+
"mean_token_accuracy": 0.8377935826778412,
|
| 7617 |
+
"num_tokens": 27700212.0,
|
| 7618 |
+
"step": 6860
|
| 7619 |
+
},
|
| 7620 |
+
{
|
| 7621 |
+
"entropy": 0.6280623555183411,
|
| 7622 |
+
"epoch": 2.0211827007943515,
|
| 7623 |
+
"grad_norm": 0.80244380235672,
|
| 7624 |
+
"learning_rate": 5.280581352171836e-05,
|
| 7625 |
+
"loss": 0.6267249107360839,
|
| 7626 |
+
"mean_token_accuracy": 0.8437743067741394,
|
| 7627 |
+
"num_tokens": 27740554.0,
|
| 7628 |
+
"step": 6870
|
| 7629 |
+
},
|
| 7630 |
+
{
|
| 7631 |
+
"entropy": 0.6882079899311065,
|
| 7632 |
+
"epoch": 2.0241247425713444,
|
| 7633 |
+
"grad_norm": 0.7488958835601807,
|
| 7634 |
+
"learning_rate": 5.2520140738332025e-05,
|
| 7635 |
+
"loss": 0.6897297382354737,
|
| 7636 |
+
"mean_token_accuracy": 0.8309988558292389,
|
| 7637 |
+
"num_tokens": 27781034.0,
|
| 7638 |
+
"step": 6880
|
| 7639 |
+
},
|
| 7640 |
+
{
|
| 7641 |
+
"entropy": 0.676528149843216,
|
| 7642 |
+
"epoch": 2.027066784348338,
|
| 7643 |
+
"grad_norm": 0.8301676511764526,
|
| 7644 |
+
"learning_rate": 5.2234967333845466e-05,
|
| 7645 |
+
"loss": 0.6622447490692138,
|
| 7646 |
+
"mean_token_accuracy": 0.8345989942550659,
|
| 7647 |
+
"num_tokens": 27821579.0,
|
| 7648 |
+
"step": 6890
|
| 7649 |
+
},
|
| 7650 |
+
{
|
| 7651 |
+
"entropy": 0.6388787865638733,
|
| 7652 |
+
"epoch": 2.030008826125331,
|
| 7653 |
+
"grad_norm": 0.7029614448547363,
|
| 7654 |
+
"learning_rate": 5.1950296307626956e-05,
|
| 7655 |
+
"loss": 0.6487605571746826,
|
| 7656 |
+
"mean_token_accuracy": 0.8400563955307007,
|
| 7657 |
+
"num_tokens": 27861899.0,
|
| 7658 |
+
"step": 6900
|
| 7659 |
+
},
|
| 7660 |
+
{
|
| 7661 |
+
"epoch": 2.030008826125331,
|
| 7662 |
+
"eval_entropy": 0.8397969613707285,
|
| 7663 |
+
"eval_loss": 1.2236672639846802,
|
| 7664 |
+
"eval_mean_token_accuracy": 0.7469185830101254,
|
| 7665 |
+
"eval_num_tokens": 27861899.0,
|
| 7666 |
+
"eval_runtime": 116.8259,
|
| 7667 |
+
"eval_samples_per_second": 26.064,
|
| 7668 |
+
"eval_steps_per_second": 3.261,
|
| 7669 |
+
"step": 6900
|
| 7670 |
}
|
| 7671 |
],
|
| 7672 |
"logging_steps": 10,
|
|
|
|
| 7686 |
"attributes": {}
|
| 7687 |
}
|
| 7688 |
},
|
| 7689 |
+
"total_flos": 1.2944070017481708e+18,
|
| 7690 |
"train_batch_size": 8,
|
| 7691 |
"trial_name": null,
|
| 7692 |
"trial_params": null
|