stage2 size-250k ckpt-100 weights 2026-06-05T06:41:39+02:00
Browse files- size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2 -2
- size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2 -2
- size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2 -2
- size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2 -2
- size-250k/checkpoint-100/global_step100/mp_rank_00_model_states.pt +1 -1
- size-250k/checkpoint-100/model-00001-of-00004.safetensors +1 -1
- size-250k/checkpoint-100/model-00002-of-00004.safetensors +1 -1
- size-250k/checkpoint-100/model-00003-of-00004.safetensors +1 -1
- size-250k/checkpoint-100/model-00004-of-00004.safetensors +1 -1
- size-250k/checkpoint-100/trainer_state.json +62 -62
- size-250k/checkpoint-100/training_args.bin +1 -1
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aefc71d94b209f7cd29c0116a29f05ca2e3f6eef5ea711434fc9cd79662393e0
|
| 3 |
+
size 5709600220
|
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebf99788094f8c9bb48d4a2fda249406ba047d0be5078b1f68212e035633299b
|
| 3 |
+
size 5709601436
|
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dba23acc32dc6029bd269f5d5227a741c2b1415937b618b3867385c67f0aa30e
|
| 3 |
+
size 5709601372
|
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eeb5200a4ae28ede6faccac6fe0a0e03bf9e1eaaa302dde6f2780eaf8e8e8a3b
|
| 3 |
+
size 5709601436
|
size-250k/checkpoint-100/global_step100/mp_rank_00_model_states.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 17932200534
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd35decc59726c9d8341004e7d9129796996e2536a1e636f4d4943f95a4459a5
|
| 3 |
size 17932200534
|
size-250k/checkpoint-100/model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4965419112
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e13adca318a79871de5ecfba6647f2755364d93dbec9569ce8627d6b5f06263e
|
| 3 |
size 4965419112
|
size-250k/checkpoint-100/model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4991495816
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d53ded58f4de861066edafa7447316f1f19e22c0c65bd73cb44070b7e48cbfc
|
| 3 |
size 4991495816
|
size-250k/checkpoint-100/model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4932751040
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c23abe77a027e2733c5f0a82127c9e9d6c03ed5f54bd46e27545a2aeeef62473
|
| 3 |
size 4932751040
|
size-250k/checkpoint-100/model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1689100192
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:636226e0264f48f6f489a47650baae72527f51d1a4773e7e4697f2232c422f5a
|
| 3 |
size 1689100192
|
size-250k/checkpoint-100/trainer_state.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 100,
|
| 8 |
"is_hyper_param_search": false,
|
|
@@ -10,114 +10,114 @@
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
-
"alignment_loss": 0.
|
| 14 |
-
"epoch": 0.
|
| 15 |
-
"grad_norm":
|
| 16 |
"learning_rate": 9e-06,
|
| 17 |
-
"loss":
|
| 18 |
-
"mean_token_accuracy": 0.
|
| 19 |
"num_tokens": 1084700.0,
|
| 20 |
"step": 10,
|
| 21 |
-
"teacher_ce_loss":
|
| 22 |
},
|
| 23 |
{
|
| 24 |
-
"alignment_loss": 0.
|
| 25 |
-
"epoch": 0.
|
| 26 |
-
"grad_norm":
|
| 27 |
"learning_rate": 9.97655028660761e-06,
|
| 28 |
-
"loss": 163.
|
| 29 |
-
"mean_token_accuracy": 0.
|
| 30 |
"num_tokens": 2173682.0,
|
| 31 |
"step": 20,
|
| 32 |
-
"teacher_ce_loss":
|
| 33 |
},
|
| 34 |
{
|
| 35 |
-
"alignment_loss": 0.
|
| 36 |
-
"epoch": 0.
|
| 37 |
-
"grad_norm":
|
| 38 |
"learning_rate": 9.950495049504951e-06,
|
| 39 |
-
"loss": 112.
|
| 40 |
-
"mean_token_accuracy": 0.
|
| 41 |
"num_tokens": 3254544.0,
|
| 42 |
"step": 30,
|
| 43 |
-
"teacher_ce_loss":
|
| 44 |
},
|
| 45 |
{
|
| 46 |
-
"alignment_loss": 0.
|
| 47 |
-
"epoch": 0.
|
| 48 |
-
"grad_norm":
|
| 49 |
"learning_rate": 9.924439812402293e-06,
|
| 50 |
-
"loss": 80.
|
| 51 |
-
"mean_token_accuracy": 0.
|
| 52 |
"num_tokens": 4366899.0,
|
| 53 |
"step": 40,
|
| 54 |
-
"teacher_ce_loss":
|
| 55 |
},
|
| 56 |
{
|
| 57 |
-
"alignment_loss": 0.
|
| 58 |
-
"epoch": 0.
|
| 59 |
-
"grad_norm":
|
| 60 |
"learning_rate": 9.898384575299636e-06,
|
| 61 |
-
"loss": 55.
|
| 62 |
-
"mean_token_accuracy": 0.
|
| 63 |
"num_tokens": 5454040.0,
|
| 64 |
"step": 50,
|
| 65 |
-
"teacher_ce_loss":
|
| 66 |
},
|
| 67 |
{
|
| 68 |
-
"alignment_loss": 0.
|
| 69 |
-
"epoch": 0.
|
| 70 |
-
"grad_norm":
|
| 71 |
"learning_rate": 9.872329338196979e-06,
|
| 72 |
-
"loss": 47.
|
| 73 |
-
"mean_token_accuracy": 0.
|
| 74 |
"num_tokens": 6535542.0,
|
| 75 |
"step": 60,
|
| 76 |
-
"teacher_ce_loss":
|
| 77 |
},
|
| 78 |
{
|
| 79 |
-
"alignment_loss": 0.
|
| 80 |
-
"epoch": 0.
|
| 81 |
-
"grad_norm":
|
| 82 |
"learning_rate": 9.84627410109432e-06,
|
| 83 |
-
"loss": 45.
|
| 84 |
-
"mean_token_accuracy": 0.
|
| 85 |
"num_tokens": 7587360.0,
|
| 86 |
"step": 70,
|
| 87 |
-
"teacher_ce_loss":
|
| 88 |
},
|
| 89 |
{
|
| 90 |
-
"alignment_loss": 0.
|
| 91 |
-
"epoch": 0.
|
| 92 |
-
"grad_norm":
|
| 93 |
"learning_rate": 9.820218863991662e-06,
|
| 94 |
-
"loss": 43.
|
| 95 |
-
"mean_token_accuracy": 0.
|
| 96 |
"num_tokens": 8677343.0,
|
| 97 |
"step": 80,
|
| 98 |
-
"teacher_ce_loss":
|
| 99 |
},
|
| 100 |
{
|
| 101 |
-
"alignment_loss": 0.
|
| 102 |
-
"epoch": 0.
|
| 103 |
-
"grad_norm":
|
| 104 |
"learning_rate": 9.794163626889005e-06,
|
| 105 |
-
"loss": 42.
|
| 106 |
-
"mean_token_accuracy": 0.
|
| 107 |
"num_tokens": 9750881.0,
|
| 108 |
"step": 90,
|
| 109 |
-
"teacher_ce_loss":
|
| 110 |
},
|
| 111 |
{
|
| 112 |
-
"alignment_loss": 0.
|
| 113 |
-
"epoch": 0.
|
| 114 |
-
"grad_norm":
|
| 115 |
"learning_rate": 9.768108389786348e-06,
|
| 116 |
-
"loss":
|
| 117 |
-
"mean_token_accuracy": 0.
|
| 118 |
"num_tokens": 10855717.0,
|
| 119 |
"step": 100,
|
| 120 |
-
"teacher_ce_loss":
|
| 121 |
}
|
| 122 |
],
|
| 123 |
"logging_steps": 10,
|
|
@@ -137,7 +137,7 @@
|
|
| 137 |
"attributes": {}
|
| 138 |
}
|
| 139 |
},
|
| 140 |
-
"total_flos": 5.
|
| 141 |
"train_batch_size": 1,
|
| 142 |
"trial_name": null,
|
| 143 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.05197505197505198,
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 100,
|
| 8 |
"is_hyper_param_search": false,
|
|
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
+
"alignment_loss": 0.795723,
|
| 14 |
+
"epoch": 0.005197505197505198,
|
| 15 |
+
"grad_norm": 9890.51953125,
|
| 16 |
"learning_rate": 9e-06,
|
| 17 |
+
"loss": 595.1739,
|
| 18 |
+
"mean_token_accuracy": 0.6256996631622315,
|
| 19 |
"num_tokens": 1084700.0,
|
| 20 |
"step": 10,
|
| 21 |
+
"teacher_ce_loss": 72.813206
|
| 22 |
},
|
| 23 |
{
|
| 24 |
+
"alignment_loss": 0.901524,
|
| 25 |
+
"epoch": 0.010395010395010396,
|
| 26 |
+
"grad_norm": 3162.57958984375,
|
| 27 |
"learning_rate": 9.97655028660761e-06,
|
| 28 |
+
"loss": 163.0194,
|
| 29 |
+
"mean_token_accuracy": 0.7504611149430275,
|
| 30 |
"num_tokens": 2173682.0,
|
| 31 |
"step": 20,
|
| 32 |
+
"teacher_ce_loss": 19.007352
|
| 33 |
},
|
| 34 |
{
|
| 35 |
+
"alignment_loss": 0.910772,
|
| 36 |
+
"epoch": 0.015592515592515593,
|
| 37 |
+
"grad_norm": 3037.57958984375,
|
| 38 |
"learning_rate": 9.950495049504951e-06,
|
| 39 |
+
"loss": 112.238,
|
| 40 |
+
"mean_token_accuracy": 0.7754241786897182,
|
| 41 |
"num_tokens": 3254544.0,
|
| 42 |
"step": 30,
|
| 43 |
+
"teacher_ce_loss": 14.432665
|
| 44 |
},
|
| 45 |
{
|
| 46 |
+
"alignment_loss": 0.885737,
|
| 47 |
+
"epoch": 0.02079002079002079,
|
| 48 |
+
"grad_norm": 2462.610595703125,
|
| 49 |
"learning_rate": 9.924439812402293e-06,
|
| 50 |
+
"loss": 80.2902,
|
| 51 |
+
"mean_token_accuracy": 0.8474745027720928,
|
| 52 |
"num_tokens": 4366899.0,
|
| 53 |
"step": 40,
|
| 54 |
+
"teacher_ce_loss": 9.699339
|
| 55 |
},
|
| 56 |
{
|
| 57 |
+
"alignment_loss": 0.867464,
|
| 58 |
+
"epoch": 0.02598752598752599,
|
| 59 |
+
"grad_norm": 817.1935424804688,
|
| 60 |
"learning_rate": 9.898384575299636e-06,
|
| 61 |
+
"loss": 55.5484,
|
| 62 |
+
"mean_token_accuracy": 0.8571618065237999,
|
| 63 |
"num_tokens": 5454040.0,
|
| 64 |
"step": 50,
|
| 65 |
+
"teacher_ce_loss": 7.613436
|
| 66 |
},
|
| 67 |
{
|
| 68 |
+
"alignment_loss": 0.869904,
|
| 69 |
+
"epoch": 0.031185031185031187,
|
| 70 |
+
"grad_norm": 497.1650085449219,
|
| 71 |
"learning_rate": 9.872329338196979e-06,
|
| 72 |
+
"loss": 47.8751,
|
| 73 |
+
"mean_token_accuracy": 0.8671012565493583,
|
| 74 |
"num_tokens": 6535542.0,
|
| 75 |
"step": 60,
|
| 76 |
+
"teacher_ce_loss": 5.361066
|
| 77 |
},
|
| 78 |
{
|
| 79 |
+
"alignment_loss": 0.882581,
|
| 80 |
+
"epoch": 0.036382536382536385,
|
| 81 |
+
"grad_norm": 504.9124755859375,
|
| 82 |
"learning_rate": 9.84627410109432e-06,
|
| 83 |
+
"loss": 45.7668,
|
| 84 |
+
"mean_token_accuracy": 0.8666120573878289,
|
| 85 |
"num_tokens": 7587360.0,
|
| 86 |
"step": 70,
|
| 87 |
+
"teacher_ce_loss": 5.355153
|
| 88 |
},
|
| 89 |
{
|
| 90 |
+
"alignment_loss": 0.882264,
|
| 91 |
+
"epoch": 0.04158004158004158,
|
| 92 |
+
"grad_norm": 413.635498046875,
|
| 93 |
"learning_rate": 9.820218863991662e-06,
|
| 94 |
+
"loss": 43.0118,
|
| 95 |
+
"mean_token_accuracy": 0.868953762203455,
|
| 96 |
"num_tokens": 8677343.0,
|
| 97 |
"step": 80,
|
| 98 |
+
"teacher_ce_loss": 4.836935
|
| 99 |
},
|
| 100 |
{
|
| 101 |
+
"alignment_loss": 0.846325,
|
| 102 |
+
"epoch": 0.04677754677754678,
|
| 103 |
+
"grad_norm": 446.23834228515625,
|
| 104 |
"learning_rate": 9.794163626889005e-06,
|
| 105 |
+
"loss": 42.9171,
|
| 106 |
+
"mean_token_accuracy": 0.8668660171329975,
|
| 107 |
"num_tokens": 9750881.0,
|
| 108 |
"step": 90,
|
| 109 |
+
"teacher_ce_loss": 5.504833
|
| 110 |
},
|
| 111 |
{
|
| 112 |
+
"alignment_loss": 0.849805,
|
| 113 |
+
"epoch": 0.05197505197505198,
|
| 114 |
+
"grad_norm": 515.4571533203125,
|
| 115 |
"learning_rate": 9.768108389786348e-06,
|
| 116 |
+
"loss": 47.1901,
|
| 117 |
+
"mean_token_accuracy": 0.8482094191014766,
|
| 118 |
"num_tokens": 10855717.0,
|
| 119 |
"step": 100,
|
| 120 |
+
"teacher_ce_loss": 5.504737
|
| 121 |
}
|
| 122 |
],
|
| 123 |
"logging_steps": 10,
|
|
|
|
| 137 |
"attributes": {}
|
| 138 |
}
|
| 139 |
},
|
| 140 |
+
"total_flos": 5.045144795228406e+17,
|
| 141 |
"train_batch_size": 1,
|
| 142 |
"trial_name": null,
|
| 143 |
"trial_params": null
|
size-250k/checkpoint-100/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 10570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca14d8a6b91e7eb5dce22e1701967a5af6244e13996a2c7240340eeaaeed1e14
|
| 3 |
size 10570
|