Training in progress, step 200, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 201880976
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aeda7b371ff2e1752bf1aff362fa660259c343ff41adf1ebf4a35769f07ce5e5
|
| 3 |
size 201880976
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 102771467
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21b4cbbc7c7e47ac572d0611695777730d7795b30ce4422d923e37f4c43b2d15
|
| 3 |
size 102771467
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de2f6fd8a366989100bcb570e1fd69da9deb6a29ce5bba1d2c8889118062705c
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63ca6d6866d748b90a4b2173e0ca24db709af27b45b8531207b094cb85539103
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint":
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 50,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -134,6 +134,132 @@
|
|
| 134 |
"eval_samples_per_second": 27.224,
|
| 135 |
"eval_steps_per_second": 1.703,
|
| 136 |
"step": 100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
}
|
| 138 |
],
|
| 139 |
"logging_steps": 10,
|
|
@@ -153,7 +279,7 @@
|
|
| 153 |
"attributes": {}
|
| 154 |
}
|
| 155 |
},
|
| 156 |
-
"total_flos":
|
| 157 |
"train_batch_size": 16,
|
| 158 |
"trial_name": null,
|
| 159 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 200,
|
| 3 |
+
"best_metric": 0.7492690359164101,
|
| 4 |
+
"best_model_checkpoint": "./qwen2.5-7b-sft-qlora/checkpoint-200",
|
| 5 |
+
"epoch": 0.935672514619883,
|
| 6 |
"eval_steps": 50,
|
| 7 |
+
"global_step": 200,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 134 |
"eval_samples_per_second": 27.224,
|
| 135 |
"eval_steps_per_second": 1.703,
|
| 136 |
"step": 100
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"entropy": 0.6501711800694465,
|
| 140 |
+
"epoch": 0.5146198830409356,
|
| 141 |
+
"grad_norm": 0.1625615507364273,
|
| 142 |
+
"learning_rate": 0.00018584487936018661,
|
| 143 |
+
"loss": 0.6484,
|
| 144 |
+
"mean_token_accuracy": 0.8180312633514404,
|
| 145 |
+
"num_tokens": 2659238.0,
|
| 146 |
+
"step": 110
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"entropy": 0.6405581876635551,
|
| 150 |
+
"epoch": 0.5614035087719298,
|
| 151 |
+
"grad_norm": 0.17417997121810913,
|
| 152 |
+
"learning_rate": 0.00018137863234250347,
|
| 153 |
+
"loss": 0.6404,
|
| 154 |
+
"mean_token_accuracy": 0.819054339826107,
|
| 155 |
+
"num_tokens": 2897816.0,
|
| 156 |
+
"step": 120
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"entropy": 0.6380819544196129,
|
| 160 |
+
"epoch": 0.6081871345029239,
|
| 161 |
+
"grad_norm": 0.17349691689014435,
|
| 162 |
+
"learning_rate": 0.00017637082395311024,
|
| 163 |
+
"loss": 0.6366,
|
| 164 |
+
"mean_token_accuracy": 0.820624266564846,
|
| 165 |
+
"num_tokens": 3136294.0,
|
| 166 |
+
"step": 130
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"entropy": 0.6500405013561249,
|
| 170 |
+
"epoch": 0.6549707602339181,
|
| 171 |
+
"grad_norm": 0.18412715196609497,
|
| 172 |
+
"learning_rate": 0.00017085478033060806,
|
| 173 |
+
"loss": 0.6426,
|
| 174 |
+
"mean_token_accuracy": 0.8185427248477936,
|
| 175 |
+
"num_tokens": 3375202.0,
|
| 176 |
+
"step": 140
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"entropy": 0.6269903033971786,
|
| 180 |
+
"epoch": 0.7017543859649122,
|
| 181 |
+
"grad_norm": 0.1778886765241623,
|
| 182 |
+
"learning_rate": 0.00016486720983522156,
|
| 183 |
+
"loss": 0.6279,
|
| 184 |
+
"mean_token_accuracy": 0.8219256103038788,
|
| 185 |
+
"num_tokens": 3614721.0,
|
| 186 |
+
"step": 150
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 0.7017543859649122,
|
| 190 |
+
"eval_bleu": 61.15829556167586,
|
| 191 |
+
"eval_entropy": 0.5959388177703928,
|
| 192 |
+
"eval_loss": 0.6073054671287537,
|
| 193 |
+
"eval_mean_token_accuracy": 0.8267559442255232,
|
| 194 |
+
"eval_num_tokens": 3614721.0,
|
| 195 |
+
"eval_rougeL": 0.7485533859740823,
|
| 196 |
+
"eval_runtime": 63.4672,
|
| 197 |
+
"eval_samples_per_second": 27.195,
|
| 198 |
+
"eval_steps_per_second": 1.702,
|
| 199 |
+
"step": 150
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"entropy": 0.6273025006055832,
|
| 203 |
+
"epoch": 0.7485380116959064,
|
| 204 |
+
"grad_norm": 0.17554914951324463,
|
| 205 |
+
"learning_rate": 0.000158447958760718,
|
| 206 |
+
"loss": 0.6235,
|
| 207 |
+
"mean_token_accuracy": 0.8232012897729873,
|
| 208 |
+
"num_tokens": 3852615.0,
|
| 209 |
+
"step": 160
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"entropy": 0.6264464437961579,
|
| 213 |
+
"epoch": 0.7953216374269005,
|
| 214 |
+
"grad_norm": 0.17685498297214508,
|
| 215 |
+
"learning_rate": 0.0001516397461638962,
|
| 216 |
+
"loss": 0.6223,
|
| 217 |
+
"mean_token_accuracy": 0.8228656515479088,
|
| 218 |
+
"num_tokens": 4085589.0,
|
| 219 |
+
"step": 170
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"entropy": 0.623998960852623,
|
| 223 |
+
"epoch": 0.8421052631578947,
|
| 224 |
+
"grad_norm": 0.1789834052324295,
|
| 225 |
+
"learning_rate": 0.0001444878795763121,
|
| 226 |
+
"loss": 0.6191,
|
| 227 |
+
"mean_token_accuracy": 0.8224357396364212,
|
| 228 |
+
"num_tokens": 4327626.0,
|
| 229 |
+
"step": 180
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"entropy": 0.6093558698892594,
|
| 233 |
+
"epoch": 0.8888888888888888,
|
| 234 |
+
"grad_norm": 0.17523610591888428,
|
| 235 |
+
"learning_rate": 0.00013703995349013113,
|
| 236 |
+
"loss": 0.61,
|
| 237 |
+
"mean_token_accuracy": 0.8264237254858017,
|
| 238 |
+
"num_tokens": 4570278.0,
|
| 239 |
+
"step": 190
|
| 240 |
+
},
|
| 241 |
+
{
|
| 242 |
+
"entropy": 0.6039168611168861,
|
| 243 |
+
"epoch": 0.935672514619883,
|
| 244 |
+
"grad_norm": 0.18692275881767273,
|
| 245 |
+
"learning_rate": 0.00012934553262463548,
|
| 246 |
+
"loss": 0.6032,
|
| 247 |
+
"mean_token_accuracy": 0.828160648047924,
|
| 248 |
+
"num_tokens": 4806172.0,
|
| 249 |
+
"step": 200
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"epoch": 0.935672514619883,
|
| 253 |
+
"eval_bleu": 60.260312927941236,
|
| 254 |
+
"eval_entropy": 0.5826076859677279,
|
| 255 |
+
"eval_loss": 0.6021928787231445,
|
| 256 |
+
"eval_mean_token_accuracy": 0.8273030961001361,
|
| 257 |
+
"eval_num_tokens": 4806172.0,
|
| 258 |
+
"eval_rougeL": 0.7492690359164101,
|
| 259 |
+
"eval_runtime": 63.3853,
|
| 260 |
+
"eval_samples_per_second": 27.23,
|
| 261 |
+
"eval_steps_per_second": 1.704,
|
| 262 |
+
"step": 200
|
| 263 |
}
|
| 264 |
],
|
| 265 |
"logging_steps": 10,
|
|
|
|
| 279 |
"attributes": {}
|
| 280 |
}
|
| 281 |
},
|
| 282 |
+
"total_flos": 4.157882340289413e+17,
|
| 283 |
"train_batch_size": 16,
|
| 284 |
"trial_name": null,
|
| 285 |
"trial_params": null
|