Training in progress, step 13500, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step13500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step13500/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +206 -6
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1311c9a69e5604b2001ceda10c832e98119547c0e33d82afe5989665de514c3e
|
| 3 |
size 12017472
|
last-checkpoint/global_step13500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e5b7fd370b88c57ef6538390266dc426bccc73daf55376f38bfe8614c792f79
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step13500/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2751dde639a5d12f58ec51183d6aef63115a33b7c76078f4a229de16b57b14e
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step13500
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76d48473cf121167cd401e1842d406e7e5686b60208f0336b7552832934ccc04
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-
|
| 5 |
-
"epoch": 9.
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5217,6 +5217,206 @@
|
|
| 5217 |
"eval_samples_per_second": 42.989,
|
| 5218 |
"eval_steps_per_second": 5.38,
|
| 5219 |
"step": 13000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5220 |
}
|
| 5221 |
],
|
| 5222 |
"logging_steps": 25,
|
|
@@ -5236,7 +5436,7 @@
|
|
| 5236 |
"attributes": {}
|
| 5237 |
}
|
| 5238 |
},
|
| 5239 |
-
"total_flos": 7.
|
| 5240 |
"train_batch_size": 4,
|
| 5241 |
"trial_name": null,
|
| 5242 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 13500,
|
| 3 |
+
"best_metric": 0.5390045046806335,
|
| 4 |
+
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-13500",
|
| 5 |
+
"epoch": 9.811488820214507,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 13500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5217 |
"eval_samples_per_second": 42.989,
|
| 5218 |
"eval_steps_per_second": 5.38,
|
| 5219 |
"step": 13000
|
| 5220 |
+
},
|
| 5221 |
+
{
|
| 5222 |
+
"epoch": 9.466097073259407,
|
| 5223 |
+
"grad_norm": 0.8422950506210327,
|
| 5224 |
+
"learning_rate": 8.584715779344832e-06,
|
| 5225 |
+
"loss": 0.5187,
|
| 5226 |
+
"mean_token_accuracy": 0.8383197170495987,
|
| 5227 |
+
"num_tokens": 286863014.0,
|
| 5228 |
+
"step": 13025
|
| 5229 |
+
},
|
| 5230 |
+
{
|
| 5231 |
+
"epoch": 9.484275586257045,
|
| 5232 |
+
"grad_norm": 0.7978519797325134,
|
| 5233 |
+
"learning_rate": 8.466584529700203e-06,
|
| 5234 |
+
"loss": 0.515,
|
| 5235 |
+
"mean_token_accuracy": 0.839700258076191,
|
| 5236 |
+
"num_tokens": 287406511.0,
|
| 5237 |
+
"step": 13050
|
| 5238 |
+
},
|
| 5239 |
+
{
|
| 5240 |
+
"epoch": 9.502454099254681,
|
| 5241 |
+
"grad_norm": 0.8645240664482117,
|
| 5242 |
+
"learning_rate": 8.349175429248554e-06,
|
| 5243 |
+
"loss": 0.5238,
|
| 5244 |
+
"mean_token_accuracy": 0.8366273155808449,
|
| 5245 |
+
"num_tokens": 287962024.0,
|
| 5246 |
+
"step": 13075
|
| 5247 |
+
},
|
| 5248 |
+
{
|
| 5249 |
+
"epoch": 9.520632612252317,
|
| 5250 |
+
"grad_norm": 0.8597573041915894,
|
| 5251 |
+
"learning_rate": 8.232491166784782e-06,
|
| 5252 |
+
"loss": 0.5159,
|
| 5253 |
+
"mean_token_accuracy": 0.8379004463553429,
|
| 5254 |
+
"num_tokens": 288527560.0,
|
| 5255 |
+
"step": 13100
|
| 5256 |
+
},
|
| 5257 |
+
{
|
| 5258 |
+
"epoch": 9.538811125249955,
|
| 5259 |
+
"grad_norm": 0.8828545808792114,
|
| 5260 |
+
"learning_rate": 8.116534414504232e-06,
|
| 5261 |
+
"loss": 0.5118,
|
| 5262 |
+
"mean_token_accuracy": 0.8406583109498024,
|
| 5263 |
+
"num_tokens": 289060843.0,
|
| 5264 |
+
"step": 13125
|
| 5265 |
+
},
|
| 5266 |
+
{
|
| 5267 |
+
"epoch": 9.556989638247591,
|
| 5268 |
+
"grad_norm": 0.8724490404129028,
|
| 5269 |
+
"learning_rate": 8.00130782794148e-06,
|
| 5270 |
+
"loss": 0.5239,
|
| 5271 |
+
"mean_token_accuracy": 0.8369137379527092,
|
| 5272 |
+
"num_tokens": 289603965.0,
|
| 5273 |
+
"step": 13150
|
| 5274 |
+
},
|
| 5275 |
+
{
|
| 5276 |
+
"epoch": 9.575168151245228,
|
| 5277 |
+
"grad_norm": 0.8818336129188538,
|
| 5278 |
+
"learning_rate": 7.886814045909515e-06,
|
| 5279 |
+
"loss": 0.5244,
|
| 5280 |
+
"mean_token_accuracy": 0.8372589892148972,
|
| 5281 |
+
"num_tokens": 290146905.0,
|
| 5282 |
+
"step": 13175
|
| 5283 |
+
},
|
| 5284 |
+
{
|
| 5285 |
+
"epoch": 9.593346664242866,
|
| 5286 |
+
"grad_norm": 0.9488387703895569,
|
| 5287 |
+
"learning_rate": 7.773055690439326e-06,
|
| 5288 |
+
"loss": 0.5131,
|
| 5289 |
+
"mean_token_accuracy": 0.8400958624482154,
|
| 5290 |
+
"num_tokens": 290702107.0,
|
| 5291 |
+
"step": 13200
|
| 5292 |
+
},
|
| 5293 |
+
{
|
| 5294 |
+
"epoch": 9.611525177240502,
|
| 5295 |
+
"grad_norm": 0.8438289165496826,
|
| 5296 |
+
"learning_rate": 7.66003536671982e-06,
|
| 5297 |
+
"loss": 0.5131,
|
| 5298 |
+
"mean_token_accuracy": 0.8400224041938782,
|
| 5299 |
+
"num_tokens": 291241779.0,
|
| 5300 |
+
"step": 13225
|
| 5301 |
+
},
|
| 5302 |
+
{
|
| 5303 |
+
"epoch": 9.629703690238138,
|
| 5304 |
+
"grad_norm": 0.8664806485176086,
|
| 5305 |
+
"learning_rate": 7.547755663038212e-06,
|
| 5306 |
+
"loss": 0.5107,
|
| 5307 |
+
"mean_token_accuracy": 0.8407774633169174,
|
| 5308 |
+
"num_tokens": 291796633.0,
|
| 5309 |
+
"step": 13250
|
| 5310 |
+
},
|
| 5311 |
+
{
|
| 5312 |
+
"epoch": 9.629703690238138,
|
| 5313 |
+
"eval_loss": 0.5401590466499329,
|
| 5314 |
+
"eval_mean_token_accuracy": 0.8320043968414169,
|
| 5315 |
+
"eval_num_tokens": 291796633.0,
|
| 5316 |
+
"eval_runtime": 112.5867,
|
| 5317 |
+
"eval_samples_per_second": 43.433,
|
| 5318 |
+
"eval_steps_per_second": 5.436,
|
| 5319 |
+
"step": 13250
|
| 5320 |
+
},
|
| 5321 |
+
{
|
| 5322 |
+
"epoch": 9.647882203235776,
|
| 5323 |
+
"grad_norm": 0.8282386064529419,
|
| 5324 |
+
"learning_rate": 7.436219150720698e-06,
|
| 5325 |
+
"loss": 0.5155,
|
| 5326 |
+
"mean_token_accuracy": 0.84046880453825,
|
| 5327 |
+
"num_tokens": 292340922.0,
|
| 5328 |
+
"step": 13275
|
| 5329 |
+
},
|
| 5330 |
+
{
|
| 5331 |
+
"epoch": 9.666060716233412,
|
| 5332 |
+
"grad_norm": 0.872983455657959,
|
| 5333 |
+
"learning_rate": 7.325428384073592e-06,
|
| 5334 |
+
"loss": 0.5231,
|
| 5335 |
+
"mean_token_accuracy": 0.8363588589429856,
|
| 5336 |
+
"num_tokens": 292895625.0,
|
| 5337 |
+
"step": 13300
|
| 5338 |
+
},
|
| 5339 |
+
{
|
| 5340 |
+
"epoch": 9.684239229231048,
|
| 5341 |
+
"grad_norm": 0.8708329200744629,
|
| 5342 |
+
"learning_rate": 7.215385900324832e-06,
|
| 5343 |
+
"loss": 0.5144,
|
| 5344 |
+
"mean_token_accuracy": 0.8397229793667793,
|
| 5345 |
+
"num_tokens": 293448542.0,
|
| 5346 |
+
"step": 13325
|
| 5347 |
+
},
|
| 5348 |
+
{
|
| 5349 |
+
"epoch": 9.702417742228686,
|
| 5350 |
+
"grad_norm": 0.8467702269554138,
|
| 5351 |
+
"learning_rate": 7.106094219565869e-06,
|
| 5352 |
+
"loss": 0.5171,
|
| 5353 |
+
"mean_token_accuracy": 0.8385615301132202,
|
| 5354 |
+
"num_tokens": 294000478.0,
|
| 5355 |
+
"step": 13350
|
| 5356 |
+
},
|
| 5357 |
+
{
|
| 5358 |
+
"epoch": 9.720596255226322,
|
| 5359 |
+
"grad_norm": 0.8231089115142822,
|
| 5360 |
+
"learning_rate": 6.9975558446939665e-06,
|
| 5361 |
+
"loss": 0.5132,
|
| 5362 |
+
"mean_token_accuracy": 0.8399266812205315,
|
| 5363 |
+
"num_tokens": 294557047.0,
|
| 5364 |
+
"step": 13375
|
| 5365 |
+
},
|
| 5366 |
+
{
|
| 5367 |
+
"epoch": 9.738774768223958,
|
| 5368 |
+
"grad_norm": 0.9206160306930542,
|
| 5369 |
+
"learning_rate": 6.8897732613548526e-06,
|
| 5370 |
+
"loss": 0.5096,
|
| 5371 |
+
"mean_token_accuracy": 0.8407321670651435,
|
| 5372 |
+
"num_tokens": 295104353.0,
|
| 5373 |
+
"step": 13400
|
| 5374 |
+
},
|
| 5375 |
+
{
|
| 5376 |
+
"epoch": 9.756953281221596,
|
| 5377 |
+
"grad_norm": 0.8946228623390198,
|
| 5378 |
+
"learning_rate": 6.782748937885842e-06,
|
| 5379 |
+
"loss": 0.5157,
|
| 5380 |
+
"mean_token_accuracy": 0.8397801405191422,
|
| 5381 |
+
"num_tokens": 295655574.0,
|
| 5382 |
+
"step": 13425
|
| 5383 |
+
},
|
| 5384 |
+
{
|
| 5385 |
+
"epoch": 9.775131794219233,
|
| 5386 |
+
"grad_norm": 0.7474434971809387,
|
| 5387 |
+
"learning_rate": 6.6764853252592585e-06,
|
| 5388 |
+
"loss": 0.5217,
|
| 5389 |
+
"mean_token_accuracy": 0.8362213695049285,
|
| 5390 |
+
"num_tokens": 296223611.0,
|
| 5391 |
+
"step": 13450
|
| 5392 |
+
},
|
| 5393 |
+
{
|
| 5394 |
+
"epoch": 9.79331030721687,
|
| 5395 |
+
"grad_norm": 0.8649734258651733,
|
| 5396 |
+
"learning_rate": 6.5709848570263324e-06,
|
| 5397 |
+
"loss": 0.5151,
|
| 5398 |
+
"mean_token_accuracy": 0.838211068212986,
|
| 5399 |
+
"num_tokens": 296787088.0,
|
| 5400 |
+
"step": 13475
|
| 5401 |
+
},
|
| 5402 |
+
{
|
| 5403 |
+
"epoch": 9.811488820214507,
|
| 5404 |
+
"grad_norm": 0.7948579788208008,
|
| 5405 |
+
"learning_rate": 6.466249949261474e-06,
|
| 5406 |
+
"loss": 0.5165,
|
| 5407 |
+
"mean_token_accuracy": 0.8387623742222786,
|
| 5408 |
+
"num_tokens": 297344033.0,
|
| 5409 |
+
"step": 13500
|
| 5410 |
+
},
|
| 5411 |
+
{
|
| 5412 |
+
"epoch": 9.811488820214507,
|
| 5413 |
+
"eval_loss": 0.5390045046806335,
|
| 5414 |
+
"eval_mean_token_accuracy": 0.8321733054966708,
|
| 5415 |
+
"eval_num_tokens": 297344033.0,
|
| 5416 |
+
"eval_runtime": 113.1601,
|
| 5417 |
+
"eval_samples_per_second": 43.213,
|
| 5418 |
+
"eval_steps_per_second": 5.408,
|
| 5419 |
+
"step": 13500
|
| 5420 |
}
|
| 5421 |
],
|
| 5422 |
"logging_steps": 25,
|
|
|
|
| 5436 |
"attributes": {}
|
| 5437 |
}
|
| 5438 |
},
|
| 5439 |
+
"total_flos": 7.499312044798116e+17,
|
| 5440 |
"train_batch_size": 4,
|
| 5441 |
"trial_name": null,
|
| 5442 |
"trial_params": null
|