guava-05-14 / trainer_state.json
AIcell's picture
Initial upload: v6 SFT checkpoint-105 (best, val loss 0.5393)
0d16021 verified
{
"best_global_step": 105,
"best_metric": 0.53931481,
"best_model_checkpoint": "/workspace/xiruili_temporary/guava/runs/qwen35-4b-train-v6-full-r16-lr2e-5/v1-20260513-234931/checkpoint-105",
"epoch": 3.0,
"eval_steps": 25,
"global_step": 105,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029197080291970802,
"grad_norm": 1.995608925819397,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0870462656021118,
"step": 1,
"token_acc": 0.7222708272577039
},
{
"epoch": 0.145985401459854,
"grad_norm": 1.630089521408081,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.0861663818359375,
"step": 5,
"token_acc": 0.7234353454485584
},
{
"epoch": 0.291970802919708,
"grad_norm": 0.810945987701416,
"learning_rate": 1.9919548128307954e-05,
"loss": 1.0100549697875976,
"step": 10,
"token_acc": 0.7320401812893188
},
{
"epoch": 0.43795620437956206,
"grad_norm": 0.6168099641799927,
"learning_rate": 1.9594929736144978e-05,
"loss": 0.8932361602783203,
"step": 15,
"token_acc": 0.752801364142539
},
{
"epoch": 0.583941605839416,
"grad_norm": 0.5895264744758606,
"learning_rate": 1.9029265382866216e-05,
"loss": 0.8415294647216797,
"step": 20,
"token_acc": 0.7655036474215341
},
{
"epoch": 0.7299270072992701,
"grad_norm": 0.5221685767173767,
"learning_rate": 1.8236765814298328e-05,
"loss": 0.7946267127990723,
"step": 25,
"token_acc": 0.7786007141912445
},
{
"epoch": 0.7299270072992701,
"eval_loss": 0.7223935127258301,
"eval_runtime": 19.6664,
"eval_samples_per_second": 1.424,
"eval_steps_per_second": 0.712,
"eval_token_acc": 0.7937243620478398,
"step": 25
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.505389392375946,
"learning_rate": 1.72373403810507e-05,
"loss": 0.727280855178833,
"step": 30,
"token_acc": 0.7940088439236739
},
{
"epoch": 1.0,
"grad_norm": 0.8667115569114685,
"learning_rate": 1.6056096871376667e-05,
"loss": 0.6953943252563477,
"step": 35,
"token_acc": 0.8008854009457692
},
{
"epoch": 1.145985401459854,
"grad_norm": 0.4732973575592041,
"learning_rate": 1.472271074772683e-05,
"loss": 0.6909239768981934,
"step": 40,
"token_acc": 0.802294099742074
},
{
"epoch": 1.2919708029197081,
"grad_norm": 0.4912143349647522,
"learning_rate": 1.3270679633174219e-05,
"loss": 0.6232150554656982,
"step": 45,
"token_acc": 0.8173081972378302
},
{
"epoch": 1.437956204379562,
"grad_norm": 0.4965362846851349,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.6203184127807617,
"step": 50,
"token_acc": 0.817360592270597
},
{
"epoch": 1.437956204379562,
"eval_loss": 0.5858569741249084,
"eval_runtime": 19.4944,
"eval_samples_per_second": 1.436,
"eval_steps_per_second": 0.718,
"eval_token_acc": 0.8254754674764264,
"step": 50
},
{
"epoch": 1.583941605839416,
"grad_norm": 0.5230202078819275,
"learning_rate": 1.015865963834808e-05,
"loss": 0.6270622253417969,
"step": 55,
"token_acc": 0.8175454142667258
},
{
"epoch": 1.72992700729927,
"grad_norm": 0.5222014784812927,
"learning_rate": 8.576851617267151e-06,
"loss": 0.5988405227661133,
"step": 60,
"token_acc": 0.8216454622561493
},
{
"epoch": 1.8759124087591241,
"grad_norm": 0.5679988265037537,
"learning_rate": 7.0307962467172555e-06,
"loss": 0.6027673721313477,
"step": 65,
"token_acc": 0.8212479748637636
},
{
"epoch": 2.0,
"grad_norm": 0.9797302484512329,
"learning_rate": 5.559333873942259e-06,
"loss": 0.5929806709289551,
"step": 70,
"token_acc": 0.828269567391848
},
{
"epoch": 2.145985401459854,
"grad_norm": 0.5088374018669128,
"learning_rate": 4.19943090428802e-06,
"loss": 0.5630727767944336,
"step": 75,
"token_acc": 0.830645705417988
},
{
"epoch": 2.145985401459854,
"eval_loss": 0.5463234782218933,
"eval_runtime": 19.6522,
"eval_samples_per_second": 1.425,
"eval_steps_per_second": 0.712,
"eval_token_acc": 0.8345319908369293,
"step": 75
},
{
"epoch": 2.291970802919708,
"grad_norm": 0.49748751521110535,
"learning_rate": 2.9852511229367862e-06,
"loss": 0.5831465721130371,
"step": 80,
"token_acc": 0.8253881640954412
},
{
"epoch": 2.437956204379562,
"grad_norm": 0.5103219747543335,
"learning_rate": 1.947297424689414e-06,
"loss": 0.5736560821533203,
"step": 85,
"token_acc": 0.8275780189959294
},
{
"epoch": 2.5839416058394162,
"grad_norm": 0.5354544520378113,
"learning_rate": 1.1116455134507665e-06,
"loss": 0.5565204620361328,
"step": 90,
"token_acc": 0.8317895383059783
},
{
"epoch": 2.72992700729927,
"grad_norm": 0.5385181903839111,
"learning_rate": 4.992888225905467e-07,
"loss": 0.5768057823181152,
"step": 95,
"token_acc": 0.8267348161960575
},
{
"epoch": 2.875912408759124,
"grad_norm": 0.5486910343170166,
"learning_rate": 1.2561111323605714e-07,
"loss": 0.5681517601013184,
"step": 100,
"token_acc": 0.8298371468398327
},
{
"epoch": 2.875912408759124,
"eval_loss": 0.5395660400390625,
"eval_runtime": 19.736,
"eval_samples_per_second": 1.419,
"eval_steps_per_second": 0.709,
"eval_token_acc": 0.8356507378402855,
"step": 100
},
{
"epoch": 3.0,
"grad_norm": 1.13607656955719,
"learning_rate": 0.0,
"loss": 0.5709176063537598,
"step": 105,
"token_acc": 0.8310829760807724
},
{
"epoch": 3.0,
"eval_loss": 0.5393148064613342,
"eval_runtime": 19.8847,
"eval_samples_per_second": 1.408,
"eval_steps_per_second": 0.704,
"eval_token_acc": 0.835810558840765,
"step": 105
}
],
"logging_steps": 5,
"max_steps": 105,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.994522975314903e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}