phi-tiny-moe-math-lean-sft / trainer_state.json
rkumar1999's picture
Model save
eb73939 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 290,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.7135222218930721,
"epoch": 0.1386481802426343,
"grad_norm": 2.765625,
"learning_rate": 3.2758620689655175e-06,
"loss": 1.0037,
"mean_token_accuracy": 0.766643451154232,
"num_tokens": 6832690.0,
"step": 20
},
{
"entropy": 0.6983387872576714,
"epoch": 0.2772963604852686,
"grad_norm": 1.171875,
"learning_rate": 6.724137931034484e-06,
"loss": 0.7837,
"mean_token_accuracy": 0.7993580244481564,
"num_tokens": 13664933.0,
"step": 40
},
{
"entropy": 0.5203136764466763,
"epoch": 0.41594454072790293,
"grad_norm": 0.44140625,
"learning_rate": 9.999541586764836e-06,
"loss": 0.5293,
"mean_token_accuracy": 0.8474333696067333,
"num_tokens": 20500452.0,
"step": 60
},
{
"entropy": 0.4632941197603941,
"epoch": 0.5545927209705372,
"grad_norm": 0.34765625,
"learning_rate": 9.799195340909569e-06,
"loss": 0.4664,
"mean_token_accuracy": 0.8605481564998627,
"num_tokens": 27342335.0,
"step": 80
},
{
"entropy": 0.44385356418788435,
"epoch": 0.6932409012131716,
"grad_norm": 0.384765625,
"learning_rate": 9.248987682898576e-06,
"loss": 0.4448,
"mean_token_accuracy": 0.8655192881822587,
"num_tokens": 34182590.0,
"step": 100
},
{
"entropy": 0.44103220105171204,
"epoch": 0.8318890814558059,
"grad_norm": 0.341796875,
"learning_rate": 8.389028759232816e-06,
"loss": 0.4425,
"mean_token_accuracy": 0.8660077638924122,
"num_tokens": 41024570.0,
"step": 120
},
{
"entropy": 0.4328003875911236,
"epoch": 0.9705372616984402,
"grad_norm": 0.318359375,
"learning_rate": 7.2820095883138456e-06,
"loss": 0.4334,
"mean_token_accuracy": 0.8682045668363572,
"num_tokens": 47861377.0,
"step": 140
},
{
"entropy": 0.4284165660282234,
"epoch": 1.1039861351819757,
"grad_norm": 0.326171875,
"learning_rate": 6.008631884264387e-06,
"loss": 0.4289,
"mean_token_accuracy": 0.868948469688366,
"num_tokens": 54391813.0,
"step": 160
},
{
"entropy": 0.4236792534589767,
"epoch": 1.24263431542461,
"grad_norm": 0.341796875,
"learning_rate": 4.661724900761355e-06,
"loss": 0.4239,
"mean_token_accuracy": 0.8704770557582379,
"num_tokens": 61227970.0,
"step": 180
},
{
"entropy": 0.42442810237407685,
"epoch": 1.3812824956672443,
"grad_norm": 0.33984375,
"learning_rate": 3.3394781770539406e-06,
"loss": 0.4245,
"mean_token_accuracy": 0.8702129699289799,
"num_tokens": 68065726.0,
"step": 200
},
{
"entropy": 0.42482112273573874,
"epoch": 1.5199306759098787,
"grad_norm": 0.318359375,
"learning_rate": 2.138283519083281e-06,
"loss": 0.4249,
"mean_token_accuracy": 0.8700512439012528,
"num_tokens": 74903041.0,
"step": 220
},
{
"entropy": 0.4213219854980707,
"epoch": 1.658578856152513,
"grad_norm": 0.32421875,
"learning_rate": 1.145708035387177e-06,
"loss": 0.4219,
"mean_token_accuracy": 0.8707552805542946,
"num_tokens": 81743295.0,
"step": 240
},
{
"entropy": 0.422089908644557,
"epoch": 1.7972270363951472,
"grad_norm": 0.322265625,
"learning_rate": 4.341104935775442e-07,
"loss": 0.4229,
"mean_token_accuracy": 0.8708024740219116,
"num_tokens": 88577789.0,
"step": 260
},
{
"entropy": 0.4224289160221815,
"epoch": 1.9358752166377817,
"grad_norm": 0.322265625,
"learning_rate": 5.536636509891225e-08,
"loss": 0.4232,
"mean_token_accuracy": 0.8705480195581913,
"num_tokens": 95417722.0,
"step": 280
},
{
"entropy": 0.4213579764237275,
"epoch": 2.0,
"mean_token_accuracy": 0.8710838395196039,
"num_tokens": 98528536.0,
"step": 290,
"total_flos": 2.1561577524323942e+18,
"train_loss": 0.5024350297862086,
"train_runtime": 9862.6956,
"train_samples_per_second": 9.819,
"train_steps_per_second": 0.029
}
],
"logging_steps": 20,
"max_steps": 290,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1561577524323942e+18,
"train_batch_size": 84,
"trial_name": null,
"trial_params": null
}