shulijia's picture
Training in progress, step 267, checkpoint
0c5fdec verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9990645463049579,
"eval_steps": 100,
"global_step": 267,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.037418147801683815,
"grad_norm": 3.605011224746704,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.3411,
"mean_token_accuracy": 0.7953277874737978,
"num_tokens": 40960.0,
"step": 10
},
{
"epoch": 0.07483629560336763,
"grad_norm": 3.451061964035034,
"learning_rate": 3.518518518518519e-05,
"loss": 0.1369,
"mean_token_accuracy": 0.9616682939231396,
"num_tokens": 81920.0,
"step": 20
},
{
"epoch": 0.11225444340505145,
"grad_norm": 2.5540621280670166,
"learning_rate": 4.958333333333334e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.954109588265419,
"num_tokens": 122880.0,
"step": 30
},
{
"epoch": 0.14967259120673526,
"grad_norm": 4.433518886566162,
"learning_rate": 4.75e-05,
"loss": 0.1509,
"mean_token_accuracy": 0.9584148712456226,
"num_tokens": 163840.0,
"step": 40
},
{
"epoch": 0.18709073900841908,
"grad_norm": 2.500624179840088,
"learning_rate": 4.541666666666667e-05,
"loss": 0.1638,
"mean_token_accuracy": 0.9535224996507168,
"num_tokens": 204800.0,
"step": 50
},
{
"epoch": 0.2245088868101029,
"grad_norm": 1.5342501401901245,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.9534246526658535,
"num_tokens": 245760.0,
"step": 60
},
{
"epoch": 0.26192703461178674,
"grad_norm": 1.9312411546707153,
"learning_rate": 4.125e-05,
"loss": 0.141,
"mean_token_accuracy": 0.9598091915249825,
"num_tokens": 286720.0,
"step": 70
},
{
"epoch": 0.2993451824134705,
"grad_norm": 1.3097331523895264,
"learning_rate": 3.9166666666666665e-05,
"loss": 0.1598,
"mean_token_accuracy": 0.954647745192051,
"num_tokens": 327680.0,
"step": 80
},
{
"epoch": 0.33676333021515437,
"grad_norm": 1.5079143047332764,
"learning_rate": 3.708333333333334e-05,
"loss": 0.142,
"mean_token_accuracy": 0.958488255739212,
"num_tokens": 368640.0,
"step": 90
},
{
"epoch": 0.37418147801683815,
"grad_norm": 1.202209234237671,
"learning_rate": 3.5e-05,
"loss": 0.1459,
"mean_token_accuracy": 0.9584882512688637,
"num_tokens": 409600.0,
"step": 100
},
{
"epoch": 0.411599625818522,
"grad_norm": 1.2866814136505127,
"learning_rate": 3.291666666666667e-05,
"loss": 0.1466,
"mean_token_accuracy": 0.9581213280558586,
"num_tokens": 450560.0,
"step": 110
},
{
"epoch": 0.4490177736202058,
"grad_norm": 1.4433410167694092,
"learning_rate": 3.0833333333333335e-05,
"loss": 0.1553,
"mean_token_accuracy": 0.957167312502861,
"num_tokens": 491520.0,
"step": 120
},
{
"epoch": 0.4864359214218896,
"grad_norm": 1.7865726947784424,
"learning_rate": 2.8749999999999997e-05,
"loss": 0.1292,
"mean_token_accuracy": 0.9638209342956543,
"num_tokens": 532480.0,
"step": 130
},
{
"epoch": 0.5238540692235735,
"grad_norm": 1.4343348741531372,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.1454,
"mean_token_accuracy": 0.9584393322467804,
"num_tokens": 573440.0,
"step": 140
},
{
"epoch": 0.5612722170252572,
"grad_norm": 1.2116364240646362,
"learning_rate": 2.4583333333333332e-05,
"loss": 0.1336,
"mean_token_accuracy": 0.9633561626076699,
"num_tokens": 614400.0,
"step": 150
},
{
"epoch": 0.598690364826941,
"grad_norm": 1.7633224725723267,
"learning_rate": 2.25e-05,
"loss": 0.1319,
"mean_token_accuracy": 0.9626712270081044,
"num_tokens": 655360.0,
"step": 160
},
{
"epoch": 0.6361085126286249,
"grad_norm": 1.3809901475906372,
"learning_rate": 2.0416666666666667e-05,
"loss": 0.129,
"mean_token_accuracy": 0.963209392875433,
"num_tokens": 696320.0,
"step": 170
},
{
"epoch": 0.6735266604303087,
"grad_norm": 1.4324010610580444,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.1278,
"mean_token_accuracy": 0.9635273940861225,
"num_tokens": 737280.0,
"step": 180
},
{
"epoch": 0.7109448082319925,
"grad_norm": 1.3217487335205078,
"learning_rate": 1.6250000000000002e-05,
"loss": 0.2005,
"mean_token_accuracy": 0.9594178041443229,
"num_tokens": 778240.0,
"step": 190
},
{
"epoch": 0.7483629560336763,
"grad_norm": 1.0903115272521973,
"learning_rate": 1.4166666666666668e-05,
"loss": 0.1028,
"mean_token_accuracy": 0.9706457890570164,
"num_tokens": 819200.0,
"step": 200
},
{
"epoch": 0.7857811038353602,
"grad_norm": 1.6808840036392212,
"learning_rate": 1.2083333333333333e-05,
"loss": 0.108,
"mean_token_accuracy": 0.9681751407682896,
"num_tokens": 860160.0,
"step": 210
},
{
"epoch": 0.823199251637044,
"grad_norm": 1.3859535455703735,
"learning_rate": 1e-05,
"loss": 0.1081,
"mean_token_accuracy": 0.9688111506402493,
"num_tokens": 901120.0,
"step": 220
},
{
"epoch": 0.8606173994387278,
"grad_norm": 0.9109633564949036,
"learning_rate": 7.916666666666667e-06,
"loss": 0.1121,
"mean_token_accuracy": 0.9680039115250111,
"num_tokens": 942080.0,
"step": 230
},
{
"epoch": 0.8980355472404116,
"grad_norm": 1.237545132637024,
"learning_rate": 5.833333333333334e-06,
"loss": 0.1042,
"mean_token_accuracy": 0.9695939309895039,
"num_tokens": 983040.0,
"step": 240
},
{
"epoch": 0.9354536950420954,
"grad_norm": 1.4165068864822388,
"learning_rate": 3.75e-06,
"loss": 0.0941,
"mean_token_accuracy": 0.9729452036321163,
"num_tokens": 1024000.0,
"step": 250
},
{
"epoch": 0.9728718428437793,
"grad_norm": 1.478573203086853,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0988,
"mean_token_accuracy": 0.9710616409778595,
"num_tokens": 1064960.0,
"step": 260
}
],
"logging_steps": 10,
"max_steps": 267,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2890255829041152.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}