tiny-test-1 / last-checkpoint /trainer_state.json
baby-dev's picture
Training in progress, epoch 2, checkpoint
2ab6e74 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0041928721174003,
"eval_steps": 500,
"global_step": 239,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.041928721174004195,
"grad_norm": 0.011030412279069424,
"learning_rate": 0.00025,
"loss": 11.9318,
"step": 5
},
{
"epoch": 0.08385744234800839,
"grad_norm": 0.014100322499871254,
"learning_rate": 0.00024465811965811965,
"loss": 11.9305,
"step": 10
},
{
"epoch": 0.12578616352201258,
"grad_norm": 0.017396269366145134,
"learning_rate": 0.00023931623931623932,
"loss": 11.9291,
"step": 15
},
{
"epoch": 0.16771488469601678,
"grad_norm": 0.022825436666607857,
"learning_rate": 0.000233974358974359,
"loss": 11.9293,
"step": 20
},
{
"epoch": 0.20964360587002095,
"grad_norm": 0.030763259157538414,
"learning_rate": 0.00022863247863247864,
"loss": 11.928,
"step": 25
},
{
"epoch": 0.25157232704402516,
"grad_norm": 0.05623968690633774,
"learning_rate": 0.0002232905982905983,
"loss": 11.9273,
"step": 30
},
{
"epoch": 0.29350104821802936,
"grad_norm": 0.0468871183693409,
"learning_rate": 0.00021794871794871795,
"loss": 11.9263,
"step": 35
},
{
"epoch": 0.33542976939203356,
"grad_norm": 0.05555358901619911,
"learning_rate": 0.0002126068376068376,
"loss": 11.9248,
"step": 40
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.0784514918923378,
"learning_rate": 0.00020726495726495727,
"loss": 11.9244,
"step": 45
},
{
"epoch": 0.4192872117400419,
"grad_norm": 0.05951184406876564,
"learning_rate": 0.00020192307692307694,
"loss": 11.9228,
"step": 50
},
{
"epoch": 0.4612159329140461,
"grad_norm": 0.057042159140110016,
"learning_rate": 0.00019658119658119659,
"loss": 11.9221,
"step": 55
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.04163195937871933,
"learning_rate": 0.00019123931623931623,
"loss": 11.9225,
"step": 60
},
{
"epoch": 0.5450733752620545,
"grad_norm": 0.03262303024530411,
"learning_rate": 0.0001858974358974359,
"loss": 11.9226,
"step": 65
},
{
"epoch": 0.5870020964360587,
"grad_norm": 0.05241989716887474,
"learning_rate": 0.00018055555555555555,
"loss": 11.922,
"step": 70
},
{
"epoch": 0.6289308176100629,
"grad_norm": 0.06784799695014954,
"learning_rate": 0.00017521367521367522,
"loss": 11.9214,
"step": 75
},
{
"epoch": 0.6708595387840671,
"grad_norm": 0.042793747037649155,
"learning_rate": 0.0001698717948717949,
"loss": 11.9183,
"step": 80
},
{
"epoch": 0.7127882599580713,
"grad_norm": 0.0430237241089344,
"learning_rate": 0.00016452991452991454,
"loss": 11.9216,
"step": 85
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.03868071734905243,
"learning_rate": 0.00015918803418803418,
"loss": 11.9194,
"step": 90
},
{
"epoch": 0.7966457023060797,
"grad_norm": 0.024328265339136124,
"learning_rate": 0.00015384615384615385,
"loss": 11.9217,
"step": 95
},
{
"epoch": 0.8385744234800838,
"grad_norm": 0.04353172332048416,
"learning_rate": 0.0001485042735042735,
"loss": 11.9212,
"step": 100
},
{
"epoch": 0.8805031446540881,
"grad_norm": 0.057023949921131134,
"learning_rate": 0.00014316239316239317,
"loss": 11.92,
"step": 105
},
{
"epoch": 0.9224318658280922,
"grad_norm": 0.039732299745082855,
"learning_rate": 0.00013782051282051284,
"loss": 11.9183,
"step": 110
},
{
"epoch": 0.9643605870020965,
"grad_norm": 0.0544021911919117,
"learning_rate": 0.00013247863247863248,
"loss": 11.9203,
"step": 115
},
{
"epoch": 0.9979035639412998,
"eval_loss": 11.919066429138184,
"eval_runtime": 0.416,
"eval_samples_per_second": 242.779,
"eval_steps_per_second": 62.498,
"step": 119
},
{
"epoch": 1.0062893081761006,
"grad_norm": 0.10388734191656113,
"learning_rate": 0.00012713675213675213,
"loss": 13.7207,
"step": 120
},
{
"epoch": 1.0482180293501049,
"grad_norm": 0.028476731851696968,
"learning_rate": 0.0001217948717948718,
"loss": 11.9223,
"step": 125
},
{
"epoch": 1.090146750524109,
"grad_norm": 0.0434449277818203,
"learning_rate": 0.00011645299145299146,
"loss": 11.9223,
"step": 130
},
{
"epoch": 1.1320754716981132,
"grad_norm": 0.09088350832462311,
"learning_rate": 0.0001111111111111111,
"loss": 11.9366,
"step": 135
},
{
"epoch": 1.1740041928721174,
"grad_norm": 0.07184627652168274,
"learning_rate": 0.00010576923076923077,
"loss": 11.5661,
"step": 140
},
{
"epoch": 1.2159329140461215,
"grad_norm": 0.04500441253185272,
"learning_rate": 0.00010042735042735043,
"loss": 12.3022,
"step": 145
},
{
"epoch": 1.2578616352201257,
"grad_norm": 0.029749717563390732,
"learning_rate": 9.508547008547008e-05,
"loss": 11.9321,
"step": 150
},
{
"epoch": 1.29979035639413,
"grad_norm": 0.046956080943346024,
"learning_rate": 8.974358974358975e-05,
"loss": 11.8407,
"step": 155
},
{
"epoch": 1.3417190775681342,
"grad_norm": 0.06576091051101685,
"learning_rate": 8.440170940170941e-05,
"loss": 11.9356,
"step": 160
},
{
"epoch": 1.3836477987421385,
"grad_norm": 0.056520890444517136,
"learning_rate": 7.905982905982905e-05,
"loss": 11.9497,
"step": 165
},
{
"epoch": 1.4255765199161425,
"grad_norm": 0.05084730684757233,
"learning_rate": 7.371794871794872e-05,
"loss": 11.5847,
"step": 170
},
{
"epoch": 1.4675052410901468,
"grad_norm": 0.03961843624711037,
"learning_rate": 6.837606837606838e-05,
"loss": 12.261,
"step": 175
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.03475997969508171,
"learning_rate": 6.303418803418804e-05,
"loss": 11.8777,
"step": 180
},
{
"epoch": 1.551362683438155,
"grad_norm": 0.028086921200156212,
"learning_rate": 5.76923076923077e-05,
"loss": 11.9761,
"step": 185
},
{
"epoch": 1.5932914046121593,
"grad_norm": 0.046144578605890274,
"learning_rate": 5.2350427350427356e-05,
"loss": 11.8616,
"step": 190
},
{
"epoch": 1.6352201257861636,
"grad_norm": 0.05854855850338936,
"learning_rate": 4.700854700854701e-05,
"loss": 11.9751,
"step": 195
},
{
"epoch": 1.6771488469601676,
"grad_norm": 0.035215962678194046,
"learning_rate": 4.1666666666666665e-05,
"loss": 11.9493,
"step": 200
},
{
"epoch": 1.719077568134172,
"grad_norm": 0.06034635007381439,
"learning_rate": 3.632478632478633e-05,
"loss": 11.8182,
"step": 205
},
{
"epoch": 1.7610062893081762,
"grad_norm": 0.027154497802257538,
"learning_rate": 3.098290598290598e-05,
"loss": 11.9445,
"step": 210
},
{
"epoch": 1.8029350104821802,
"grad_norm": 0.055567361414432526,
"learning_rate": 2.564102564102564e-05,
"loss": 11.9099,
"step": 215
},
{
"epoch": 1.8448637316561844,
"grad_norm": 0.04124658182263374,
"learning_rate": 2.02991452991453e-05,
"loss": 11.7515,
"step": 220
},
{
"epoch": 1.8867924528301887,
"grad_norm": 0.047468505799770355,
"learning_rate": 1.4957264957264958e-05,
"loss": 12.3221,
"step": 225
},
{
"epoch": 1.9287211740041927,
"grad_norm": 0.04315986856818199,
"learning_rate": 9.615384615384616e-06,
"loss": 11.6918,
"step": 230
},
{
"epoch": 1.9706498951781972,
"grad_norm": 0.08445514738559723,
"learning_rate": 4.273504273504274e-06,
"loss": 12.0431,
"step": 235
},
{
"epoch": 1.9958071278825997,
"eval_loss": 11.917438507080078,
"eval_runtime": 0.4203,
"eval_samples_per_second": 240.321,
"eval_steps_per_second": 61.865,
"step": 238
},
{
"epoch": 2.0041928721174003,
"eval_loss": 11.917825698852539,
"eval_runtime": 0.4191,
"eval_samples_per_second": 241.009,
"eval_steps_per_second": 62.042,
"step": 239
}
],
"logging_steps": 5,
"max_steps": 239,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 269012385792.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}