outputs / last-checkpoint /trainer_state.json
bitsoko's picture
Training in progress, step 600, checkpoint
588ce04 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.03681207436039021,
"eval_steps": 20,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012270691453463403,
"grad_norm": 0.5895100831985474,
"learning_rate": 0.00019993864026834657,
"loss": 2.5283,
"step": 20
},
{
"epoch": 0.0012270691453463403,
"eval_loss": 2.1053755283355713,
"eval_runtime": 20.2067,
"eval_samples_per_second": 4.949,
"eval_steps_per_second": 0.643,
"step": 20
},
{
"epoch": 0.0024541382906926807,
"grad_norm": 0.5672204494476318,
"learning_rate": 0.00019985682729280866,
"loss": 2.0202,
"step": 40
},
{
"epoch": 0.0024541382906926807,
"eval_loss": 1.9263768196105957,
"eval_runtime": 19.6426,
"eval_samples_per_second": 5.091,
"eval_steps_per_second": 0.662,
"step": 40
},
{
"epoch": 0.003681207436039021,
"grad_norm": 0.8112285733222961,
"learning_rate": 0.00019977501431727072,
"loss": 1.9429,
"step": 60
},
{
"epoch": 0.003681207436039021,
"eval_loss": 1.8300584554672241,
"eval_runtime": 19.6764,
"eval_samples_per_second": 5.082,
"eval_steps_per_second": 0.661,
"step": 60
},
{
"epoch": 0.004908276581385361,
"grad_norm": 0.7514470219612122,
"learning_rate": 0.0001996932013417328,
"loss": 1.8545,
"step": 80
},
{
"epoch": 0.004908276581385361,
"eval_loss": 1.7825714349746704,
"eval_runtime": 19.7307,
"eval_samples_per_second": 5.068,
"eval_steps_per_second": 0.659,
"step": 80
},
{
"epoch": 0.006135345726731701,
"grad_norm": 0.6156793832778931,
"learning_rate": 0.00019961138836619487,
"loss": 1.8015,
"step": 100
},
{
"epoch": 0.006135345726731701,
"eval_loss": 1.7314132452011108,
"eval_runtime": 19.6578,
"eval_samples_per_second": 5.087,
"eval_steps_per_second": 0.661,
"step": 100
},
{
"epoch": 0.007362414872078042,
"grad_norm": 0.8181987404823303,
"learning_rate": 0.000199529575390657,
"loss": 1.7025,
"step": 120
},
{
"epoch": 0.007362414872078042,
"eval_loss": 1.6928819417953491,
"eval_runtime": 19.5713,
"eval_samples_per_second": 5.11,
"eval_steps_per_second": 0.664,
"step": 120
},
{
"epoch": 0.008589484017424381,
"grad_norm": 0.6988233327865601,
"learning_rate": 0.00019944776241511905,
"loss": 1.7335,
"step": 140
},
{
"epoch": 0.008589484017424381,
"eval_loss": 1.6641780138015747,
"eval_runtime": 19.6822,
"eval_samples_per_second": 5.081,
"eval_steps_per_second": 0.66,
"step": 140
},
{
"epoch": 0.009816553162770723,
"grad_norm": 0.623275637626648,
"learning_rate": 0.00019936594943958114,
"loss": 1.6708,
"step": 160
},
{
"epoch": 0.009816553162770723,
"eval_loss": 1.629647970199585,
"eval_runtime": 19.5744,
"eval_samples_per_second": 5.109,
"eval_steps_per_second": 0.664,
"step": 160
},
{
"epoch": 0.011043622308117063,
"grad_norm": 0.6912758350372314,
"learning_rate": 0.0001992841364640432,
"loss": 1.7161,
"step": 180
},
{
"epoch": 0.011043622308117063,
"eval_loss": 1.6033389568328857,
"eval_runtime": 19.6839,
"eval_samples_per_second": 5.08,
"eval_steps_per_second": 0.66,
"step": 180
},
{
"epoch": 0.012270691453463402,
"grad_norm": 0.6177836656570435,
"learning_rate": 0.0001992023234885053,
"loss": 1.7165,
"step": 200
},
{
"epoch": 0.012270691453463402,
"eval_loss": 1.5879381895065308,
"eval_runtime": 19.7091,
"eval_samples_per_second": 5.074,
"eval_steps_per_second": 0.66,
"step": 200
},
{
"epoch": 0.013497760598809742,
"grad_norm": 0.8630465269088745,
"learning_rate": 0.00019912051051296735,
"loss": 1.64,
"step": 220
},
{
"epoch": 0.013497760598809742,
"eval_loss": 1.5652003288269043,
"eval_runtime": 19.9102,
"eval_samples_per_second": 5.023,
"eval_steps_per_second": 0.653,
"step": 220
},
{
"epoch": 0.014724829744156084,
"grad_norm": 0.7266297936439514,
"learning_rate": 0.00019903869753742944,
"loss": 1.6705,
"step": 240
},
{
"epoch": 0.014724829744156084,
"eval_loss": 1.5418590307235718,
"eval_runtime": 19.7135,
"eval_samples_per_second": 5.073,
"eval_steps_per_second": 0.659,
"step": 240
},
{
"epoch": 0.015951898889502422,
"grad_norm": 0.7300752997398376,
"learning_rate": 0.00019895688456189153,
"loss": 1.669,
"step": 260
},
{
"epoch": 0.015951898889502422,
"eval_loss": 1.5231231451034546,
"eval_runtime": 19.522,
"eval_samples_per_second": 5.122,
"eval_steps_per_second": 0.666,
"step": 260
},
{
"epoch": 0.017178968034848762,
"grad_norm": 0.7053245306015015,
"learning_rate": 0.00019887507158635362,
"loss": 1.6513,
"step": 280
},
{
"epoch": 0.017178968034848762,
"eval_loss": 1.514104962348938,
"eval_runtime": 19.6874,
"eval_samples_per_second": 5.079,
"eval_steps_per_second": 0.66,
"step": 280
},
{
"epoch": 0.018406037180195105,
"grad_norm": 0.8148968815803528,
"learning_rate": 0.00019879325861081568,
"loss": 1.5712,
"step": 300
},
{
"epoch": 0.018406037180195105,
"eval_loss": 1.4980120658874512,
"eval_runtime": 19.7232,
"eval_samples_per_second": 5.07,
"eval_steps_per_second": 0.659,
"step": 300
},
{
"epoch": 0.019633106325541445,
"grad_norm": 0.5613670349121094,
"learning_rate": 0.00019871144563527777,
"loss": 1.5492,
"step": 320
},
{
"epoch": 0.019633106325541445,
"eval_loss": 1.4802027940750122,
"eval_runtime": 19.7147,
"eval_samples_per_second": 5.072,
"eval_steps_per_second": 0.659,
"step": 320
},
{
"epoch": 0.020860175470887785,
"grad_norm": 0.7558555603027344,
"learning_rate": 0.00019862963265973983,
"loss": 1.6268,
"step": 340
},
{
"epoch": 0.020860175470887785,
"eval_loss": 1.4685406684875488,
"eval_runtime": 19.7336,
"eval_samples_per_second": 5.068,
"eval_steps_per_second": 0.659,
"step": 340
},
{
"epoch": 0.022087244616234125,
"grad_norm": 0.6657942533493042,
"learning_rate": 0.00019854781968420192,
"loss": 1.5955,
"step": 360
},
{
"epoch": 0.022087244616234125,
"eval_loss": 1.4536309242248535,
"eval_runtime": 19.6042,
"eval_samples_per_second": 5.101,
"eval_steps_per_second": 0.663,
"step": 360
},
{
"epoch": 0.023314313761580465,
"grad_norm": 0.8438799977302551,
"learning_rate": 0.00019846600670866399,
"loss": 1.5271,
"step": 380
},
{
"epoch": 0.023314313761580465,
"eval_loss": 1.4461709260940552,
"eval_runtime": 19.7178,
"eval_samples_per_second": 5.072,
"eval_steps_per_second": 0.659,
"step": 380
},
{
"epoch": 0.024541382906926805,
"grad_norm": 0.6734594702720642,
"learning_rate": 0.0001983841937331261,
"loss": 1.4713,
"step": 400
},
{
"epoch": 0.024541382906926805,
"eval_loss": 1.4335358142852783,
"eval_runtime": 19.7501,
"eval_samples_per_second": 5.063,
"eval_steps_per_second": 0.658,
"step": 400
},
{
"epoch": 0.025768452052273145,
"grad_norm": 0.8461142778396606,
"learning_rate": 0.00019830238075758816,
"loss": 1.5175,
"step": 420
},
{
"epoch": 0.025768452052273145,
"eval_loss": 1.4290988445281982,
"eval_runtime": 19.703,
"eval_samples_per_second": 5.075,
"eval_steps_per_second": 0.66,
"step": 420
},
{
"epoch": 0.026995521197619485,
"grad_norm": 0.7308184504508972,
"learning_rate": 0.00019822056778205025,
"loss": 1.4878,
"step": 440
},
{
"epoch": 0.026995521197619485,
"eval_loss": 1.4188473224639893,
"eval_runtime": 19.6677,
"eval_samples_per_second": 5.084,
"eval_steps_per_second": 0.661,
"step": 440
},
{
"epoch": 0.028222590342965825,
"grad_norm": 0.7773933410644531,
"learning_rate": 0.00019813875480651232,
"loss": 1.5046,
"step": 460
},
{
"epoch": 0.028222590342965825,
"eval_loss": 1.4094576835632324,
"eval_runtime": 19.6378,
"eval_samples_per_second": 5.092,
"eval_steps_per_second": 0.662,
"step": 460
},
{
"epoch": 0.029449659488312168,
"grad_norm": 0.6018341779708862,
"learning_rate": 0.0001980569418309744,
"loss": 1.508,
"step": 480
},
{
"epoch": 0.029449659488312168,
"eval_loss": 1.396529197692871,
"eval_runtime": 19.7008,
"eval_samples_per_second": 5.076,
"eval_steps_per_second": 0.66,
"step": 480
},
{
"epoch": 0.030676728633658508,
"grad_norm": 0.6028321981430054,
"learning_rate": 0.00019797512885543647,
"loss": 1.5018,
"step": 500
},
{
"epoch": 0.030676728633658508,
"eval_loss": 1.3898202180862427,
"eval_runtime": 19.7395,
"eval_samples_per_second": 5.066,
"eval_steps_per_second": 0.659,
"step": 500
},
{
"epoch": 0.031903797779004844,
"grad_norm": 0.7919607162475586,
"learning_rate": 0.00019789331587989856,
"loss": 1.4158,
"step": 520
},
{
"epoch": 0.031903797779004844,
"eval_loss": 1.385123610496521,
"eval_runtime": 19.7083,
"eval_samples_per_second": 5.074,
"eval_steps_per_second": 0.66,
"step": 520
},
{
"epoch": 0.03313086692435119,
"grad_norm": 0.7193537354469299,
"learning_rate": 0.00019781150290436065,
"loss": 1.4829,
"step": 540
},
{
"epoch": 0.03313086692435119,
"eval_loss": 1.3717445135116577,
"eval_runtime": 19.7284,
"eval_samples_per_second": 5.069,
"eval_steps_per_second": 0.659,
"step": 540
},
{
"epoch": 0.034357936069697524,
"grad_norm": 0.623745322227478,
"learning_rate": 0.00019772968992882274,
"loss": 1.5216,
"step": 560
},
{
"epoch": 0.034357936069697524,
"eval_loss": 1.3708571195602417,
"eval_runtime": 19.6215,
"eval_samples_per_second": 5.096,
"eval_steps_per_second": 0.663,
"step": 560
},
{
"epoch": 0.03558500521504387,
"grad_norm": 0.7613083124160767,
"learning_rate": 0.0001976478769532848,
"loss": 1.4677,
"step": 580
},
{
"epoch": 0.03558500521504387,
"eval_loss": 1.3612563610076904,
"eval_runtime": 19.7315,
"eval_samples_per_second": 5.068,
"eval_steps_per_second": 0.659,
"step": 580
},
{
"epoch": 0.03681207436039021,
"grad_norm": 0.6662244200706482,
"learning_rate": 0.00019756606397774689,
"loss": 1.4336,
"step": 600
},
{
"epoch": 0.03681207436039021,
"eval_loss": 1.3519067764282227,
"eval_runtime": 19.6923,
"eval_samples_per_second": 5.078,
"eval_steps_per_second": 0.66,
"step": 600
}
],
"logging_steps": 20,
"max_steps": 48897,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"total_flos": 3.892256129028096e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}