Lumo / checkpoint-2000 /trainer_state.json
Adi362's picture
Upload folder using huggingface_hub
b88ff1a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.36542072025943,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"grad_norm": 0.41249775886535645,
"learning_rate": 0.00011363636363636365,
"loss": 1.6991,
"step": 50
},
{
"epoch": 0.07,
"grad_norm": 0.4261733889579773,
"learning_rate": 0.00019999118973067944,
"loss": 1.463,
"step": 100
},
{
"epoch": 0.1,
"grad_norm": 0.46912631392478943,
"learning_rate": 0.00019976490347667713,
"loss": 1.4387,
"step": 150
},
{
"epoch": 0.14,
"grad_norm": 0.4052003026008606,
"learning_rate": 0.00019923349759697913,
"loss": 1.4552,
"step": 200
},
{
"epoch": 0.17,
"grad_norm": 0.41492465138435364,
"learning_rate": 0.00019839859733610985,
"loss": 1.432,
"step": 250
},
{
"epoch": 0.2,
"grad_norm": 0.4481758177280426,
"learning_rate": 0.0001972627561416867,
"loss": 1.4365,
"step": 300
},
{
"epoch": 0.24,
"grad_norm": 0.5057295560836792,
"learning_rate": 0.00019582944785499073,
"loss": 1.4398,
"step": 350
},
{
"epoch": 0.27,
"grad_norm": 0.4403720796108246,
"learning_rate": 0.00019410305608661744,
"loss": 1.4236,
"step": 400
},
{
"epoch": 0.31,
"grad_norm": 0.4727613627910614,
"learning_rate": 0.00019208886080970187,
"loss": 1.4109,
"step": 450
},
{
"epoch": 0.34,
"grad_norm": 0.5325965285301208,
"learning_rate": 0.0001897930222117203,
"loss": 1.4556,
"step": 500
},
{
"epoch": 0.38,
"grad_norm": 0.5341053605079651,
"learning_rate": 0.00018722256185425655,
"loss": 1.426,
"step": 550
},
{
"epoch": 0.41,
"grad_norm": 0.6165050268173218,
"learning_rate": 0.00018438534119835363,
"loss": 1.3777,
"step": 600
},
{
"epoch": 0.44,
"grad_norm": 0.455497682094574,
"learning_rate": 0.0001812900375611279,
"loss": 1.4352,
"step": 650
},
{
"epoch": 0.48,
"grad_norm": 0.4459982216358185,
"learning_rate": 0.00017794611757718015,
"loss": 1.3917,
"step": 700
},
{
"epoch": 0.51,
"grad_norm": 0.488208144903183,
"learning_rate": 0.00017436380824596915,
"loss": 1.3801,
"step": 750
},
{
"epoch": 0.55,
"grad_norm": 0.6411360502243042,
"learning_rate": 0.00017055406565369532,
"loss": 1.4015,
"step": 800
},
{
"epoch": 0.58,
"grad_norm": 0.6391683220863342,
"learning_rate": 0.0001665285414653555,
"loss": 1.3578,
"step": 850
},
{
"epoch": 0.61,
"grad_norm": 0.4634678363800049,
"learning_rate": 0.00016229954728944895,
"loss": 1.3958,
"step": 900
},
{
"epoch": 0.65,
"grad_norm": 0.5685352087020874,
"learning_rate": 0.00015788001702432132,
"loss": 1.3824,
"step": 950
},
{
"epoch": 0.68,
"grad_norm": 0.48762017488479614,
"learning_rate": 0.0001532834673013053,
"loss": 1.3651,
"step": 1000
},
{
"epoch": 0.72,
"grad_norm": 0.4447900354862213,
"learning_rate": 0.00014852395614563935,
"loss": 1.3928,
"step": 1050
},
{
"epoch": 0.75,
"grad_norm": 0.5502452850341797,
"learning_rate": 0.00014361603998159388,
"loss": 1.3901,
"step": 1100
},
{
"epoch": 0.79,
"grad_norm": 0.5311463475227356,
"learning_rate": 0.00013857472911330131,
"loss": 1.3506,
"step": 1150
},
{
"epoch": 0.82,
"grad_norm": 0.5501254796981812,
"learning_rate": 0.0001334154418174456,
"loss": 1.3611,
"step": 1200
},
{
"epoch": 0.85,
"grad_norm": 0.6120589375495911,
"learning_rate": 0.00012815395718821415,
"loss": 1.3879,
"step": 1250
},
{
"epoch": 0.89,
"grad_norm": 0.6142286658287048,
"learning_rate": 0.0001228063668787309,
"loss": 1.3633,
"step": 1300
},
{
"epoch": 0.92,
"grad_norm": 0.610168993473053,
"learning_rate": 0.00011738902588656242,
"loss": 1.3522,
"step": 1350
},
{
"epoch": 0.96,
"grad_norm": 0.5812483429908752,
"learning_rate": 0.00011191850253381601,
"loss": 1.3938,
"step": 1400
},
{
"epoch": 0.99,
"grad_norm": 0.6008601784706116,
"learning_rate": 0.00010641152779480806,
"loss": 1.3396,
"step": 1450
},
{
"epoch": 1.02,
"grad_norm": 0.6404789686203003,
"learning_rate": 0.00010088494412627968,
"loss": 1.3007,
"step": 1500
},
{
"epoch": 1.06,
"grad_norm": 0.6050863862037659,
"learning_rate": 9.535565395665562e-05,
"loss": 1.2374,
"step": 1550
},
{
"epoch": 1.09,
"grad_norm": 0.5933852195739746,
"learning_rate": 8.984056799188677e-05,
"loss": 1.2135,
"step": 1600
},
{
"epoch": 1.13,
"grad_norm": 0.7690561413764954,
"learning_rate": 8.435655349597689e-05,
"loss": 1.2479,
"step": 1650
},
{
"epoch": 1.16,
"grad_norm": 0.735683023929596,
"learning_rate": 7.892038270437153e-05,
"loss": 1.2115,
"step": 1700
},
{
"epoch": 1.19,
"grad_norm": 0.7068488597869873,
"learning_rate": 7.35486815279806e-05,
"loss": 1.2184,
"step": 1750
},
{
"epoch": 1.23,
"grad_norm": 0.821847677230835,
"learning_rate": 6.825787870471872e-05,
"loss": 1.2054,
"step": 1800
},
{
"epoch": 1.26,
"grad_norm": 0.7907322645187378,
"learning_rate": 6.30641555540761e-05,
"loss": 1.2289,
"step": 1850
},
{
"epoch": 1.3,
"grad_norm": 0.6236514449119568,
"learning_rate": 5.798339648839073e-05,
"loss": 1.2113,
"step": 1900
},
{
"epoch": 1.33,
"grad_norm": 0.6800610423088074,
"learning_rate": 5.303114043217771e-05,
"loss": 1.201,
"step": 1950
},
{
"epoch": 1.37,
"grad_norm": 0.6555848121643066,
"learning_rate": 4.8222533298093295e-05,
"loss": 1.2028,
"step": 2000
}
],
"logging_steps": 50,
"max_steps": 2928,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 4.863950904700109e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}