cc_kaz / checkpoint-20000 /trainer_state.json
DaniilOr's picture
Initial upload of multiple checkpoints
769e510 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 12.795905310300704,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3198976327575176,
"grad_norm": 4.1601386070251465,
"learning_rate": 5e-06,
"loss": 10.3279,
"step": 500
},
{
"epoch": 0.6397952655150352,
"grad_norm": 4.366061687469482,
"learning_rate": 1e-05,
"loss": 9.3834,
"step": 1000
},
{
"epoch": 0.9596928982725528,
"grad_norm": 4.784337043762207,
"learning_rate": 1.5e-05,
"loss": 8.8888,
"step": 1500
},
{
"epoch": 1.2795905310300704,
"grad_norm": 3.9968652725219727,
"learning_rate": 2e-05,
"loss": 8.6568,
"step": 2000
},
{
"epoch": 1.599488163787588,
"grad_norm": 4.402552127838135,
"learning_rate": 2.5e-05,
"loss": 8.5473,
"step": 2500
},
{
"epoch": 1.9193857965451055,
"grad_norm": 4.639041423797607,
"learning_rate": 3e-05,
"loss": 8.4044,
"step": 3000
},
{
"epoch": 2.239283429302623,
"grad_norm": 5.651747226715088,
"learning_rate": 3.5e-05,
"loss": 8.2868,
"step": 3500
},
{
"epoch": 2.5591810620601407,
"grad_norm": 4.6999359130859375,
"learning_rate": 4e-05,
"loss": 8.1766,
"step": 4000
},
{
"epoch": 2.8790786948176583,
"grad_norm": 4.838181495666504,
"learning_rate": 4.499e-05,
"loss": 8.1118,
"step": 4500
},
{
"epoch": 3.198976327575176,
"grad_norm": 4.238831996917725,
"learning_rate": 4.999e-05,
"loss": 8.0038,
"step": 5000
},
{
"epoch": 3.5188739603326935,
"grad_norm": 4.455530643463135,
"learning_rate": 5.499000000000001e-05,
"loss": 7.9014,
"step": 5500
},
{
"epoch": 3.838771593090211,
"grad_norm": 5.811736583709717,
"learning_rate": 5.999e-05,
"loss": 7.8352,
"step": 6000
},
{
"epoch": 4.158669225847729,
"grad_norm": 4.998301982879639,
"learning_rate": 6.498e-05,
"loss": 7.7613,
"step": 6500
},
{
"epoch": 4.478566858605246,
"grad_norm": 5.011510848999023,
"learning_rate": 6.998e-05,
"loss": 7.6554,
"step": 7000
},
{
"epoch": 4.798464491362764,
"grad_norm": 4.750300884246826,
"learning_rate": 7.498e-05,
"loss": 7.6109,
"step": 7500
},
{
"epoch": 5.1183621241202815,
"grad_norm": 6.24017858505249,
"learning_rate": 7.998e-05,
"loss": 7.5186,
"step": 8000
},
{
"epoch": 5.438259756877799,
"grad_norm": 6.061458587646484,
"learning_rate": 8.497000000000001e-05,
"loss": 7.3966,
"step": 8500
},
{
"epoch": 5.758157389635317,
"grad_norm": 7.151447772979736,
"learning_rate": 8.997000000000001e-05,
"loss": 7.2877,
"step": 9000
},
{
"epoch": 6.078055022392834,
"grad_norm": 7.578985214233398,
"learning_rate": 9.497000000000001e-05,
"loss": 7.1542,
"step": 9500
},
{
"epoch": 6.397952655150352,
"grad_norm": 5.948920726776123,
"learning_rate": 9.997e-05,
"loss": 7.0008,
"step": 10000
},
{
"epoch": 6.717850287907869,
"grad_norm": 8.036959648132324,
"learning_rate": 9.982896551724137e-05,
"loss": 6.8966,
"step": 10500
},
{
"epoch": 7.037747920665387,
"grad_norm": 7.160433292388916,
"learning_rate": 9.965655172413794e-05,
"loss": 6.7509,
"step": 11000
},
{
"epoch": 7.357645553422905,
"grad_norm": 5.934999465942383,
"learning_rate": 9.948413793103449e-05,
"loss": 6.5833,
"step": 11500
},
{
"epoch": 7.677543186180422,
"grad_norm": 7.745622634887695,
"learning_rate": 9.931172413793104e-05,
"loss": 6.4975,
"step": 12000
},
{
"epoch": 7.99744081893794,
"grad_norm": 7.0418477058410645,
"learning_rate": 9.91393103448276e-05,
"loss": 6.4261,
"step": 12500
},
{
"epoch": 8.317338451695457,
"grad_norm": 6.101259708404541,
"learning_rate": 9.896689655172414e-05,
"loss": 6.2092,
"step": 13000
},
{
"epoch": 8.637236084452976,
"grad_norm": 7.289799213409424,
"learning_rate": 9.87944827586207e-05,
"loss": 6.1436,
"step": 13500
},
{
"epoch": 8.957133717210493,
"grad_norm": 8.126811027526855,
"learning_rate": 9.862206896551725e-05,
"loss": 6.0456,
"step": 14000
},
{
"epoch": 9.277031349968011,
"grad_norm": 8.221816062927246,
"learning_rate": 9.845000000000001e-05,
"loss": 5.9141,
"step": 14500
},
{
"epoch": 9.596928982725528,
"grad_norm": 7.361550331115723,
"learning_rate": 9.827793103448277e-05,
"loss": 5.8326,
"step": 15000
},
{
"epoch": 9.916826615483046,
"grad_norm": 7.1737775802612305,
"learning_rate": 9.810551724137932e-05,
"loss": 5.7974,
"step": 15500
},
{
"epoch": 10.236724248240563,
"grad_norm": 9.80185604095459,
"learning_rate": 9.793310344827586e-05,
"loss": 5.6282,
"step": 16000
},
{
"epoch": 10.556621880998081,
"grad_norm": 7.2062153816223145,
"learning_rate": 9.776068965517242e-05,
"loss": 5.5619,
"step": 16500
},
{
"epoch": 10.876519513755598,
"grad_norm": 10.801878929138184,
"learning_rate": 9.758827586206896e-05,
"loss": 5.5155,
"step": 17000
},
{
"epoch": 11.196417146513117,
"grad_norm": 8.48509693145752,
"learning_rate": 9.741586206896553e-05,
"loss": 5.4259,
"step": 17500
},
{
"epoch": 11.516314779270633,
"grad_norm": 8.47572135925293,
"learning_rate": 9.724344827586207e-05,
"loss": 5.3205,
"step": 18000
},
{
"epoch": 11.836212412028152,
"grad_norm": 6.122796535491943,
"learning_rate": 9.707103448275863e-05,
"loss": 5.3025,
"step": 18500
},
{
"epoch": 12.156110044785668,
"grad_norm": 8.210710525512695,
"learning_rate": 9.689896551724139e-05,
"loss": 5.2264,
"step": 19000
},
{
"epoch": 12.476007677543187,
"grad_norm": 7.857537746429443,
"learning_rate": 9.672655172413794e-05,
"loss": 5.1395,
"step": 19500
},
{
"epoch": 12.795905310300704,
"grad_norm": 7.743075370788574,
"learning_rate": 9.655413793103448e-05,
"loss": 5.1109,
"step": 20000
}
],
"logging_steps": 500,
"max_steps": 300000,
"num_input_tokens_seen": 0,
"num_train_epochs": 192,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2481858007638016e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}