8-clusters-balanced-lex-best-2 / trainer_state.json
MHGanainy's picture
MHGanainy/8-clusters-balanced-lex-best-2
5d90d3f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 6688,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014952153110047847,
"grad_norm": 0.11818729341030121,
"learning_rate": 2.9940119760479042e-06,
"loss": 2.3924,
"step": 100
},
{
"epoch": 0.029904306220095694,
"grad_norm": 0.17396187782287598,
"learning_rate": 5.9880239520958085e-06,
"loss": 2.3714,
"step": 200
},
{
"epoch": 0.04485645933014354,
"grad_norm": 0.251809298992157,
"learning_rate": 8.982035928143713e-06,
"loss": 2.3528,
"step": 300
},
{
"epoch": 0.05980861244019139,
"grad_norm": 0.3756244480609894,
"learning_rate": 1.1976047904191617e-05,
"loss": 2.3111,
"step": 400
},
{
"epoch": 0.07476076555023924,
"grad_norm": 0.41418808698654175,
"learning_rate": 1.4970059880239522e-05,
"loss": 2.2514,
"step": 500
},
{
"epoch": 0.08971291866028708,
"grad_norm": 0.46379268169403076,
"learning_rate": 1.7964071856287426e-05,
"loss": 2.2166,
"step": 600
},
{
"epoch": 0.10466507177033493,
"grad_norm": 0.6668035387992859,
"learning_rate": 1.9998605666598577e-05,
"loss": 2.1742,
"step": 700
},
{
"epoch": 0.11961722488038277,
"grad_norm": 0.608900785446167,
"learning_rate": 1.9976283374914574e-05,
"loss": 2.1807,
"step": 800
},
{
"epoch": 0.13456937799043062,
"grad_norm": 0.6352519989013672,
"learning_rate": 1.9926798130661576e-05,
"loss": 2.1485,
"step": 900
},
{
"epoch": 0.14952153110047847,
"grad_norm": 0.7120950222015381,
"learning_rate": 1.9850284669922354e-05,
"loss": 2.1058,
"step": 1000
},
{
"epoch": 0.16447368421052633,
"grad_norm": 0.8403559923171997,
"learning_rate": 1.9746951319929283e-05,
"loss": 2.098,
"step": 1100
},
{
"epoch": 0.17942583732057416,
"grad_norm": 1.0693122148513794,
"learning_rate": 1.961707943184083e-05,
"loss": 2.073,
"step": 1200
},
{
"epoch": 0.194377990430622,
"grad_norm": 0.7334321737289429,
"learning_rate": 1.9461022614691904e-05,
"loss": 2.0591,
"step": 1300
},
{
"epoch": 0.20933014354066987,
"grad_norm": 0.9172684550285339,
"learning_rate": 1.9279205772603905e-05,
"loss": 2.0918,
"step": 1400
},
{
"epoch": 0.2242822966507177,
"grad_norm": 0.8891502618789673,
"learning_rate": 1.9072123947875885e-05,
"loss": 2.0391,
"step": 1500
},
{
"epoch": 0.23923444976076555,
"grad_norm": 0.7827827334403992,
"learning_rate": 1.8840340973106777e-05,
"loss": 2.0423,
"step": 1600
},
{
"epoch": 0.2541866028708134,
"grad_norm": 0.908538818359375,
"learning_rate": 1.8584487936018663e-05,
"loss": 2.0105,
"step": 1700
},
{
"epoch": 0.26913875598086123,
"grad_norm": 1.1278783082962036,
"learning_rate": 1.830526146116098e-05,
"loss": 2.0393,
"step": 1800
},
{
"epoch": 0.2840909090909091,
"grad_norm": 0.9773155450820923,
"learning_rate": 1.800342181317413e-05,
"loss": 2.0248,
"step": 1900
},
{
"epoch": 0.29904306220095694,
"grad_norm": 1.0564335584640503,
"learning_rate": 1.7679790826776865e-05,
"loss": 2.0289,
"step": 2000
},
{
"epoch": 0.31399521531100477,
"grad_norm": 0.9529797434806824,
"learning_rate": 1.7335249669113613e-05,
"loss": 2.014,
"step": 2100
},
{
"epoch": 0.32894736842105265,
"grad_norm": 1.0069955587387085,
"learning_rate": 1.6970736440554218e-05,
"loss": 1.9883,
"step": 2200
},
{
"epoch": 0.3438995215311005,
"grad_norm": 0.8809061646461487,
"learning_rate": 1.6587243620478617e-05,
"loss": 1.9769,
"step": 2300
},
{
"epoch": 0.3588516746411483,
"grad_norm": 0.9701207876205444,
"learning_rate": 1.6185815365000955e-05,
"loss": 1.976,
"step": 2400
},
{
"epoch": 0.3738038277511962,
"grad_norm": 1.3475958108901978,
"learning_rate": 1.5767544663990664e-05,
"loss": 1.999,
"step": 2500
},
{
"epoch": 0.388755980861244,
"grad_norm": 1.2865383625030518,
"learning_rate": 1.5333570365131353e-05,
"loss": 1.9752,
"step": 2600
},
{
"epoch": 0.40370813397129185,
"grad_norm": 1.0800195932388306,
"learning_rate": 1.4885074073120192e-05,
"loss": 1.9928,
"step": 2700
},
{
"epoch": 0.41866028708133973,
"grad_norm": 0.8922355771064758,
"learning_rate": 1.4423276932450512e-05,
"loss": 1.9922,
"step": 2800
},
{
"epoch": 0.43361244019138756,
"grad_norm": 1.251029372215271,
"learning_rate": 1.3949436302537357e-05,
"loss": 1.98,
"step": 2900
},
{
"epoch": 0.4485645933014354,
"grad_norm": 0.8446579575538635,
"learning_rate": 1.3464842334238706e-05,
"loss": 1.9958,
"step": 3000
},
{
"epoch": 0.46351674641148327,
"grad_norm": 1.2263991832733154,
"learning_rate": 1.2970814457093732e-05,
"loss": 1.9642,
"step": 3100
},
{
"epoch": 0.4784688995215311,
"grad_norm": 0.8603528738021851,
"learning_rate": 1.2468697786842481e-05,
"loss": 1.9668,
"step": 3200
},
{
"epoch": 0.4934210526315789,
"grad_norm": 1.107136607170105,
"learning_rate": 1.1959859463008316e-05,
"loss": 1.989,
"step": 3300
},
{
"epoch": 0.5083732057416268,
"grad_norm": 1.3708308935165405,
"learning_rate": 1.1445684926515088e-05,
"loss": 1.958,
"step": 3400
},
{
"epoch": 0.5233253588516746,
"grad_norm": 1.102388620376587,
"learning_rate": 1.0927574147474122e-05,
"loss": 1.9456,
"step": 3500
},
{
"epoch": 0.5382775119617225,
"grad_norm": 1.0549287796020508,
"learning_rate": 1.0406937813411792e-05,
"loss": 1.9293,
"step": 3600
},
{
"epoch": 0.5532296650717703,
"grad_norm": 1.217860221862793,
"learning_rate": 9.885193488316246e-06,
"loss": 1.9067,
"step": 3700
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.8757530450820923,
"learning_rate": 9.363761752961217e-06,
"loss": 1.9248,
"step": 3800
},
{
"epoch": 0.5831339712918661,
"grad_norm": 0.948020339012146,
"learning_rate": 8.844062337015873e-06,
"loss": 1.9939,
"step": 3900
},
{
"epoch": 0.5980861244019139,
"grad_norm": 1.2569465637207031,
"learning_rate": 8.327510253472023e-06,
"loss": 1.937,
"step": 4000
},
{
"epoch": 0.6130382775119617,
"grad_norm": 1.1405400037765503,
"learning_rate": 7.815511945913656e-06,
"loss": 1.9434,
"step": 4100
},
{
"epoch": 0.6279904306220095,
"grad_norm": 1.1672446727752686,
"learning_rate": 7.309461459118869e-06,
"loss": 1.9708,
"step": 4200
},
{
"epoch": 0.6429425837320574,
"grad_norm": 1.0575659275054932,
"learning_rate": 6.810736643420675e-06,
"loss": 1.9096,
"step": 4300
},
{
"epoch": 0.6578947368421053,
"grad_norm": 1.3636834621429443,
"learning_rate": 6.320695403161265e-06,
"loss": 1.9368,
"step": 4400
},
{
"epoch": 0.6728468899521531,
"grad_norm": 1.0744818449020386,
"learning_rate": 5.840671999454305e-06,
"loss": 1.9416,
"step": 4500
},
{
"epoch": 0.687799043062201,
"grad_norm": 1.1180146932601929,
"learning_rate": 5.371973417321858e-06,
"loss": 1.9412,
"step": 4600
},
{
"epoch": 0.7027511961722488,
"grad_norm": 1.0380100011825562,
"learning_rate": 4.91587580709739e-06,
"loss": 1.9581,
"step": 4700
},
{
"epoch": 0.7177033492822966,
"grad_norm": 1.65896475315094,
"learning_rate": 4.4736210097839876e-06,
"loss": 1.9444,
"step": 4800
},
{
"epoch": 0.7326555023923444,
"grad_norm": 0.9622933864593506,
"learning_rate": 4.0464131758283965e-06,
"loss": 1.9271,
"step": 4900
},
{
"epoch": 0.7476076555023924,
"grad_norm": 1.1379241943359375,
"learning_rate": 3.635415486517151e-06,
"loss": 1.9383,
"step": 5000
},
{
"epoch": 0.7625598086124402,
"grad_norm": 1.2658005952835083,
"learning_rate": 3.24174698692157e-06,
"loss": 1.8978,
"step": 5100
},
{
"epoch": 0.777511961722488,
"grad_norm": 1.0809060335159302,
"learning_rate": 2.866479539014744e-06,
"loss": 1.8513,
"step": 5200
},
{
"epoch": 0.7924641148325359,
"grad_norm": 1.1973536014556885,
"learning_rate": 2.5106349032564683e-06,
"loss": 1.9,
"step": 5300
},
{
"epoch": 0.8074162679425837,
"grad_norm": 1.4524526596069336,
"learning_rate": 2.1751819565921774e-06,
"loss": 1.9109,
"step": 5400
},
{
"epoch": 0.8223684210526315,
"grad_norm": 1.475762963294983,
"learning_rate": 1.861034054440607e-06,
"loss": 1.9121,
"step": 5500
},
{
"epoch": 0.8373205741626795,
"grad_norm": 1.374098777770996,
"learning_rate": 1.5690465438528702e-06,
"loss": 1.8737,
"step": 5600
},
{
"epoch": 0.8522727272727273,
"grad_norm": 1.581127405166626,
"learning_rate": 1.300014434613952e-06,
"loss": 1.9188,
"step": 5700
},
{
"epoch": 0.8672248803827751,
"grad_norm": 1.403342604637146,
"learning_rate": 1.0546702346276671e-06,
"loss": 1.9318,
"step": 5800
},
{
"epoch": 0.882177033492823,
"grad_norm": 1.043062686920166,
"learning_rate": 8.336819554787723e-07,
"loss": 1.9304,
"step": 5900
},
{
"epoch": 0.8971291866028708,
"grad_norm": 1.1141079664230347,
"learning_rate": 6.37651293602628e-07,
"loss": 1.9026,
"step": 6000
},
{
"epoch": 0.9120813397129187,
"grad_norm": 1.0021100044250488,
"learning_rate": 4.6711199201459833e-07,
"loss": 1.926,
"step": 6100
},
{
"epoch": 0.9270334928229665,
"grad_norm": 1.2316174507141113,
"learning_rate": 3.225283870597973e-07,
"loss": 1.9084,
"step": 6200
},
{
"epoch": 0.9419856459330144,
"grad_norm": 1.3726786375045776,
"learning_rate": 2.0429414414006588e-07,
"loss": 1.9412,
"step": 6300
},
{
"epoch": 0.9569377990430622,
"grad_norm": 1.1840542554855347,
"learning_rate": 1.1273118586042298e-07,
"loss": 1.9118,
"step": 6400
},
{
"epoch": 0.97188995215311,
"grad_norm": 1.1613115072250366,
"learning_rate": 4.8088815513424037e-08,
"loss": 1.9591,
"step": 6500
},
{
"epoch": 0.9868421052631579,
"grad_norm": 1.2687839269638062,
"learning_rate": 1.0543038287944562e-08,
"loss": 1.9661,
"step": 6600
},
{
"epoch": 1.0,
"eval_loss": 2.05165696144104,
"eval_runtime": 294.8197,
"eval_samples_per_second": 13.988,
"eval_steps_per_second": 1.75,
"step": 6688
},
{
"epoch": 1.0,
"step": 6688,
"total_flos": 1.2181112832e+17,
"train_loss": 2.005428218385249,
"train_runtime": 2349.5442,
"train_samples_per_second": 5.693,
"train_steps_per_second": 2.847
}
],
"logging_steps": 100,
"max_steps": 6688,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2181112832e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}