8-clusters-balanced-lex-best-7 / trainer_state.json
MHGanainy's picture
MHGanainy/8-clusters-balanced-lex-best-7
e0bddd3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 7378,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013553808620222282,
"grad_norm": 0.1191592738032341,
"learning_rate": 2.7137042062415198e-06,
"loss": 2.4955,
"step": 100
},
{
"epoch": 0.027107617240444564,
"grad_norm": 0.26774927973747253,
"learning_rate": 5.4274084124830395e-06,
"loss": 2.4596,
"step": 200
},
{
"epoch": 0.040661425860666844,
"grad_norm": 0.22501784563064575,
"learning_rate": 8.14111261872456e-06,
"loss": 2.4267,
"step": 300
},
{
"epoch": 0.05421523448088913,
"grad_norm": 0.3591971695423126,
"learning_rate": 1.0854816824966079e-05,
"loss": 2.4028,
"step": 400
},
{
"epoch": 0.06776904310111141,
"grad_norm": 0.36190351843833923,
"learning_rate": 1.35685210312076e-05,
"loss": 2.3537,
"step": 500
},
{
"epoch": 0.08132285172133369,
"grad_norm": 0.4355062246322632,
"learning_rate": 1.628222523744912e-05,
"loss": 2.2783,
"step": 600
},
{
"epoch": 0.09487666034155598,
"grad_norm": 0.4676553010940552,
"learning_rate": 1.899592944369064e-05,
"loss": 2.2975,
"step": 700
},
{
"epoch": 0.10843046896177826,
"grad_norm": 0.5140531063079834,
"learning_rate": 1.9995559296849784e-05,
"loss": 2.2424,
"step": 800
},
{
"epoch": 0.12198427758200055,
"grad_norm": 0.5148899555206299,
"learning_rate": 1.9970285884793442e-05,
"loss": 2.288,
"step": 900
},
{
"epoch": 0.13553808620222282,
"grad_norm": 0.6413929462432861,
"learning_rate": 1.992270453601682e-05,
"loss": 2.2605,
"step": 1000
},
{
"epoch": 0.14909189482244511,
"grad_norm": 0.615112841129303,
"learning_rate": 1.985292171102966e-05,
"loss": 2.193,
"step": 1100
},
{
"epoch": 0.16264570344266738,
"grad_norm": 0.5734832882881165,
"learning_rate": 1.976109354485778e-05,
"loss": 2.1929,
"step": 1200
},
{
"epoch": 0.17619951206288967,
"grad_norm": 0.9036373496055603,
"learning_rate": 1.9647425497699984e-05,
"loss": 2.1792,
"step": 1300
},
{
"epoch": 0.18975332068311196,
"grad_norm": 0.6254833936691284,
"learning_rate": 1.9512171895222806e-05,
"loss": 2.1922,
"step": 1400
},
{
"epoch": 0.20330712930333425,
"grad_norm": 0.6832364797592163,
"learning_rate": 1.935563535952155e-05,
"loss": 2.1794,
"step": 1500
},
{
"epoch": 0.2168609379235565,
"grad_norm": 0.7471606731414795,
"learning_rate": 1.91781661320209e-05,
"loss": 2.2264,
"step": 1600
},
{
"epoch": 0.2304147465437788,
"grad_norm": 0.7134560346603394,
"learning_rate": 1.898016128983004e-05,
"loss": 2.1893,
"step": 1700
},
{
"epoch": 0.2439685551640011,
"grad_norm": 0.585695743560791,
"learning_rate": 1.8762063857305644e-05,
"loss": 2.1997,
"step": 1800
},
{
"epoch": 0.2575223637842234,
"grad_norm": 1.138268232345581,
"learning_rate": 1.8524361814810542e-05,
"loss": 2.149,
"step": 1900
},
{
"epoch": 0.27107617240444565,
"grad_norm": 0.9364109039306641,
"learning_rate": 1.826758700688596e-05,
"loss": 2.1775,
"step": 2000
},
{
"epoch": 0.2846299810246679,
"grad_norm": 0.849953293800354,
"learning_rate": 1.7992313952280175e-05,
"loss": 2.1315,
"step": 2100
},
{
"epoch": 0.29818378964489023,
"grad_norm": 0.8387102484703064,
"learning_rate": 1.7699158558496127e-05,
"loss": 2.1819,
"step": 2200
},
{
"epoch": 0.3117375982651125,
"grad_norm": 0.7546108365058899,
"learning_rate": 1.73887767437341e-05,
"loss": 2.1215,
"step": 2300
},
{
"epoch": 0.32529140688533476,
"grad_norm": 0.5941405296325684,
"learning_rate": 1.7061862969312734e-05,
"loss": 2.1457,
"step": 2400
},
{
"epoch": 0.3388452155055571,
"grad_norm": 0.8476601243019104,
"learning_rate": 1.6719148685852103e-05,
"loss": 2.1349,
"step": 2500
},
{
"epoch": 0.35239902412577934,
"grad_norm": 0.6843703985214233,
"learning_rate": 1.6361400696695352e-05,
"loss": 2.1708,
"step": 2600
},
{
"epoch": 0.3659528327460016,
"grad_norm": 0.8201411366462708,
"learning_rate": 1.5989419442230672e-05,
"loss": 2.1408,
"step": 2700
},
{
"epoch": 0.3795066413662239,
"grad_norm": 0.8031175136566162,
"learning_rate": 1.5604037208952308e-05,
"loss": 2.1407,
"step": 2800
},
{
"epoch": 0.3930604499864462,
"grad_norm": 0.7488608360290527,
"learning_rate": 1.520611626726779e-05,
"loss": 2.1288,
"step": 2900
},
{
"epoch": 0.4066142586066685,
"grad_norm": 0.8462947607040405,
"learning_rate": 1.4796546942217882e-05,
"loss": 2.0887,
"step": 3000
},
{
"epoch": 0.42016806722689076,
"grad_norm": 0.9811045527458191,
"learning_rate": 1.4376245621425904e-05,
"loss": 2.1211,
"step": 3100
},
{
"epoch": 0.433721875847113,
"grad_norm": 0.7871158719062805,
"learning_rate": 1.3946152704733542e-05,
"loss": 2.0577,
"step": 3200
},
{
"epoch": 0.44727568446733534,
"grad_norm": 0.7200583815574646,
"learning_rate": 1.3507230500110733e-05,
"loss": 2.0763,
"step": 3300
},
{
"epoch": 0.4608294930875576,
"grad_norm": 1.2278062105178833,
"learning_rate": 1.3060461070547336e-05,
"loss": 2.0654,
"step": 3400
},
{
"epoch": 0.47438330170777987,
"grad_norm": 1.0776195526123047,
"learning_rate": 1.2606844036744152e-05,
"loss": 2.1158,
"step": 3500
},
{
"epoch": 0.4879371103280022,
"grad_norm": 0.8359533548355103,
"learning_rate": 1.2147394340519519e-05,
"loss": 2.078,
"step": 3600
},
{
"epoch": 0.5014909189482244,
"grad_norm": 0.7208207249641418,
"learning_rate": 1.1683139973935847e-05,
"loss": 2.0443,
"step": 3700
},
{
"epoch": 0.5150447275684468,
"grad_norm": 0.8674055337905884,
"learning_rate": 1.1215119679226966e-05,
"loss": 2.0315,
"step": 3800
},
{
"epoch": 0.528598536188669,
"grad_norm": 0.8693380951881409,
"learning_rate": 1.074438062467258e-05,
"loss": 2.0985,
"step": 3900
},
{
"epoch": 0.5421523448088913,
"grad_norm": 1.1561861038208008,
"learning_rate": 1.027197606161996e-05,
"loss": 2.092,
"step": 4000
},
{
"epoch": 0.5557061534291136,
"grad_norm": 1.33693265914917,
"learning_rate": 9.798962967895082e-06,
"loss": 2.1118,
"step": 4100
},
{
"epoch": 0.5692599620493358,
"grad_norm": 0.8483671545982361,
"learning_rate": 9.326399682876032e-06,
"loss": 2.0907,
"step": 4200
},
{
"epoch": 0.5828137706695582,
"grad_norm": 0.8882675170898438,
"learning_rate": 8.855343539520006e-06,
"loss": 2.1293,
"step": 4300
},
{
"epoch": 0.5963675792897805,
"grad_norm": 0.7699640989303589,
"learning_rate": 8.386848498642072e-06,
"loss": 2.1274,
"step": 4400
},
{
"epoch": 0.6099213879100027,
"grad_norm": 1.0404322147369385,
"learning_rate": 7.921962790738976e-06,
"loss": 2.0554,
"step": 4500
},
{
"epoch": 0.623475196530225,
"grad_norm": 0.8978158831596375,
"learning_rate": 7.46172657063414e-06,
"loss": 2.1415,
"step": 4600
},
{
"epoch": 0.6370290051504472,
"grad_norm": 1.3695547580718994,
"learning_rate": 7.007169590191574e-06,
"loss": 2.0975,
"step": 4700
},
{
"epoch": 0.6505828137706695,
"grad_norm": 0.80988609790802,
"learning_rate": 6.5593088943057386e-06,
"loss": 2.1267,
"step": 4800
},
{
"epoch": 0.6641366223908919,
"grad_norm": 1.1780813932418823,
"learning_rate": 6.119146545322567e-06,
"loss": 2.0794,
"step": 4900
},
{
"epoch": 0.6776904310111141,
"grad_norm": 0.9567013382911682,
"learning_rate": 5.687667380983037e-06,
"loss": 2.095,
"step": 5000
},
{
"epoch": 0.6912442396313364,
"grad_norm": 1.1569477319717407,
"learning_rate": 5.265836810905844e-06,
"loss": 2.0693,
"step": 5100
},
{
"epoch": 0.7047980482515587,
"grad_norm": 0.9160040020942688,
"learning_rate": 4.854598656539305e-06,
"loss": 2.1312,
"step": 5200
},
{
"epoch": 0.7183518568717809,
"grad_norm": 0.8206405639648438,
"learning_rate": 4.454873039415593e-06,
"loss": 2.0722,
"step": 5300
},
{
"epoch": 0.7319056654920032,
"grad_norm": 1.1364679336547852,
"learning_rate": 4.067554322432159e-06,
"loss": 2.0582,
"step": 5400
},
{
"epoch": 0.7454594741122256,
"grad_norm": 1.1402835845947266,
"learning_rate": 3.6935091087665677e-06,
"loss": 2.1207,
"step": 5500
},
{
"epoch": 0.7590132827324478,
"grad_norm": 1.0124040842056274,
"learning_rate": 3.333574302902145e-06,
"loss": 2.0439,
"step": 5600
},
{
"epoch": 0.7725670913526701,
"grad_norm": 1.3236887454986572,
"learning_rate": 2.9885552381026927e-06,
"loss": 2.1332,
"step": 5700
},
{
"epoch": 0.7861208999728924,
"grad_norm": 0.9097657799720764,
"learning_rate": 2.659223874525996e-06,
"loss": 2.1608,
"step": 5800
},
{
"epoch": 0.7996747085931146,
"grad_norm": 0.8172687292098999,
"learning_rate": 2.34631707200773e-06,
"loss": 2.1053,
"step": 5900
},
{
"epoch": 0.813228517213337,
"grad_norm": 0.9772549271583557,
"learning_rate": 2.050534941380283e-06,
"loss": 2.0312,
"step": 6000
},
{
"epoch": 0.8267823258335593,
"grad_norm": 1.0267835855484009,
"learning_rate": 1.7725392780153484e-06,
"loss": 2.0607,
"step": 6100
},
{
"epoch": 0.8403361344537815,
"grad_norm": 1.166391134262085,
"learning_rate": 1.5129520810951426e-06,
"loss": 2.0938,
"step": 6200
},
{
"epoch": 0.8538899430740038,
"grad_norm": 1.170172095298767,
"learning_rate": 1.2723541619253044e-06,
"loss": 2.0053,
"step": 6300
},
{
"epoch": 0.867443751694226,
"grad_norm": 1.0053555965423584,
"learning_rate": 1.0512838444032515e-06,
"loss": 2.0844,
"step": 6400
},
{
"epoch": 0.8809975603144483,
"grad_norm": 0.8027963638305664,
"learning_rate": 8.502357605496692e-07,
"loss": 2.1124,
"step": 6500
},
{
"epoch": 0.8945513689346707,
"grad_norm": 0.6970746517181396,
"learning_rate": 6.696597437980367e-07,
"loss": 2.0236,
"step": 6600
},
{
"epoch": 0.908105177554893,
"grad_norm": 1.0525530576705933,
"learning_rate": 5.099598225183966e-07,
"loss": 2.0594,
"step": 6700
},
{
"epoch": 0.9216589861751152,
"grad_norm": 1.0885214805603027,
"learning_rate": 3.714933160273004e-07,
"loss": 2.0454,
"step": 6800
},
{
"epoch": 0.9352127947953375,
"grad_norm": 0.9120861291885376,
"learning_rate": 2.5457003510654055e-07,
"loss": 2.069,
"step": 6900
},
{
"epoch": 0.9487666034155597,
"grad_norm": 1.7351340055465698,
"learning_rate": 1.594515888194903e-07,
"loss": 2.0947,
"step": 7000
},
{
"epoch": 0.962320412035782,
"grad_norm": 0.9601479172706604,
"learning_rate": 8.635079917599376e-08,
"loss": 2.1024,
"step": 7100
},
{
"epoch": 0.9758742206560044,
"grad_norm": 0.9935265779495239,
"learning_rate": 3.543122495545004e-08,
"loss": 2.0988,
"step": 7200
},
{
"epoch": 0.9894280292762266,
"grad_norm": 0.6805266737937927,
"learning_rate": 6.806795753524498e-09,
"loss": 2.0827,
"step": 7300
},
{
"epoch": 1.0,
"eval_loss": 1.4551358222961426,
"eval_runtime": 26.4185,
"eval_samples_per_second": 8.857,
"eval_steps_per_second": 1.136,
"step": 7378
},
{
"epoch": 1.0,
"step": 7378,
"total_flos": 1.3438841192448e+17,
"train_loss": 2.142452607086339,
"train_runtime": 3524.2818,
"train_samples_per_second": 4.187,
"train_steps_per_second": 2.093
}
],
"logging_steps": 100,
"max_steps": 7378,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3438841192448e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}