multitask_model / trainer_state.json
yoriis's picture
Uploading model
24ae9b3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5243,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009536524890329964,
"grad_norm": 1.3725634813308716,
"learning_rate": 3.7262357414448674e-05,
"loss": 1.5183903503417968,
"step": 50
},
{
"epoch": 0.019073049780659927,
"grad_norm": 1.5630295276641846,
"learning_rate": 7.52851711026616e-05,
"loss": 1.0358538818359375,
"step": 100
},
{
"epoch": 0.02860957467098989,
"grad_norm": 1.0457429885864258,
"learning_rate": 0.00011330798479087452,
"loss": 0.9928044128417969,
"step": 150
},
{
"epoch": 0.038146099561319854,
"grad_norm": 1.1760847568511963,
"learning_rate": 0.00015133079847908746,
"loss": 0.9799901580810547,
"step": 200
},
{
"epoch": 0.04768262445164982,
"grad_norm": 1.0198277235031128,
"learning_rate": 0.0001893536121673004,
"loss": 0.9692832946777343,
"step": 250
},
{
"epoch": 0.05721914934197978,
"grad_norm": 4.2576141357421875,
"learning_rate": 0.00019997421320309795,
"loss": 0.9611747741699219,
"step": 300
},
{
"epoch": 0.06675567423230974,
"grad_norm": 1.0672153234481812,
"learning_rate": 0.00019985286992997873,
"loss": 0.9739149475097656,
"step": 350
},
{
"epoch": 0.07629219912263971,
"grad_norm": 1.015286922454834,
"learning_rate": 0.00019963219089815488,
"loss": 0.95861572265625,
"step": 400
},
{
"epoch": 0.08582872401296968,
"grad_norm": 1.108496069908142,
"learning_rate": 0.00019931239564382073,
"loss": 0.9668975067138672,
"step": 450
},
{
"epoch": 0.09536524890329964,
"grad_norm": 0.855691134929657,
"learning_rate": 0.0001988938023060968,
"loss": 0.9651553344726562,
"step": 500
},
{
"epoch": 0.09536524890329964,
"eval_loss": 0.9762284159660339,
"eval_runtime": 680.8217,
"eval_samples_per_second": 12.536,
"eval_steps_per_second": 0.784,
"step": 500
},
{
"epoch": 0.1049017737936296,
"grad_norm": 0.8507081866264343,
"learning_rate": 0.0001983768273105382,
"loss": 0.9656350708007813,
"step": 550
},
{
"epoch": 0.11443829868395956,
"grad_norm": 0.8325157165527344,
"learning_rate": 0.00019776198495486565,
"loss": 0.9318672180175781,
"step": 600
},
{
"epoch": 0.12397482357428953,
"grad_norm": 0.9234330654144287,
"learning_rate": 0.0001970498868973313,
"loss": 0.9393777465820312,
"step": 650
},
{
"epoch": 0.13351134846461948,
"grad_norm": 0.8355016708374023,
"learning_rate": 0.0001962412415482278,
"loss": 0.9270508575439453,
"step": 700
},
{
"epoch": 0.14304787335494945,
"grad_norm": 0.9328432083129883,
"learning_rate": 0.00019533685336514697,
"loss": 0.9209387969970703,
"step": 750
},
{
"epoch": 0.15258439824527942,
"grad_norm": 1.194154977798462,
"learning_rate": 0.00019433762205268805,
"loss": 0.9753380584716796,
"step": 800
},
{
"epoch": 0.16212092313560938,
"grad_norm": 0.9554775953292847,
"learning_rate": 0.0001932445416674127,
"loss": 0.9683086395263671,
"step": 850
},
{
"epoch": 0.17165744802593935,
"grad_norm": 1.0252937078475952,
"learning_rate": 0.00019205869962893605,
"loss": 0.9329315948486329,
"step": 900
},
{
"epoch": 0.18119397291626932,
"grad_norm": 1.2285805940628052,
"learning_rate": 0.00019078127563813883,
"loss": 0.9511469268798828,
"step": 950
},
{
"epoch": 0.1907304978065993,
"grad_norm": 0.9454661011695862,
"learning_rate": 0.00018941354050357566,
"loss": 0.951951904296875,
"step": 1000
},
{
"epoch": 0.1907304978065993,
"eval_loss": 0.9598119258880615,
"eval_runtime": 680.8627,
"eval_samples_per_second": 12.536,
"eval_steps_per_second": 0.784,
"step": 1000
},
{
"epoch": 0.20026702269692923,
"grad_norm": 1.0210950374603271,
"learning_rate": 0.00018795685487724782,
"loss": 0.9263379669189453,
"step": 1050
},
{
"epoch": 0.2098035475872592,
"grad_norm": 1.0087344646453857,
"learning_rate": 0.0001864126679009975,
"loss": 0.9310942840576172,
"step": 1100
},
{
"epoch": 0.21934007247758916,
"grad_norm": 1.1398996114730835,
"learning_rate": 0.00018478251576487092,
"loss": 0.9465924072265625,
"step": 1150
},
{
"epoch": 0.22887659736791913,
"grad_norm": 0.9276406764984131,
"learning_rate": 0.0001830680201788836,
"loss": 0.9552659606933593,
"step": 1200
},
{
"epoch": 0.2384131222582491,
"grad_norm": 1.0002596378326416,
"learning_rate": 0.00018127088675970888,
"loss": 0.9471788024902343,
"step": 1250
},
{
"epoch": 0.24794964714857906,
"grad_norm": 1.0108773708343506,
"learning_rate": 0.000179392903333894,
"loss": 0.9245248413085938,
"step": 1300
},
{
"epoch": 0.257486172038909,
"grad_norm": 0.9765293002128601,
"learning_rate": 0.0001774359381592925,
"loss": 0.9204165649414062,
"step": 1350
},
{
"epoch": 0.26702269692923897,
"grad_norm": 1.0059659481048584,
"learning_rate": 0.00017540193806648134,
"loss": 0.9452506256103516,
"step": 1400
},
{
"epoch": 0.27655922181956893,
"grad_norm": 1.1858373880386353,
"learning_rate": 0.0001732929265220125,
"loss": 0.9286368560791015,
"step": 1450
},
{
"epoch": 0.2860957467098989,
"grad_norm": 1.0006766319274902,
"learning_rate": 0.00017111100161542545,
"loss": 0.9553109741210938,
"step": 1500
},
{
"epoch": 0.2860957467098989,
"eval_loss": 0.9582533836364746,
"eval_runtime": 680.822,
"eval_samples_per_second": 12.536,
"eval_steps_per_second": 0.784,
"step": 1500
},
{
"epoch": 0.29563227160022887,
"grad_norm": 0.9042516350746155,
"learning_rate": 0.00016885833397202308,
"loss": 0.93341796875,
"step": 1550
},
{
"epoch": 0.30516879649055884,
"grad_norm": 0.9387173652648926,
"learning_rate": 0.00016653716459348735,
"loss": 0.9339485168457031,
"step": 1600
},
{
"epoch": 0.3147053213808888,
"grad_norm": 0.9056106209754944,
"learning_rate": 0.00016414980262848333,
"loss": 0.9324442291259766,
"step": 1650
},
{
"epoch": 0.32424184627121877,
"grad_norm": 0.9407602548599243,
"learning_rate": 0.0001616986230754689,
"loss": 0.9319153594970703,
"step": 1700
},
{
"epoch": 0.33377837116154874,
"grad_norm": 0.9982315897941589,
"learning_rate": 0.0001591860644199957,
"loss": 0.9230084228515625,
"step": 1750
},
{
"epoch": 0.3433148960518787,
"grad_norm": 1.0833745002746582,
"learning_rate": 0.00015661462620885199,
"loss": 0.9161262512207031,
"step": 1800
},
{
"epoch": 0.35285142094220867,
"grad_norm": 0.9810793995857239,
"learning_rate": 0.00015398686656346028,
"loss": 0.9208243560791015,
"step": 1850
},
{
"epoch": 0.36238794583253864,
"grad_norm": 1.094853401184082,
"learning_rate": 0.00015130539963500376,
"loss": 0.9387385559082031,
"step": 1900
},
{
"epoch": 0.3719244707228686,
"grad_norm": 0.8733468055725098,
"learning_rate": 0.000148572893003813,
"loss": 0.9468997955322266,
"step": 1950
},
{
"epoch": 0.3814609956131986,
"grad_norm": 1.0074750185012817,
"learning_rate": 0.0001457920650256004,
"loss": 0.9178851318359375,
"step": 2000
},
{
"epoch": 0.3814609956131986,
"eval_loss": 0.9489485025405884,
"eval_runtime": 681.5678,
"eval_samples_per_second": 12.523,
"eval_steps_per_second": 0.783,
"step": 2000
},
{
"epoch": 0.39099752050352854,
"grad_norm": 1.0041425228118896,
"learning_rate": 0.00014296568212718213,
"loss": 0.9244281005859375,
"step": 2050
},
{
"epoch": 0.40053404539385845,
"grad_norm": 1.0447967052459717,
"learning_rate": 0.0001400965560543778,
"loss": 0.9248933410644531,
"step": 2100
},
{
"epoch": 0.4100705702841884,
"grad_norm": 0.968345046043396,
"learning_rate": 0.00013718754107482596,
"loss": 0.9109151458740234,
"step": 2150
},
{
"epoch": 0.4196070951745184,
"grad_norm": 0.9737382531166077,
"learning_rate": 0.0001342415311384981,
"loss": 0.9231878662109375,
"step": 2200
},
{
"epoch": 0.42914362006484835,
"grad_norm": 1.0494582653045654,
"learning_rate": 0.00013126145699873532,
"loss": 0.9193107604980468,
"step": 2250
},
{
"epoch": 0.4386801449551783,
"grad_norm": 0.925656259059906,
"learning_rate": 0.000128250283296673,
"loss": 0.9101874542236328,
"step": 2300
},
{
"epoch": 0.4482166698455083,
"grad_norm": 1.0119520425796509,
"learning_rate": 0.00012521100561195233,
"loss": 0.8976884460449219,
"step": 2350
},
{
"epoch": 0.45775319473583825,
"grad_norm": 0.9749938249588013,
"learning_rate": 0.0001221466474826543,
"loss": 0.9223648834228516,
"step": 2400
},
{
"epoch": 0.4672897196261682,
"grad_norm": 1.071219563484192,
"learning_rate": 0.00011906025739741956,
"loss": 0.9097858428955078,
"step": 2450
},
{
"epoch": 0.4768262445164982,
"grad_norm": 1.0023361444473267,
"learning_rate": 0.00011595490576274704,
"loss": 0.9021361541748046,
"step": 2500
},
{
"epoch": 0.4768262445164982,
"eval_loss": 0.9404354095458984,
"eval_runtime": 681.2848,
"eval_samples_per_second": 12.528,
"eval_steps_per_second": 0.784,
"step": 2500
},
{
"epoch": 0.48636276940682815,
"grad_norm": 0.7936119437217712,
"learning_rate": 0.00011283368184848842,
"loss": 0.9291069030761718,
"step": 2550
},
{
"epoch": 0.4958992942971581,
"grad_norm": 0.8363732695579529,
"learning_rate": 0.00010969969071457669,
"loss": 0.8912797546386719,
"step": 2600
},
{
"epoch": 0.505435819187488,
"grad_norm": 0.9047644734382629,
"learning_rate": 0.0001065560501220464,
"loss": 0.8831972503662109,
"step": 2650
},
{
"epoch": 0.514972344077818,
"grad_norm": 0.8407217264175415,
"learning_rate": 0.00010340588743141879,
"loss": 0.9096057891845704,
"step": 2700
},
{
"epoch": 0.524508868968148,
"grad_norm": 0.9929122924804688,
"learning_rate": 0.00010025233649153707,
"loss": 0.9299073028564453,
"step": 2750
},
{
"epoch": 0.5340453938584779,
"grad_norm": 1.0364736318588257,
"learning_rate": 9.70985345219468e-05,
"loss": 0.9169498443603515,
"step": 2800
},
{
"epoch": 0.5435819187488079,
"grad_norm": 0.9246217012405396,
"learning_rate": 9.394761899192327e-05,
"loss": 0.8965087127685547,
"step": 2850
},
{
"epoch": 0.5531184436391379,
"grad_norm": 0.9324344992637634,
"learning_rate": 9.08027244992503e-05,
"loss": 0.9102237701416016,
"step": 2900
},
{
"epoch": 0.5626549685294678,
"grad_norm": 0.998298704624176,
"learning_rate": 8.766697965185565e-05,
"loss": 0.9192097473144532,
"step": 2950
},
{
"epoch": 0.5721914934197978,
"grad_norm": 0.9278152585029602,
"learning_rate": 8.45435039554054e-05,
"loss": 0.9202249908447265,
"step": 3000
},
{
"epoch": 0.5721914934197978,
"eval_loss": 0.9327928423881531,
"eval_runtime": 680.7868,
"eval_samples_per_second": 12.537,
"eval_steps_per_second": 0.784,
"step": 3000
},
{
"epoch": 0.5817280183101278,
"grad_norm": 1.0032947063446045,
"learning_rate": 8.14354047099533e-05,
"loss": 0.9145193481445313,
"step": 3050
},
{
"epoch": 0.5912645432004577,
"grad_norm": 0.834137499332428,
"learning_rate": 7.834577391873266e-05,
"loss": 0.9149667358398438,
"step": 3100
},
{
"epoch": 0.6008010680907877,
"grad_norm": 0.9321780204772949,
"learning_rate": 7.527768521216568e-05,
"loss": 0.892784652709961,
"step": 3150
},
{
"epoch": 0.6103375929811177,
"grad_norm": 1.0074846744537354,
"learning_rate": 7.223419079015062e-05,
"loss": 0.9072589111328125,
"step": 3200
},
{
"epoch": 0.6198741178714476,
"grad_norm": 1.2590699195861816,
"learning_rate": 6.921831838566842e-05,
"loss": 0.9023927307128906,
"step": 3250
},
{
"epoch": 0.6294106427617776,
"grad_norm": 0.7596737742424011,
"learning_rate": 6.623306825272937e-05,
"loss": 0.896091079711914,
"step": 3300
},
{
"epoch": 0.6389471676521076,
"grad_norm": 0.9938393831253052,
"learning_rate": 6.328141018165693e-05,
"loss": 0.9024559783935547,
"step": 3350
},
{
"epoch": 0.6484836925424375,
"grad_norm": 0.8195408582687378,
"learning_rate": 6.036628054467682e-05,
"loss": 0.8873242950439453,
"step": 3400
},
{
"epoch": 0.6580202174327675,
"grad_norm": 0.9234633445739746,
"learning_rate": 5.7490579374751686e-05,
"loss": 0.8969253540039063,
"step": 3450
},
{
"epoch": 0.6675567423230975,
"grad_norm": 0.9521735906600952,
"learning_rate": 5.4657167480566594e-05,
"loss": 0.8936698913574219,
"step": 3500
},
{
"epoch": 0.6675567423230975,
"eval_loss": 0.9270365238189697,
"eval_runtime": 680.6766,
"eval_samples_per_second": 12.539,
"eval_steps_per_second": 0.785,
"step": 3500
},
{
"epoch": 0.6770932672134274,
"grad_norm": 0.9278208613395691,
"learning_rate": 5.1868863600535646e-05,
"loss": 0.8819551849365235,
"step": 3550
},
{
"epoch": 0.6866297921037574,
"grad_norm": 0.8966768383979797,
"learning_rate": 4.912844159866112e-05,
"loss": 0.905844955444336,
"step": 3600
},
{
"epoch": 0.6961663169940874,
"grad_norm": 0.886394202709198,
"learning_rate": 4.6438627705034535e-05,
"loss": 0.8963019561767578,
"step": 3650
},
{
"epoch": 0.7057028418844173,
"grad_norm": 1.0538233518600464,
"learning_rate": 4.380209780372496e-05,
"loss": 0.8904299926757813,
"step": 3700
},
{
"epoch": 0.7152393667747473,
"grad_norm": 1.0238487720489502,
"learning_rate": 4.12214747707527e-05,
"loss": 0.9049517822265625,
"step": 3750
},
{
"epoch": 0.7247758916650773,
"grad_norm": 0.913487434387207,
"learning_rate": 3.869932586479628e-05,
"loss": 0.8899057006835938,
"step": 3800
},
{
"epoch": 0.7343124165554072,
"grad_norm": 0.8777801990509033,
"learning_rate": 3.623816017322917e-05,
"loss": 0.8895880889892578,
"step": 3850
},
{
"epoch": 0.7438489414457372,
"grad_norm": 1.0374138355255127,
"learning_rate": 3.3840426116026044e-05,
"loss": 0.9089337921142578,
"step": 3900
},
{
"epoch": 0.7533854663360672,
"grad_norm": 1.0942944288253784,
"learning_rate": 3.150850901002268e-05,
"loss": 0.8874909210205079,
"step": 3950
},
{
"epoch": 0.7629219912263971,
"grad_norm": 0.8818588256835938,
"learning_rate": 2.9244728695951995e-05,
"loss": 0.8881364440917969,
"step": 4000
},
{
"epoch": 0.7629219912263971,
"eval_loss": 0.921961784362793,
"eval_runtime": 680.8471,
"eval_samples_per_second": 12.536,
"eval_steps_per_second": 0.784,
"step": 4000
},
{
"epoch": 0.7724585161167271,
"grad_norm": 0.8658433556556702,
"learning_rate": 2.7051337230617125e-05,
"loss": 0.887118148803711,
"step": 4050
},
{
"epoch": 0.7819950410070571,
"grad_norm": 1.0360819101333618,
"learning_rate": 2.4930516646497448e-05,
"loss": 0.9029306030273437,
"step": 4100
},
{
"epoch": 0.791531565897387,
"grad_norm": 0.9453035593032837,
"learning_rate": 2.2884376781016258e-05,
"loss": 0.874930419921875,
"step": 4150
},
{
"epoch": 0.8010680907877169,
"grad_norm": 0.9671445488929749,
"learning_rate": 2.0914953177629548e-05,
"loss": 0.8965608978271484,
"step": 4200
},
{
"epoch": 0.8106046156780469,
"grad_norm": 0.9346293210983276,
"learning_rate": 1.902420506082424e-05,
"loss": 0.8860881042480468,
"step": 4250
},
{
"epoch": 0.8201411405683768,
"grad_norm": 1.0210144519805908,
"learning_rate": 1.7214013387039884e-05,
"loss": 0.9090773773193359,
"step": 4300
},
{
"epoch": 0.8296776654587068,
"grad_norm": 0.7916406393051147,
"learning_rate": 1.54861789734532e-05,
"loss": 0.9077362060546875,
"step": 4350
},
{
"epoch": 0.8392141903490368,
"grad_norm": 0.849738597869873,
"learning_rate": 1.3842420706486903e-05,
"loss": 0.8971955871582031,
"step": 4400
},
{
"epoch": 0.8487507152393667,
"grad_norm": 0.8733137845993042,
"learning_rate": 1.2284373831824847e-05,
"loss": 0.8827657318115234,
"step": 4450
},
{
"epoch": 0.8582872401296967,
"grad_norm": 0.9427047967910767,
"learning_rate": 1.0813588327634961e-05,
"loss": 0.8771236419677735,
"step": 4500
},
{
"epoch": 0.8582872401296967,
"eval_loss": 0.9178871512413025,
"eval_runtime": 680.8349,
"eval_samples_per_second": 12.536,
"eval_steps_per_second": 0.784,
"step": 4500
},
{
"epoch": 0.8678237650200267,
"grad_norm": 0.8574934005737305,
"learning_rate": 9.431527362617832e-06,
"loss": 0.8978910827636719,
"step": 4550
},
{
"epoch": 0.8773602899103566,
"grad_norm": 0.986375629901886,
"learning_rate": 8.139565840415553e-06,
"loss": 0.8829045867919922,
"step": 4600
},
{
"epoch": 0.8868968148006866,
"grad_norm": 1.026666283607483,
"learning_rate": 6.938989031828158e-06,
"loss": 0.8786582183837891,
"step": 4650
},
{
"epoch": 0.8964333396910166,
"grad_norm": 0.9898380041122437,
"learning_rate": 5.8309912961990506e-06,
"loss": 0.8815351867675781,
"step": 4700
},
{
"epoch": 0.9059698645813465,
"grad_norm": 0.8167511224746704,
"learning_rate": 4.8166748932408355e-06,
"loss": 0.8737747192382812,
"step": 4750
},
{
"epoch": 0.9155063894716765,
"grad_norm": 0.9592494368553162,
"learning_rate": 3.8970488864839334e-06,
"loss": 0.886360092163086,
"step": 4800
},
{
"epoch": 0.9250429143620065,
"grad_norm": 1.1282484531402588,
"learning_rate": 3.0730281394387382e-06,
"loss": 0.9033355712890625,
"step": 4850
},
{
"epoch": 0.9345794392523364,
"grad_norm": 1.0762990713119507,
"learning_rate": 2.345432405469894e-06,
"loss": 0.9018070983886719,
"step": 4900
},
{
"epoch": 0.9441159641426664,
"grad_norm": 1.0391168594360352,
"learning_rate": 1.7149855122882697e-06,
"loss": 0.8773787689208984,
"step": 4950
},
{
"epoch": 0.9536524890329964,
"grad_norm": 1.160536527633667,
"learning_rate": 1.1823146418717068e-06,
"loss": 0.8903836822509765,
"step": 5000
},
{
"epoch": 0.9536524890329964,
"eval_loss": 0.9169394373893738,
"eval_runtime": 681.071,
"eval_samples_per_second": 12.532,
"eval_steps_per_second": 0.784,
"step": 5000
},
{
"epoch": 0.9631890139233263,
"grad_norm": 0.823947548866272,
"learning_rate": 7.479497065310925e-07,
"loss": 0.90203857421875,
"step": 5050
},
{
"epoch": 0.9727255388136563,
"grad_norm": 0.7839481830596924,
"learning_rate": 4.123228217422948e-07,
"loss": 0.8997705078125,
"step": 5100
},
{
"epoch": 0.9822620637039863,
"grad_norm": 0.9920501708984375,
"learning_rate": 1.7576787626851777e-07,
"loss": 0.8869709777832031,
"step": 5150
},
{
"epoch": 0.9917985885943162,
"grad_norm": 0.9276648759841919,
"learning_rate": 3.8520200000624615e-08,
"loss": 0.8911021423339843,
"step": 5200
},
{
"epoch": 1.0,
"eval_loss": 0.9169082045555115,
"eval_runtime": 681.3749,
"eval_samples_per_second": 12.526,
"eval_steps_per_second": 0.784,
"step": 5243
}
],
"logging_steps": 50,
"max_steps": 5243,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.345861112836915e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}