llambo323 / trainer_state.json
Uri-ka's picture
Upload folder using huggingface_hub
103c851 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 300,
"global_step": 21160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004725897920604915,
"grad_norm": 2.6531940365245514,
"learning_rate": 0.0001999889787603642,
"loss": 4.0021,
"step": 100
},
{
"epoch": 0.00945179584120983,
"grad_norm": 1.8468496440092184,
"learning_rate": 0.00019995591747081122,
"loss": 2.3469,
"step": 200
},
{
"epoch": 0.014177693761814745,
"grad_norm": 2.020813349289251,
"learning_rate": 0.00019990082341886893,
"loss": 2.0452,
"step": 300
},
{
"epoch": 0.014177693761814745,
"eval_loss": 1.8925005197525024,
"eval_runtime": 5.481,
"eval_samples_per_second": 75.717,
"eval_steps_per_second": 3.831,
"step": 300
},
{
"epoch": 0.01890359168241966,
"grad_norm": 1.2563921162910447,
"learning_rate": 0.00019982370874863236,
"loss": 1.8059,
"step": 400
},
{
"epoch": 0.023629489603024575,
"grad_norm": 0.8312915105679309,
"learning_rate": 0.00019972459045808672,
"loss": 1.6709,
"step": 500
},
{
"epoch": 0.02835538752362949,
"grad_norm": 0.8129560611164286,
"learning_rate": 0.00019960349039536062,
"loss": 1.5706,
"step": 600
},
{
"epoch": 0.02835538752362949,
"eval_loss": 1.5036791563034058,
"eval_runtime": 5.202,
"eval_samples_per_second": 79.777,
"eval_steps_per_second": 4.037,
"step": 600
},
{
"epoch": 0.0330812854442344,
"grad_norm": 0.8612422660201489,
"learning_rate": 0.00019946043525391027,
"loss": 1.4998,
"step": 700
},
{
"epoch": 0.03780718336483932,
"grad_norm": 0.695233402859304,
"learning_rate": 0.00019929545656663562,
"loss": 1.4536,
"step": 800
},
{
"epoch": 0.04253308128544423,
"grad_norm": 0.5635887483204054,
"learning_rate": 0.0001991085906989296,
"loss": 1.4189,
"step": 900
},
{
"epoch": 0.04253308128544423,
"eval_loss": 1.3953526020050049,
"eval_runtime": 5.1848,
"eval_samples_per_second": 80.042,
"eval_steps_per_second": 4.05,
"step": 900
},
{
"epoch": 0.04725897920604915,
"grad_norm": 0.6884010711536315,
"learning_rate": 0.00019889987884066237,
"loss": 1.409,
"step": 1000
},
{
"epoch": 0.05198487712665406,
"grad_norm": 0.5763429947147058,
"learning_rate": 0.00019866936699710198,
"loss": 1.382,
"step": 1100
},
{
"epoch": 0.05671077504725898,
"grad_norm": 0.6029165262178462,
"learning_rate": 0.00019841710597877382,
"loss": 1.3618,
"step": 1200
},
{
"epoch": 0.05671077504725898,
"eval_loss": 1.2691556215286255,
"eval_runtime": 5.1873,
"eval_samples_per_second": 80.002,
"eval_steps_per_second": 4.048,
"step": 1200
},
{
"epoch": 0.06143667296786389,
"grad_norm": 0.6708186101812629,
"learning_rate": 0.00019814315139026053,
"loss": 1.3234,
"step": 1300
},
{
"epoch": 0.0661625708884688,
"grad_norm": 0.5192883759035518,
"learning_rate": 0.00019784756361794555,
"loss": 1.3112,
"step": 1400
},
{
"epoch": 0.07088846880907372,
"grad_norm": 0.4576161281424316,
"learning_rate": 0.00019753040781670224,
"loss": 1.2915,
"step": 1500
},
{
"epoch": 0.07088846880907372,
"eval_loss": 1.1815497875213623,
"eval_runtime": 5.1859,
"eval_samples_per_second": 80.025,
"eval_steps_per_second": 4.049,
"step": 1500
},
{
"epoch": 0.07561436672967864,
"grad_norm": 0.5510812189896114,
"learning_rate": 0.00019719175389553242,
"loss": 1.2739,
"step": 1600
},
{
"epoch": 0.08034026465028356,
"grad_norm": 0.5899918319123949,
"learning_rate": 0.00019683167650215642,
"loss": 1.2617,
"step": 1700
},
{
"epoch": 0.08506616257088846,
"grad_norm": 0.42939528650931935,
"learning_rate": 0.00019645025500655906,
"loss": 1.2607,
"step": 1800
},
{
"epoch": 0.08506616257088846,
"eval_loss": 1.10903000831604,
"eval_runtime": 5.2115,
"eval_samples_per_second": 79.631,
"eval_steps_per_second": 4.03,
"step": 1800
},
{
"epoch": 0.08979206049149338,
"grad_norm": 0.508789955425928,
"learning_rate": 0.00019604757348349447,
"loss": 1.2276,
"step": 1900
},
{
"epoch": 0.0945179584120983,
"grad_norm": 0.4257274370286851,
"learning_rate": 0.00019562372069395384,
"loss": 1.2189,
"step": 2000
},
{
"epoch": 0.09924385633270322,
"grad_norm": 0.6417008198357919,
"learning_rate": 0.0001951787900656005,
"loss": 1.2192,
"step": 2100
},
{
"epoch": 0.09924385633270322,
"eval_loss": 1.0954564809799194,
"eval_runtime": 5.2097,
"eval_samples_per_second": 79.66,
"eval_steps_per_second": 4.031,
"step": 2100
},
{
"epoch": 0.10396975425330812,
"grad_norm": 0.3759012332224868,
"learning_rate": 0.00019471287967217594,
"loss": 1.2272,
"step": 2200
},
{
"epoch": 0.10869565217391304,
"grad_norm": 0.5260238110195213,
"learning_rate": 0.00019422609221188207,
"loss": 1.2101,
"step": 2300
},
{
"epoch": 0.11342155009451796,
"grad_norm": 0.4348649359952213,
"learning_rate": 0.0001937185349847439,
"loss": 1.1893,
"step": 2400
},
{
"epoch": 0.11342155009451796,
"eval_loss": 1.0621371269226074,
"eval_runtime": 5.2508,
"eval_samples_per_second": 79.036,
"eval_steps_per_second": 3.999,
"step": 2400
},
{
"epoch": 0.11814744801512288,
"grad_norm": 0.38742811944800437,
"learning_rate": 0.00019319031986895807,
"loss": 1.1921,
"step": 2500
},
{
"epoch": 0.12287334593572778,
"grad_norm": 0.5195602808144861,
"learning_rate": 0.00019264156329623197,
"loss": 1.1806,
"step": 2600
},
{
"epoch": 0.1275992438563327,
"grad_norm": 0.45641434721553403,
"learning_rate": 0.00019207238622611936,
"loss": 1.1902,
"step": 2700
},
{
"epoch": 0.1275992438563327,
"eval_loss": 1.0587011575698853,
"eval_runtime": 7.8145,
"eval_samples_per_second": 53.107,
"eval_steps_per_second": 2.687,
"step": 2700
},
{
"epoch": 0.1323251417769376,
"grad_norm": 0.4428344921300073,
"learning_rate": 0.00019148291411935796,
"loss": 1.1764,
"step": 2800
},
{
"epoch": 0.13705103969754254,
"grad_norm": 0.7533101325947344,
"learning_rate": 0.00019087327691021472,
"loss": 1.169,
"step": 2900
},
{
"epoch": 0.14177693761814744,
"grad_norm": 0.39224873221985657,
"learning_rate": 0.00019024360897784508,
"loss": 1.1692,
"step": 3000
},
{
"epoch": 0.14177693761814744,
"eval_loss": 1.027273416519165,
"eval_runtime": 5.2079,
"eval_samples_per_second": 79.686,
"eval_steps_per_second": 4.032,
"step": 3000
},
{
"epoch": 0.14650283553875237,
"grad_norm": 0.4338112970906164,
"learning_rate": 0.00018959404911667252,
"loss": 1.1689,
"step": 3100
},
{
"epoch": 0.15122873345935728,
"grad_norm": 0.39383682502704437,
"learning_rate": 0.0001889247405057948,
"loss": 1.1631,
"step": 3200
},
{
"epoch": 0.15595463137996218,
"grad_norm": 0.38979347330040026,
"learning_rate": 0.0001882358306774237,
"loss": 1.1625,
"step": 3300
},
{
"epoch": 0.15595463137996218,
"eval_loss": 1.0129389762878418,
"eval_runtime": 5.2231,
"eval_samples_per_second": 79.454,
"eval_steps_per_second": 4.021,
"step": 3300
},
{
"epoch": 0.16068052930056712,
"grad_norm": 0.35777900887774455,
"learning_rate": 0.00018752747148436543,
"loss": 1.1589,
"step": 3400
},
{
"epoch": 0.16540642722117202,
"grad_norm": 0.45954014358662065,
"learning_rate": 0.00018679981906654823,
"loss": 1.1411,
"step": 3500
},
{
"epoch": 0.17013232514177692,
"grad_norm": 0.34494886514352496,
"learning_rate": 0.00018605303381660543,
"loss": 1.1401,
"step": 3600
},
{
"epoch": 0.17013232514177692,
"eval_loss": 1.0022536516189575,
"eval_runtime": 5.251,
"eval_samples_per_second": 79.032,
"eval_steps_per_second": 3.999,
"step": 3600
},
{
"epoch": 0.17485822306238186,
"grad_norm": 0.36070737242493944,
"learning_rate": 0.00018528728034452102,
"loss": 1.1328,
"step": 3700
},
{
"epoch": 0.17958412098298676,
"grad_norm": 0.38075438227938496,
"learning_rate": 0.00018450272744134532,
"loss": 1.1322,
"step": 3800
},
{
"epoch": 0.1843100189035917,
"grad_norm": 0.3045755124209396,
"learning_rate": 0.0001836995480419894,
"loss": 1.1225,
"step": 3900
},
{
"epoch": 0.1843100189035917,
"eval_loss": 0.9917121529579163,
"eval_runtime": 5.2189,
"eval_samples_per_second": 79.519,
"eval_steps_per_second": 4.024,
"step": 3900
},
{
"epoch": 0.1890359168241966,
"grad_norm": 0.3355857365601763,
"learning_rate": 0.00018287791918710587,
"loss": 1.1436,
"step": 4000
},
{
"epoch": 0.1937618147448015,
"grad_norm": 0.3291517054836814,
"learning_rate": 0.00018203802198406478,
"loss": 1.1356,
"step": 4100
},
{
"epoch": 0.19848771266540643,
"grad_norm": 0.34097720017987254,
"learning_rate": 0.00018118004156703296,
"loss": 1.1178,
"step": 4200
},
{
"epoch": 0.19848771266540643,
"eval_loss": 0.975567102432251,
"eval_runtime": 5.26,
"eval_samples_per_second": 78.897,
"eval_steps_per_second": 3.992,
"step": 4200
},
{
"epoch": 0.20321361058601134,
"grad_norm": 0.3931138858913563,
"learning_rate": 0.00018030416705616602,
"loss": 1.1106,
"step": 4300
},
{
"epoch": 0.20793950850661624,
"grad_norm": 0.31930492232829666,
"learning_rate": 0.00017941059151592147,
"loss": 1.1148,
"step": 4400
},
{
"epoch": 0.21266540642722118,
"grad_norm": 0.35451850597759177,
"learning_rate": 0.00017849951191250255,
"loss": 1.1119,
"step": 4500
},
{
"epoch": 0.21266540642722118,
"eval_loss": 1.0053483247756958,
"eval_runtime": 5.2606,
"eval_samples_per_second": 78.888,
"eval_steps_per_second": 3.992,
"step": 4500
},
{
"epoch": 0.21739130434782608,
"grad_norm": 1.729695480261338,
"learning_rate": 0.000177571129070442,
"loss": 2.8502,
"step": 4600
},
{
"epoch": 0.222117202268431,
"grad_norm": 0.3889359878955159,
"learning_rate": 0.0001766256476283353,
"loss": 1.432,
"step": 4700
},
{
"epoch": 0.22684310018903592,
"grad_norm": 0.33225367698457886,
"learning_rate": 0.00017566327599373338,
"loss": 1.1015,
"step": 4800
},
{
"epoch": 0.22684310018903592,
"eval_loss": 0.9650746583938599,
"eval_runtime": 5.2328,
"eval_samples_per_second": 79.307,
"eval_steps_per_second": 4.013,
"step": 4800
},
{
"epoch": 0.23156899810964082,
"grad_norm": 0.30130743666113563,
"learning_rate": 0.0001746842262972043,
"loss": 1.0999,
"step": 4900
},
{
"epoch": 0.23629489603024575,
"grad_norm": 0.30899211203535526,
"learning_rate": 0.00017368871434557447,
"loss": 1.103,
"step": 5000
},
{
"epoch": 0.24102079395085066,
"grad_norm": 0.29216937102712714,
"learning_rate": 0.00017267695957435945,
"loss": 1.1142,
"step": 5100
},
{
"epoch": 0.24102079395085066,
"eval_loss": 0.9577646851539612,
"eval_runtime": 5.2388,
"eval_samples_per_second": 79.216,
"eval_steps_per_second": 4.009,
"step": 5100
},
{
"epoch": 0.24574669187145556,
"grad_norm": 0.3238385499792752,
"learning_rate": 0.00017164918499939504,
"loss": 1.0945,
"step": 5200
},
{
"epoch": 0.2504725897920605,
"grad_norm": 0.3001667683763501,
"learning_rate": 0.00017060561716767883,
"loss": 1.0998,
"step": 5300
},
{
"epoch": 0.2551984877126654,
"grad_norm": 0.3172907389612034,
"learning_rate": 0.00016954648610743384,
"loss": 1.0985,
"step": 5400
},
{
"epoch": 0.2551984877126654,
"eval_loss": 0.9488331079483032,
"eval_runtime": 5.2053,
"eval_samples_per_second": 79.726,
"eval_steps_per_second": 4.034,
"step": 5400
},
{
"epoch": 0.2599243856332703,
"grad_norm": 0.3126246763797197,
"learning_rate": 0.00016847202527740443,
"loss": 1.0862,
"step": 5500
},
{
"epoch": 0.2646502835538752,
"grad_norm": 0.3656784929029524,
"learning_rate": 0.00016738247151539643,
"loss": 1.1074,
"step": 5600
},
{
"epoch": 0.26937618147448017,
"grad_norm": 0.28837893106504947,
"learning_rate": 0.0001662780649860719,
"loss": 1.0832,
"step": 5700
},
{
"epoch": 0.26937618147448017,
"eval_loss": 0.9372844099998474,
"eval_runtime": 5.2075,
"eval_samples_per_second": 79.693,
"eval_steps_per_second": 4.033,
"step": 5700
},
{
"epoch": 0.2741020793950851,
"grad_norm": 0.30748002033313665,
"learning_rate": 0.00016515904912801118,
"loss": 1.0999,
"step": 5800
},
{
"epoch": 0.27882797731569,
"grad_norm": 0.2874899164304016,
"learning_rate": 0.00016402567060005283,
"loss": 1.0828,
"step": 5900
},
{
"epoch": 0.2835538752362949,
"grad_norm": 0.33110349066422323,
"learning_rate": 0.00016287817922692395,
"loss": 1.0779,
"step": 6000
},
{
"epoch": 0.2835538752362949,
"eval_loss": 0.932012140750885,
"eval_runtime": 5.2381,
"eval_samples_per_second": 79.227,
"eval_steps_per_second": 4.009,
"step": 6000
},
{
"epoch": 0.2882797731568998,
"grad_norm": 0.294275463342433,
"learning_rate": 0.00016171682794417257,
"loss": 1.0903,
"step": 6100
},
{
"epoch": 0.29300567107750475,
"grad_norm": 0.36996690057802206,
"learning_rate": 0.0001605418727424145,
"loss": 1.0842,
"step": 6200
},
{
"epoch": 0.29773156899810965,
"grad_norm": 0.2724699437979514,
"learning_rate": 0.00015935357261090652,
"loss": 1.079,
"step": 6300
},
{
"epoch": 0.29773156899810965,
"eval_loss": 0.9178703427314758,
"eval_runtime": 5.2652,
"eval_samples_per_second": 78.819,
"eval_steps_per_second": 3.988,
"step": 6300
},
{
"epoch": 0.30245746691871456,
"grad_norm": 0.2891830821110885,
"learning_rate": 0.00015815218948045878,
"loss": 1.0734,
"step": 6400
},
{
"epoch": 0.30718336483931946,
"grad_norm": 0.24982988659992944,
"learning_rate": 0.00015693798816569885,
"loss": 1.0649,
"step": 6500
},
{
"epoch": 0.31190926275992437,
"grad_norm": 0.2568972740115008,
"learning_rate": 0.0001557112363066998,
"loss": 1.0573,
"step": 6600
},
{
"epoch": 0.31190926275992437,
"eval_loss": 0.9198368191719055,
"eval_runtime": 5.2276,
"eval_samples_per_second": 79.386,
"eval_steps_per_second": 4.017,
"step": 6600
},
{
"epoch": 0.3166351606805293,
"grad_norm": 0.29091825253619064,
"learning_rate": 0.00015447220430998582,
"loss": 1.085,
"step": 6700
},
{
"epoch": 0.32136105860113423,
"grad_norm": 0.29225096960229624,
"learning_rate": 0.00015322116528892807,
"loss": 1.0609,
"step": 6800
},
{
"epoch": 0.32608695652173914,
"grad_norm": 0.311594279061454,
"learning_rate": 0.00015195839500354335,
"loss": 1.0594,
"step": 6900
},
{
"epoch": 0.32608695652173914,
"eval_loss": 0.9203236699104309,
"eval_runtime": 5.2426,
"eval_samples_per_second": 79.16,
"eval_steps_per_second": 4.006,
"step": 6900
},
{
"epoch": 0.33081285444234404,
"grad_norm": 0.5176311453672888,
"learning_rate": 0.00015068417179971014,
"loss": 1.0729,
"step": 7000
},
{
"epoch": 0.33553875236294894,
"grad_norm": 0.28252873243710874,
"learning_rate": 0.00014939877654781395,
"loss": 1.081,
"step": 7100
},
{
"epoch": 0.34026465028355385,
"grad_norm": 0.31787595640980193,
"learning_rate": 0.00014810249258083677,
"loss": 1.0717,
"step": 7200
},
{
"epoch": 0.34026465028355385,
"eval_loss": 0.9039013981819153,
"eval_runtime": 5.2593,
"eval_samples_per_second": 78.908,
"eval_steps_per_second": 3.993,
"step": 7200
},
{
"epoch": 0.3449905482041588,
"grad_norm": 0.2748908881405949,
"learning_rate": 0.00014679560563190332,
"loss": 1.0609,
"step": 7300
},
{
"epoch": 0.3497164461247637,
"grad_norm": 0.3125203929787377,
"learning_rate": 0.00014547840377129842,
"loss": 1.0552,
"step": 7400
},
{
"epoch": 0.3544423440453686,
"grad_norm": 0.4133883061855867,
"learning_rate": 0.00014415117734296916,
"loss": 1.061,
"step": 7500
},
{
"epoch": 0.3544423440453686,
"eval_loss": 0.8995205163955688,
"eval_runtime": 5.2251,
"eval_samples_per_second": 79.424,
"eval_steps_per_second": 4.019,
"step": 7500
},
{
"epoch": 0.3591682419659735,
"grad_norm": 0.2645834917560888,
"learning_rate": 0.0001428142189005259,
"loss": 1.0523,
"step": 7600
},
{
"epoch": 0.3638941398865784,
"grad_norm": 0.43674521926929427,
"learning_rate": 0.0001414678231427562,
"loss": 1.051,
"step": 7700
},
{
"epoch": 0.3686200378071834,
"grad_norm": 0.2975803374511188,
"learning_rate": 0.00014011228684866582,
"loss": 1.0516,
"step": 7800
},
{
"epoch": 0.3686200378071834,
"eval_loss": 0.8893873691558838,
"eval_runtime": 5.2209,
"eval_samples_per_second": 79.488,
"eval_steps_per_second": 4.022,
"step": 7800
},
{
"epoch": 0.3733459357277883,
"grad_norm": 0.25801366812079984,
"learning_rate": 0.00013874790881206146,
"loss": 1.0332,
"step": 7900
},
{
"epoch": 0.3780718336483932,
"grad_norm": 0.27388985128284893,
"learning_rate": 0.000137374989775689,
"loss": 1.0409,
"step": 8000
},
{
"epoch": 0.3827977315689981,
"grad_norm": 0.2977365916589529,
"learning_rate": 0.00013599383236494248,
"loss": 1.0455,
"step": 8100
},
{
"epoch": 0.3827977315689981,
"eval_loss": 0.884984016418457,
"eval_runtime": 5.1923,
"eval_samples_per_second": 79.926,
"eval_steps_per_second": 4.044,
"step": 8100
},
{
"epoch": 0.387523629489603,
"grad_norm": 0.30299461347087553,
"learning_rate": 0.00013460474102115785,
"loss": 1.0338,
"step": 8200
},
{
"epoch": 0.39224952741020797,
"grad_norm": 0.3249509218094752,
"learning_rate": 0.00013320802193450662,
"loss": 1.0535,
"step": 8300
},
{
"epoch": 0.39697542533081287,
"grad_norm": 0.43141764888355383,
"learning_rate": 0.00013180398297650393,
"loss": 1.0481,
"step": 8400
},
{
"epoch": 0.39697542533081287,
"eval_loss": 0.8851591944694519,
"eval_runtime": 5.1825,
"eval_samples_per_second": 80.077,
"eval_steps_per_second": 4.052,
"step": 8400
},
{
"epoch": 0.4017013232514178,
"grad_norm": 0.3383716797214537,
"learning_rate": 0.0001303929336321461,
"loss": 1.0456,
"step": 8500
},
{
"epoch": 0.4064272211720227,
"grad_norm": 0.3313965305574248,
"learning_rate": 0.0001289751849316924,
"loss": 1.0454,
"step": 8600
},
{
"epoch": 0.4111531190926276,
"grad_norm": 0.2873865099421956,
"learning_rate": 0.0001275510493821062,
"loss": 1.0325,
"step": 8700
},
{
"epoch": 0.4111531190926276,
"eval_loss": 0.8757073283195496,
"eval_runtime": 5.2011,
"eval_samples_per_second": 79.791,
"eval_steps_per_second": 4.038,
"step": 8700
},
{
"epoch": 0.4158790170132325,
"grad_norm": 0.25123513069157694,
"learning_rate": 0.0001261208408981708,
"loss": 1.0381,
"step": 8800
},
{
"epoch": 0.42060491493383745,
"grad_norm": 0.3890343278493704,
"learning_rate": 0.00012468487473329485,
"loss": 1.0406,
"step": 8900
},
{
"epoch": 0.42533081285444235,
"grad_norm": 0.2722692609230761,
"learning_rate": 0.0001232434674100226,
"loss": 1.0353,
"step": 9000
},
{
"epoch": 0.42533081285444235,
"eval_loss": 0.8738681077957153,
"eval_runtime": 5.2524,
"eval_samples_per_second": 79.011,
"eval_steps_per_second": 3.998,
"step": 9000
},
{
"epoch": 0.43005671077504726,
"grad_norm": 0.32764958069862743,
"learning_rate": 0.00012179693665026448,
"loss": 1.0282,
"step": 9100
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.3152329877955543,
"learning_rate": 0.0001203456013052634,
"loss": 1.0297,
"step": 9200
},
{
"epoch": 0.43950850661625707,
"grad_norm": 0.2822447390885084,
"learning_rate": 0.0001188897812853119,
"loss": 1.0241,
"step": 9300
},
{
"epoch": 0.43950850661625707,
"eval_loss": 0.8668489456176758,
"eval_runtime": 5.2178,
"eval_samples_per_second": 79.536,
"eval_steps_per_second": 4.025,
"step": 9300
},
{
"epoch": 0.444234404536862,
"grad_norm": 0.2342101057101333,
"learning_rate": 0.00011742979748923611,
"loss": 1.0214,
"step": 9400
},
{
"epoch": 0.44896030245746693,
"grad_norm": 0.2793266628696714,
"learning_rate": 0.00011596597173366168,
"loss": 1.0301,
"step": 9500
},
{
"epoch": 0.45368620037807184,
"grad_norm": 0.24486092989303898,
"learning_rate": 0.00011449862668207734,
"loss": 1.0222,
"step": 9600
},
{
"epoch": 0.45368620037807184,
"eval_loss": 0.8624320030212402,
"eval_runtime": 5.2399,
"eval_samples_per_second": 79.199,
"eval_steps_per_second": 4.008,
"step": 9600
},
{
"epoch": 0.45841209829867674,
"grad_norm": 0.36863921402437216,
"learning_rate": 0.00011302808577371196,
"loss": 1.0166,
"step": 9700
},
{
"epoch": 0.46313799621928164,
"grad_norm": 0.2718322585187314,
"learning_rate": 0.00011155467315224038,
"loss": 1.02,
"step": 9800
},
{
"epoch": 0.4678638941398866,
"grad_norm": 0.32504794923171715,
"learning_rate": 0.00011007871359433431,
"loss": 1.0168,
"step": 9900
},
{
"epoch": 0.4678638941398866,
"eval_loss": 0.8551745414733887,
"eval_runtime": 5.2268,
"eval_samples_per_second": 79.399,
"eval_steps_per_second": 4.018,
"step": 9900
},
{
"epoch": 0.4725897920604915,
"grad_norm": 0.23334743308841835,
"learning_rate": 0.00010860053243807338,
"loss": 1.0165,
"step": 10000
},
{
"epoch": 0.4773156899810964,
"grad_norm": 0.3063392284541564,
"learning_rate": 0.00010712045551123254,
"loss": 1.0201,
"step": 10100
},
{
"epoch": 0.4820415879017013,
"grad_norm": 0.23591008170417674,
"learning_rate": 0.00010563880905946159,
"loss": 1.0089,
"step": 10200
},
{
"epoch": 0.4820415879017013,
"eval_loss": 0.8523257970809937,
"eval_runtime": 5.236,
"eval_samples_per_second": 79.258,
"eval_steps_per_second": 4.011,
"step": 10200
},
{
"epoch": 0.4867674858223062,
"grad_norm": 0.2458952740027801,
"learning_rate": 0.00010415591967437253,
"loss": 1.0238,
"step": 10300
},
{
"epoch": 0.4914933837429111,
"grad_norm": 0.24927222616875758,
"learning_rate": 0.00010267211422155072,
"loss": 1.0082,
"step": 10400
},
{
"epoch": 0.4962192816635161,
"grad_norm": 0.2984843122066568,
"learning_rate": 0.00010118771976850548,
"loss": 1.0095,
"step": 10500
},
{
"epoch": 0.4962192816635161,
"eval_loss": 0.8471891283988953,
"eval_runtime": 5.2504,
"eval_samples_per_second": 79.041,
"eval_steps_per_second": 4.0,
"step": 10500
},
{
"epoch": 0.500945179584121,
"grad_norm": 0.3068228178596565,
"learning_rate": 9.970306351257647e-05,
"loss": 1.0082,
"step": 10600
},
{
"epoch": 0.505671077504726,
"grad_norm": 0.26307198455267633,
"learning_rate": 9.821847270881115e-05,
"loss": 1.0144,
"step": 10700
},
{
"epoch": 0.5103969754253308,
"grad_norm": 4.7606417583467415,
"learning_rate": 9.673427459782974e-05,
"loss": 1.0153,
"step": 10800
},
{
"epoch": 0.5103969754253308,
"eval_loss": 0.8897992372512817,
"eval_runtime": 5.2165,
"eval_samples_per_second": 79.556,
"eval_steps_per_second": 4.026,
"step": 10800
},
{
"epoch": 0.5151228733459358,
"grad_norm": 0.30927024833433553,
"learning_rate": 9.525079633369313e-05,
"loss": 1.0046,
"step": 10900
},
{
"epoch": 0.5198487712665406,
"grad_norm": 0.26903362222547117,
"learning_rate": 9.376836491179028e-05,
"loss": 1.0069,
"step": 11000
},
{
"epoch": 0.5245746691871456,
"grad_norm": 0.3420715319504347,
"learning_rate": 9.228730709676e-05,
"loss": 1.0033,
"step": 11100
},
{
"epoch": 0.5245746691871456,
"eval_loss": 0.8409531116485596,
"eval_runtime": 5.2724,
"eval_samples_per_second": 78.711,
"eval_steps_per_second": 3.983,
"step": 11100
},
{
"epoch": 0.5293005671077504,
"grad_norm": 0.3191753149107475,
"learning_rate": 9.080794935046421e-05,
"loss": 1.001,
"step": 11200
},
{
"epoch": 0.5340264650283554,
"grad_norm": 0.2622416950887366,
"learning_rate": 8.933061776002749e-05,
"loss": 1.0056,
"step": 11300
},
{
"epoch": 0.5387523629489603,
"grad_norm": 0.3084365561058697,
"learning_rate": 8.785563796595938e-05,
"loss": 0.9929,
"step": 11400
},
{
"epoch": 0.5387523629489603,
"eval_loss": 0.8378592729568481,
"eval_runtime": 5.2613,
"eval_samples_per_second": 78.878,
"eval_steps_per_second": 3.991,
"step": 11400
},
{
"epoch": 0.5434782608695652,
"grad_norm": 0.27673606677077633,
"learning_rate": 8.638333509037536e-05,
"loss": 1.0103,
"step": 11500
},
{
"epoch": 0.5482041587901701,
"grad_norm": 0.2572779074674892,
"learning_rate": 8.49140336653315e-05,
"loss": 1.0045,
"step": 11600
},
{
"epoch": 0.552930056710775,
"grad_norm": 0.29450261640985925,
"learning_rate": 8.34480575612899e-05,
"loss": 0.9926,
"step": 11700
},
{
"epoch": 0.552930056710775,
"eval_loss": 0.8326123356819153,
"eval_runtime": 5.2465,
"eval_samples_per_second": 79.1,
"eval_steps_per_second": 4.003,
"step": 11700
},
{
"epoch": 0.55765595463138,
"grad_norm": 0.2963246115701643,
"learning_rate": 8.198572991572939e-05,
"loss": 0.9941,
"step": 11800
},
{
"epoch": 0.5623818525519849,
"grad_norm": 0.2873036150856026,
"learning_rate": 8.052737306191812e-05,
"loss": 0.997,
"step": 11900
},
{
"epoch": 0.5671077504725898,
"grad_norm": 0.29300952943209607,
"learning_rate": 7.907330845786337e-05,
"loss": 0.9907,
"step": 12000
},
{
"epoch": 0.5671077504725898,
"eval_loss": 0.826417863368988,
"eval_runtime": 5.2391,
"eval_samples_per_second": 79.212,
"eval_steps_per_second": 4.008,
"step": 12000
},
{
"epoch": 0.5718336483931947,
"grad_norm": 0.2839260705447411,
"learning_rate": 7.762385661545401e-05,
"loss": 0.999,
"step": 12100
},
{
"epoch": 0.5765595463137996,
"grad_norm": 0.28962628105155874,
"learning_rate": 7.617933702981198e-05,
"loss": 0.9898,
"step": 12200
},
{
"epoch": 0.5812854442344045,
"grad_norm": 0.34326419629513394,
"learning_rate": 7.474006810886752e-05,
"loss": 0.993,
"step": 12300
},
{
"epoch": 0.5812854442344045,
"eval_loss": 0.8264899253845215,
"eval_runtime": 5.2541,
"eval_samples_per_second": 78.986,
"eval_steps_per_second": 3.997,
"step": 12300
},
{
"epoch": 0.5860113421550095,
"grad_norm": 0.326407125727706,
"learning_rate": 7.330636710317417e-05,
"loss": 0.9859,
"step": 12400
},
{
"epoch": 0.5907372400756143,
"grad_norm": 0.2643534737693366,
"learning_rate": 7.1878550035979e-05,
"loss": 0.9834,
"step": 12500
},
{
"epoch": 0.5954631379962193,
"grad_norm": 0.31549347548377993,
"learning_rate": 7.0456931633563e-05,
"loss": 0.992,
"step": 12600
},
{
"epoch": 0.5954631379962193,
"eval_loss": 0.8210363984107971,
"eval_runtime": 5.2489,
"eval_samples_per_second": 79.063,
"eval_steps_per_second": 4.001,
"step": 12600
},
{
"epoch": 0.6001890359168242,
"grad_norm": 0.35140381090659567,
"learning_rate": 6.90418252558679e-05,
"loss": 0.9905,
"step": 12700
},
{
"epoch": 0.6049149338374291,
"grad_norm": 0.27929423819613103,
"learning_rate": 6.763354282742363e-05,
"loss": 0.9762,
"step": 12800
},
{
"epoch": 0.6096408317580341,
"grad_norm": 0.38668336812116355,
"learning_rate": 6.623239476859256e-05,
"loss": 0.9992,
"step": 12900
},
{
"epoch": 0.6096408317580341,
"eval_loss": 0.8181740641593933,
"eval_runtime": 5.2401,
"eval_samples_per_second": 79.197,
"eval_steps_per_second": 4.008,
"step": 12900
},
{
"epoch": 0.6143667296786389,
"grad_norm": 0.30268233702072767,
"learning_rate": 6.48386899271452e-05,
"loss": 0.9896,
"step": 13000
},
{
"epoch": 0.6190926275992439,
"grad_norm": 0.28617992631244854,
"learning_rate": 6.345273551018227e-05,
"loss": 0.9817,
"step": 13100
},
{
"epoch": 0.6238185255198487,
"grad_norm": 0.2768338973235925,
"learning_rate": 6.207483701641888e-05,
"loss": 0.9762,
"step": 13200
},
{
"epoch": 0.6238185255198487,
"eval_loss": 0.8148543834686279,
"eval_runtime": 5.2467,
"eval_samples_per_second": 79.098,
"eval_steps_per_second": 4.003,
"step": 13200
},
{
"epoch": 0.6285444234404537,
"grad_norm": 0.2781071212339617,
"learning_rate": 6.070529816884483e-05,
"loss": 0.9793,
"step": 13300
},
{
"epoch": 0.6332703213610587,
"grad_norm": 0.3070510003852717,
"learning_rate": 5.934442084777676e-05,
"loss": 0.9702,
"step": 13400
},
{
"epoch": 0.6379962192816635,
"grad_norm": 0.3276450250586141,
"learning_rate": 5.7992505024316125e-05,
"loss": 0.9822,
"step": 13500
},
{
"epoch": 0.6379962192816635,
"eval_loss": 0.8103421330451965,
"eval_runtime": 5.2589,
"eval_samples_per_second": 78.914,
"eval_steps_per_second": 3.993,
"step": 13500
},
{
"epoch": 0.6427221172022685,
"grad_norm": 0.2577870720634291,
"learning_rate": 5.6649848694228026e-05,
"loss": 0.9882,
"step": 13600
},
{
"epoch": 0.6474480151228733,
"grad_norm": 0.24382392994716376,
"learning_rate": 5.531674781225573e-05,
"loss": 0.9799,
"step": 13700
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.3123278759569431,
"learning_rate": 5.399349622688479e-05,
"loss": 0.9793,
"step": 13800
},
{
"epoch": 0.6521739130434783,
"eval_loss": 0.808434009552002,
"eval_runtime": 5.2459,
"eval_samples_per_second": 79.11,
"eval_steps_per_second": 4.003,
"step": 13800
},
{
"epoch": 0.6568998109640832,
"grad_norm": 0.3128533610620476,
"learning_rate": 5.268038561557166e-05,
"loss": 0.9737,
"step": 13900
},
{
"epoch": 0.6616257088846881,
"grad_norm": 0.36584898333751154,
"learning_rate": 5.137770542045063e-05,
"loss": 0.976,
"step": 14000
},
{
"epoch": 0.666351606805293,
"grad_norm": 0.2481106679231261,
"learning_rate": 5.008574278453368e-05,
"loss": 0.9657,
"step": 14100
},
{
"epoch": 0.666351606805293,
"eval_loss": 0.8048512935638428,
"eval_runtime": 5.262,
"eval_samples_per_second": 78.867,
"eval_steps_per_second": 3.991,
"step": 14100
},
{
"epoch": 0.6710775047258979,
"grad_norm": 0.28339151903023796,
"learning_rate": 4.8804782488417054e-05,
"loss": 0.9685,
"step": 14200
},
{
"epoch": 0.6758034026465028,
"grad_norm": 0.2932163196742264,
"learning_rate": 4.7535106887508486e-05,
"loss": 0.9696,
"step": 14300
},
{
"epoch": 0.6805293005671077,
"grad_norm": 0.28023874700097434,
"learning_rate": 4.6276995849789115e-05,
"loss": 0.9683,
"step": 14400
},
{
"epoch": 0.6805293005671077,
"eval_loss": 0.7999623417854309,
"eval_runtime": 5.2541,
"eval_samples_per_second": 78.987,
"eval_steps_per_second": 3.997,
"step": 14400
},
{
"epoch": 0.6852551984877127,
"grad_norm": 0.27912970915295715,
"learning_rate": 4.503072669412367e-05,
"loss": 0.9702,
"step": 14500
},
{
"epoch": 0.6899810964083176,
"grad_norm": 0.31856080149184574,
"learning_rate": 4.379657412913243e-05,
"loss": 0.9777,
"step": 14600
},
{
"epoch": 0.6947069943289225,
"grad_norm": 0.31680804579667116,
"learning_rate": 4.257481019263872e-05,
"loss": 0.9721,
"step": 14700
},
{
"epoch": 0.6947069943289225,
"eval_loss": 0.7991150617599487,
"eval_runtime": 5.2513,
"eval_samples_per_second": 79.028,
"eval_steps_per_second": 3.999,
"step": 14700
},
{
"epoch": 0.6994328922495274,
"grad_norm": 0.2865103676621986,
"learning_rate": 4.136570419170501e-05,
"loss": 0.9603,
"step": 14800
},
{
"epoch": 0.7041587901701323,
"grad_norm": 0.2739501065198027,
"learning_rate": 4.016952264327091e-05,
"loss": 0.9698,
"step": 14900
},
{
"epoch": 0.7088846880907372,
"grad_norm": 0.27242959783154574,
"learning_rate": 3.8986529215406275e-05,
"loss": 0.9692,
"step": 15000
},
{
"epoch": 0.7088846880907372,
"eval_loss": 0.7943203449249268,
"eval_runtime": 5.2535,
"eval_samples_per_second": 78.996,
"eval_steps_per_second": 3.997,
"step": 15000
},
{
"epoch": 0.7136105860113422,
"grad_norm": 0.2803746380131459,
"learning_rate": 3.7816984669192244e-05,
"loss": 0.9682,
"step": 15100
},
{
"epoch": 0.718336483931947,
"grad_norm": 0.32480320465371537,
"learning_rate": 3.666114680124298e-05,
"loss": 0.9605,
"step": 15200
},
{
"epoch": 0.723062381852552,
"grad_norm": 0.2733118881436892,
"learning_rate": 3.551927038688095e-05,
"loss": 0.9565,
"step": 15300
},
{
"epoch": 0.723062381852552,
"eval_loss": 0.7929270267486572,
"eval_runtime": 5.2621,
"eval_samples_per_second": 78.866,
"eval_steps_per_second": 3.991,
"step": 15300
},
{
"epoch": 0.7277882797731569,
"grad_norm": 0.2906152755954255,
"learning_rate": 3.4391607123978095e-05,
"loss": 0.9595,
"step": 15400
},
{
"epoch": 0.7325141776937618,
"grad_norm": 0.29851876373805336,
"learning_rate": 3.327840557747539e-05,
"loss": 0.9631,
"step": 15500
},
{
"epoch": 0.7372400756143668,
"grad_norm": 0.2878910172272555,
"learning_rate": 3.2179911124592966e-05,
"loss": 0.966,
"step": 15600
},
{
"epoch": 0.7372400756143668,
"eval_loss": 0.7899049520492554,
"eval_runtime": 5.2608,
"eval_samples_per_second": 78.885,
"eval_steps_per_second": 3.992,
"step": 15600
},
{
"epoch": 0.7419659735349716,
"grad_norm": 0.28857592875042104,
"learning_rate": 3.109636590074292e-05,
"loss": 0.9654,
"step": 15700
},
{
"epoch": 0.7466918714555766,
"grad_norm": 0.30328231354583435,
"learning_rate": 3.0028008746156588e-05,
"loss": 0.9625,
"step": 15800
},
{
"epoch": 0.7514177693761814,
"grad_norm": 0.2600916390740946,
"learning_rate": 2.897507515323835e-05,
"loss": 0.9565,
"step": 15900
},
{
"epoch": 0.7514177693761814,
"eval_loss": 0.7877747416496277,
"eval_runtime": 5.2562,
"eval_samples_per_second": 78.954,
"eval_steps_per_second": 3.995,
"step": 15900
},
{
"epoch": 0.7561436672967864,
"grad_norm": 0.29782376770057994,
"learning_rate": 2.7937797214657147e-05,
"loss": 0.9614,
"step": 16000
},
{
"epoch": 0.7608695652173914,
"grad_norm": 0.265940470272052,
"learning_rate": 2.691640357218759e-05,
"loss": 0.9529,
"step": 16100
},
{
"epoch": 0.7655954631379962,
"grad_norm": 0.31542391746331455,
"learning_rate": 2.5911119366311597e-05,
"loss": 0.9596,
"step": 16200
},
{
"epoch": 0.7655954631379962,
"eval_loss": 0.7863346934318542,
"eval_runtime": 5.2772,
"eval_samples_per_second": 78.641,
"eval_steps_per_second": 3.979,
"step": 16200
},
{
"epoch": 0.7703213610586012,
"grad_norm": 0.2551803401373606,
"learning_rate": 2.492216618659188e-05,
"loss": 0.9581,
"step": 16300
},
{
"epoch": 0.775047258979206,
"grad_norm": 0.30687065848754075,
"learning_rate": 2.3949762022828092e-05,
"loss": 0.9483,
"step": 16400
},
{
"epoch": 0.779773156899811,
"grad_norm": 0.24549272522905088,
"learning_rate": 2.2994121217006404e-05,
"loss": 0.954,
"step": 16500
},
{
"epoch": 0.779773156899811,
"eval_loss": 0.7838764786720276,
"eval_runtime": 5.259,
"eval_samples_per_second": 78.913,
"eval_steps_per_second": 3.993,
"step": 16500
},
{
"epoch": 0.7844990548204159,
"grad_norm": 0.3111216806210738,
"learning_rate": 2.2055454416053422e-05,
"loss": 0.9616,
"step": 16600
},
{
"epoch": 0.7892249527410208,
"grad_norm": 0.3111244620482844,
"learning_rate": 2.1133968525404146e-05,
"loss": 0.9631,
"step": 16700
},
{
"epoch": 0.7939508506616257,
"grad_norm": 0.28892056294761154,
"learning_rate": 2.0229866663395026e-05,
"loss": 0.9625,
"step": 16800
},
{
"epoch": 0.7939508506616257,
"eval_loss": 0.7827126383781433,
"eval_runtime": 5.258,
"eval_samples_per_second": 78.927,
"eval_steps_per_second": 3.994,
"step": 16800
},
{
"epoch": 0.7986767485822306,
"grad_norm": 0.32345991572241156,
"learning_rate": 1.934334811649161e-05,
"loss": 0.9556,
"step": 16900
},
{
"epoch": 0.8034026465028355,
"grad_norm": 0.3526105964303374,
"learning_rate": 1.847460829536075e-05,
"loss": 0.9546,
"step": 17000
},
{
"epoch": 0.8081285444234405,
"grad_norm": 0.30985153753926015,
"learning_rate": 1.7623838691797544e-05,
"loss": 0.9612,
"step": 17100
},
{
"epoch": 0.8081285444234405,
"eval_loss": 0.7805792093276978,
"eval_runtime": 5.2626,
"eval_samples_per_second": 78.858,
"eval_steps_per_second": 3.99,
"step": 17100
},
{
"epoch": 0.8128544423440454,
"grad_norm": 0.2840692418516341,
"learning_rate": 1.679122683651546e-05,
"loss": 0.9557,
"step": 17200
},
{
"epoch": 0.8175803402646503,
"grad_norm": 0.3393141443988905,
"learning_rate": 1.5976956257810127e-05,
"loss": 0.9471,
"step": 17300
},
{
"epoch": 0.8223062381852552,
"grad_norm": 0.2537148482900762,
"learning_rate": 1.5181206441105078e-05,
"loss": 0.9405,
"step": 17400
},
{
"epoch": 0.8223062381852552,
"eval_loss": 0.7792423367500305,
"eval_runtime": 5.2508,
"eval_samples_per_second": 79.035,
"eval_steps_per_second": 3.999,
"step": 17400
},
{
"epoch": 0.8270321361058601,
"grad_norm": 0.290376596540058,
"learning_rate": 1.4404152789388647e-05,
"loss": 0.9548,
"step": 17500
},
{
"epoch": 0.831758034026465,
"grad_norm": 0.30405340852350765,
"learning_rate": 1.364596658455105e-05,
"loss": 0.9429,
"step": 17600
},
{
"epoch": 0.8364839319470699,
"grad_norm": 0.3035861524046109,
"learning_rate": 1.2906814949629232e-05,
"loss": 0.9576,
"step": 17700
},
{
"epoch": 0.8364839319470699,
"eval_loss": 0.7785135507583618,
"eval_runtime": 5.269,
"eval_samples_per_second": 78.762,
"eval_steps_per_second": 3.986,
"step": 17700
},
{
"epoch": 0.8412098298676749,
"grad_norm": 0.3852006759427171,
"learning_rate": 1.218686081196917e-05,
"loss": 0.9462,
"step": 17800
},
{
"epoch": 0.8459357277882797,
"grad_norm": 0.34392822048362665,
"learning_rate": 1.1486262867312413e-05,
"loss": 0.9478,
"step": 17900
},
{
"epoch": 0.8506616257088847,
"grad_norm": 0.26878458999553395,
"learning_rate": 1.0805175544815648e-05,
"loss": 0.943,
"step": 18000
},
{
"epoch": 0.8506616257088847,
"eval_loss": 0.7768391370773315,
"eval_runtime": 5.2545,
"eval_samples_per_second": 78.979,
"eval_steps_per_second": 3.997,
"step": 18000
},
{
"epoch": 0.8553875236294896,
"grad_norm": 0.2737274242530303,
"learning_rate": 1.0143748973010825e-05,
"loss": 0.9543,
"step": 18100
},
{
"epoch": 0.8601134215500945,
"grad_norm": 0.35051258094572235,
"learning_rate": 9.502128946712862e-06,
"loss": 0.9519,
"step": 18200
},
{
"epoch": 0.8648393194706995,
"grad_norm": 0.2597588197411309,
"learning_rate": 8.880456894883104e-06,
"loss": 0.949,
"step": 18300
},
{
"epoch": 0.8648393194706995,
"eval_loss": 0.7756012082099915,
"eval_runtime": 5.2553,
"eval_samples_per_second": 78.968,
"eval_steps_per_second": 3.996,
"step": 18300
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.28694454712067513,
"learning_rate": 8.278869849454718e-06,
"loss": 0.9499,
"step": 18400
},
{
"epoch": 0.8742911153119093,
"grad_norm": 0.28699237456135096,
"learning_rate": 7.697500415127434e-06,
"loss": 0.9457,
"step": 18500
},
{
"epoch": 0.8790170132325141,
"grad_norm": 0.24934899142631728,
"learning_rate": 7.136476740138387e-06,
"loss": 0.9409,
"step": 18600
},
{
"epoch": 0.8790170132325141,
"eval_loss": 0.77437824010849,
"eval_runtime": 5.2548,
"eval_samples_per_second": 78.975,
"eval_steps_per_second": 3.996,
"step": 18600
},
{
"epoch": 0.8837429111531191,
"grad_norm": 0.3691485047317885,
"learning_rate": 6.5959224880147715e-06,
"loss": 0.9393,
"step": 18700
},
{
"epoch": 0.888468809073724,
"grad_norm": 0.26090043398323587,
"learning_rate": 6.0759568103156195e-06,
"loss": 0.9493,
"step": 18800
},
{
"epoch": 0.8931947069943289,
"grad_norm": 0.27012221812253706,
"learning_rate": 5.576694320367648e-06,
"loss": 0.9413,
"step": 18900
},
{
"epoch": 0.8931947069943289,
"eval_loss": 0.773705005645752,
"eval_runtime": 5.2531,
"eval_samples_per_second": 79.001,
"eval_steps_per_second": 3.998,
"step": 18900
},
{
"epoch": 0.8979206049149339,
"grad_norm": 0.22647954450095426,
"learning_rate": 5.098245068001661e-06,
"loss": 0.9418,
"step": 19000
},
{
"epoch": 0.9026465028355387,
"grad_norm": 0.26426406932660085,
"learning_rate": 4.64071451529502e-06,
"loss": 0.9539,
"step": 19100
},
{
"epoch": 0.9073724007561437,
"grad_norm": 0.33611022842390764,
"learning_rate": 4.2042035133248895e-06,
"loss": 0.9451,
"step": 19200
},
{
"epoch": 0.9073724007561437,
"eval_loss": 0.773216724395752,
"eval_runtime": 5.2599,
"eval_samples_per_second": 78.899,
"eval_steps_per_second": 3.992,
"step": 19200
},
{
"epoch": 0.9120982986767486,
"grad_norm": 0.28197774639046563,
"learning_rate": 3.7888082799384495e-06,
"loss": 0.9515,
"step": 19300
},
{
"epoch": 0.9168241965973535,
"grad_norm": 0.30675458683133056,
"learning_rate": 3.3946203785439113e-06,
"loss": 0.9523,
"step": 19400
},
{
"epoch": 0.9215500945179584,
"grad_norm": 0.30602215415286255,
"learning_rate": 3.021726697927696e-06,
"loss": 0.9448,
"step": 19500
},
{
"epoch": 0.9215500945179584,
"eval_loss": 0.7725175023078918,
"eval_runtime": 5.2584,
"eval_samples_per_second": 78.921,
"eval_steps_per_second": 3.994,
"step": 19500
},
{
"epoch": 0.9262759924385633,
"grad_norm": 0.2630236723779736,
"learning_rate": 2.6702094331020887e-06,
"loss": 0.9397,
"step": 19600
},
{
"epoch": 0.9310018903591682,
"grad_norm": 0.29540927584526117,
"learning_rate": 2.34014606718731e-06,
"loss": 0.9483,
"step": 19700
},
{
"epoch": 0.9357277882797732,
"grad_norm": 0.27625323870327295,
"learning_rate": 2.0316093543323757e-06,
"loss": 0.9511,
"step": 19800
},
{
"epoch": 0.9357277882797732,
"eval_loss": 0.7723566889762878,
"eval_runtime": 5.269,
"eval_samples_per_second": 78.763,
"eval_steps_per_second": 3.986,
"step": 19800
},
{
"epoch": 0.9404536862003781,
"grad_norm": 0.5607467253913079,
"learning_rate": 1.7446673036782935e-06,
"loss": 0.9498,
"step": 19900
},
{
"epoch": 0.945179584120983,
"grad_norm": 0.29002936924030825,
"learning_rate": 1.479383164367043e-06,
"loss": 0.9451,
"step": 20000
},
{
"epoch": 0.9499054820415879,
"grad_norm": 0.2881707243707424,
"learning_rate": 1.2358154116000942e-06,
"loss": 0.948,
"step": 20100
},
{
"epoch": 0.9499054820415879,
"eval_loss": 0.7720000147819519,
"eval_runtime": 5.2571,
"eval_samples_per_second": 78.941,
"eval_steps_per_second": 3.995,
"step": 20100
},
{
"epoch": 0.9546313799621928,
"grad_norm": 0.2632217737595484,
"learning_rate": 1.0140177337488288e-06,
"loss": 0.9523,
"step": 20200
},
{
"epoch": 0.9593572778827977,
"grad_norm": 0.2690755865675043,
"learning_rate": 8.140390205204407e-07,
"loss": 0.9437,
"step": 20300
},
{
"epoch": 0.9640831758034026,
"grad_norm": 0.22581336971248372,
"learning_rate": 6.359233521813224e-07,
"loss": 0.9462,
"step": 20400
},
{
"epoch": 0.9640831758034026,
"eval_loss": 0.7718393206596375,
"eval_runtime": 5.2675,
"eval_samples_per_second": 78.785,
"eval_steps_per_second": 3.987,
"step": 20400
},
{
"epoch": 0.9688090737240076,
"grad_norm": 0.262694103108468,
"learning_rate": 4.797099898407375e-07,
"loss": 0.9487,
"step": 20500
},
{
"epoch": 0.9735349716446124,
"grad_norm": 0.25669910143562646,
"learning_rate": 3.4543336679673245e-07,
"loss": 0.9475,
"step": 20600
},
{
"epoch": 0.9782608695652174,
"grad_norm": 0.2423276211409861,
"learning_rate": 2.3312308094607382e-07,
"loss": 0.9435,
"step": 20700
},
{
"epoch": 0.9782608695652174,
"eval_loss": 0.7716944217681885,
"eval_runtime": 5.2605,
"eval_samples_per_second": 78.89,
"eval_steps_per_second": 3.992,
"step": 20700
},
{
"epoch": 0.9829867674858223,
"grad_norm": 0.24829427961914935,
"learning_rate": 1.4280388826026782e-07,
"loss": 0.9409,
"step": 20800
},
{
"epoch": 0.9877126654064272,
"grad_norm": 0.2521311024320304,
"learning_rate": 7.449569732862482e-08,
"loss": 0.9471,
"step": 20900
},
{
"epoch": 0.9924385633270322,
"grad_norm": 0.28557979970838715,
"learning_rate": 2.8213564969969963e-08,
"loss": 0.945,
"step": 21000
},
{
"epoch": 0.9924385633270322,
"eval_loss": 0.7717165946960449,
"eval_runtime": 5.2747,
"eval_samples_per_second": 78.678,
"eval_steps_per_second": 3.981,
"step": 21000
},
{
"epoch": 0.997164461247637,
"grad_norm": 0.28331781617919166,
"learning_rate": 3.967692913753318e-09,
"loss": 0.9499,
"step": 21100
}
],
"logging_steps": 100,
"max_steps": 21160,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 4000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 914125500973056.0,
"train_batch_size": 5,
"trial_name": null,
"trial_params": null
}