{ "best_metric": 0.035182420164346695, "best_model_checkpoint": "saves/psy-course/Llama-3.1-8B-Instruct/train/fold3/checkpoint-1300", "epoch": 4.9976479443033215, "eval_steps": 50, "global_step": 3320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015053156458744943, "grad_norm": 4.746142387390137, "learning_rate": 3.0120481927710846e-06, "loss": 1.5819, "step": 10 }, { "epoch": 0.030106312917489886, "grad_norm": 5.74769926071167, "learning_rate": 6.024096385542169e-06, "loss": 1.5237, "step": 20 }, { "epoch": 0.04515946937623483, "grad_norm": 7.372040748596191, "learning_rate": 9.036144578313253e-06, "loss": 1.319, "step": 30 }, { "epoch": 0.06021262583497977, "grad_norm": 2.2783114910125732, "learning_rate": 1.2048192771084338e-05, "loss": 1.0249, "step": 40 }, { "epoch": 0.07526578229372471, "grad_norm": 1.3017759323120117, "learning_rate": 1.5060240963855424e-05, "loss": 0.5742, "step": 50 }, { "epoch": 0.07526578229372471, "eval_loss": 0.4036926031112671, "eval_runtime": 184.4794, "eval_samples_per_second": 6.402, "eval_steps_per_second": 6.402, "step": 50 }, { "epoch": 0.09031893875246966, "grad_norm": 0.937243640422821, "learning_rate": 1.8072289156626505e-05, "loss": 0.3259, "step": 60 }, { "epoch": 0.1053720952112146, "grad_norm": 1.6518957614898682, "learning_rate": 2.1084337349397593e-05, "loss": 0.2599, "step": 70 }, { "epoch": 0.12042525166995954, "grad_norm": 0.7597194314002991, "learning_rate": 2.4096385542168677e-05, "loss": 0.2071, "step": 80 }, { "epoch": 0.1354784081287045, "grad_norm": 1.245460033416748, "learning_rate": 2.7108433734939758e-05, "loss": 0.1062, "step": 90 }, { "epoch": 0.15053156458744943, "grad_norm": 0.9548414349555969, "learning_rate": 3.012048192771085e-05, "loss": 0.0981, "step": 100 }, { "epoch": 0.15053156458744943, "eval_loss": 0.08899969607591629, "eval_runtime": 183.6887, "eval_samples_per_second": 6.429, "eval_steps_per_second": 6.429, "step": 100 }, { "epoch": 0.16558472104619437, "grad_norm": 1.5413150787353516, "learning_rate": 3.313253012048193e-05, "loss": 0.0727, "step": 110 }, { "epoch": 0.18063787750493931, "grad_norm": 0.6956924200057983, "learning_rate": 3.614457831325301e-05, "loss": 0.0929, "step": 120 }, { "epoch": 0.19569103396368426, "grad_norm": 0.8089670538902283, "learning_rate": 3.91566265060241e-05, "loss": 0.0795, "step": 130 }, { "epoch": 0.2107441904224292, "grad_norm": 0.828559160232544, "learning_rate": 4.2168674698795186e-05, "loss": 0.0718, "step": 140 }, { "epoch": 0.22579734688117414, "grad_norm": 1.1623843908309937, "learning_rate": 4.5180722891566266e-05, "loss": 0.0775, "step": 150 }, { "epoch": 0.22579734688117414, "eval_loss": 0.06625137478113174, "eval_runtime": 183.633, "eval_samples_per_second": 6.431, "eval_steps_per_second": 6.431, "step": 150 }, { "epoch": 0.2408505033399191, "grad_norm": 0.929506242275238, "learning_rate": 4.8192771084337354e-05, "loss": 0.0775, "step": 160 }, { "epoch": 0.25590365979866403, "grad_norm": 0.7432942390441895, "learning_rate": 5.120481927710844e-05, "loss": 0.0812, "step": 170 }, { "epoch": 0.270956816257409, "grad_norm": 1.0123765468597412, "learning_rate": 5.4216867469879516e-05, "loss": 0.0757, "step": 180 }, { "epoch": 0.2860099727161539, "grad_norm": 1.2548093795776367, "learning_rate": 5.72289156626506e-05, "loss": 0.0634, "step": 190 }, { "epoch": 0.30106312917489886, "grad_norm": 0.8022633194923401, "learning_rate": 6.02409638554217e-05, "loss": 0.075, "step": 200 }, { "epoch": 0.30106312917489886, "eval_loss": 0.05735846608877182, "eval_runtime": 183.4515, "eval_samples_per_second": 6.438, "eval_steps_per_second": 6.438, "step": 200 }, { "epoch": 0.3161162856336438, "grad_norm": 1.0062459707260132, "learning_rate": 6.325301204819278e-05, "loss": 0.0665, "step": 210 }, { "epoch": 0.33116944209238874, "grad_norm": 0.7612916231155396, "learning_rate": 6.626506024096386e-05, "loss": 0.0623, "step": 220 }, { "epoch": 0.3462225985511337, "grad_norm": 0.7383196949958801, "learning_rate": 6.927710843373494e-05, "loss": 0.0675, "step": 230 }, { "epoch": 0.36127575500987863, "grad_norm": 1.260231375694275, "learning_rate": 7.228915662650602e-05, "loss": 0.0609, "step": 240 }, { "epoch": 0.3763289114686236, "grad_norm": 0.9885674118995667, "learning_rate": 7.530120481927712e-05, "loss": 0.0587, "step": 250 }, { "epoch": 0.3763289114686236, "eval_loss": 0.05333821102976799, "eval_runtime": 183.3222, "eval_samples_per_second": 6.442, "eval_steps_per_second": 6.442, "step": 250 }, { "epoch": 0.3913820679273685, "grad_norm": 0.5603907108306885, "learning_rate": 7.83132530120482e-05, "loss": 0.0397, "step": 260 }, { "epoch": 0.40643522438611346, "grad_norm": 0.407864511013031, "learning_rate": 8.132530120481928e-05, "loss": 0.0537, "step": 270 }, { "epoch": 0.4214883808448584, "grad_norm": 0.4329761862754822, "learning_rate": 8.433734939759037e-05, "loss": 0.0592, "step": 280 }, { "epoch": 0.43654153730360334, "grad_norm": 0.419300377368927, "learning_rate": 8.734939759036145e-05, "loss": 0.0489, "step": 290 }, { "epoch": 0.4515946937623483, "grad_norm": 0.8530102372169495, "learning_rate": 9.036144578313253e-05, "loss": 0.0617, "step": 300 }, { "epoch": 0.4515946937623483, "eval_loss": 0.05468267574906349, "eval_runtime": 183.3947, "eval_samples_per_second": 6.44, "eval_steps_per_second": 6.44, "step": 300 }, { "epoch": 0.46664785022109323, "grad_norm": 0.2520005404949188, "learning_rate": 9.337349397590361e-05, "loss": 0.0446, "step": 310 }, { "epoch": 0.4817010066798382, "grad_norm": 0.8285573720932007, "learning_rate": 9.638554216867471e-05, "loss": 0.0736, "step": 320 }, { "epoch": 0.4967541631385831, "grad_norm": 0.8984416127204895, "learning_rate": 9.939759036144579e-05, "loss": 0.0531, "step": 330 }, { "epoch": 0.5118073195973281, "grad_norm": 0.5475632548332214, "learning_rate": 9.999823129264712e-05, "loss": 0.0375, "step": 340 }, { "epoch": 0.526860476056073, "grad_norm": 0.45024311542510986, "learning_rate": 9.999104613348688e-05, "loss": 0.0431, "step": 350 }, { "epoch": 0.526860476056073, "eval_loss": 0.05186503753066063, "eval_runtime": 183.5524, "eval_samples_per_second": 6.434, "eval_steps_per_second": 6.434, "step": 350 }, { "epoch": 0.541913632514818, "grad_norm": 0.331438273191452, "learning_rate": 9.997833477197385e-05, "loss": 0.0547, "step": 360 }, { "epoch": 0.5569667889735629, "grad_norm": 0.6287385821342468, "learning_rate": 9.996009861327077e-05, "loss": 0.0479, "step": 370 }, { "epoch": 0.5720199454323078, "grad_norm": 0.335729718208313, "learning_rate": 9.993633967327269e-05, "loss": 0.0447, "step": 380 }, { "epoch": 0.5870731018910528, "grad_norm": 0.8053644895553589, "learning_rate": 9.990706057838416e-05, "loss": 0.0455, "step": 390 }, { "epoch": 0.6021262583497977, "grad_norm": 0.4682386517524719, "learning_rate": 9.987226456522884e-05, "loss": 0.0573, "step": 400 }, { "epoch": 0.6021262583497977, "eval_loss": 0.04791492968797684, "eval_runtime": 183.6734, "eval_samples_per_second": 6.43, "eval_steps_per_second": 6.43, "step": 400 }, { "epoch": 0.6171794148085427, "grad_norm": 0.1765960454940796, "learning_rate": 9.983195548029173e-05, "loss": 0.0365, "step": 410 }, { "epoch": 0.6322325712672876, "grad_norm": 0.7186487317085266, "learning_rate": 9.9786137779494e-05, "loss": 0.0581, "step": 420 }, { "epoch": 0.6472857277260325, "grad_norm": 0.3791099786758423, "learning_rate": 9.973481652770038e-05, "loss": 0.044, "step": 430 }, { "epoch": 0.6623388841847775, "grad_norm": 0.18522125482559204, "learning_rate": 9.967799739815925e-05, "loss": 0.0397, "step": 440 }, { "epoch": 0.6773920406435224, "grad_norm": 0.9369704127311707, "learning_rate": 9.961568667187556e-05, "loss": 0.0504, "step": 450 }, { "epoch": 0.6773920406435224, "eval_loss": 0.043754514306783676, "eval_runtime": 183.6481, "eval_samples_per_second": 6.431, "eval_steps_per_second": 6.431, "step": 450 }, { "epoch": 0.6924451971022674, "grad_norm": 0.6878030300140381, "learning_rate": 9.954789123691642e-05, "loss": 0.0445, "step": 460 }, { "epoch": 0.7074983535610123, "grad_norm": 0.4177227318286896, "learning_rate": 9.947461858764978e-05, "loss": 0.0387, "step": 470 }, { "epoch": 0.7225515100197573, "grad_norm": 0.2587774097919464, "learning_rate": 9.939587682391586e-05, "loss": 0.0357, "step": 480 }, { "epoch": 0.7376046664785022, "grad_norm": 0.8120297789573669, "learning_rate": 9.931167465013182e-05, "loss": 0.0588, "step": 490 }, { "epoch": 0.7526578229372471, "grad_norm": 0.4611870050430298, "learning_rate": 9.922202137432955e-05, "loss": 0.0341, "step": 500 }, { "epoch": 0.7526578229372471, "eval_loss": 0.04279050976037979, "eval_runtime": 183.5554, "eval_samples_per_second": 6.434, "eval_steps_per_second": 6.434, "step": 500 }, { "epoch": 0.7677109793959921, "grad_norm": 0.2154083251953125, "learning_rate": 9.912692690712665e-05, "loss": 0.0414, "step": 510 }, { "epoch": 0.782764135854737, "grad_norm": 0.40784987807273865, "learning_rate": 9.902640176063103e-05, "loss": 0.0491, "step": 520 }, { "epoch": 0.797817292313482, "grad_norm": 0.23167699575424194, "learning_rate": 9.892045704727864e-05, "loss": 0.0544, "step": 530 }, { "epoch": 0.8128704487722269, "grad_norm": 0.3374333083629608, "learning_rate": 9.880910447860527e-05, "loss": 0.0587, "step": 540 }, { "epoch": 0.8279236052309719, "grad_norm": 0.41512349247932434, "learning_rate": 9.869235636395177e-05, "loss": 0.0448, "step": 550 }, { "epoch": 0.8279236052309719, "eval_loss": 0.04395035281777382, "eval_runtime": 182.9764, "eval_samples_per_second": 6.454, "eval_steps_per_second": 6.454, "step": 550 }, { "epoch": 0.8429767616897168, "grad_norm": 0.6826766729354858, "learning_rate": 9.857022560910338e-05, "loss": 0.0586, "step": 560 }, { "epoch": 0.8580299181484617, "grad_norm": 0.7344224452972412, "learning_rate": 9.844272571486311e-05, "loss": 0.043, "step": 570 }, { "epoch": 0.8730830746072067, "grad_norm": 0.24045772850513458, "learning_rate": 9.830987077555924e-05, "loss": 0.0538, "step": 580 }, { "epoch": 0.8881362310659516, "grad_norm": 0.863741934299469, "learning_rate": 9.817167547748729e-05, "loss": 0.0615, "step": 590 }, { "epoch": 0.9031893875246966, "grad_norm": 0.2820413112640381, "learning_rate": 9.802815509728662e-05, "loss": 0.0373, "step": 600 }, { "epoch": 0.9031893875246966, "eval_loss": 0.04143202304840088, "eval_runtime": 182.4396, "eval_samples_per_second": 6.473, "eval_steps_per_second": 6.473, "step": 600 }, { "epoch": 0.9182425439834415, "grad_norm": 0.3734269440174103, "learning_rate": 9.787932550025158e-05, "loss": 0.0383, "step": 610 }, { "epoch": 0.9332957004421865, "grad_norm": 0.2547641694545746, "learning_rate": 9.772520313857775e-05, "loss": 0.0429, "step": 620 }, { "epoch": 0.9483488569009314, "grad_norm": 0.318452924489975, "learning_rate": 9.756580504954334e-05, "loss": 0.0475, "step": 630 }, { "epoch": 0.9634020133596763, "grad_norm": 0.2922682762145996, "learning_rate": 9.740114885362562e-05, "loss": 0.0443, "step": 640 }, { "epoch": 0.9784551698184213, "grad_norm": 0.2600071430206299, "learning_rate": 9.723125275255325e-05, "loss": 0.0369, "step": 650 }, { "epoch": 0.9784551698184213, "eval_loss": 0.04143212363123894, "eval_runtime": 182.2253, "eval_samples_per_second": 6.481, "eval_steps_per_second": 6.481, "step": 650 }, { "epoch": 0.9935083262771662, "grad_norm": 0.24337530136108398, "learning_rate": 9.705613552729415e-05, "loss": 0.0298, "step": 660 }, { "epoch": 1.0085614827359113, "grad_norm": 0.2102644145488739, "learning_rate": 9.68758165359794e-05, "loss": 0.0412, "step": 670 }, { "epoch": 1.0236146391946561, "grad_norm": 0.3315404951572418, "learning_rate": 9.669031571176322e-05, "loss": 0.0423, "step": 680 }, { "epoch": 1.0386677956534012, "grad_norm": 0.41248032450675964, "learning_rate": 9.64996535606196e-05, "loss": 0.0382, "step": 690 }, { "epoch": 1.053720952112146, "grad_norm": 0.21960702538490295, "learning_rate": 9.630385115907545e-05, "loss": 0.0266, "step": 700 }, { "epoch": 1.053720952112146, "eval_loss": 0.042183760553598404, "eval_runtime": 182.3786, "eval_samples_per_second": 6.476, "eval_steps_per_second": 6.476, "step": 700 }, { "epoch": 1.068774108570891, "grad_norm": 0.1838514655828476, "learning_rate": 9.610293015188067e-05, "loss": 0.036, "step": 710 }, { "epoch": 1.083827265029636, "grad_norm": 0.23699496686458588, "learning_rate": 9.589691274961556e-05, "loss": 0.0252, "step": 720 }, { "epoch": 1.098880421488381, "grad_norm": 0.1834336817264557, "learning_rate": 9.568582172623544e-05, "loss": 0.0432, "step": 730 }, { "epoch": 1.1139335779471258, "grad_norm": 0.3436281681060791, "learning_rate": 9.546968041655326e-05, "loss": 0.0366, "step": 740 }, { "epoch": 1.1289867344058706, "grad_norm": 0.18323691189289093, "learning_rate": 9.524851271366001e-05, "loss": 0.0337, "step": 750 }, { "epoch": 1.1289867344058706, "eval_loss": 0.03804696723818779, "eval_runtime": 182.4717, "eval_samples_per_second": 6.472, "eval_steps_per_second": 6.472, "step": 750 }, { "epoch": 1.1440398908646157, "grad_norm": 0.2865748703479767, "learning_rate": 9.502234306628355e-05, "loss": 0.0291, "step": 760 }, { "epoch": 1.1590930473233607, "grad_norm": 0.4548961818218231, "learning_rate": 9.47911964760858e-05, "loss": 0.0295, "step": 770 }, { "epoch": 1.1741462037821055, "grad_norm": 0.46943390369415283, "learning_rate": 9.455509849489915e-05, "loss": 0.0287, "step": 780 }, { "epoch": 1.1891993602408506, "grad_norm": 0.35242822766304016, "learning_rate": 9.431407522190175e-05, "loss": 0.0368, "step": 790 }, { "epoch": 1.2042525166995954, "grad_norm": 0.2523718774318695, "learning_rate": 9.406815330073244e-05, "loss": 0.0379, "step": 800 }, { "epoch": 1.2042525166995954, "eval_loss": 0.04243133217096329, "eval_runtime": 182.4906, "eval_samples_per_second": 6.472, "eval_steps_per_second": 6.472, "step": 800 }, { "epoch": 1.2193056731583405, "grad_norm": 0.2634824216365814, "learning_rate": 9.381735991654546e-05, "loss": 0.0373, "step": 810 }, { "epoch": 1.2343588296170853, "grad_norm": 0.3448045551776886, "learning_rate": 9.356172279300528e-05, "loss": 0.0408, "step": 820 }, { "epoch": 1.2494119860758304, "grad_norm": 0.11834454536437988, "learning_rate": 9.330127018922194e-05, "loss": 0.0338, "step": 830 }, { "epoch": 1.2644651425345752, "grad_norm": 0.2064102292060852, "learning_rate": 9.303603089662716e-05, "loss": 0.0396, "step": 840 }, { "epoch": 1.2795182989933203, "grad_norm": 0.30329787731170654, "learning_rate": 9.276603423579164e-05, "loss": 0.0297, "step": 850 }, { "epoch": 1.2795182989933203, "eval_loss": 0.04129231348633766, "eval_runtime": 182.4118, "eval_samples_per_second": 6.474, "eval_steps_per_second": 6.474, "step": 850 }, { "epoch": 1.294571455452065, "grad_norm": 0.29361701011657715, "learning_rate": 9.249131005318387e-05, "loss": 0.0434, "step": 860 }, { "epoch": 1.30962461191081, "grad_norm": 0.1374683827161789, "learning_rate": 9.221188871787075e-05, "loss": 0.0346, "step": 870 }, { "epoch": 1.324677768369555, "grad_norm": 0.22801139950752258, "learning_rate": 9.192780111816047e-05, "loss": 0.0367, "step": 880 }, { "epoch": 1.3397309248283, "grad_norm": 0.3111538290977478, "learning_rate": 9.163907865818806e-05, "loss": 0.0346, "step": 890 }, { "epoch": 1.3547840812870449, "grad_norm": 0.31622806191444397, "learning_rate": 9.134575325444376e-05, "loss": 0.0417, "step": 900 }, { "epoch": 1.3547840812870449, "eval_loss": 0.03892921656370163, "eval_runtime": 182.2721, "eval_samples_per_second": 6.479, "eval_steps_per_second": 6.479, "step": 900 }, { "epoch": 1.3698372377457897, "grad_norm": 0.23705615103244781, "learning_rate": 9.104785733224496e-05, "loss": 0.0294, "step": 910 }, { "epoch": 1.3848903942045347, "grad_norm": 0.2639251947402954, "learning_rate": 9.07454238221517e-05, "loss": 0.0361, "step": 920 }, { "epoch": 1.3999435506632798, "grad_norm": 0.25921890139579773, "learning_rate": 9.043848615632642e-05, "loss": 0.0294, "step": 930 }, { "epoch": 1.4149967071220246, "grad_norm": 0.3407827317714691, "learning_rate": 9.012707826483823e-05, "loss": 0.0406, "step": 940 }, { "epoch": 1.4300498635807695, "grad_norm": 0.33196377754211426, "learning_rate": 8.98112345719122e-05, "loss": 0.0342, "step": 950 }, { "epoch": 1.4300498635807695, "eval_loss": 0.0392959862947464, "eval_runtime": 182.2448, "eval_samples_per_second": 6.48, "eval_steps_per_second": 6.48, "step": 950 }, { "epoch": 1.4451030200395145, "grad_norm": 0.5743414163589478, "learning_rate": 8.949098999212391e-05, "loss": 0.0361, "step": 960 }, { "epoch": 1.4601561764982596, "grad_norm": 0.325915664434433, "learning_rate": 8.916637992653991e-05, "loss": 0.0281, "step": 970 }, { "epoch": 1.4752093329570044, "grad_norm": 0.45795273780822754, "learning_rate": 8.883744025880428e-05, "loss": 0.0321, "step": 980 }, { "epoch": 1.4902624894157492, "grad_norm": 0.46391385793685913, "learning_rate": 8.850420735117202e-05, "loss": 0.0332, "step": 990 }, { "epoch": 1.5053156458744943, "grad_norm": 0.23828794062137604, "learning_rate": 8.816671804048933e-05, "loss": 0.033, "step": 1000 }, { "epoch": 1.5053156458744943, "eval_loss": 0.038652509450912476, "eval_runtime": 182.2825, "eval_samples_per_second": 6.479, "eval_steps_per_second": 6.479, "step": 1000 }, { "epoch": 1.5203688023332393, "grad_norm": 0.24454714357852936, "learning_rate": 8.782500963412156e-05, "loss": 0.0371, "step": 1010 }, { "epoch": 1.5354219587919842, "grad_norm": 0.21508722007274628, "learning_rate": 8.747911990582912e-05, "loss": 0.0298, "step": 1020 }, { "epoch": 1.550475115250729, "grad_norm": 0.2557254433631897, "learning_rate": 8.712908709159183e-05, "loss": 0.0298, "step": 1030 }, { "epoch": 1.565528271709474, "grad_norm": 0.8136358857154846, "learning_rate": 8.677494988538211e-05, "loss": 0.0414, "step": 1040 }, { "epoch": 1.5805814281682191, "grad_norm": 0.28363868594169617, "learning_rate": 8.641674743488769e-05, "loss": 0.0304, "step": 1050 }, { "epoch": 1.5805814281682191, "eval_loss": 0.041224390268325806, "eval_runtime": 183.0383, "eval_samples_per_second": 6.452, "eval_steps_per_second": 6.452, "step": 1050 }, { "epoch": 1.595634584626964, "grad_norm": 0.4219471514225006, "learning_rate": 8.605451933718397e-05, "loss": 0.0437, "step": 1060 }, { "epoch": 1.6106877410857088, "grad_norm": 0.47602376341819763, "learning_rate": 8.568830563435694e-05, "loss": 0.0396, "step": 1070 }, { "epoch": 1.6257408975444538, "grad_norm": 0.21332669258117676, "learning_rate": 8.531814680907664e-05, "loss": 0.0247, "step": 1080 }, { "epoch": 1.6407940540031989, "grad_norm": 0.15635231137275696, "learning_rate": 8.494408378012209e-05, "loss": 0.0246, "step": 1090 }, { "epoch": 1.6558472104619437, "grad_norm": 0.149508997797966, "learning_rate": 8.456615789785804e-05, "loss": 0.0225, "step": 1100 }, { "epoch": 1.6558472104619437, "eval_loss": 0.0380428284406662, "eval_runtime": 183.1081, "eval_samples_per_second": 6.45, "eval_steps_per_second": 6.45, "step": 1100 }, { "epoch": 1.6709003669206886, "grad_norm": 0.22358861565589905, "learning_rate": 8.418441093966385e-05, "loss": 0.0453, "step": 1110 }, { "epoch": 1.6859535233794336, "grad_norm": 0.15282148122787476, "learning_rate": 8.379888510531535e-05, "loss": 0.031, "step": 1120 }, { "epoch": 1.7010066798381787, "grad_norm": 0.2559264302253723, "learning_rate": 8.340962301231981e-05, "loss": 0.0241, "step": 1130 }, { "epoch": 1.7160598362969235, "grad_norm": 0.2203015387058258, "learning_rate": 8.301666769120488e-05, "loss": 0.0316, "step": 1140 }, { "epoch": 1.7311129927556683, "grad_norm": 0.2738555669784546, "learning_rate": 8.262006258076187e-05, "loss": 0.0406, "step": 1150 }, { "epoch": 1.7311129927556683, "eval_loss": 0.03589543700218201, "eval_runtime": 183.3536, "eval_samples_per_second": 6.441, "eval_steps_per_second": 6.441, "step": 1150 }, { "epoch": 1.7461661492144134, "grad_norm": 0.46012789011001587, "learning_rate": 8.221985152324385e-05, "loss": 0.0328, "step": 1160 }, { "epoch": 1.7612193056731584, "grad_norm": 0.1931673288345337, "learning_rate": 8.18160787595191e-05, "loss": 0.0399, "step": 1170 }, { "epoch": 1.7762724621319033, "grad_norm": 0.32602667808532715, "learning_rate": 8.14087889241806e-05, "loss": 0.0294, "step": 1180 }, { "epoch": 1.791325618590648, "grad_norm": 0.20906108617782593, "learning_rate": 8.099802704061195e-05, "loss": 0.0372, "step": 1190 }, { "epoch": 1.8063787750493931, "grad_norm": 0.5570924282073975, "learning_rate": 8.058383851601027e-05, "loss": 0.0314, "step": 1200 }, { "epoch": 1.8063787750493931, "eval_loss": 0.037830330431461334, "eval_runtime": 183.2865, "eval_samples_per_second": 6.443, "eval_steps_per_second": 6.443, "step": 1200 }, { "epoch": 1.8214319315081382, "grad_norm": 0.5903065800666809, "learning_rate": 8.01662691363668e-05, "loss": 0.0322, "step": 1210 }, { "epoch": 1.836485087966883, "grad_norm": 0.3619183301925659, "learning_rate": 7.974536506140547e-05, "loss": 0.0319, "step": 1220 }, { "epoch": 1.8515382444256279, "grad_norm": 0.26466983556747437, "learning_rate": 7.932117281948021e-05, "loss": 0.0351, "step": 1230 }, { "epoch": 1.866591400884373, "grad_norm": 0.196661576628685, "learning_rate": 7.889373930243164e-05, "loss": 0.0364, "step": 1240 }, { "epoch": 1.881644557343118, "grad_norm": 0.21144545078277588, "learning_rate": 7.846311176040331e-05, "loss": 0.0345, "step": 1250 }, { "epoch": 1.881644557343118, "eval_loss": 0.03522011637687683, "eval_runtime": 183.0871, "eval_samples_per_second": 6.45, "eval_steps_per_second": 6.45, "step": 1250 }, { "epoch": 1.8966977138018628, "grad_norm": 0.13879913091659546, "learning_rate": 7.802933779661859e-05, "loss": 0.0414, "step": 1260 }, { "epoch": 1.9117508702606076, "grad_norm": 0.20287376642227173, "learning_rate": 7.759246536211844e-05, "loss": 0.0298, "step": 1270 }, { "epoch": 1.9268040267193527, "grad_norm": 0.1346718668937683, "learning_rate": 7.715254275046062e-05, "loss": 0.0343, "step": 1280 }, { "epoch": 1.9418571831780977, "grad_norm": 0.6451114416122437, "learning_rate": 7.670961859238124e-05, "loss": 0.032, "step": 1290 }, { "epoch": 1.9569103396368426, "grad_norm": 0.2284402847290039, "learning_rate": 7.626374185041886e-05, "loss": 0.0314, "step": 1300 }, { "epoch": 1.9569103396368426, "eval_loss": 0.035182420164346695, "eval_runtime": 183.0374, "eval_samples_per_second": 6.452, "eval_steps_per_second": 6.452, "step": 1300 }, { "epoch": 1.9719634960955874, "grad_norm": 0.29817232489585876, "learning_rate": 7.581496181350203e-05, "loss": 0.0279, "step": 1310 }, { "epoch": 1.9870166525543325, "grad_norm": 0.1967862844467163, "learning_rate": 7.536332809150067e-05, "loss": 0.0296, "step": 1320 }, { "epoch": 2.0020698090130775, "grad_norm": 0.3098978102207184, "learning_rate": 7.490889060974201e-05, "loss": 0.0365, "step": 1330 }, { "epoch": 2.0171229654718226, "grad_norm": 0.09154434502124786, "learning_rate": 7.445169960349167e-05, "loss": 0.0195, "step": 1340 }, { "epoch": 2.032176121930567, "grad_norm": 0.20876505970954895, "learning_rate": 7.399180561240044e-05, "loss": 0.0232, "step": 1350 }, { "epoch": 2.032176121930567, "eval_loss": 0.03698495402932167, "eval_runtime": 183.2243, "eval_samples_per_second": 6.446, "eval_steps_per_second": 6.446, "step": 1350 }, { "epoch": 2.0472292783893122, "grad_norm": 0.11014339327812195, "learning_rate": 7.352925947491746e-05, "loss": 0.014, "step": 1360 }, { "epoch": 2.0622824348480573, "grad_norm": 0.34100142121315, "learning_rate": 7.306411232267029e-05, "loss": 0.0218, "step": 1370 }, { "epoch": 2.0773355913068023, "grad_norm": 0.17889666557312012, "learning_rate": 7.259641557481269e-05, "loss": 0.0207, "step": 1380 }, { "epoch": 2.092388747765547, "grad_norm": 0.1581099033355713, "learning_rate": 7.212622093234049e-05, "loss": 0.023, "step": 1390 }, { "epoch": 2.107441904224292, "grad_norm": 0.5647837519645691, "learning_rate": 7.165358037237643e-05, "loss": 0.0298, "step": 1400 }, { "epoch": 2.107441904224292, "eval_loss": 0.03580976277589798, "eval_runtime": 183.5421, "eval_samples_per_second": 6.434, "eval_steps_per_second": 6.434, "step": 1400 }, { "epoch": 2.122495060683037, "grad_norm": 0.20781104266643524, "learning_rate": 7.117854614242434e-05, "loss": 0.0266, "step": 1410 }, { "epoch": 2.137548217141782, "grad_norm": 0.4122653901576996, "learning_rate": 7.070117075459352e-05, "loss": 0.0236, "step": 1420 }, { "epoch": 2.1526013736005267, "grad_norm": 0.179127499461174, "learning_rate": 7.022150697979384e-05, "loss": 0.0244, "step": 1430 }, { "epoch": 2.167654530059272, "grad_norm": 0.34931114315986633, "learning_rate": 6.973960784190237e-05, "loss": 0.028, "step": 1440 }, { "epoch": 2.182707686518017, "grad_norm": 0.2368355244398117, "learning_rate": 6.925552661190166e-05, "loss": 0.0224, "step": 1450 }, { "epoch": 2.182707686518017, "eval_loss": 0.037619560956954956, "eval_runtime": 183.5055, "eval_samples_per_second": 6.436, "eval_steps_per_second": 6.436, "step": 1450 }, { "epoch": 2.197760842976762, "grad_norm": 0.33395224809646606, "learning_rate": 6.876931680199121e-05, "loss": 0.021, "step": 1460 }, { "epoch": 2.2128139994355065, "grad_norm": 0.16753950715065002, "learning_rate": 6.828103215967186e-05, "loss": 0.0262, "step": 1470 }, { "epoch": 2.2278671558942515, "grad_norm": 0.25596651434898376, "learning_rate": 6.779072666180446e-05, "loss": 0.0271, "step": 1480 }, { "epoch": 2.2429203123529966, "grad_norm": 0.11663230508565903, "learning_rate": 6.729845450864294e-05, "loss": 0.0177, "step": 1490 }, { "epoch": 2.257973468811741, "grad_norm": 0.2654312551021576, "learning_rate": 6.680427011784292e-05, "loss": 0.0251, "step": 1500 }, { "epoch": 2.257973468811741, "eval_loss": 0.040253281593322754, "eval_runtime": 183.1944, "eval_samples_per_second": 6.447, "eval_steps_per_second": 6.447, "step": 1500 }, { "epoch": 2.2730266252704863, "grad_norm": 0.2926699221134186, "learning_rate": 6.630822811844604e-05, "loss": 0.0191, "step": 1510 }, { "epoch": 2.2880797817292313, "grad_norm": 0.14185629785060883, "learning_rate": 6.58103833448412e-05, "loss": 0.0318, "step": 1520 }, { "epoch": 2.3031329381879764, "grad_norm": 0.3577151298522949, "learning_rate": 6.531079083070288e-05, "loss": 0.0264, "step": 1530 }, { "epoch": 2.3181860946467214, "grad_norm": 0.1893313229084015, "learning_rate": 6.480950580290752e-05, "loss": 0.0205, "step": 1540 }, { "epoch": 2.333239251105466, "grad_norm": 0.21659129858016968, "learning_rate": 6.430658367542843e-05, "loss": 0.0303, "step": 1550 }, { "epoch": 2.333239251105466, "eval_loss": 0.037741564214229584, "eval_runtime": 183.2043, "eval_samples_per_second": 6.446, "eval_steps_per_second": 6.446, "step": 1550 }, { "epoch": 2.348292407564211, "grad_norm": 0.17712511122226715, "learning_rate": 6.380208004321036e-05, "loss": 0.0283, "step": 1560 }, { "epoch": 2.363345564022956, "grad_norm": 0.302386611700058, "learning_rate": 6.32960506760236e-05, "loss": 0.0205, "step": 1570 }, { "epoch": 2.378398720481701, "grad_norm": 0.1805274933576584, "learning_rate": 6.278855151229901e-05, "loss": 0.0196, "step": 1580 }, { "epoch": 2.393451876940446, "grad_norm": 0.0922689437866211, "learning_rate": 6.227963865294444e-05, "loss": 0.0142, "step": 1590 }, { "epoch": 2.408505033399191, "grad_norm": 0.2465469241142273, "learning_rate": 6.176936835514312e-05, "loss": 0.0174, "step": 1600 }, { "epoch": 2.408505033399191, "eval_loss": 0.0399341955780983, "eval_runtime": 182.9994, "eval_samples_per_second": 6.454, "eval_steps_per_second": 6.454, "step": 1600 }, { "epoch": 2.423558189857936, "grad_norm": 0.3466060161590576, "learning_rate": 6.125779702613471e-05, "loss": 0.0236, "step": 1610 }, { "epoch": 2.438611346316681, "grad_norm": 0.42869460582733154, "learning_rate": 6.074498121697983e-05, "loss": 0.0233, "step": 1620 }, { "epoch": 2.4536645027754256, "grad_norm": 0.287655770778656, "learning_rate": 6.023097761630879e-05, "loss": 0.0268, "step": 1630 }, { "epoch": 2.4687176592341706, "grad_norm": 0.11640474200248718, "learning_rate": 5.971584304405489e-05, "loss": 0.0184, "step": 1640 }, { "epoch": 2.4837708156929157, "grad_norm": 0.2504878640174866, "learning_rate": 5.919963444517338e-05, "loss": 0.02, "step": 1650 }, { "epoch": 2.4837708156929157, "eval_loss": 0.03929644078016281, "eval_runtime": 182.7524, "eval_samples_per_second": 6.462, "eval_steps_per_second": 6.462, "step": 1650 }, { "epoch": 2.4988239721516607, "grad_norm": 0.21977509558200836, "learning_rate": 5.868240888334653e-05, "loss": 0.022, "step": 1660 }, { "epoch": 2.5138771286104054, "grad_norm": 0.13794435560703278, "learning_rate": 5.816422353467562e-05, "loss": 0.0174, "step": 1670 }, { "epoch": 2.5289302850691504, "grad_norm": 0.0966905802488327, "learning_rate": 5.7645135681360496e-05, "loss": 0.0196, "step": 1680 }, { "epoch": 2.5439834415278955, "grad_norm": 0.24332837760448456, "learning_rate": 5.7125202705367234e-05, "loss": 0.0228, "step": 1690 }, { "epoch": 2.5590365979866405, "grad_norm": 0.28298506140708923, "learning_rate": 5.660448208208513e-05, "loss": 0.0239, "step": 1700 }, { "epoch": 2.5590365979866405, "eval_loss": 0.03864952549338341, "eval_runtime": 183.0276, "eval_samples_per_second": 6.453, "eval_steps_per_second": 6.453, "step": 1700 }, { "epoch": 2.574089754445385, "grad_norm": 0.12762555480003357, "learning_rate": 5.608303137397294e-05, "loss": 0.0173, "step": 1710 }, { "epoch": 2.58914291090413, "grad_norm": 0.4499613642692566, "learning_rate": 5.5560908224195886e-05, "loss": 0.0241, "step": 1720 }, { "epoch": 2.6041960673628752, "grad_norm": 0.2668222486972809, "learning_rate": 5.503817035025342e-05, "loss": 0.0238, "step": 1730 }, { "epoch": 2.61924922382162, "grad_norm": 0.33222636580467224, "learning_rate": 5.4514875537598985e-05, "loss": 0.0189, "step": 1740 }, { "epoch": 2.634302380280365, "grad_norm": 0.3312423527240753, "learning_rate": 5.399108163325217e-05, "loss": 0.0377, "step": 1750 }, { "epoch": 2.634302380280365, "eval_loss": 0.03773489594459534, "eval_runtime": 182.6394, "eval_samples_per_second": 6.466, "eval_steps_per_second": 6.466, "step": 1750 }, { "epoch": 2.64935553673911, "grad_norm": 0.18128706514835358, "learning_rate": 5.346684653940408e-05, "loss": 0.0217, "step": 1760 }, { "epoch": 2.664408693197855, "grad_norm": 0.26159965991973877, "learning_rate": 5.294222820701661e-05, "loss": 0.0214, "step": 1770 }, { "epoch": 2.6794618496566, "grad_norm": 0.2960847318172455, "learning_rate": 5.24172846294163e-05, "loss": 0.0192, "step": 1780 }, { "epoch": 2.6945150061153447, "grad_norm": 0.2534957826137543, "learning_rate": 5.1892073835883524e-05, "loss": 0.0236, "step": 1790 }, { "epoch": 2.7095681625740897, "grad_norm": 0.34921959042549133, "learning_rate": 5.136665388523778e-05, "loss": 0.0266, "step": 1800 }, { "epoch": 2.7095681625740897, "eval_loss": 0.037273816764354706, "eval_runtime": 182.2953, "eval_samples_per_second": 6.479, "eval_steps_per_second": 6.479, "step": 1800 }, { "epoch": 2.7246213190328348, "grad_norm": 0.17289181053638458, "learning_rate": 5.0841082859419585e-05, "loss": 0.0265, "step": 1810 }, { "epoch": 2.7396744754915794, "grad_norm": 0.380982905626297, "learning_rate": 5.031541885706987e-05, "loss": 0.0257, "step": 1820 }, { "epoch": 2.7547276319503244, "grad_norm": 0.26447662711143494, "learning_rate": 4.9789719987107545e-05, "loss": 0.0257, "step": 1830 }, { "epoch": 2.7697807884090695, "grad_norm": 0.15229015052318573, "learning_rate": 4.926404436230596e-05, "loss": 0.0238, "step": 1840 }, { "epoch": 2.7848339448678145, "grad_norm": 0.4182538390159607, "learning_rate": 4.8738450092868785e-05, "loss": 0.0229, "step": 1850 }, { "epoch": 2.7848339448678145, "eval_loss": 0.03564143180847168, "eval_runtime": 181.9285, "eval_samples_per_second": 6.492, "eval_steps_per_second": 6.492, "step": 1850 }, { "epoch": 2.7998871013265596, "grad_norm": 0.24404701590538025, "learning_rate": 4.8212995280006426e-05, "loss": 0.0179, "step": 1860 }, { "epoch": 2.814940257785304, "grad_norm": 0.3688347637653351, "learning_rate": 4.76877380095132e-05, "loss": 0.02, "step": 1870 }, { "epoch": 2.8299934142440493, "grad_norm": 0.25834599137306213, "learning_rate": 4.7162736345346303e-05, "loss": 0.0167, "step": 1880 }, { "epoch": 2.8450465707027943, "grad_norm": 0.614407479763031, "learning_rate": 4.663804832320726e-05, "loss": 0.0185, "step": 1890 }, { "epoch": 2.860099727161539, "grad_norm": 0.3433961868286133, "learning_rate": 4.6113731944126406e-05, "loss": 0.0257, "step": 1900 }, { "epoch": 2.860099727161539, "eval_loss": 0.040932826697826385, "eval_runtime": 181.7455, "eval_samples_per_second": 6.498, "eval_steps_per_second": 6.498, "step": 1900 }, { "epoch": 2.875152883620284, "grad_norm": 0.24760842323303223, "learning_rate": 4.558984516805118e-05, "loss": 0.0233, "step": 1910 }, { "epoch": 2.890206040079029, "grad_norm": 0.3494224548339844, "learning_rate": 4.5066445907439104e-05, "loss": 0.0173, "step": 1920 }, { "epoch": 2.905259196537774, "grad_norm": 0.4293796420097351, "learning_rate": 4.454359202085582e-05, "loss": 0.0259, "step": 1930 }, { "epoch": 2.920312352996519, "grad_norm": 0.40314778685569763, "learning_rate": 4.402134130657925e-05, "loss": 0.0177, "step": 1940 }, { "epoch": 2.9353655094552638, "grad_norm": 0.25267696380615234, "learning_rate": 4.349975149621039e-05, "loss": 0.021, "step": 1950 }, { "epoch": 2.9353655094552638, "eval_loss": 0.036483317613601685, "eval_runtime": 181.4852, "eval_samples_per_second": 6.507, "eval_steps_per_second": 6.507, "step": 1950 }, { "epoch": 2.950418665914009, "grad_norm": 0.36842381954193115, "learning_rate": 4.297888024829126e-05, "loss": 0.0183, "step": 1960 }, { "epoch": 2.965471822372754, "grad_norm": 0.1686345487833023, "learning_rate": 4.2458785141931314e-05, "loss": 0.023, "step": 1970 }, { "epoch": 2.9805249788314985, "grad_norm": 0.2095889300107956, "learning_rate": 4.1939523670442316e-05, "loss": 0.0202, "step": 1980 }, { "epoch": 2.9955781352902435, "grad_norm": 0.2536243796348572, "learning_rate": 4.14211532349828e-05, "loss": 0.0199, "step": 1990 }, { "epoch": 3.0106312917489886, "grad_norm": 0.037917930632829666, "learning_rate": 4.090373113821281e-05, "loss": 0.0137, "step": 2000 }, { "epoch": 3.0106312917489886, "eval_loss": 0.03815687075257301, "eval_runtime": 181.5827, "eval_samples_per_second": 6.504, "eval_steps_per_second": 6.504, "step": 2000 }, { "epoch": 3.0256844482077336, "grad_norm": 0.1631132960319519, "learning_rate": 4.0387314577959315e-05, "loss": 0.0119, "step": 2010 }, { "epoch": 3.0407376046664787, "grad_norm": 0.06639809161424637, "learning_rate": 3.987196064089346e-05, "loss": 0.0111, "step": 2020 }, { "epoch": 3.0557907611252233, "grad_norm": 0.26162204146385193, "learning_rate": 3.935772629621995e-05, "loss": 0.0077, "step": 2030 }, { "epoch": 3.0708439175839684, "grad_norm": 0.7657532691955566, "learning_rate": 3.8844668389379396e-05, "loss": 0.0125, "step": 2040 }, { "epoch": 3.0858970740427134, "grad_norm": 0.38687437772750854, "learning_rate": 3.833284363576447e-05, "loss": 0.0119, "step": 2050 }, { "epoch": 3.0858970740427134, "eval_loss": 0.043912529945373535, "eval_runtime": 181.7111, "eval_samples_per_second": 6.499, "eval_steps_per_second": 6.499, "step": 2050 }, { "epoch": 3.1009502305014585, "grad_norm": 0.31369754672050476, "learning_rate": 3.7822308614450406e-05, "loss": 0.0092, "step": 2060 }, { "epoch": 3.116003386960203, "grad_norm": 0.19304712116718292, "learning_rate": 3.7313119761940375e-05, "loss": 0.0208, "step": 2070 }, { "epoch": 3.131056543418948, "grad_norm": 0.44090160727500916, "learning_rate": 3.680533336592694e-05, "loss": 0.0101, "step": 2080 }, { "epoch": 3.146109699877693, "grad_norm": 0.1936006397008896, "learning_rate": 3.62990055590697e-05, "loss": 0.0131, "step": 2090 }, { "epoch": 3.1611628563364382, "grad_norm": 0.1822337806224823, "learning_rate": 3.579419231279023e-05, "loss": 0.0116, "step": 2100 }, { "epoch": 3.1611628563364382, "eval_loss": 0.04266900196671486, "eval_runtime": 181.6064, "eval_samples_per_second": 6.503, "eval_steps_per_second": 6.503, "step": 2100 }, { "epoch": 3.176216012795183, "grad_norm": 0.04466831311583519, "learning_rate": 3.529094943108475e-05, "loss": 0.0084, "step": 2110 }, { "epoch": 3.191269169253928, "grad_norm": 0.15677054226398468, "learning_rate": 3.478933254435534e-05, "loss": 0.0161, "step": 2120 }, { "epoch": 3.206322325712673, "grad_norm": 0.31596970558166504, "learning_rate": 3.4289397103260346e-05, "loss": 0.0144, "step": 2130 }, { "epoch": 3.221375482171418, "grad_norm": 1.040712833404541, "learning_rate": 3.3791198372584664e-05, "loss": 0.0164, "step": 2140 }, { "epoch": 3.2364286386301626, "grad_norm": 0.16840790212154388, "learning_rate": 3.329479142513051e-05, "loss": 0.0131, "step": 2150 }, { "epoch": 3.2364286386301626, "eval_loss": 0.043540649116039276, "eval_runtime": 181.7685, "eval_samples_per_second": 6.497, "eval_steps_per_second": 6.497, "step": 2150 }, { "epoch": 3.2514817950889077, "grad_norm": 0.19833751022815704, "learning_rate": 3.280023113562957e-05, "loss": 0.0138, "step": 2160 }, { "epoch": 3.2665349515476527, "grad_norm": 0.3191230297088623, "learning_rate": 3.230757217467677e-05, "loss": 0.0208, "step": 2170 }, { "epoch": 3.2815881080063978, "grad_norm": 0.28639087080955505, "learning_rate": 3.1816869002686936e-05, "loss": 0.0138, "step": 2180 }, { "epoch": 3.2966412644651424, "grad_norm": 0.14610445499420166, "learning_rate": 3.1328175863874464e-05, "loss": 0.0111, "step": 2190 }, { "epoch": 3.3116944209238874, "grad_norm": 0.3217936158180237, "learning_rate": 3.084154678025692e-05, "loss": 0.0132, "step": 2200 }, { "epoch": 3.3116944209238874, "eval_loss": 0.04361730068922043, "eval_runtime": 178.1306, "eval_samples_per_second": 6.63, "eval_steps_per_second": 6.63, "step": 2200 }, { "epoch": 3.3267475773826325, "grad_norm": 0.43495962023735046, "learning_rate": 3.035703554568331e-05, "loss": 0.0171, "step": 2210 }, { "epoch": 3.3418007338413775, "grad_norm": 0.16047963500022888, "learning_rate": 2.9874695719887464e-05, "loss": 0.0094, "step": 2220 }, { "epoch": 3.356853890300122, "grad_norm": 0.8236820101737976, "learning_rate": 2.9394580622567312e-05, "loss": 0.0065, "step": 2230 }, { "epoch": 3.371907046758867, "grad_norm": 0.26033273339271545, "learning_rate": 2.8916743327490803e-05, "loss": 0.0118, "step": 2240 }, { "epoch": 3.3869602032176123, "grad_norm": 0.6144323945045471, "learning_rate": 2.8441236656628828e-05, "loss": 0.0095, "step": 2250 }, { "epoch": 3.3869602032176123, "eval_loss": 0.04476158320903778, "eval_runtime": 179.5871, "eval_samples_per_second": 6.576, "eval_steps_per_second": 6.576, "step": 2250 }, { "epoch": 3.4020133596763573, "grad_norm": 0.1782694309949875, "learning_rate": 2.79681131743161e-05, "loss": 0.0052, "step": 2260 }, { "epoch": 3.417066516135102, "grad_norm": 0.3929290771484375, "learning_rate": 2.7497425181440607e-05, "loss": 0.0106, "step": 2270 }, { "epoch": 3.432119672593847, "grad_norm": 0.1011100634932518, "learning_rate": 2.702922470966187e-05, "loss": 0.0087, "step": 2280 }, { "epoch": 3.447172829052592, "grad_norm": 0.2634778618812561, "learning_rate": 2.6563563515659306e-05, "loss": 0.0091, "step": 2290 }, { "epoch": 3.462225985511337, "grad_norm": 0.2693592309951782, "learning_rate": 2.6100493075410848e-05, "loss": 0.0101, "step": 2300 }, { "epoch": 3.462225985511337, "eval_loss": 0.04863074794411659, "eval_runtime": 162.7367, "eval_samples_per_second": 7.257, "eval_steps_per_second": 7.257, "step": 2300 }, { "epoch": 3.4772791419700817, "grad_norm": 0.4828035235404968, "learning_rate": 2.5640064578502497e-05, "loss": 0.0121, "step": 2310 }, { "epoch": 3.4923322984288268, "grad_norm": 0.5196333527565002, "learning_rate": 2.5182328922469723e-05, "loss": 0.0126, "step": 2320 }, { "epoch": 3.507385454887572, "grad_norm": 0.650068998336792, "learning_rate": 2.4727336707170973e-05, "loss": 0.012, "step": 2330 }, { "epoch": 3.522438611346317, "grad_norm": 0.12833315134048462, "learning_rate": 2.427513822919424e-05, "loss": 0.0145, "step": 2340 }, { "epoch": 3.5374917678050615, "grad_norm": 0.40547439455986023, "learning_rate": 2.3825783476297087e-05, "loss": 0.0068, "step": 2350 }, { "epoch": 3.5374917678050615, "eval_loss": 0.047212984412908554, "eval_runtime": 161.1488, "eval_samples_per_second": 7.329, "eval_steps_per_second": 7.329, "step": 2350 }, { "epoch": 3.5525449242638065, "grad_norm": 0.3044688105583191, "learning_rate": 2.337932212188073e-05, "loss": 0.0109, "step": 2360 }, { "epoch": 3.5675980807225516, "grad_norm": 0.2789610028266907, "learning_rate": 2.2935803519499e-05, "loss": 0.0148, "step": 2370 }, { "epoch": 3.582651237181296, "grad_norm": 0.4912359416484833, "learning_rate": 2.2495276697402662e-05, "loss": 0.0096, "step": 2380 }, { "epoch": 3.5977043936400412, "grad_norm": 0.4292566180229187, "learning_rate": 2.2057790353119535e-05, "loss": 0.0138, "step": 2390 }, { "epoch": 3.6127575500987863, "grad_norm": 0.5193042159080505, "learning_rate": 2.1623392848071354e-05, "loss": 0.0133, "step": 2400 }, { "epoch": 3.6127575500987863, "eval_loss": 0.044749874621629715, "eval_runtime": 161.0257, "eval_samples_per_second": 7.334, "eval_steps_per_second": 7.334, "step": 2400 }, { "epoch": 3.6278107065575314, "grad_norm": 0.05012566223740578, "learning_rate": 2.1192132202227677e-05, "loss": 0.0122, "step": 2410 }, { "epoch": 3.6428638630162764, "grad_norm": 0.5422964692115784, "learning_rate": 2.0764056088797645e-05, "loss": 0.0093, "step": 2420 }, { "epoch": 3.657917019475021, "grad_norm": 0.3080587089061737, "learning_rate": 2.0339211828959904e-05, "loss": 0.0102, "step": 2430 }, { "epoch": 3.672970175933766, "grad_norm": 0.4595882296562195, "learning_rate": 1.9917646386631577e-05, "loss": 0.0129, "step": 2440 }, { "epoch": 3.688023332392511, "grad_norm": 0.07800110429525375, "learning_rate": 1.949940636327671e-05, "loss": 0.0155, "step": 2450 }, { "epoch": 3.688023332392511, "eval_loss": 0.042347878217697144, "eval_runtime": 160.9841, "eval_samples_per_second": 7.336, "eval_steps_per_second": 7.336, "step": 2450 }, { "epoch": 3.7030764888512557, "grad_norm": 0.3958413600921631, "learning_rate": 1.9084537992754792e-05, "loss": 0.0103, "step": 2460 }, { "epoch": 3.718129645310001, "grad_norm": 0.11023591458797455, "learning_rate": 1.8673087136209803e-05, "loss": 0.0097, "step": 2470 }, { "epoch": 3.733182801768746, "grad_norm": 0.08786564320325851, "learning_rate": 1.8265099277000614e-05, "loss": 0.0077, "step": 2480 }, { "epoch": 3.748235958227491, "grad_norm": 0.43999993801116943, "learning_rate": 1.7860619515673033e-05, "loss": 0.0112, "step": 2490 }, { "epoch": 3.763289114686236, "grad_norm": 0.07914853096008301, "learning_rate": 1.7459692564974316e-05, "loss": 0.0118, "step": 2500 }, { "epoch": 3.763289114686236, "eval_loss": 0.04464948922395706, "eval_runtime": 161.0183, "eval_samples_per_second": 7.335, "eval_steps_per_second": 7.335, "step": 2500 }, { "epoch": 3.7783422711449806, "grad_norm": 0.4151097536087036, "learning_rate": 1.7062362744910322e-05, "loss": 0.0077, "step": 2510 }, { "epoch": 3.7933954276037256, "grad_norm": 0.5213687419891357, "learning_rate": 1.6668673977846254e-05, "loss": 0.0088, "step": 2520 }, { "epoch": 3.8084485840624707, "grad_norm": 0.25051119923591614, "learning_rate": 1.6278669783651395e-05, "loss": 0.0098, "step": 2530 }, { "epoch": 3.8235017405212153, "grad_norm": 0.19400495290756226, "learning_rate": 1.589239327488812e-05, "loss": 0.0082, "step": 2540 }, { "epoch": 3.8385548969799603, "grad_norm": 0.21994425356388092, "learning_rate": 1.5509887152046137e-05, "loss": 0.0104, "step": 2550 }, { "epoch": 3.8385548969799603, "eval_loss": 0.046441059559583664, "eval_runtime": 160.9418, "eval_samples_per_second": 7.338, "eval_steps_per_second": 7.338, "step": 2550 }, { "epoch": 3.8536080534387054, "grad_norm": 0.3098808825016022, "learning_rate": 1.5131193698822232e-05, "loss": 0.0119, "step": 2560 }, { "epoch": 3.8686612098974504, "grad_norm": 0.4345768392086029, "learning_rate": 1.4756354777446001e-05, "loss": 0.0108, "step": 2570 }, { "epoch": 3.8837143663561955, "grad_norm": 0.6836276650428772, "learning_rate": 1.4385411824052342e-05, "loss": 0.0111, "step": 2580 }, { "epoch": 3.89876752281494, "grad_norm": 0.4671355187892914, "learning_rate": 1.4018405844100812e-05, "loss": 0.0138, "step": 2590 }, { "epoch": 3.913820679273685, "grad_norm": 0.4520271420478821, "learning_rate": 1.3655377407842812e-05, "loss": 0.0149, "step": 2600 }, { "epoch": 3.913820679273685, "eval_loss": 0.04335136339068413, "eval_runtime": 161.0846, "eval_samples_per_second": 7.332, "eval_steps_per_second": 7.332, "step": 2600 }, { "epoch": 3.92887383573243, "grad_norm": 0.12534154951572418, "learning_rate": 1.3296366645836822e-05, "loss": 0.0083, "step": 2610 }, { "epoch": 3.943926992191175, "grad_norm": 0.4017331898212433, "learning_rate": 1.2941413244512113e-05, "loss": 0.0147, "step": 2620 }, { "epoch": 3.95898014864992, "grad_norm": 0.34028685092926025, "learning_rate": 1.2590556441781725e-05, "loss": 0.0113, "step": 2630 }, { "epoch": 3.974033305108665, "grad_norm": 0.23689797520637512, "learning_rate": 1.2243835022705003e-05, "loss": 0.0104, "step": 2640 }, { "epoch": 3.98908646156741, "grad_norm": 0.22768300771713257, "learning_rate": 1.1901287315199977e-05, "loss": 0.0126, "step": 2650 }, { "epoch": 3.98908646156741, "eval_loss": 0.04391155019402504, "eval_runtime": 160.9313, "eval_samples_per_second": 7.339, "eval_steps_per_second": 7.339, "step": 2650 }, { "epoch": 4.004139618026155, "grad_norm": 0.3352128565311432, "learning_rate": 1.1562951185806676e-05, "loss": 0.0117, "step": 2660 }, { "epoch": 4.0191927744849, "grad_norm": 0.062144506722688675, "learning_rate": 1.1228864035501069e-05, "loss": 0.0065, "step": 2670 }, { "epoch": 4.034245930943645, "grad_norm": 0.21621422469615936, "learning_rate": 1.0899062795560573e-05, "loss": 0.0055, "step": 2680 }, { "epoch": 4.04929908740239, "grad_norm": 0.3631681799888611, "learning_rate": 1.0573583923481711e-05, "loss": 0.0077, "step": 2690 }, { "epoch": 4.064352243861134, "grad_norm": 0.23000024259090424, "learning_rate": 1.0252463398949792e-05, "loss": 0.0066, "step": 2700 }, { "epoch": 4.064352243861134, "eval_loss": 0.04638359695672989, "eval_runtime": 160.9872, "eval_samples_per_second": 7.336, "eval_steps_per_second": 7.336, "step": 2700 }, { "epoch": 4.07940540031988, "grad_norm": 0.2550221085548401, "learning_rate": 9.935736719861622e-06, "loss": 0.0052, "step": 2710 }, { "epoch": 4.0944585567786245, "grad_norm": 0.051744118332862854, "learning_rate": 9.62343889840151e-06, "loss": 0.0055, "step": 2720 }, { "epoch": 4.109511713237369, "grad_norm": 0.04730711504817009, "learning_rate": 9.315604457170768e-06, "loss": 0.0036, "step": 2730 }, { "epoch": 4.124564869696115, "grad_norm": 0.15228353440761566, "learning_rate": 9.012267425371513e-06, "loss": 0.0031, "step": 2740 }, { "epoch": 4.139618026154859, "grad_norm": 0.15392455458641052, "learning_rate": 8.71346133504498e-06, "loss": 0.0048, "step": 2750 }, { "epoch": 4.139618026154859, "eval_loss": 0.05023922026157379, "eval_runtime": 160.8832, "eval_samples_per_second": 7.341, "eval_steps_per_second": 7.341, "step": 2750 }, { "epoch": 4.154671182613605, "grad_norm": 0.708788275718689, "learning_rate": 8.419219217364654e-06, "loss": 0.0037, "step": 2760 }, { "epoch": 4.169724339072349, "grad_norm": 0.02116093598306179, "learning_rate": 8.129573598984997e-06, "loss": 0.0031, "step": 2770 }, { "epoch": 4.184777495531094, "grad_norm": 0.15363575518131256, "learning_rate": 7.844556498445788e-06, "loss": 0.0028, "step": 2780 }, { "epoch": 4.199830651989839, "grad_norm": 0.4273289442062378, "learning_rate": 7.564199422632579e-06, "loss": 0.0068, "step": 2790 }, { "epoch": 4.214883808448584, "grad_norm": 0.041491761803627014, "learning_rate": 7.288533363293959e-06, "loss": 0.0052, "step": 2800 }, { "epoch": 4.214883808448584, "eval_loss": 0.05428801849484444, "eval_runtime": 160.8825, "eval_samples_per_second": 7.341, "eval_steps_per_second": 7.341, "step": 2800 }, { "epoch": 4.229936964907329, "grad_norm": 0.7560365200042725, "learning_rate": 7.017588793615498e-06, "loss": 0.0073, "step": 2810 }, { "epoch": 4.244990121366074, "grad_norm": 0.0603238008916378, "learning_rate": 6.751395664851135e-06, "loss": 0.004, "step": 2820 }, { "epoch": 4.260043277824819, "grad_norm": 0.07926664501428604, "learning_rate": 6.489983403012312e-06, "loss": 0.0043, "step": 2830 }, { "epoch": 4.275096434283564, "grad_norm": 0.7852065563201904, "learning_rate": 6.233380905615049e-06, "loss": 0.006, "step": 2840 }, { "epoch": 4.290149590742309, "grad_norm": 0.34134069085121155, "learning_rate": 5.981616538485496e-06, "loss": 0.0051, "step": 2850 }, { "epoch": 4.290149590742309, "eval_loss": 0.05365157872438431, "eval_runtime": 160.8734, "eval_samples_per_second": 7.341, "eval_steps_per_second": 7.341, "step": 2850 }, { "epoch": 4.3052027472010534, "grad_norm": 0.7026229500770569, "learning_rate": 5.73471813262435e-06, "loss": 0.0041, "step": 2860 }, { "epoch": 4.320255903659799, "grad_norm": 1.0271748304367065, "learning_rate": 5.4927129811301715e-06, "loss": 0.0025, "step": 2870 }, { "epoch": 4.335309060118544, "grad_norm": 0.05058298259973526, "learning_rate": 5.255627836182453e-06, "loss": 0.0041, "step": 2880 }, { "epoch": 4.350362216577288, "grad_norm": 0.3640020787715912, "learning_rate": 5.0234889060842176e-06, "loss": 0.0067, "step": 2890 }, { "epoch": 4.365415373036034, "grad_norm": 0.2726612389087677, "learning_rate": 4.796321852364877e-06, "loss": 0.0102, "step": 2900 }, { "epoch": 4.365415373036034, "eval_loss": 0.054679907858371735, "eval_runtime": 160.874, "eval_samples_per_second": 7.341, "eval_steps_per_second": 7.341, "step": 2900 }, { "epoch": 4.380468529494778, "grad_norm": 0.10512948036193848, "learning_rate": 4.5741517869435706e-06, "loss": 0.0038, "step": 2910 }, { "epoch": 4.395521685953524, "grad_norm": 0.033506445586681366, "learning_rate": 4.357003269353105e-06, "loss": 0.0051, "step": 2920 }, { "epoch": 4.410574842412268, "grad_norm": 0.14954346418380737, "learning_rate": 4.144900304025101e-06, "loss": 0.0066, "step": 2930 }, { "epoch": 4.425627998871013, "grad_norm": 0.18633145093917847, "learning_rate": 3.937866337636459e-06, "loss": 0.0071, "step": 2940 }, { "epoch": 4.4406811553297585, "grad_norm": 0.054286763072013855, "learning_rate": 3.7359242565174423e-06, "loss": 0.0052, "step": 2950 }, { "epoch": 4.4406811553297585, "eval_loss": 0.05462853983044624, "eval_runtime": 160.8836, "eval_samples_per_second": 7.341, "eval_steps_per_second": 7.341, "step": 2950 }, { "epoch": 4.455734311788503, "grad_norm": 0.06919770687818527, "learning_rate": 3.539096384121743e-06, "loss": 0.0063, "step": 2960 }, { "epoch": 4.470787468247248, "grad_norm": 0.10654816031455994, "learning_rate": 3.34740447855878e-06, "loss": 0.0025, "step": 2970 }, { "epoch": 4.485840624705993, "grad_norm": 0.3785998523235321, "learning_rate": 3.160869730188465e-06, "loss": 0.0057, "step": 2980 }, { "epoch": 4.500893781164738, "grad_norm": 0.03692477196455002, "learning_rate": 2.9795127592787186e-06, "loss": 0.0055, "step": 2990 }, { "epoch": 4.515946937623482, "grad_norm": 0.44500401616096497, "learning_rate": 2.803353613726056e-06, "loss": 0.0029, "step": 3000 }, { "epoch": 4.515946937623482, "eval_loss": 0.054797928780317307, "eval_runtime": 160.8222, "eval_samples_per_second": 7.344, "eval_steps_per_second": 7.344, "step": 3000 }, { "epoch": 4.531000094082228, "grad_norm": 0.06466939300298691, "learning_rate": 2.6324117668393877e-06, "loss": 0.0054, "step": 3010 }, { "epoch": 4.5460532505409725, "grad_norm": 0.4691304862499237, "learning_rate": 2.466706115187406e-06, "loss": 0.0044, "step": 3020 }, { "epoch": 4.561106406999718, "grad_norm": 0.19666248559951782, "learning_rate": 2.3062549765096364e-06, "loss": 0.0039, "step": 3030 }, { "epoch": 4.576159563458463, "grad_norm": 0.2812332808971405, "learning_rate": 2.1510760876915505e-06, "loss": 0.0086, "step": 3040 }, { "epoch": 4.591212719917207, "grad_norm": 0.46060940623283386, "learning_rate": 2.0011866028038617e-06, "loss": 0.0085, "step": 3050 }, { "epoch": 4.591212719917207, "eval_loss": 0.0551898255944252, "eval_runtime": 160.8674, "eval_samples_per_second": 7.341, "eval_steps_per_second": 7.341, "step": 3050 }, { "epoch": 4.606265876375953, "grad_norm": 0.05337996780872345, "learning_rate": 1.8566030912062549e-06, "loss": 0.0037, "step": 3060 }, { "epoch": 4.621319032834697, "grad_norm": 0.28329890966415405, "learning_rate": 1.717341535715733e-06, "loss": 0.0046, "step": 3070 }, { "epoch": 4.636372189293443, "grad_norm": 0.26113247871398926, "learning_rate": 1.5834173308397982e-06, "loss": 0.0051, "step": 3080 }, { "epoch": 4.6514253457521875, "grad_norm": 0.042031049728393555, "learning_rate": 1.4548452810747403e-06, "loss": 0.0073, "step": 3090 }, { "epoch": 4.666478502210932, "grad_norm": 0.08665376156568527, "learning_rate": 1.33163959926903e-06, "loss": 0.0049, "step": 3100 }, { "epoch": 4.666478502210932, "eval_loss": 0.05508030205965042, "eval_runtime": 160.8918, "eval_samples_per_second": 7.34, "eval_steps_per_second": 7.34, "step": 3100 }, { "epoch": 4.681531658669678, "grad_norm": 0.09392786026000977, "learning_rate": 1.2138139050522023e-06, "loss": 0.0078, "step": 3110 }, { "epoch": 4.696584815128422, "grad_norm": 0.24732336401939392, "learning_rate": 1.101381223329301e-06, "loss": 0.007, "step": 3120 }, { "epoch": 4.711637971587168, "grad_norm": 0.09796655178070068, "learning_rate": 9.943539828410342e-07, "loss": 0.0044, "step": 3130 }, { "epoch": 4.726691128045912, "grad_norm": 0.006699330639094114, "learning_rate": 8.927440147898702e-07, "loss": 0.0034, "step": 3140 }, { "epoch": 4.741744284504657, "grad_norm": 0.12186097353696823, "learning_rate": 7.96562551532154e-07, "loss": 0.0054, "step": 3150 }, { "epoch": 4.741744284504657, "eval_loss": 0.05528326332569122, "eval_runtime": 160.9058, "eval_samples_per_second": 7.34, "eval_steps_per_second": 7.34, "step": 3150 }, { "epoch": 4.756797440963402, "grad_norm": 0.08800538629293442, "learning_rate": 7.05820225336451e-07, "loss": 0.0048, "step": 3160 }, { "epoch": 4.771850597422147, "grad_norm": 0.0998738557100296, "learning_rate": 6.20527067208232e-07, "loss": 0.004, "step": 3170 }, { "epoch": 4.786903753880892, "grad_norm": 0.02175210416316986, "learning_rate": 5.406925057809653e-07, "loss": 0.0037, "step": 3180 }, { "epoch": 4.801956910339637, "grad_norm": 0.17426030337810516, "learning_rate": 4.6632536627386756e-07, "loss": 0.0055, "step": 3190 }, { "epoch": 4.817010066798382, "grad_norm": 0.23882503807544708, "learning_rate": 3.974338695163393e-07, "loss": 0.0035, "step": 3200 }, { "epoch": 4.817010066798382, "eval_loss": 0.055309001356363297, "eval_runtime": 160.8733, "eval_samples_per_second": 7.341, "eval_steps_per_second": 7.341, "step": 3200 }, { "epoch": 4.832063223257126, "grad_norm": 0.2771469056606293, "learning_rate": 3.3402563103916984e-07, "loss": 0.0025, "step": 3210 }, { "epoch": 4.847116379715872, "grad_norm": 0.04239178076386452, "learning_rate": 2.7610766023271615e-07, "loss": 0.0017, "step": 3220 }, { "epoch": 4.8621695361746164, "grad_norm": 0.1505410075187683, "learning_rate": 2.2368635957205618e-07, "loss": 0.0046, "step": 3230 }, { "epoch": 4.877222692633362, "grad_norm": 0.29227346181869507, "learning_rate": 1.7676752390920482e-07, "loss": 0.0072, "step": 3240 }, { "epoch": 4.8922758490921066, "grad_norm": 0.3839699625968933, "learning_rate": 1.3535633983257078e-07, "loss": 0.0041, "step": 3250 }, { "epoch": 4.8922758490921066, "eval_loss": 0.05536264553666115, "eval_runtime": 160.8279, "eval_samples_per_second": 7.343, "eval_steps_per_second": 7.343, "step": 3250 }, { "epoch": 4.907329005550851, "grad_norm": 0.05278844013810158, "learning_rate": 9.945738509358205e-08, "loss": 0.0051, "step": 3260 }, { "epoch": 4.922382162009597, "grad_norm": 0.11306164413690567, "learning_rate": 6.907462810065158e-08, "loss": 0.0045, "step": 3270 }, { "epoch": 4.937435318468341, "grad_norm": 0.18797364830970764, "learning_rate": 4.4211427480500554e-08, "loss": 0.0081, "step": 3280 }, { "epoch": 4.952488474927087, "grad_norm": 0.046247877180576324, "learning_rate": 2.4870531706872034e-08, "loss": 0.0048, "step": 3290 }, { "epoch": 4.967541631385831, "grad_norm": 0.5178457498550415, "learning_rate": 1.105407879670728e-08, "loss": 0.0045, "step": 3300 }, { "epoch": 4.967541631385831, "eval_loss": 0.055298663675785065, "eval_runtime": 160.8556, "eval_samples_per_second": 7.342, "eval_steps_per_second": 7.342, "step": 3300 }, { "epoch": 4.982594787844576, "grad_norm": 0.0552547313272953, "learning_rate": 2.763596073807051e-09, "loss": 0.0062, "step": 3310 }, { "epoch": 4.9976479443033215, "grad_norm": 0.1541270911693573, "learning_rate": 0.0, "loss": 0.0034, "step": 3320 }, { "epoch": 4.9976479443033215, "step": 3320, "total_flos": 8.339771893214085e+17, "train_loss": 0.04500684966052692, "train_runtime": 39650.0346, "train_samples_per_second": 1.34, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 3320, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.339771893214085e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }