{ "best_metric": 1.2152043581008911, "best_model_checkpoint": "./output/checkpoint-4650", "epoch": 0.3073567321039064, "eval_steps": 150, "global_step": 4650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006609822195782933, "grad_norm": 7.413546562194824, "learning_rate": 2.2360679774997904e-06, "loss": 1.2392, "step": 10 }, { "epoch": 0.0013219644391565867, "grad_norm": 7.08538818359375, "learning_rate": 4.472135954999581e-06, "loss": 1.2951, "step": 20 }, { "epoch": 0.00198294665873488, "grad_norm": 15.173999786376953, "learning_rate": 6.70820393249937e-06, "loss": 1.3208, "step": 30 }, { "epoch": 0.0026439288783131733, "grad_norm": 7.055360317230225, "learning_rate": 8.944271909999161e-06, "loss": 1.2641, "step": 40 }, { "epoch": 0.003304911097891467, "grad_norm": 8.638155937194824, "learning_rate": 1.118033988749895e-05, "loss": 1.2835, "step": 50 }, { "epoch": 0.00396589331746976, "grad_norm": 7.482174396514893, "learning_rate": 1.341640786499874e-05, "loss": 1.1253, "step": 60 }, { "epoch": 0.0046268755370480535, "grad_norm": 11.88020133972168, "learning_rate": 1.565247584249853e-05, "loss": 1.1857, "step": 70 }, { "epoch": 0.005287857756626347, "grad_norm": 13.985732078552246, "learning_rate": 1.7888543819998323e-05, "loss": 1.3447, "step": 80 }, { "epoch": 0.00594883997620464, "grad_norm": 7.102285861968994, "learning_rate": 2.0124611797498112e-05, "loss": 1.3336, "step": 90 }, { "epoch": 0.006609822195782934, "grad_norm": 7.646865367889404, "learning_rate": 2.23606797749979e-05, "loss": 1.176, "step": 100 }, { "epoch": 0.007270804415361227, "grad_norm": 6.750139236450195, "learning_rate": 2.236044998500671e-05, "loss": 1.2429, "step": 110 }, { "epoch": 0.00793178663493952, "grad_norm": 10.000078201293945, "learning_rate": 2.235976062447891e-05, "loss": 1.3139, "step": 120 }, { "epoch": 0.008592768854517813, "grad_norm": 12.12943172454834, "learning_rate": 2.2358611721751407e-05, "loss": 1.3145, "step": 130 }, { "epoch": 0.009253751074096107, "grad_norm": 7.1956071853637695, "learning_rate": 2.2357003324051093e-05, "loss": 1.3055, "step": 140 }, { "epoch": 0.009914733293674401, "grad_norm": 6.159770965576172, "learning_rate": 2.23549354974929e-05, "loss": 1.3298, "step": 150 }, { "epoch": 0.009914733293674401, "eval_loss": 1.3606581687927246, "eval_runtime": 45.5267, "eval_samples_per_second": 11.005, "eval_steps_per_second": 11.005, "step": 150 }, { "epoch": 0.010575715513252693, "grad_norm": 15.24757194519043, "learning_rate": 2.2352408327077078e-05, "loss": 1.303, "step": 160 }, { "epoch": 0.011236697732830987, "grad_norm": 10.154984474182129, "learning_rate": 2.2349421916685704e-05, "loss": 1.2568, "step": 170 }, { "epoch": 0.01189767995240928, "grad_norm": 7.64827299118042, "learning_rate": 2.234597638907841e-05, "loss": 1.27, "step": 180 }, { "epoch": 0.012558662171987573, "grad_norm": 10.21170711517334, "learning_rate": 2.2342071885887346e-05, "loss": 1.2995, "step": 190 }, { "epoch": 0.013219644391565867, "grad_norm": 10.44480037689209, "learning_rate": 2.2337708567611343e-05, "loss": 1.3509, "step": 200 }, { "epoch": 0.01388062661114416, "grad_norm": 7.435905456542969, "learning_rate": 2.233288661360932e-05, "loss": 1.1597, "step": 210 }, { "epoch": 0.014541608830722454, "grad_norm": 16.616416931152344, "learning_rate": 2.232760622209293e-05, "loss": 1.2589, "step": 220 }, { "epoch": 0.015202591050300748, "grad_norm": 13.498307228088379, "learning_rate": 2.2321867610118378e-05, "loss": 1.3307, "step": 230 }, { "epoch": 0.01586357326987904, "grad_norm": 7.282419681549072, "learning_rate": 2.231567101357753e-05, "loss": 1.3213, "step": 240 }, { "epoch": 0.016524555489457332, "grad_norm": 12.302486419677734, "learning_rate": 2.2309016687188194e-05, "loss": 1.3124, "step": 250 }, { "epoch": 0.017185537709035626, "grad_norm": 8.877416610717773, "learning_rate": 2.230190490448367e-05, "loss": 1.1267, "step": 260 }, { "epoch": 0.01784651992861392, "grad_norm": 10.397753715515137, "learning_rate": 2.229433595780149e-05, "loss": 1.3197, "step": 270 }, { "epoch": 0.018507502148192214, "grad_norm": 9.187607765197754, "learning_rate": 2.2286310158271407e-05, "loss": 1.1703, "step": 280 }, { "epoch": 0.019168484367770508, "grad_norm": 7.458565711975098, "learning_rate": 2.22778278358026e-05, "loss": 1.2126, "step": 290 }, { "epoch": 0.019829466587348802, "grad_norm": 11.090981483459473, "learning_rate": 2.2268889339070124e-05, "loss": 1.1683, "step": 300 }, { "epoch": 0.019829466587348802, "eval_loss": 1.3488467931747437, "eval_runtime": 55.8106, "eval_samples_per_second": 8.977, "eval_steps_per_second": 8.977, "step": 300 }, { "epoch": 0.020490448806927093, "grad_norm": 10.89608383178711, "learning_rate": 2.2259495035500576e-05, "loss": 1.4133, "step": 310 }, { "epoch": 0.021151431026505386, "grad_norm": 7.514070510864258, "learning_rate": 2.2249645311256972e-05, "loss": 1.2241, "step": 320 }, { "epoch": 0.02181241324608368, "grad_norm": 12.841883659362793, "learning_rate": 2.2239340571222904e-05, "loss": 1.2928, "step": 330 }, { "epoch": 0.022473395465661974, "grad_norm": 13.028974533081055, "learning_rate": 2.2228581238985868e-05, "loss": 1.2704, "step": 340 }, { "epoch": 0.02313437768524027, "grad_norm": 11.415493965148926, "learning_rate": 2.2217367756819878e-05, "loss": 1.2951, "step": 350 }, { "epoch": 0.02379535990481856, "grad_norm": 14.492388725280762, "learning_rate": 2.2205700585667257e-05, "loss": 1.2643, "step": 360 }, { "epoch": 0.024456342124396853, "grad_norm": 10.009002685546875, "learning_rate": 2.2193580205119724e-05, "loss": 1.2515, "step": 370 }, { "epoch": 0.025117324343975147, "grad_norm": 8.66943073272705, "learning_rate": 2.2181007113398642e-05, "loss": 1.1653, "step": 380 }, { "epoch": 0.02577830656355344, "grad_norm": 13.82745361328125, "learning_rate": 2.216798182733457e-05, "loss": 1.3251, "step": 390 }, { "epoch": 0.026439288783131735, "grad_norm": 9.831866264343262, "learning_rate": 2.2154504882346002e-05, "loss": 1.3099, "step": 400 }, { "epoch": 0.02710027100271003, "grad_norm": 6.000834941864014, "learning_rate": 2.214057683241736e-05, "loss": 1.2919, "step": 410 }, { "epoch": 0.02776125322228832, "grad_norm": 5.438742160797119, "learning_rate": 2.2126198250076225e-05, "loss": 1.1859, "step": 420 }, { "epoch": 0.028422235441866613, "grad_norm": 11.776556968688965, "learning_rate": 2.2111369726369802e-05, "loss": 1.339, "step": 430 }, { "epoch": 0.029083217661444907, "grad_norm": 7.697872638702393, "learning_rate": 2.2096091870840613e-05, "loss": 1.2235, "step": 440 }, { "epoch": 0.0297441998810232, "grad_norm": 12.47408676147461, "learning_rate": 2.2080365311501466e-05, "loss": 1.0851, "step": 450 }, { "epoch": 0.0297441998810232, "eval_loss": 1.3441540002822876, "eval_runtime": 45.42, "eval_samples_per_second": 11.03, "eval_steps_per_second": 11.03, "step": 450 }, { "epoch": 0.030405182100601495, "grad_norm": 5.456786155700684, "learning_rate": 2.206419069480962e-05, "loss": 1.2224, "step": 460 }, { "epoch": 0.031066164320179786, "grad_norm": 17.571989059448242, "learning_rate": 2.2047568685640212e-05, "loss": 1.355, "step": 470 }, { "epoch": 0.03172714653975808, "grad_norm": 10.6810302734375, "learning_rate": 2.203049996725894e-05, "loss": 1.3274, "step": 480 }, { "epoch": 0.032388128759336374, "grad_norm": 7.424011707305908, "learning_rate": 2.2012985241293954e-05, "loss": 1.1497, "step": 490 }, { "epoch": 0.033049110978914664, "grad_norm": 12.73671817779541, "learning_rate": 2.1995025227707044e-05, "loss": 1.3728, "step": 500 }, { "epoch": 0.03371009319849296, "grad_norm": 8.181777000427246, "learning_rate": 2.1976620664764027e-05, "loss": 1.2332, "step": 510 }, { "epoch": 0.03437107541807125, "grad_norm": 13.738442420959473, "learning_rate": 2.1957772309004394e-05, "loss": 1.2833, "step": 520 }, { "epoch": 0.03503205763764955, "grad_norm": 13.703083992004395, "learning_rate": 2.1938480935210228e-05, "loss": 1.4239, "step": 530 }, { "epoch": 0.03569303985722784, "grad_norm": 7.870193004608154, "learning_rate": 2.1918747336374347e-05, "loss": 1.4103, "step": 540 }, { "epoch": 0.03635402207680613, "grad_norm": 8.396446228027344, "learning_rate": 2.189857232366771e-05, "loss": 1.2522, "step": 550 }, { "epoch": 0.03701500429638443, "grad_norm": 12.225940704345703, "learning_rate": 2.1877956726406063e-05, "loss": 1.3464, "step": 560 }, { "epoch": 0.03767598651596272, "grad_norm": 11.3760347366333, "learning_rate": 2.1856901392015874e-05, "loss": 1.2843, "step": 570 }, { "epoch": 0.038336968735541016, "grad_norm": 11.334436416625977, "learning_rate": 2.183540718599946e-05, "loss": 1.2579, "step": 580 }, { "epoch": 0.03899795095511931, "grad_norm": 10.890923500061035, "learning_rate": 2.1813474991899453e-05, "loss": 1.1799, "step": 590 }, { "epoch": 0.039658933174697604, "grad_norm": 9.872835159301758, "learning_rate": 2.1791105711262442e-05, "loss": 1.1629, "step": 600 }, { "epoch": 0.039658933174697604, "eval_loss": 1.3372266292572021, "eval_runtime": 56.8438, "eval_samples_per_second": 8.814, "eval_steps_per_second": 8.814, "step": 600 }, { "epoch": 0.040319915394275894, "grad_norm": 11.447709083557129, "learning_rate": 2.1768300263601945e-05, "loss": 1.2011, "step": 610 }, { "epoch": 0.040980897613854185, "grad_norm": 12.056636810302734, "learning_rate": 2.174505958636059e-05, "loss": 1.2068, "step": 620 }, { "epoch": 0.04164187983343248, "grad_norm": 8.074010848999023, "learning_rate": 2.1721384634871592e-05, "loss": 1.1598, "step": 630 }, { "epoch": 0.04230286205301077, "grad_norm": 11.10396957397461, "learning_rate": 2.169727638231948e-05, "loss": 1.0609, "step": 640 }, { "epoch": 0.04296384427258907, "grad_norm": 7.929290771484375, "learning_rate": 2.1672735819700084e-05, "loss": 1.1761, "step": 650 }, { "epoch": 0.04362482649216736, "grad_norm": 12.149751663208008, "learning_rate": 2.1647763955779823e-05, "loss": 1.35, "step": 660 }, { "epoch": 0.04428580871174565, "grad_norm": 12.335487365722656, "learning_rate": 2.1622361817054213e-05, "loss": 1.2615, "step": 670 }, { "epoch": 0.04494679093132395, "grad_norm": 10.838406562805176, "learning_rate": 2.1596530447705676e-05, "loss": 1.1423, "step": 680 }, { "epoch": 0.04560777315090224, "grad_norm": 11.29602336883545, "learning_rate": 2.157027090956064e-05, "loss": 1.2088, "step": 690 }, { "epoch": 0.04626875537048054, "grad_norm": 6.865326881408691, "learning_rate": 2.1543584282045862e-05, "loss": 1.2449, "step": 700 }, { "epoch": 0.04692973759005883, "grad_norm": 11.23728084564209, "learning_rate": 2.1516471662144077e-05, "loss": 1.3072, "step": 710 }, { "epoch": 0.04759071980963712, "grad_norm": 9.809483528137207, "learning_rate": 2.1488934164348898e-05, "loss": 1.2592, "step": 720 }, { "epoch": 0.048251702029215415, "grad_norm": 12.237908363342285, "learning_rate": 2.1460972920619e-05, "loss": 1.2014, "step": 730 }, { "epoch": 0.048912684248793706, "grad_norm": 12.795587539672852, "learning_rate": 2.143258908033159e-05, "loss": 1.2433, "step": 740 }, { "epoch": 0.049573666468372, "grad_norm": 13.611194610595703, "learning_rate": 2.140378381023518e-05, "loss": 1.2548, "step": 750 }, { "epoch": 0.049573666468372, "eval_loss": 1.3183883428573608, "eval_runtime": 55.6542, "eval_samples_per_second": 9.002, "eval_steps_per_second": 9.002, "step": 750 }, { "epoch": 0.050234648687950294, "grad_norm": 4.964775085449219, "learning_rate": 2.1374558294401597e-05, "loss": 1.2587, "step": 760 }, { "epoch": 0.050895630907528584, "grad_norm": 13.402926445007324, "learning_rate": 2.134491373417733e-05, "loss": 1.1855, "step": 770 }, { "epoch": 0.05155661312710688, "grad_norm": 8.38901138305664, "learning_rate": 2.1314851348134134e-05, "loss": 1.3289, "step": 780 }, { "epoch": 0.05221759534668517, "grad_norm": 6.840709686279297, "learning_rate": 2.1284372372018963e-05, "loss": 1.1234, "step": 790 }, { "epoch": 0.05287857756626347, "grad_norm": 6.543496608734131, "learning_rate": 2.125347805870314e-05, "loss": 1.2149, "step": 800 }, { "epoch": 0.05353955978584176, "grad_norm": 7.223635196685791, "learning_rate": 2.122216967813088e-05, "loss": 1.0977, "step": 810 }, { "epoch": 0.05420054200542006, "grad_norm": 10.436606407165527, "learning_rate": 2.1190448517267087e-05, "loss": 1.1564, "step": 820 }, { "epoch": 0.05486152422499835, "grad_norm": 17.590259552001953, "learning_rate": 2.115831588004444e-05, "loss": 1.3229, "step": 830 }, { "epoch": 0.05552250644457664, "grad_norm": 11.749155044555664, "learning_rate": 2.1125773087309798e-05, "loss": 1.2345, "step": 840 }, { "epoch": 0.056183488664154936, "grad_norm": 11.912696838378906, "learning_rate": 2.1092821476769906e-05, "loss": 1.1779, "step": 850 }, { "epoch": 0.05684447088373323, "grad_norm": 5.420770168304443, "learning_rate": 2.1059462402936416e-05, "loss": 1.2414, "step": 860 }, { "epoch": 0.057505453103311524, "grad_norm": 4.887539863586426, "learning_rate": 2.102569723707019e-05, "loss": 1.1046, "step": 870 }, { "epoch": 0.058166435322889815, "grad_norm": 9.325897216796875, "learning_rate": 2.0991527367124955e-05, "loss": 1.3145, "step": 880 }, { "epoch": 0.058827417542468105, "grad_norm": 14.635684967041016, "learning_rate": 2.095695419769022e-05, "loss": 1.3592, "step": 890 }, { "epoch": 0.0594883997620464, "grad_norm": 8.91545295715332, "learning_rate": 2.0921979149933576e-05, "loss": 1.3035, "step": 900 }, { "epoch": 0.0594883997620464, "eval_loss": 1.3120555877685547, "eval_runtime": 52.1726, "eval_samples_per_second": 9.603, "eval_steps_per_second": 9.603, "step": 900 }, { "epoch": 0.06014938198162469, "grad_norm": 6.539499759674072, "learning_rate": 2.0886603661542245e-05, "loss": 1.2819, "step": 910 }, { "epoch": 0.06081036420120299, "grad_norm": 5.03954553604126, "learning_rate": 2.0850829186663994e-05, "loss": 1.2467, "step": 920 }, { "epoch": 0.06147134642078128, "grad_norm": 12.52458381652832, "learning_rate": 2.0814657195847375e-05, "loss": 1.1568, "step": 930 }, { "epoch": 0.06213232864035957, "grad_norm": 11.251747131347656, "learning_rate": 2.077808917598125e-05, "loss": 1.1703, "step": 940 }, { "epoch": 0.06279331085993786, "grad_norm": 10.658408164978027, "learning_rate": 2.0741126630233687e-05, "loss": 1.1074, "step": 950 }, { "epoch": 0.06345429307951617, "grad_norm": 6.95957612991333, "learning_rate": 2.070377107799017e-05, "loss": 1.1635, "step": 960 }, { "epoch": 0.06411527529909446, "grad_norm": 10.898233413696289, "learning_rate": 2.0666024054791137e-05, "loss": 1.2801, "step": 970 }, { "epoch": 0.06477625751867275, "grad_norm": 12.640921592712402, "learning_rate": 2.0627887112268875e-05, "loss": 1.2982, "step": 980 }, { "epoch": 0.06543723973825104, "grad_norm": 6.845248699188232, "learning_rate": 2.0589361818083712e-05, "loss": 1.0552, "step": 990 }, { "epoch": 0.06609822195782933, "grad_norm": 12.774737358093262, "learning_rate": 2.0550449755859598e-05, "loss": 1.149, "step": 1000 }, { "epoch": 0.06675920417740763, "grad_norm": 12.460762977600098, "learning_rate": 2.0511152525119014e-05, "loss": 1.0864, "step": 1010 }, { "epoch": 0.06742018639698592, "grad_norm": 12.369227409362793, "learning_rate": 2.0471471741217183e-05, "loss": 1.2691, "step": 1020 }, { "epoch": 0.06808116861656421, "grad_norm": 15.577491760253906, "learning_rate": 2.0431409035275724e-05, "loss": 1.3091, "step": 1030 }, { "epoch": 0.0687421508361425, "grad_norm": 8.849650382995605, "learning_rate": 2.0390966054115558e-05, "loss": 1.2703, "step": 1040 }, { "epoch": 0.0694031330557208, "grad_norm": 13.82666015625, "learning_rate": 2.035014446018924e-05, "loss": 1.388, "step": 1050 }, { "epoch": 0.0694031330557208, "eval_loss": 1.303145170211792, "eval_runtime": 53.8965, "eval_samples_per_second": 9.296, "eval_steps_per_second": 9.296, "step": 1050 }, { "epoch": 0.0700641152752991, "grad_norm": 11.953422546386719, "learning_rate": 2.0308945931512606e-05, "loss": 1.1849, "step": 1060 }, { "epoch": 0.07072509749487739, "grad_norm": 6.583851337432861, "learning_rate": 2.0267372161595806e-05, "loss": 1.2334, "step": 1070 }, { "epoch": 0.07138607971445568, "grad_norm": 10.967381477355957, "learning_rate": 2.022542485937369e-05, "loss": 1.146, "step": 1080 }, { "epoch": 0.07204706193403397, "grad_norm": 11.6732177734375, "learning_rate": 2.0183105749135553e-05, "loss": 1.1601, "step": 1090 }, { "epoch": 0.07270804415361226, "grad_norm": 11.63559341430664, "learning_rate": 2.0140416570454266e-05, "loss": 1.2845, "step": 1100 }, { "epoch": 0.07336902637319057, "grad_norm": 8.482784271240234, "learning_rate": 2.0097359078114767e-05, "loss": 1.1344, "step": 1110 }, { "epoch": 0.07403000859276886, "grad_norm": 11.602831840515137, "learning_rate": 2.0053935042041915e-05, "loss": 1.2167, "step": 1120 }, { "epoch": 0.07469099081234715, "grad_norm": 6.016249179840088, "learning_rate": 2.001014624722775e-05, "loss": 1.2611, "step": 1130 }, { "epoch": 0.07535197303192544, "grad_norm": 6.9794020652771, "learning_rate": 1.996599449365813e-05, "loss": 1.0101, "step": 1140 }, { "epoch": 0.07601295525150373, "grad_norm": 10.84961986541748, "learning_rate": 1.9921481596238703e-05, "loss": 1.1906, "step": 1150 }, { "epoch": 0.07667393747108203, "grad_norm": 13.637924194335938, "learning_rate": 1.9876609384720335e-05, "loss": 1.2617, "step": 1160 }, { "epoch": 0.07733491969066032, "grad_norm": 11.967713356018066, "learning_rate": 1.9831379703623903e-05, "loss": 1.1903, "step": 1170 }, { "epoch": 0.07799590191023861, "grad_norm": 12.296497344970703, "learning_rate": 1.978579441216443e-05, "loss": 0.9757, "step": 1180 }, { "epoch": 0.0786568841298169, "grad_norm": 12.823221206665039, "learning_rate": 1.9739855384174708e-05, "loss": 1.2341, "step": 1190 }, { "epoch": 0.07931786634939521, "grad_norm": 9.349319458007812, "learning_rate": 1.969356450802825e-05, "loss": 1.1929, "step": 1200 }, { "epoch": 0.07931786634939521, "eval_loss": 1.3002644777297974, "eval_runtime": 46.8524, "eval_samples_per_second": 10.693, "eval_steps_per_second": 10.693, "step": 1200 }, { "epoch": 0.0799788485689735, "grad_norm": 6.869687080383301, "learning_rate": 1.964692368656166e-05, "loss": 0.9831, "step": 1210 }, { "epoch": 0.08063983078855179, "grad_norm": 12.35352897644043, "learning_rate": 1.9599934836996435e-05, "loss": 1.1827, "step": 1220 }, { "epoch": 0.08130081300813008, "grad_norm": 14.163335800170898, "learning_rate": 1.9552599890860126e-05, "loss": 1.2183, "step": 1230 }, { "epoch": 0.08196179522770837, "grad_norm": 14.357596397399902, "learning_rate": 1.9504920793906985e-05, "loss": 1.1122, "step": 1240 }, { "epoch": 0.08262277744728667, "grad_norm": 12.211373329162598, "learning_rate": 1.945689950603793e-05, "loss": 1.1785, "step": 1250 }, { "epoch": 0.08328375966686496, "grad_norm": 9.271207809448242, "learning_rate": 1.9408538001220032e-05, "loss": 1.3458, "step": 1260 }, { "epoch": 0.08394474188644326, "grad_norm": 8.985238075256348, "learning_rate": 1.9359838267405318e-05, "loss": 1.2764, "step": 1270 }, { "epoch": 0.08460572410602155, "grad_norm": 6.032650947570801, "learning_rate": 1.931080230644911e-05, "loss": 1.1252, "step": 1280 }, { "epoch": 0.08526670632559984, "grad_norm": 8.561097145080566, "learning_rate": 1.926143213402771e-05, "loss": 1.1761, "step": 1290 }, { "epoch": 0.08592768854517814, "grad_norm": 11.316914558410645, "learning_rate": 1.921172977955552e-05, "loss": 1.2844, "step": 1300 }, { "epoch": 0.08658867076475643, "grad_norm": 11.52777099609375, "learning_rate": 1.9161697286101677e-05, "loss": 1.3252, "step": 1310 }, { "epoch": 0.08724965298433472, "grad_norm": 7.112990379333496, "learning_rate": 1.9111336710306013e-05, "loss": 1.2886, "step": 1320 }, { "epoch": 0.08791063520391301, "grad_norm": 11.982434272766113, "learning_rate": 1.9060650122294554e-05, "loss": 1.2249, "step": 1330 }, { "epoch": 0.0885716174234913, "grad_norm": 5.956284046173096, "learning_rate": 1.9009639605594407e-05, "loss": 1.1993, "step": 1340 }, { "epoch": 0.08923259964306961, "grad_norm": 6.896420955657959, "learning_rate": 1.8958307257048116e-05, "loss": 1.2083, "step": 1350 }, { "epoch": 0.08923259964306961, "eval_loss": 1.2925916910171509, "eval_runtime": 53.3979, "eval_samples_per_second": 9.382, "eval_steps_per_second": 9.382, "step": 1350 }, { "epoch": 0.0898935818626479, "grad_norm": 11.231532096862793, "learning_rate": 1.890665518672748e-05, "loss": 1.3071, "step": 1360 }, { "epoch": 0.09055456408222619, "grad_norm": 8.269697189331055, "learning_rate": 1.88546855178468e-05, "loss": 1.3681, "step": 1370 }, { "epoch": 0.09121554630180448, "grad_norm": 9.768874168395996, "learning_rate": 1.880240038667561e-05, "loss": 1.1444, "step": 1380 }, { "epoch": 0.09187652852138277, "grad_norm": 12.701289176940918, "learning_rate": 1.874980194245087e-05, "loss": 1.2358, "step": 1390 }, { "epoch": 0.09253751074096107, "grad_norm": 7.481356620788574, "learning_rate": 1.8696892347288606e-05, "loss": 1.2474, "step": 1400 }, { "epoch": 0.09319849296053936, "grad_norm": 5.565570831298828, "learning_rate": 1.864367377609504e-05, "loss": 1.3041, "step": 1410 }, { "epoch": 0.09385947518011765, "grad_norm": 11.658685684204102, "learning_rate": 1.8590148416477198e-05, "loss": 1.2475, "step": 1420 }, { "epoch": 0.09452045739969595, "grad_norm": 7.721464157104492, "learning_rate": 1.8536318468652962e-05, "loss": 1.2889, "step": 1430 }, { "epoch": 0.09518143961927424, "grad_norm": 13.417887687683105, "learning_rate": 1.8482186145360648e-05, "loss": 1.0137, "step": 1440 }, { "epoch": 0.09584242183885254, "grad_norm": 12.11631965637207, "learning_rate": 1.8427753671768056e-05, "loss": 1.1422, "step": 1450 }, { "epoch": 0.09650340405843083, "grad_norm": 10.596673965454102, "learning_rate": 1.8373023285380966e-05, "loss": 1.3137, "step": 1460 }, { "epoch": 0.09716438627800912, "grad_norm": 7.0566558837890625, "learning_rate": 1.8317997235951204e-05, "loss": 1.1111, "step": 1470 }, { "epoch": 0.09782536849758741, "grad_norm": 11.534781455993652, "learning_rate": 1.8262677785384142e-05, "loss": 1.207, "step": 1480 }, { "epoch": 0.0984863507171657, "grad_norm": 10.579961776733398, "learning_rate": 1.8207067207645716e-05, "loss": 1.0107, "step": 1490 }, { "epoch": 0.099147332936744, "grad_norm": 11.584352493286133, "learning_rate": 1.815116778866897e-05, "loss": 1.3272, "step": 1500 }, { "epoch": 0.099147332936744, "eval_loss": 1.2920811176300049, "eval_runtime": 56.3843, "eval_samples_per_second": 8.885, "eval_steps_per_second": 8.885, "step": 1500 }, { "epoch": 0.0998083151563223, "grad_norm": 12.167766571044922, "learning_rate": 1.8094981826260064e-05, "loss": 1.1052, "step": 1510 }, { "epoch": 0.10046929737590059, "grad_norm": 6.422857284545898, "learning_rate": 1.8038511630003865e-05, "loss": 1.2341, "step": 1520 }, { "epoch": 0.10113027959547888, "grad_norm": 11.502632141113281, "learning_rate": 1.798175952116895e-05, "loss": 1.2251, "step": 1530 }, { "epoch": 0.10179126181505717, "grad_norm": 13.205157279968262, "learning_rate": 1.7924727832612227e-05, "loss": 1.2488, "step": 1540 }, { "epoch": 0.10245224403463547, "grad_norm": 7.521269798278809, "learning_rate": 1.786741890868305e-05, "loss": 1.2128, "step": 1550 }, { "epoch": 0.10311322625421376, "grad_norm": 7.006454944610596, "learning_rate": 1.7809835105126807e-05, "loss": 1.1772, "step": 1560 }, { "epoch": 0.10377420847379205, "grad_norm": 10.070454597473145, "learning_rate": 1.7751978788988123e-05, "loss": 1.2622, "step": 1570 }, { "epoch": 0.10443519069337034, "grad_norm": 5.716686248779297, "learning_rate": 1.7693852338513545e-05, "loss": 1.2284, "step": 1580 }, { "epoch": 0.10509617291294863, "grad_norm": 9.35854721069336, "learning_rate": 1.7635458143053794e-05, "loss": 1.1278, "step": 1590 }, { "epoch": 0.10575715513252694, "grad_norm": 8.222880363464355, "learning_rate": 1.7576798602965525e-05, "loss": 1.2629, "step": 1600 }, { "epoch": 0.10641813735210523, "grad_norm": 7.391974925994873, "learning_rate": 1.7517876129512677e-05, "loss": 1.1084, "step": 1610 }, { "epoch": 0.10707911957168352, "grad_norm": 9.882158279418945, "learning_rate": 1.7458693144767353e-05, "loss": 1.1754, "step": 1620 }, { "epoch": 0.10774010179126181, "grad_norm": 6.603885173797607, "learning_rate": 1.7399252081510248e-05, "loss": 1.2642, "step": 1630 }, { "epoch": 0.10840108401084012, "grad_norm": 9.928793907165527, "learning_rate": 1.733955538313066e-05, "loss": 1.2299, "step": 1640 }, { "epoch": 0.1090620662304184, "grad_norm": 13.607159614562988, "learning_rate": 1.7279605503526047e-05, "loss": 1.3297, "step": 1650 }, { "epoch": 0.1090620662304184, "eval_loss": 1.2833280563354492, "eval_runtime": 56.0628, "eval_samples_per_second": 8.936, "eval_steps_per_second": 8.936, "step": 1650 }, { "epoch": 0.1097230484499967, "grad_norm": 12.829073905944824, "learning_rate": 1.721940490700115e-05, "loss": 1.1734, "step": 1660 }, { "epoch": 0.11038403066957499, "grad_norm": 5.9544548988342285, "learning_rate": 1.7158956068166697e-05, "loss": 1.0935, "step": 1670 }, { "epoch": 0.11104501288915328, "grad_norm": 7.440855503082275, "learning_rate": 1.7098261471837696e-05, "loss": 1.22, "step": 1680 }, { "epoch": 0.11170599510873158, "grad_norm": 5.567168235778809, "learning_rate": 1.7037323612931272e-05, "loss": 1.1423, "step": 1690 }, { "epoch": 0.11236697732830987, "grad_norm": 5.937944412231445, "learning_rate": 1.697614499636414e-05, "loss": 1.148, "step": 1700 }, { "epoch": 0.11302795954788816, "grad_norm": 6.795397758483887, "learning_rate": 1.6914728136949594e-05, "loss": 1.2881, "step": 1710 }, { "epoch": 0.11368894176746645, "grad_norm": 8.981378555297852, "learning_rate": 1.6853075559294172e-05, "loss": 1.1772, "step": 1720 }, { "epoch": 0.11434992398704474, "grad_norm": 9.995403289794922, "learning_rate": 1.6791189797693877e-05, "loss": 1.1541, "step": 1730 }, { "epoch": 0.11501090620662305, "grad_norm": 12.851771354675293, "learning_rate": 1.6729073396029965e-05, "loss": 1.2167, "step": 1740 }, { "epoch": 0.11567188842620134, "grad_norm": 12.812955856323242, "learning_rate": 1.666672890766442e-05, "loss": 1.1763, "step": 1750 }, { "epoch": 0.11633287064577963, "grad_norm": 8.584874153137207, "learning_rate": 1.660415889533497e-05, "loss": 1.2797, "step": 1760 }, { "epoch": 0.11699385286535792, "grad_norm": 8.92071533203125, "learning_rate": 1.6541365931049757e-05, "loss": 1.23, "step": 1770 }, { "epoch": 0.11765483508493621, "grad_norm": 5.1022210121154785, "learning_rate": 1.6478352595981594e-05, "loss": 1.0536, "step": 1780 }, { "epoch": 0.11831581730451451, "grad_norm": 8.801514625549316, "learning_rate": 1.6415121480361884e-05, "loss": 1.0129, "step": 1790 }, { "epoch": 0.1189767995240928, "grad_norm": 11.475573539733887, "learning_rate": 1.635167518337413e-05, "loss": 1.2538, "step": 1800 }, { "epoch": 0.1189767995240928, "eval_loss": 1.278364896774292, "eval_runtime": 47.0777, "eval_samples_per_second": 10.642, "eval_steps_per_second": 10.642, "step": 1800 }, { "epoch": 0.1196377817436711, "grad_norm": 10.728155136108398, "learning_rate": 1.6288016313047095e-05, "loss": 1.2208, "step": 1810 }, { "epoch": 0.12029876396324939, "grad_norm": 12.165102005004883, "learning_rate": 1.6224147486147602e-05, "loss": 1.3179, "step": 1820 }, { "epoch": 0.12095974618282768, "grad_norm": 10.370355606079102, "learning_rate": 1.616007132807298e-05, "loss": 1.226, "step": 1830 }, { "epoch": 0.12162072840240598, "grad_norm": 13.64041519165039, "learning_rate": 1.6095790472743107e-05, "loss": 1.287, "step": 1840 }, { "epoch": 0.12228171062198427, "grad_norm": 9.342700958251953, "learning_rate": 1.6031307562492174e-05, "loss": 1.2169, "step": 1850 }, { "epoch": 0.12294269284156256, "grad_norm": 5.222902297973633, "learning_rate": 1.5966625247960068e-05, "loss": 1.2688, "step": 1860 }, { "epoch": 0.12360367506114085, "grad_norm": 6.980830669403076, "learning_rate": 1.5901746187983387e-05, "loss": 1.1797, "step": 1870 }, { "epoch": 0.12426465728071914, "grad_norm": 10.581820487976074, "learning_rate": 1.5836673049486175e-05, "loss": 1.1752, "step": 1880 }, { "epoch": 0.12492563950029745, "grad_norm": 10.523150444030762, "learning_rate": 1.577140850737029e-05, "loss": 1.2042, "step": 1890 }, { "epoch": 0.12558662171987572, "grad_norm": 6.221709251403809, "learning_rate": 1.5705955244405423e-05, "loss": 1.1912, "step": 1900 }, { "epoch": 0.12624760393945403, "grad_norm": 10.54680347442627, "learning_rate": 1.564031595111886e-05, "loss": 1.2476, "step": 1910 }, { "epoch": 0.12690858615903233, "grad_norm": 5.043491840362549, "learning_rate": 1.557449332568485e-05, "loss": 1.2221, "step": 1920 }, { "epoch": 0.1275695683786106, "grad_norm": 10.203733444213867, "learning_rate": 1.5508490073813722e-05, "loss": 1.1716, "step": 1930 }, { "epoch": 0.1282305505981889, "grad_norm": 7.249475955963135, "learning_rate": 1.5442308908640636e-05, "loss": 1.1548, "step": 1940 }, { "epoch": 0.1288915328177672, "grad_norm": 11.740514755249023, "learning_rate": 1.537595255061408e-05, "loss": 1.1863, "step": 1950 }, { "epoch": 0.1288915328177672, "eval_loss": 1.2681256532669067, "eval_runtime": 53.9387, "eval_samples_per_second": 9.288, "eval_steps_per_second": 9.288, "step": 1950 }, { "epoch": 0.1295525150373455, "grad_norm": 9.638320922851562, "learning_rate": 1.5309423727384037e-05, "loss": 1.2506, "step": 1960 }, { "epoch": 0.1302134972569238, "grad_norm": 7.702147483825684, "learning_rate": 1.5242725173689851e-05, "loss": 1.1908, "step": 1970 }, { "epoch": 0.13087447947650208, "grad_norm": 15.315128326416016, "learning_rate": 1.5175859631247827e-05, "loss": 1.1775, "step": 1980 }, { "epoch": 0.13153546169608038, "grad_norm": 6.902062892913818, "learning_rate": 1.5108829848638515e-05, "loss": 1.1696, "step": 1990 }, { "epoch": 0.13219644391565866, "grad_norm": 10.421862602233887, "learning_rate": 1.5041638581193741e-05, "loss": 1.1456, "step": 2000 }, { "epoch": 0.13285742613523696, "grad_norm": 12.304083824157715, "learning_rate": 1.4974288590883346e-05, "loss": 1.0899, "step": 2010 }, { "epoch": 0.13351840835481527, "grad_norm": 6.598790645599365, "learning_rate": 1.4906782646201634e-05, "loss": 1.1023, "step": 2020 }, { "epoch": 0.13417939057439354, "grad_norm": 10.214670181274414, "learning_rate": 1.4839123522053591e-05, "loss": 1.1551, "step": 2030 }, { "epoch": 0.13484037279397185, "grad_norm": 9.92830753326416, "learning_rate": 1.4771313999640806e-05, "loss": 1.1611, "step": 2040 }, { "epoch": 0.13550135501355012, "grad_norm": 11.352734565734863, "learning_rate": 1.4703356866347155e-05, "loss": 1.1261, "step": 2050 }, { "epoch": 0.13616233723312843, "grad_norm": 9.193647384643555, "learning_rate": 1.4635254915624214e-05, "loss": 1.1497, "step": 2060 }, { "epoch": 0.13682331945270673, "grad_norm": 8.309967994689941, "learning_rate": 1.4567010946876445e-05, "loss": 1.2163, "step": 2070 }, { "epoch": 0.137484301672285, "grad_norm": 9.005535125732422, "learning_rate": 1.4498627765346109e-05, "loss": 1.1769, "step": 2080 }, { "epoch": 0.1381452838918633, "grad_norm": 6.557043552398682, "learning_rate": 1.4430108181997962e-05, "loss": 1.093, "step": 2090 }, { "epoch": 0.1388062661114416, "grad_norm": 7.859200954437256, "learning_rate": 1.4361455013403695e-05, "loss": 1.2585, "step": 2100 }, { "epoch": 0.1388062661114416, "eval_loss": 1.2679221630096436, "eval_runtime": 46.9201, "eval_samples_per_second": 10.678, "eval_steps_per_second": 10.678, "step": 2100 }, { "epoch": 0.1394672483310199, "grad_norm": 12.011978149414062, "learning_rate": 1.4292671081626183e-05, "loss": 1.2173, "step": 2110 }, { "epoch": 0.1401282305505982, "grad_norm": 9.485074996948242, "learning_rate": 1.4223759214103443e-05, "loss": 1.2501, "step": 2120 }, { "epoch": 0.14078921277017648, "grad_norm": 11.757882118225098, "learning_rate": 1.4154722243532445e-05, "loss": 1.1974, "step": 2130 }, { "epoch": 0.14145019498975478, "grad_norm": 13.57962703704834, "learning_rate": 1.4085563007752654e-05, "loss": 1.1892, "step": 2140 }, { "epoch": 0.14211117720933306, "grad_norm": 9.708785057067871, "learning_rate": 1.4016284349629364e-05, "loss": 1.225, "step": 2150 }, { "epoch": 0.14277215942891136, "grad_norm": 10.492091178894043, "learning_rate": 1.3946889116936874e-05, "loss": 1.208, "step": 2160 }, { "epoch": 0.14343314164848966, "grad_norm": 7.376300811767578, "learning_rate": 1.3877380162241394e-05, "loss": 1.1689, "step": 2170 }, { "epoch": 0.14409412386806794, "grad_norm": 6.636634349822998, "learning_rate": 1.3807760342783804e-05, "loss": 1.1393, "step": 2180 }, { "epoch": 0.14475510608764625, "grad_norm": 12.17708969116211, "learning_rate": 1.37380325203622e-05, "loss": 1.2818, "step": 2190 }, { "epoch": 0.14541608830722452, "grad_norm": 12.49779987335205, "learning_rate": 1.3668199561214252e-05, "loss": 1.133, "step": 2200 }, { "epoch": 0.14607707052680283, "grad_norm": 6.741744518280029, "learning_rate": 1.35982643358994e-05, "loss": 1.1637, "step": 2210 }, { "epoch": 0.14673805274638113, "grad_norm": 9.643292427062988, "learning_rate": 1.3528229719180835e-05, "loss": 1.2758, "step": 2220 }, { "epoch": 0.1473990349659594, "grad_norm": 10.941937446594238, "learning_rate": 1.3458098589907348e-05, "loss": 1.268, "step": 2230 }, { "epoch": 0.1480600171855377, "grad_norm": 11.461699485778809, "learning_rate": 1.3387873830894973e-05, "loss": 1.0558, "step": 2240 }, { "epoch": 0.148720999405116, "grad_norm": 6.023902893066406, "learning_rate": 1.3317558328808506e-05, "loss": 1.1131, "step": 2250 }, { "epoch": 0.148720999405116, "eval_loss": 1.259637475013733, "eval_runtime": 52.7273, "eval_samples_per_second": 9.502, "eval_steps_per_second": 9.502, "step": 2250 }, { "epoch": 0.1493819816246943, "grad_norm": 11.362767219543457, "learning_rate": 1.3247154974042827e-05, "loss": 1.2487, "step": 2260 }, { "epoch": 0.1500429638442726, "grad_norm": 12.16934585571289, "learning_rate": 1.3176666660604102e-05, "loss": 1.3317, "step": 2270 }, { "epoch": 0.15070394606385087, "grad_norm": 7.8326849937438965, "learning_rate": 1.3106096285990812e-05, "loss": 1.1973, "step": 2280 }, { "epoch": 0.15136492828342918, "grad_norm": 7.108518600463867, "learning_rate": 1.3035446751074653e-05, "loss": 1.1605, "step": 2290 }, { "epoch": 0.15202591050300746, "grad_norm": 11.288322448730469, "learning_rate": 1.2964720959981287e-05, "loss": 1.1857, "step": 2300 }, { "epoch": 0.15268689272258576, "grad_norm": 5.468815803527832, "learning_rate": 1.2893921819970972e-05, "loss": 1.2428, "step": 2310 }, { "epoch": 0.15334787494216406, "grad_norm": 11.970479011535645, "learning_rate": 1.2823052241319061e-05, "loss": 1.2249, "step": 2320 }, { "epoch": 0.15400885716174234, "grad_norm": 9.788006782531738, "learning_rate": 1.2752115137196341e-05, "loss": 1.1832, "step": 2330 }, { "epoch": 0.15466983938132065, "grad_norm": 5.940231800079346, "learning_rate": 1.2681113423549334e-05, "loss": 1.0796, "step": 2340 }, { "epoch": 0.15533082160089895, "grad_norm": 5.606922149658203, "learning_rate": 1.2610050018980385e-05, "loss": 0.9388, "step": 2350 }, { "epoch": 0.15599180382047723, "grad_norm": 6.812578201293945, "learning_rate": 1.2538927844627726e-05, "loss": 1.12, "step": 2360 }, { "epoch": 0.15665278604005553, "grad_norm": 10.468450546264648, "learning_rate": 1.2467749824045373e-05, "loss": 1.1143, "step": 2370 }, { "epoch": 0.1573137682596338, "grad_norm": 6.699043273925781, "learning_rate": 1.2396518883082966e-05, "loss": 1.1317, "step": 2380 }, { "epoch": 0.1579747504792121, "grad_norm": 11.339058876037598, "learning_rate": 1.2325237949765496e-05, "loss": 1.1824, "step": 2390 }, { "epoch": 0.15863573269879042, "grad_norm": 6.434577941894531, "learning_rate": 1.225390995417295e-05, "loss": 1.0624, "step": 2400 }, { "epoch": 0.15863573269879042, "eval_loss": 1.253835678100586, "eval_runtime": 47.0266, "eval_samples_per_second": 10.654, "eval_steps_per_second": 10.654, "step": 2400 }, { "epoch": 0.1592967149183687, "grad_norm": 10.957035064697266, "learning_rate": 1.2182537828319848e-05, "loss": 1.265, "step": 2410 }, { "epoch": 0.159957697137947, "grad_norm": 12.669862747192383, "learning_rate": 1.2111124506034739e-05, "loss": 1.1453, "step": 2420 }, { "epoch": 0.16061867935752527, "grad_norm": 12.645952224731445, "learning_rate": 1.2039672922839598e-05, "loss": 1.1506, "step": 2430 }, { "epoch": 0.16127966157710358, "grad_norm": 12.920147895812988, "learning_rate": 1.196818601582915e-05, "loss": 1.0976, "step": 2440 }, { "epoch": 0.16194064379668188, "grad_norm": 13.062854766845703, "learning_rate": 1.189666672355015e-05, "loss": 1.3518, "step": 2450 }, { "epoch": 0.16260162601626016, "grad_norm": 5.583253860473633, "learning_rate": 1.1825117985880576e-05, "loss": 1.0854, "step": 2460 }, { "epoch": 0.16326260823583846, "grad_norm": 12.410826683044434, "learning_rate": 1.1753542743908802e-05, "loss": 1.1561, "step": 2470 }, { "epoch": 0.16392359045541674, "grad_norm": 11.445279121398926, "learning_rate": 1.1681943939812688e-05, "loss": 1.3584, "step": 2480 }, { "epoch": 0.16458457267499504, "grad_norm": 6.8058342933654785, "learning_rate": 1.1610324516738626e-05, "loss": 1.2373, "step": 2490 }, { "epoch": 0.16524555489457335, "grad_norm": 10.376558303833008, "learning_rate": 1.1538687418680596e-05, "loss": 1.0921, "step": 2500 }, { "epoch": 0.16590653711415163, "grad_norm": 6.7869791984558105, "learning_rate": 1.1467035590359106e-05, "loss": 1.2743, "step": 2510 }, { "epoch": 0.16656751933372993, "grad_norm": 12.313713073730469, "learning_rate": 1.139537197710018e-05, "loss": 1.1243, "step": 2520 }, { "epoch": 0.1672285015533082, "grad_norm": 11.535476684570312, "learning_rate": 1.1323699524714278e-05, "loss": 1.2232, "step": 2530 }, { "epoch": 0.1678894837728865, "grad_norm": 9.248635292053223, "learning_rate": 1.1252021179375192e-05, "loss": 1.0689, "step": 2540 }, { "epoch": 0.16855046599246482, "grad_norm": 10.689653396606445, "learning_rate": 1.118033988749895e-05, "loss": 1.2617, "step": 2550 }, { "epoch": 0.16855046599246482, "eval_loss": 1.2488397359848022, "eval_runtime": 52.0382, "eval_samples_per_second": 9.628, "eval_steps_per_second": 9.628, "step": 2550 }, { "epoch": 0.1692114482120431, "grad_norm": 12.502510070800781, "learning_rate": 1.1108658595622709e-05, "loss": 1.2023, "step": 2560 }, { "epoch": 0.1698724304316214, "grad_norm": 11.087409973144531, "learning_rate": 1.1036980250283621e-05, "loss": 1.2207, "step": 2570 }, { "epoch": 0.17053341265119967, "grad_norm": 9.92039680480957, "learning_rate": 1.096530779789772e-05, "loss": 1.1602, "step": 2580 }, { "epoch": 0.17119439487077798, "grad_norm": 5.836206912994385, "learning_rate": 1.0893644184638797e-05, "loss": 1.0523, "step": 2590 }, { "epoch": 0.17185537709035628, "grad_norm": 12.243383407592773, "learning_rate": 1.0821992356317307e-05, "loss": 1.2196, "step": 2600 }, { "epoch": 0.17251635930993456, "grad_norm": 6.7921366691589355, "learning_rate": 1.0750355258259273e-05, "loss": 1.2333, "step": 2610 }, { "epoch": 0.17317734152951286, "grad_norm": 11.758354187011719, "learning_rate": 1.0678735835185219e-05, "loss": 1.1695, "step": 2620 }, { "epoch": 0.17383832374909114, "grad_norm": 12.446253776550293, "learning_rate": 1.06071370310891e-05, "loss": 1.1428, "step": 2630 }, { "epoch": 0.17449930596866944, "grad_norm": 7.370149612426758, "learning_rate": 1.0535561789117327e-05, "loss": 1.262, "step": 2640 }, { "epoch": 0.17516028818824775, "grad_norm": 10.489151954650879, "learning_rate": 1.0464013051447755e-05, "loss": 1.0921, "step": 2650 }, { "epoch": 0.17582127040782602, "grad_norm": 10.34467887878418, "learning_rate": 1.0392493759168751e-05, "loss": 1.1942, "step": 2660 }, { "epoch": 0.17648225262740433, "grad_norm": 11.04796314239502, "learning_rate": 1.0321006852158306e-05, "loss": 1.0937, "step": 2670 }, { "epoch": 0.1771432348469826, "grad_norm": 12.193102836608887, "learning_rate": 1.0249555268963164e-05, "loss": 1.1015, "step": 2680 }, { "epoch": 0.1778042170665609, "grad_norm": 11.928840637207031, "learning_rate": 1.0178141946678054e-05, "loss": 1.2069, "step": 2690 }, { "epoch": 0.17846519928613921, "grad_norm": 6.055873870849609, "learning_rate": 1.0106769820824951e-05, "loss": 1.0915, "step": 2700 }, { "epoch": 0.17846519928613921, "eval_loss": 1.246018409729004, "eval_runtime": 47.997, "eval_samples_per_second": 10.438, "eval_steps_per_second": 10.438, "step": 2700 }, { "epoch": 0.1791261815057175, "grad_norm": 7.3669586181640625, "learning_rate": 1.0035441825232406e-05, "loss": 1.0824, "step": 2710 }, { "epoch": 0.1797871637252958, "grad_norm": 12.520928382873535, "learning_rate": 9.964160891914937e-06, "loss": 1.1395, "step": 2720 }, { "epoch": 0.18044814594487407, "grad_norm": 6.952485084533691, "learning_rate": 9.892929950952532e-06, "loss": 1.1727, "step": 2730 }, { "epoch": 0.18110912816445238, "grad_norm": 10.507661819458008, "learning_rate": 9.821751930370177e-06, "loss": 1.184, "step": 2740 }, { "epoch": 0.18177011038403068, "grad_norm": 12.77137279510498, "learning_rate": 9.750629756017514e-06, "loss": 1.228, "step": 2750 }, { "epoch": 0.18243109260360896, "grad_norm": 7.609248161315918, "learning_rate": 9.679566351448571e-06, "loss": 1.1315, "step": 2760 }, { "epoch": 0.18309207482318726, "grad_norm": 11.428009986877441, "learning_rate": 9.608564637801562e-06, "loss": 1.041, "step": 2770 }, { "epoch": 0.18375305704276554, "grad_norm": 12.582087516784668, "learning_rate": 9.537627533678842e-06, "loss": 1.1608, "step": 2780 }, { "epoch": 0.18441403926234384, "grad_norm": 10.488136291503906, "learning_rate": 9.466757955026925e-06, "loss": 1.0935, "step": 2790 }, { "epoch": 0.18507502148192215, "grad_norm": 12.54319953918457, "learning_rate": 9.395958815016618e-06, "loss": 1.1654, "step": 2800 }, { "epoch": 0.18573600370150042, "grad_norm": 10.314374923706055, "learning_rate": 9.325233023923252e-06, "loss": 1.2293, "step": 2810 }, { "epoch": 0.18639698592107873, "grad_norm": 7.015604496002197, "learning_rate": 9.25458348900709e-06, "loss": 1.0994, "step": 2820 }, { "epoch": 0.187057968140657, "grad_norm": 6.349636554718018, "learning_rate": 9.1840131143938e-06, "loss": 1.2272, "step": 2830 }, { "epoch": 0.1877189503602353, "grad_norm": 9.584831237792969, "learning_rate": 9.113524800955074e-06, "loss": 1.1187, "step": 2840 }, { "epoch": 0.1883799325798136, "grad_norm": 4.967813491821289, "learning_rate": 9.043121446189398e-06, "loss": 1.0012, "step": 2850 }, { "epoch": 0.1883799325798136, "eval_loss": 1.2398909330368042, "eval_runtime": 53.5377, "eval_samples_per_second": 9.358, "eval_steps_per_second": 9.358, "step": 2850 }, { "epoch": 0.1890409147993919, "grad_norm": 11.762967109680176, "learning_rate": 8.972805944102928e-06, "loss": 1.1628, "step": 2860 }, { "epoch": 0.1897018970189702, "grad_norm": 9.806082725524902, "learning_rate": 8.902581185090555e-06, "loss": 1.0982, "step": 2870 }, { "epoch": 0.19036287923854847, "grad_norm": 5.619679927825928, "learning_rate": 8.832450055817064e-06, "loss": 1.1545, "step": 2880 }, { "epoch": 0.19102386145812678, "grad_norm": 12.290181159973145, "learning_rate": 8.7624154390985e-06, "loss": 1.1625, "step": 2890 }, { "epoch": 0.19168484367770508, "grad_norm": 12.353217124938965, "learning_rate": 8.692480213783649e-06, "loss": 1.159, "step": 2900 }, { "epoch": 0.19234582589728336, "grad_norm": 9.661192893981934, "learning_rate": 8.622647254635703e-06, "loss": 1.2334, "step": 2910 }, { "epoch": 0.19300680811686166, "grad_norm": 10.236005783081055, "learning_rate": 8.552919432214097e-06, "loss": 1.1434, "step": 2920 }, { "epoch": 0.19366779033643994, "grad_norm": 11.429096221923828, "learning_rate": 8.483299612756505e-06, "loss": 1.2204, "step": 2930 }, { "epoch": 0.19432877255601824, "grad_norm": 7.723197937011719, "learning_rate": 8.413790658061028e-06, "loss": 1.2049, "step": 2940 }, { "epoch": 0.19498975477559655, "grad_norm": 9.042826652526855, "learning_rate": 8.344395425368537e-06, "loss": 1.1231, "step": 2950 }, { "epoch": 0.19565073699517482, "grad_norm": 11.260157585144043, "learning_rate": 8.275116767245251e-06, "loss": 1.1543, "step": 2960 }, { "epoch": 0.19631171921475313, "grad_norm": 5.6008830070495605, "learning_rate": 8.205957531465456e-06, "loss": 1.0243, "step": 2970 }, { "epoch": 0.1969727014343314, "grad_norm": 5.492390155792236, "learning_rate": 8.136920560894458e-06, "loss": 1.2962, "step": 2980 }, { "epoch": 0.1976336836539097, "grad_norm": 10.791748046875, "learning_rate": 8.068008693371723e-06, "loss": 1.0384, "step": 2990 }, { "epoch": 0.198294665873488, "grad_norm": 6.472116470336914, "learning_rate": 7.999224761594206e-06, "loss": 1.0479, "step": 3000 }, { "epoch": 0.198294665873488, "eval_loss": 1.2349213361740112, "eval_runtime": 53.0521, "eval_samples_per_second": 9.444, "eval_steps_per_second": 9.444, "step": 3000 }, { "epoch": 0.1989556480930663, "grad_norm": 7.443964958190918, "learning_rate": 7.930571592999942e-06, "loss": 1.1367, "step": 3010 }, { "epoch": 0.1996166303126446, "grad_norm": 7.271074295043945, "learning_rate": 7.86205200965179e-06, "loss": 1.1435, "step": 3020 }, { "epoch": 0.20027761253222287, "grad_norm": 12.19694995880127, "learning_rate": 7.793668828121457e-06, "loss": 1.274, "step": 3030 }, { "epoch": 0.20093859475180118, "grad_norm": 6.130085468292236, "learning_rate": 7.725424859373688e-06, "loss": 1.1887, "step": 3040 }, { "epoch": 0.20159957697137948, "grad_norm": 8.441886901855469, "learning_rate": 7.65732290865075e-06, "loss": 1.1228, "step": 3050 }, { "epoch": 0.20226055919095776, "grad_norm": 10.298881530761719, "learning_rate": 7.589365775357096e-06, "loss": 1.1681, "step": 3060 }, { "epoch": 0.20292154141053606, "grad_norm": 5.6892218589782715, "learning_rate": 7.52155625294431e-06, "loss": 1.1967, "step": 3070 }, { "epoch": 0.20358252363011434, "grad_norm": 4.733664035797119, "learning_rate": 7.453897128796269e-06, "loss": 0.9874, "step": 3080 }, { "epoch": 0.20424350584969264, "grad_norm": 6.695845603942871, "learning_rate": 7.386391184114558e-06, "loss": 1.2284, "step": 3090 }, { "epoch": 0.20490448806927095, "grad_norm": 11.191842079162598, "learning_rate": 7.319041193804161e-06, "loss": 1.2232, "step": 3100 }, { "epoch": 0.20556547028884922, "grad_norm": 6.132591724395752, "learning_rate": 7.2518499263593866e-06, "loss": 1.12, "step": 3110 }, { "epoch": 0.20622645250842753, "grad_norm": 11.867471694946289, "learning_rate": 7.184820143750079e-06, "loss": 1.1889, "step": 3120 }, { "epoch": 0.2068874347280058, "grad_norm": 10.931007385253906, "learning_rate": 7.117954601308052e-06, "loss": 1.2347, "step": 3130 }, { "epoch": 0.2075484169475841, "grad_norm": 12.895480155944824, "learning_rate": 7.051256047613866e-06, "loss": 1.216, "step": 3140 }, { "epoch": 0.2082093991671624, "grad_norm": 10.634278297424316, "learning_rate": 6.984727224383822e-06, "loss": 1.1687, "step": 3150 }, { "epoch": 0.2082093991671624, "eval_loss": 1.2307320833206177, "eval_runtime": 58.1752, "eval_samples_per_second": 8.612, "eval_steps_per_second": 8.612, "step": 3150 }, { "epoch": 0.2088703813867407, "grad_norm": 11.298223495483398, "learning_rate": 6.918370866357266e-06, "loss": 1.1429, "step": 3160 }, { "epoch": 0.209531363606319, "grad_norm": 5.801537036895752, "learning_rate": 6.852189701184183e-06, "loss": 1.1809, "step": 3170 }, { "epoch": 0.21019234582589727, "grad_norm": 11.565352439880371, "learning_rate": 6.786186449313051e-06, "loss": 1.1068, "step": 3180 }, { "epoch": 0.21085332804547557, "grad_norm": 9.563201904296875, "learning_rate": 6.720363823879042e-06, "loss": 1.1438, "step": 3190 }, { "epoch": 0.21151431026505388, "grad_norm": 3.7967348098754883, "learning_rate": 6.6547245305924765e-06, "loss": 1.1022, "step": 3200 }, { "epoch": 0.21217529248463216, "grad_norm": 9.867331504821777, "learning_rate": 6.589271267627615e-06, "loss": 1.0329, "step": 3210 }, { "epoch": 0.21283627470421046, "grad_norm": 10.908332824707031, "learning_rate": 6.524006725511727e-06, "loss": 1.0811, "step": 3220 }, { "epoch": 0.21349725692378874, "grad_norm": 11.866363525390625, "learning_rate": 6.4589335870145165e-06, "loss": 1.1611, "step": 3230 }, { "epoch": 0.21415823914336704, "grad_norm": 12.108943939208984, "learning_rate": 6.394054527037837e-06, "loss": 1.1558, "step": 3240 }, { "epoch": 0.21481922136294535, "grad_norm": 11.09125804901123, "learning_rate": 6.329372212505727e-06, "loss": 1.1853, "step": 3250 }, { "epoch": 0.21548020358252362, "grad_norm": 12.74525260925293, "learning_rate": 6.264889302254797e-06, "loss": 1.1862, "step": 3260 }, { "epoch": 0.21614118580210193, "grad_norm": 9.876714706420898, "learning_rate": 6.200608446924922e-06, "loss": 1.1651, "step": 3270 }, { "epoch": 0.21680216802168023, "grad_norm": 9.700896263122559, "learning_rate": 6.136532288850295e-06, "loss": 1.2345, "step": 3280 }, { "epoch": 0.2174631502412585, "grad_norm": 10.941569328308105, "learning_rate": 6.072663461950806e-06, "loss": 1.0379, "step": 3290 }, { "epoch": 0.2181241324608368, "grad_norm": 13.29504108428955, "learning_rate": 6.009004591623776e-06, "loss": 1.1251, "step": 3300 }, { "epoch": 0.2181241324608368, "eval_loss": 1.2260839939117432, "eval_runtime": 47.8562, "eval_samples_per_second": 10.469, "eval_steps_per_second": 10.469, "step": 3300 }, { "epoch": 0.2187851146804151, "grad_norm": 8.1751708984375, "learning_rate": 5.945558294636019e-06, "loss": 1.1452, "step": 3310 }, { "epoch": 0.2194460968999934, "grad_norm": 12.451173782348633, "learning_rate": 5.882327179016307e-06, "loss": 1.217, "step": 3320 }, { "epoch": 0.2201070791195717, "grad_norm": 11.116937637329102, "learning_rate": 5.819313843948146e-06, "loss": 1.1602, "step": 3330 }, { "epoch": 0.22076806133914997, "grad_norm": 10.272557258605957, "learning_rate": 5.756520879662929e-06, "loss": 1.2616, "step": 3340 }, { "epoch": 0.22142904355872828, "grad_norm": 10.73164176940918, "learning_rate": 5.693950867333488e-06, "loss": 1.2448, "step": 3350 }, { "epoch": 0.22209002577830655, "grad_norm": 11.405309677124023, "learning_rate": 5.6316063789679415e-06, "loss": 1.2419, "step": 3360 }, { "epoch": 0.22275100799788486, "grad_norm": 6.117231369018555, "learning_rate": 5.569489977304029e-06, "loss": 1.2027, "step": 3370 }, { "epoch": 0.22341199021746316, "grad_norm": 12.008468627929688, "learning_rate": 5.507604215703729e-06, "loss": 1.1525, "step": 3380 }, { "epoch": 0.22407297243704144, "grad_norm": 6.268473148345947, "learning_rate": 5.44595163804831e-06, "loss": 1.1422, "step": 3390 }, { "epoch": 0.22473395465661974, "grad_norm": 14.515848159790039, "learning_rate": 5.384534778633763e-06, "loss": 1.0998, "step": 3400 }, { "epoch": 0.22539493687619802, "grad_norm": 10.610064506530762, "learning_rate": 5.323356162066626e-06, "loss": 1.2074, "step": 3410 }, { "epoch": 0.22605591909577633, "grad_norm": 11.648080825805664, "learning_rate": 5.262418303160206e-06, "loss": 1.0755, "step": 3420 }, { "epoch": 0.22671690131535463, "grad_norm": 6.210646629333496, "learning_rate": 5.201723706831204e-06, "loss": 1.1203, "step": 3430 }, { "epoch": 0.2273778835349329, "grad_norm": 4.218708038330078, "learning_rate": 5.141274867996755e-06, "loss": 0.9939, "step": 3440 }, { "epoch": 0.2280388657545112, "grad_norm": 8.179903030395508, "learning_rate": 5.081074271471855e-06, "loss": 1.0597, "step": 3450 }, { "epoch": 0.2280388657545112, "eval_loss": 1.2263822555541992, "eval_runtime": 52.855, "eval_samples_per_second": 9.479, "eval_steps_per_second": 9.479, "step": 3450 }, { "epoch": 0.2286998479740895, "grad_norm": 13.975303649902344, "learning_rate": 5.021124391867241e-06, "loss": 1.1898, "step": 3460 }, { "epoch": 0.2293608301936678, "grad_norm": 11.902430534362793, "learning_rate": 4.961427693487654e-06, "loss": 1.2382, "step": 3470 }, { "epoch": 0.2300218124132461, "grad_norm": 7.363813877105713, "learning_rate": 4.901986630230549e-06, "loss": 1.1337, "step": 3480 }, { "epoch": 0.23068279463282437, "grad_norm": 14.231773376464844, "learning_rate": 4.842803645485228e-06, "loss": 1.2631, "step": 3490 }, { "epoch": 0.23134377685240268, "grad_norm": 13.055315971374512, "learning_rate": 4.7838811720323795e-06, "loss": 1.2307, "step": 3500 }, { "epoch": 0.23200475907198095, "grad_norm": 11.109673500061035, "learning_rate": 4.725221631944109e-06, "loss": 1.0673, "step": 3510 }, { "epoch": 0.23266574129155926, "grad_norm": 9.12000560760498, "learning_rate": 4.666827436484355e-06, "loss": 1.2818, "step": 3520 }, { "epoch": 0.23332672351113756, "grad_norm": 11.266242980957031, "learning_rate": 4.60870098600978e-06, "loss": 0.9892, "step": 3530 }, { "epoch": 0.23398770573071584, "grad_norm": 13.089488983154297, "learning_rate": 4.550844669871095e-06, "loss": 1.1585, "step": 3540 }, { "epoch": 0.23464868795029414, "grad_norm": 9.938103675842285, "learning_rate": 4.493260866314851e-06, "loss": 1.1734, "step": 3550 }, { "epoch": 0.23530967016987242, "grad_norm": 10.093935012817383, "learning_rate": 4.435951942385671e-06, "loss": 1.1185, "step": 3560 }, { "epoch": 0.23597065238945072, "grad_norm": 4.782352924346924, "learning_rate": 4.378920253828953e-06, "loss": 1.1413, "step": 3570 }, { "epoch": 0.23663163460902903, "grad_norm": 11.091765403747559, "learning_rate": 4.322168144994041e-06, "loss": 1.2909, "step": 3580 }, { "epoch": 0.2372926168286073, "grad_norm": 10.81592845916748, "learning_rate": 4.265697948737836e-06, "loss": 1.2501, "step": 3590 }, { "epoch": 0.2379535990481856, "grad_norm": 11.043889045715332, "learning_rate": 4.209511986328935e-06, "loss": 1.1757, "step": 3600 }, { "epoch": 0.2379535990481856, "eval_loss": 1.223681092262268, "eval_runtime": 54.0238, "eval_samples_per_second": 9.274, "eval_steps_per_second": 9.274, "step": 3600 }, { "epoch": 0.2386145812677639, "grad_norm": 6.890323638916016, "learning_rate": 4.153612567352186e-06, "loss": 1.0562, "step": 3610 }, { "epoch": 0.2392755634873422, "grad_norm": 8.741559028625488, "learning_rate": 4.098001989613763e-06, "loss": 1.1737, "step": 3620 }, { "epoch": 0.2399365457069205, "grad_norm": 12.617691993713379, "learning_rate": 4.042682539046698e-06, "loss": 1.2365, "step": 3630 }, { "epoch": 0.24059752792649877, "grad_norm": 6.839216232299805, "learning_rate": 3.987656489616937e-06, "loss": 1.1941, "step": 3640 }, { "epoch": 0.24125851014607708, "grad_norm": 10.760446548461914, "learning_rate": 3.932926103229849e-06, "loss": 1.1187, "step": 3650 }, { "epoch": 0.24191949236565535, "grad_norm": 7.493879795074463, "learning_rate": 3.878493629637249e-06, "loss": 1.1193, "step": 3660 }, { "epoch": 0.24258047458523366, "grad_norm": 8.233012199401855, "learning_rate": 3.824361306344942e-06, "loss": 1.1905, "step": 3670 }, { "epoch": 0.24324145680481196, "grad_norm": 8.992157936096191, "learning_rate": 3.7705313585207056e-06, "loss": 1.0877, "step": 3680 }, { "epoch": 0.24390243902439024, "grad_norm": 13.892884254455566, "learning_rate": 3.717005998902859e-06, "loss": 1.1345, "step": 3690 }, { "epoch": 0.24456342124396854, "grad_norm": 10.53703784942627, "learning_rate": 3.6637874277092946e-06, "loss": 1.1473, "step": 3700 }, { "epoch": 0.24522440346354682, "grad_norm": 5.2873406410217285, "learning_rate": 3.610877832547034e-06, "loss": 1.0317, "step": 3710 }, { "epoch": 0.24588538568312512, "grad_norm": 8.536104202270508, "learning_rate": 3.5582793883222923e-06, "loss": 1.0296, "step": 3720 }, { "epoch": 0.24654636790270343, "grad_norm": 7.4764227867126465, "learning_rate": 3.5059942571511037e-06, "loss": 1.0728, "step": 3730 }, { "epoch": 0.2472073501222817, "grad_norm": 9.194038391113281, "learning_rate": 3.4540245882704213e-06, "loss": 1.1157, "step": 3740 }, { "epoch": 0.24786833234186, "grad_norm": 10.502184867858887, "learning_rate": 3.4023725179497848e-06, "loss": 1.1923, "step": 3750 }, { "epoch": 0.24786833234186, "eval_loss": 1.2212793827056885, "eval_runtime": 53.4315, "eval_samples_per_second": 9.376, "eval_steps_per_second": 9.376, "step": 3750 }, { "epoch": 0.24852931456143829, "grad_norm": 7.8659234046936035, "learning_rate": 3.351040169403499e-06, "loss": 1.0991, "step": 3760 }, { "epoch": 0.2491902967810166, "grad_norm": 8.55827808380127, "learning_rate": 3.30002965270335e-06, "loss": 1.0168, "step": 3770 }, { "epoch": 0.2498512790005949, "grad_norm": 10.08139705657959, "learning_rate": 3.2493430646918865e-06, "loss": 1.188, "step": 3780 }, { "epoch": 0.25051226122017317, "grad_norm": 7.772961139678955, "learning_rate": 3.1989824888962225e-06, "loss": 1.1373, "step": 3790 }, { "epoch": 0.25117324343975145, "grad_norm": 7.485221862792969, "learning_rate": 3.1489499954423797e-06, "loss": 1.2637, "step": 3800 }, { "epoch": 0.2518342256593298, "grad_norm": 14.595245361328125, "learning_rate": 3.0992476409701936e-06, "loss": 1.1433, "step": 3810 }, { "epoch": 0.25249520787890806, "grad_norm": 11.104635238647461, "learning_rate": 3.0498774685487882e-06, "loss": 1.1773, "step": 3820 }, { "epoch": 0.25315619009848633, "grad_norm": 6.462589263916016, "learning_rate": 3.000841507592583e-06, "loss": 1.0087, "step": 3830 }, { "epoch": 0.25381717231806467, "grad_norm": 12.072765350341797, "learning_rate": 2.9521417737778717e-06, "loss": 1.0804, "step": 3840 }, { "epoch": 0.25447815453764294, "grad_norm": 11.500109672546387, "learning_rate": 2.9037802689599704e-06, "loss": 1.1597, "step": 3850 }, { "epoch": 0.2551391367572212, "grad_norm": 8.149591445922852, "learning_rate": 2.855758981090918e-06, "loss": 1.2028, "step": 3860 }, { "epoch": 0.25580011897679955, "grad_norm": 11.354681015014648, "learning_rate": 2.8080798841377743e-06, "loss": 1.1725, "step": 3870 }, { "epoch": 0.2564611011963778, "grad_norm": 9.085524559020996, "learning_rate": 2.7607449380014703e-06, "loss": 1.2511, "step": 3880 }, { "epoch": 0.2571220834159561, "grad_norm": 10.283825874328613, "learning_rate": 2.713756088436244e-06, "loss": 1.1444, "step": 3890 }, { "epoch": 0.2577830656355344, "grad_norm": 11.607617378234863, "learning_rate": 2.6671152669696515e-06, "loss": 1.1419, "step": 3900 }, { "epoch": 0.2577830656355344, "eval_loss": 1.2201364040374756, "eval_runtime": 55.3983, "eval_samples_per_second": 9.044, "eval_steps_per_second": 9.044, "step": 3900 }, { "epoch": 0.2584440478551127, "grad_norm": 7.006284713745117, "learning_rate": 2.6208243908231916e-06, "loss": 1.0414, "step": 3910 }, { "epoch": 0.259105030074691, "grad_norm": 10.41873550415039, "learning_rate": 2.57488536283347e-06, "loss": 1.1597, "step": 3920 }, { "epoch": 0.25976601229426927, "grad_norm": 9.293778419494629, "learning_rate": 2.5293000713739977e-06, "loss": 1.182, "step": 3930 }, { "epoch": 0.2604269945138476, "grad_norm": 11.898356437683105, "learning_rate": 2.4840703902775642e-06, "loss": 1.2502, "step": 3940 }, { "epoch": 0.2610879767334259, "grad_norm": 9.323407173156738, "learning_rate": 2.4391981787592005e-06, "loss": 1.0892, "step": 3950 }, { "epoch": 0.26174895895300415, "grad_norm": 11.664414405822754, "learning_rate": 2.3946852813397737e-06, "loss": 1.1837, "step": 3960 }, { "epoch": 0.2624099411725825, "grad_norm": 11.392061233520508, "learning_rate": 2.3505335277701494e-06, "loss": 1.0029, "step": 3970 }, { "epoch": 0.26307092339216076, "grad_norm": 10.388303756713867, "learning_rate": 2.306744732955991e-06, "loss": 1.172, "step": 3980 }, { "epoch": 0.26373190561173904, "grad_norm": 11.332767486572266, "learning_rate": 2.2633206968831374e-06, "loss": 1.1951, "step": 3990 }, { "epoch": 0.2643928878313173, "grad_norm": 4.8323259353637695, "learning_rate": 2.220263204543635e-06, "loss": 1.0181, "step": 4000 }, { "epoch": 0.26505387005089565, "grad_norm": 11.138567924499512, "learning_rate": 2.1775740258623492e-06, "loss": 1.1295, "step": 4010 }, { "epoch": 0.2657148522704739, "grad_norm": 7.644820690155029, "learning_rate": 2.1352549156242126e-06, "loss": 1.1392, "step": 4020 }, { "epoch": 0.2663758344900522, "grad_norm": 11.998611450195312, "learning_rate": 2.0933076134020958e-06, "loss": 1.1516, "step": 4030 }, { "epoch": 0.26703681670963053, "grad_norm": 9.40128231048584, "learning_rate": 2.0517338434852946e-06, "loss": 1.1157, "step": 4040 }, { "epoch": 0.2676977989292088, "grad_norm": 7.291782379150391, "learning_rate": 2.010535314808659e-06, "loss": 1.1069, "step": 4050 }, { "epoch": 0.2676977989292088, "eval_loss": 1.2179657220840454, "eval_runtime": 53.213, "eval_samples_per_second": 9.415, "eval_steps_per_second": 9.415, "step": 4050 }, { "epoch": 0.2683587811487871, "grad_norm": 11.658596992492676, "learning_rate": 1.9697137208823396e-06, "loss": 1.172, "step": 4060 }, { "epoch": 0.2690197633683654, "grad_norm": 5.082404613494873, "learning_rate": 1.9292707397221775e-06, "loss": 1.1331, "step": 4070 }, { "epoch": 0.2696807455879437, "grad_norm": 13.126559257507324, "learning_rate": 1.8892080337807171e-06, "loss": 1.1899, "step": 4080 }, { "epoch": 0.27034172780752197, "grad_norm": 11.264731407165527, "learning_rate": 1.8495272498788887e-06, "loss": 1.0929, "step": 4090 }, { "epoch": 0.27100271002710025, "grad_norm": 12.232498168945312, "learning_rate": 1.8102300191383008e-06, "loss": 1.1517, "step": 4100 }, { "epoch": 0.2716636922466786, "grad_norm": 6.517210483551025, "learning_rate": 1.7713179569141897e-06, "loss": 1.1451, "step": 4110 }, { "epoch": 0.27232467446625686, "grad_norm": 10.073516845703125, "learning_rate": 1.7327926627290298e-06, "loss": 1.1757, "step": 4120 }, { "epoch": 0.27298565668583513, "grad_norm": 10.904183387756348, "learning_rate": 1.6946557202067662e-06, "loss": 1.201, "step": 4130 }, { "epoch": 0.27364663890541346, "grad_norm": 9.502151489257812, "learning_rate": 1.6569086970077352e-06, "loss": 1.1649, "step": 4140 }, { "epoch": 0.27430762112499174, "grad_norm": 12.71923542022705, "learning_rate": 1.6195531447642177e-06, "loss": 1.2048, "step": 4150 }, { "epoch": 0.27496860334457, "grad_norm": 13.27767562866211, "learning_rate": 1.582590599016653e-06, "loss": 1.0894, "step": 4160 }, { "epoch": 0.27562958556414835, "grad_norm": 12.859643936157227, "learning_rate": 1.5460225791505258e-06, "loss": 1.1565, "step": 4170 }, { "epoch": 0.2762905677837266, "grad_norm": 6.589792728424072, "learning_rate": 1.509850588333905e-06, "loss": 1.0296, "step": 4180 }, { "epoch": 0.2769515500033049, "grad_norm": 13.752243995666504, "learning_rate": 1.4740761134556557e-06, "loss": 1.312, "step": 4190 }, { "epoch": 0.2776125322228832, "grad_norm": 12.691303253173828, "learning_rate": 1.4387006250643236e-06, "loss": 1.1494, "step": 4200 }, { "epoch": 0.2776125322228832, "eval_loss": 1.2168010473251343, "eval_runtime": 51.4283, "eval_samples_per_second": 9.742, "eval_steps_per_second": 9.742, "step": 4200 }, { "epoch": 0.2782735144424615, "grad_norm": 11.23477840423584, "learning_rate": 1.4037255773076804e-06, "loss": 1.0421, "step": 4210 }, { "epoch": 0.2789344966620398, "grad_norm": 10.921051979064941, "learning_rate": 1.3691524078729481e-06, "loss": 1.055, "step": 4220 }, { "epoch": 0.27959547888161806, "grad_norm": 7.342863082885742, "learning_rate": 1.3349825379277099e-06, "loss": 1.2973, "step": 4230 }, { "epoch": 0.2802564611011964, "grad_norm": 11.837105751037598, "learning_rate": 1.3012173720614862e-06, "loss": 1.2177, "step": 4240 }, { "epoch": 0.2809174433207747, "grad_norm": 13.415239334106445, "learning_rate": 1.267858298227995e-06, "loss": 1.1455, "step": 4250 }, { "epoch": 0.28157842554035295, "grad_norm": 11.301210403442383, "learning_rate": 1.2349066876881063e-06, "loss": 1.1602, "step": 4260 }, { "epoch": 0.2822394077599313, "grad_norm": 5.907723903656006, "learning_rate": 1.202363894953462e-06, "loss": 1.1053, "step": 4270 }, { "epoch": 0.28290038997950956, "grad_norm": 12.926289558410645, "learning_rate": 1.1702312577308133e-06, "loss": 1.2056, "step": 4280 }, { "epoch": 0.28356137219908784, "grad_norm": 10.026867866516113, "learning_rate": 1.1385100968670189e-06, "loss": 1.1685, "step": 4290 }, { "epoch": 0.2842223544186661, "grad_norm": 12.193798065185547, "learning_rate": 1.107201716294762e-06, "loss": 1.1253, "step": 4300 }, { "epoch": 0.28488333663824444, "grad_norm": 6.5807294845581055, "learning_rate": 1.076307402978938e-06, "loss": 1.1252, "step": 4310 }, { "epoch": 0.2855443188578227, "grad_norm": 11.568461418151855, "learning_rate": 1.0458284268637652e-06, "loss": 1.2131, "step": 4320 }, { "epoch": 0.286205301077401, "grad_norm": 5.46840238571167, "learning_rate": 1.0157660408205728e-06, "loss": 1.0678, "step": 4330 }, { "epoch": 0.28686628329697933, "grad_norm": 13.20085334777832, "learning_rate": 9.861214805963042e-07, "loss": 1.1974, "step": 4340 }, { "epoch": 0.2875272655165576, "grad_norm": 13.585931777954102, "learning_rate": 9.568959647627223e-07, "loss": 1.1664, "step": 4350 }, { "epoch": 0.2875272655165576, "eval_loss": 1.21638822555542, "eval_runtime": 51.7738, "eval_samples_per_second": 9.677, "eval_steps_per_second": 9.677, "step": 4350 }, { "epoch": 0.2881882477361359, "grad_norm": 7.628300189971924, "learning_rate": 9.280906946663111e-07, "loss": 1.0584, "step": 4360 }, { "epoch": 0.2888492299557142, "grad_norm": 8.380716323852539, "learning_rate": 8.997068543789051e-07, "loss": 1.1137, "step": 4370 }, { "epoch": 0.2895102121752925, "grad_norm": 12.071667671203613, "learning_rate": 8.717456106490042e-07, "loss": 1.0887, "step": 4380 }, { "epoch": 0.29017119439487077, "grad_norm": 6.33940315246582, "learning_rate": 8.442081128538243e-07, "loss": 1.0145, "step": 4390 }, { "epoch": 0.29083217661444905, "grad_norm": 9.972112655639648, "learning_rate": 8.170954929520389e-07, "loss": 1.1362, "step": 4400 }, { "epoch": 0.2914931588340274, "grad_norm": 12.998346328735352, "learning_rate": 7.904088654372622e-07, "loss": 1.148, "step": 4410 }, { "epoch": 0.29215414105360565, "grad_norm": 5.646799087524414, "learning_rate": 7.641493272922243e-07, "loss": 1.1281, "step": 4420 }, { "epoch": 0.29281512327318393, "grad_norm": 10.702962875366211, "learning_rate": 7.383179579436903e-07, "loss": 1.1785, "step": 4430 }, { "epoch": 0.29347610549276226, "grad_norm": 5.956870079040527, "learning_rate": 7.129158192180766e-07, "loss": 1.1568, "step": 4440 }, { "epoch": 0.29413708771234054, "grad_norm": 11.048665046691895, "learning_rate": 6.879439552978142e-07, "loss": 1.0652, "step": 4450 }, { "epoch": 0.2947980699319188, "grad_norm": 5.649775505065918, "learning_rate": 6.634033926784221e-07, "loss": 1.1235, "step": 4460 }, { "epoch": 0.29545905215149715, "grad_norm": 11.055773735046387, "learning_rate": 6.392951401263069e-07, "loss": 1.285, "step": 4470 }, { "epoch": 0.2961200343710754, "grad_norm": 7.027043342590332, "learning_rate": 6.156201886373113e-07, "loss": 1.209, "step": 4480 }, { "epoch": 0.2967810165906537, "grad_norm": 11.43958854675293, "learning_rate": 5.923795113959569e-07, "loss": 1.2139, "step": 4490 }, { "epoch": 0.297441998810232, "grad_norm": 11.668280601501465, "learning_rate": 5.695740637354591e-07, "loss": 1.2407, "step": 4500 }, { "epoch": 0.297441998810232, "eval_loss": 1.2155283689498901, "eval_runtime": 48.0067, "eval_samples_per_second": 10.436, "eval_steps_per_second": 10.436, "step": 4500 }, { "epoch": 0.2981029810298103, "grad_norm": 10.411969184875488, "learning_rate": 5.472047830984499e-07, "loss": 1.1499, "step": 4510 }, { "epoch": 0.2987639632493886, "grad_norm": 6.937885761260986, "learning_rate": 5.252725889984403e-07, "loss": 1.0297, "step": 4520 }, { "epoch": 0.29942494546896686, "grad_norm": 10.743237495422363, "learning_rate": 5.037783829820298e-07, "loss": 1.1198, "step": 4530 }, { "epoch": 0.3000859276885452, "grad_norm": 5.665622234344482, "learning_rate": 4.827230485918372e-07, "loss": 1.0459, "step": 4540 }, { "epoch": 0.30074690990812347, "grad_norm": 9.720799446105957, "learning_rate": 4.6210745133019236e-07, "loss": 1.1943, "step": 4550 }, { "epoch": 0.30140789212770175, "grad_norm": 11.57904052734375, "learning_rate": 4.419324386235529e-07, "loss": 1.2007, "step": 4560 }, { "epoch": 0.3020688743472801, "grad_norm": 10.47191333770752, "learning_rate": 4.2219883978767386e-07, "loss": 1.1754, "step": 4570 }, { "epoch": 0.30272985656685836, "grad_norm": 8.371639251708984, "learning_rate": 4.029074659935082e-07, "loss": 1.0829, "step": 4580 }, { "epoch": 0.30339083878643663, "grad_norm": 11.640840530395508, "learning_rate": 3.8405911023387444e-07, "loss": 1.0573, "step": 4590 }, { "epoch": 0.3040518210060149, "grad_norm": 14.082575798034668, "learning_rate": 3.6565454729085526e-07, "loss": 1.2711, "step": 4600 }, { "epoch": 0.30471280322559324, "grad_norm": 8.940695762634277, "learning_rate": 3.4769453370394753e-07, "loss": 1.1595, "step": 4610 }, { "epoch": 0.3053737854451715, "grad_norm": 7.7234954833984375, "learning_rate": 3.301798077389637e-07, "loss": 1.2151, "step": 4620 }, { "epoch": 0.3060347676647498, "grad_norm": 4.756081581115723, "learning_rate": 3.1311108935768926e-07, "loss": 1.173, "step": 4630 }, { "epoch": 0.30669574988432813, "grad_norm": 10.524628639221191, "learning_rate": 2.964890801882817e-07, "loss": 1.0992, "step": 4640 }, { "epoch": 0.3073567321039064, "grad_norm": 6.618716716766357, "learning_rate": 2.8031446349643393e-07, "loss": 1.1152, "step": 4650 }, { "epoch": 0.3073567321039064, "eval_loss": 1.2152043581008911, "eval_runtime": 53.4713, "eval_samples_per_second": 9.37, "eval_steps_per_second": 9.37, "step": 4650 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2041141329494016e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }