{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.997163925127623, "eval_steps": 1000, "global_step": 4405, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011344299489506523, "grad_norm": 2.3206300735473633, "learning_rate": 4.535147392290249e-06, "loss": 1.5929, "step": 10 }, { "epoch": 0.022688598979013045, "grad_norm": 1.2386493682861328, "learning_rate": 9.070294784580499e-06, "loss": 1.6159, "step": 20 }, { "epoch": 0.03403289846851957, "grad_norm": 1.1790252923965454, "learning_rate": 1.360544217687075e-05, "loss": 1.538, "step": 30 }, { "epoch": 0.04537719795802609, "grad_norm": 1.021796703338623, "learning_rate": 1.8140589569160997e-05, "loss": 1.4132, "step": 40 }, { "epoch": 0.05672149744753262, "grad_norm": 1.3392266035079956, "learning_rate": 2.267573696145125e-05, "loss": 1.2604, "step": 50 }, { "epoch": 0.06806579693703914, "grad_norm": 0.9446895122528076, "learning_rate": 2.72108843537415e-05, "loss": 1.1644, "step": 60 }, { "epoch": 0.07941009642654566, "grad_norm": 1.3290923833847046, "learning_rate": 3.1746031746031745e-05, "loss": 1.1082, "step": 70 }, { "epoch": 0.09075439591605218, "grad_norm": 1.5161434412002563, "learning_rate": 3.6281179138321995e-05, "loss": 1.0389, "step": 80 }, { "epoch": 0.1020986954055587, "grad_norm": 0.6483525633811951, "learning_rate": 4.0816326530612245e-05, "loss": 1.0542, "step": 90 }, { "epoch": 0.11344299489506524, "grad_norm": 0.8814989924430847, "learning_rate": 4.53514739229025e-05, "loss": 0.9847, "step": 100 }, { "epoch": 0.12478729438457176, "grad_norm": 0.7316718101501465, "learning_rate": 4.9886621315192745e-05, "loss": 1.0585, "step": 110 }, { "epoch": 0.13613159387407828, "grad_norm": 0.7645348310470581, "learning_rate": 5.4421768707483e-05, "loss": 0.9713, "step": 120 }, { "epoch": 0.1474758933635848, "grad_norm": 0.6830883622169495, "learning_rate": 5.895691609977324e-05, "loss": 0.9823, "step": 130 }, { "epoch": 0.15882019285309132, "grad_norm": 1.3199207782745361, "learning_rate": 6.349206349206349e-05, "loss": 0.9992, "step": 140 }, { "epoch": 0.17016449234259784, "grad_norm": 0.7770159840583801, "learning_rate": 6.802721088435374e-05, "loss": 1.0085, "step": 150 }, { "epoch": 0.18150879183210436, "grad_norm": 1.623410940170288, "learning_rate": 7.256235827664399e-05, "loss": 1.0491, "step": 160 }, { "epoch": 0.19285309132161088, "grad_norm": 2.8830106258392334, "learning_rate": 7.709750566893424e-05, "loss": 1.0686, "step": 170 }, { "epoch": 0.2041973908111174, "grad_norm": 1.3428577184677124, "learning_rate": 8.163265306122449e-05, "loss": 1.0359, "step": 180 }, { "epoch": 0.21554169030062392, "grad_norm": 0.8043076395988464, "learning_rate": 8.616780045351474e-05, "loss": 1.0496, "step": 190 }, { "epoch": 0.22688598979013047, "grad_norm": 1.8799352645874023, "learning_rate": 9.0702947845805e-05, "loss": 1.0284, "step": 200 }, { "epoch": 0.238230289279637, "grad_norm": 0.6667978167533875, "learning_rate": 9.523809523809524e-05, "loss": 1.0162, "step": 210 }, { "epoch": 0.2495745887691435, "grad_norm": 0.815127968788147, "learning_rate": 9.977324263038549e-05, "loss": 1.0009, "step": 220 }, { "epoch": 0.26091888825865, "grad_norm": 0.6558067798614502, "learning_rate": 0.00010430839002267574, "loss": 1.004, "step": 230 }, { "epoch": 0.27226318774815655, "grad_norm": 0.6002511382102966, "learning_rate": 0.000108843537414966, "loss": 0.9702, "step": 240 }, { "epoch": 0.28360748723766305, "grad_norm": 0.7007895708084106, "learning_rate": 0.00011337868480725624, "loss": 1.0266, "step": 250 }, { "epoch": 0.2949517867271696, "grad_norm": 0.7985921502113342, "learning_rate": 0.00011791383219954648, "loss": 0.9753, "step": 260 }, { "epoch": 0.30629608621667614, "grad_norm": 0.5343239903450012, "learning_rate": 0.00012244897959183676, "loss": 1.036, "step": 270 }, { "epoch": 0.31764038570618264, "grad_norm": 0.7095124125480652, "learning_rate": 0.00012698412698412698, "loss": 1.0061, "step": 280 }, { "epoch": 0.3289846851956892, "grad_norm": 0.8570685386657715, "learning_rate": 0.00013151927437641726, "loss": 0.9458, "step": 290 }, { "epoch": 0.3403289846851957, "grad_norm": 0.6379779577255249, "learning_rate": 0.00013605442176870748, "loss": 0.9965, "step": 300 }, { "epoch": 0.3516732841747022, "grad_norm": 0.9263567328453064, "learning_rate": 0.00014058956916099776, "loss": 0.9601, "step": 310 }, { "epoch": 0.3630175836642087, "grad_norm": 0.7343761920928955, "learning_rate": 0.00014512471655328798, "loss": 1.0182, "step": 320 }, { "epoch": 0.37436188315371527, "grad_norm": 0.588762640953064, "learning_rate": 0.00014965986394557826, "loss": 0.9762, "step": 330 }, { "epoch": 0.38570618264322176, "grad_norm": 0.6719630360603333, "learning_rate": 0.00015419501133786848, "loss": 0.989, "step": 340 }, { "epoch": 0.3970504821327283, "grad_norm": 1.641836166381836, "learning_rate": 0.00015873015873015873, "loss": 0.9611, "step": 350 }, { "epoch": 0.4083947816222348, "grad_norm": 0.9340532422065735, "learning_rate": 0.00016326530612244898, "loss": 0.9861, "step": 360 }, { "epoch": 0.41973908111174135, "grad_norm": 0.737554669380188, "learning_rate": 0.00016780045351473923, "loss": 1.0, "step": 370 }, { "epoch": 0.43108338060124785, "grad_norm": 1.1190237998962402, "learning_rate": 0.00017233560090702948, "loss": 1.016, "step": 380 }, { "epoch": 0.4424276800907544, "grad_norm": 0.7501509785652161, "learning_rate": 0.00017687074829931973, "loss": 0.9743, "step": 390 }, { "epoch": 0.45377197958026094, "grad_norm": 0.5105754733085632, "learning_rate": 0.00018140589569161, "loss": 1.0182, "step": 400 }, { "epoch": 0.46511627906976744, "grad_norm": 0.7148075699806213, "learning_rate": 0.00018594104308390023, "loss": 0.9673, "step": 410 }, { "epoch": 0.476460578559274, "grad_norm": 0.49944302439689636, "learning_rate": 0.00019047619047619048, "loss": 1.0083, "step": 420 }, { "epoch": 0.4878048780487805, "grad_norm": 0.5624661445617676, "learning_rate": 0.00019501133786848073, "loss": 1.0201, "step": 430 }, { "epoch": 0.499149177538287, "grad_norm": 0.5779452919960022, "learning_rate": 0.00019954648526077098, "loss": 1.0165, "step": 440 }, { "epoch": 0.5104934770277936, "grad_norm": 0.8505494594573975, "learning_rate": 0.0001999974561843451, "loss": 0.9527, "step": 450 }, { "epoch": 0.5218377765173, "grad_norm": 0.7141993641853333, "learning_rate": 0.00019998866291366877, "loss": 0.9927, "step": 460 }, { "epoch": 0.5331820760068066, "grad_norm": 0.5913094282150269, "learning_rate": 0.0001999735893350151, "loss": 1.0054, "step": 470 }, { "epoch": 0.5445263754963131, "grad_norm": 0.5813531279563904, "learning_rate": 0.00019995223639515864, "loss": 0.9511, "step": 480 }, { "epoch": 0.5558706749858197, "grad_norm": 0.9083317518234253, "learning_rate": 0.0001999246054352818, "loss": 0.9596, "step": 490 }, { "epoch": 0.5672149744753261, "grad_norm": 0.8444753885269165, "learning_rate": 0.00019989069819089067, "loss": 1.0163, "step": 500 }, { "epoch": 0.5785592739648326, "grad_norm": 0.6896610856056213, "learning_rate": 0.0001998505167917061, "loss": 0.9606, "step": 510 }, { "epoch": 0.5899035734543392, "grad_norm": 0.7446523308753967, "learning_rate": 0.00019980406376152984, "loss": 0.9748, "step": 520 }, { "epoch": 0.6012478729438457, "grad_norm": 0.5111407041549683, "learning_rate": 0.00019975134201808605, "loss": 0.9364, "step": 530 }, { "epoch": 0.6125921724333523, "grad_norm": 0.6797256469726562, "learning_rate": 0.000199692354872838, "loss": 0.9766, "step": 540 }, { "epoch": 0.6239364719228587, "grad_norm": 0.9774245619773865, "learning_rate": 0.00019962710603078007, "loss": 0.9669, "step": 550 }, { "epoch": 0.6352807714123653, "grad_norm": 0.7039481997489929, "learning_rate": 0.0001995555995902052, "loss": 0.9371, "step": 560 }, { "epoch": 0.6466250709018718, "grad_norm": 0.7363829016685486, "learning_rate": 0.0001994778400424472, "loss": 0.9809, "step": 570 }, { "epoch": 0.6579693703913784, "grad_norm": 0.7072857022285461, "learning_rate": 0.0001993938322715989, "loss": 0.9825, "step": 580 }, { "epoch": 0.6693136698808848, "grad_norm": 0.5628974437713623, "learning_rate": 0.00019930358155420525, "loss": 0.9101, "step": 590 }, { "epoch": 0.6806579693703914, "grad_norm": 0.6564317345619202, "learning_rate": 0.0001992070935589319, "loss": 1.0374, "step": 600 }, { "epoch": 0.6920022688598979, "grad_norm": 0.5805884599685669, "learning_rate": 0.0001991043743462092, "loss": 0.9695, "step": 610 }, { "epoch": 0.7033465683494045, "grad_norm": 0.5671830773353577, "learning_rate": 0.00019899543036785145, "loss": 0.9598, "step": 620 }, { "epoch": 0.7146908678389109, "grad_norm": 0.54367595911026, "learning_rate": 0.0001988802684666519, "loss": 0.962, "step": 630 }, { "epoch": 0.7260351673284174, "grad_norm": 0.6982467770576477, "learning_rate": 0.00019875889587595252, "loss": 0.9633, "step": 640 }, { "epoch": 0.737379466817924, "grad_norm": 0.6268488764762878, "learning_rate": 0.00019863132021919025, "loss": 0.9684, "step": 650 }, { "epoch": 0.7487237663074305, "grad_norm": 1.2111632823944092, "learning_rate": 0.00019849754950941758, "loss": 1.0044, "step": 660 }, { "epoch": 0.7600680657969371, "grad_norm": 0.6442829370498657, "learning_rate": 0.00019835759214879964, "loss": 0.9533, "step": 670 }, { "epoch": 0.7714123652864435, "grad_norm": 0.5263229608535767, "learning_rate": 0.00019821145692808633, "loss": 0.959, "step": 680 }, { "epoch": 0.7827566647759501, "grad_norm": 0.572928786277771, "learning_rate": 0.00019805915302606016, "loss": 0.9473, "step": 690 }, { "epoch": 0.7941009642654566, "grad_norm": 0.6176092624664307, "learning_rate": 0.00019790069000895987, "loss": 0.9164, "step": 700 }, { "epoch": 0.8054452637549632, "grad_norm": 0.5628384351730347, "learning_rate": 0.00019773607782987924, "loss": 0.9705, "step": 710 }, { "epoch": 0.8167895632444696, "grad_norm": 0.8331648111343384, "learning_rate": 0.00019756532682814232, "loss": 0.9497, "step": 720 }, { "epoch": 0.8281338627339762, "grad_norm": 0.5843848586082458, "learning_rate": 0.00019738844772865377, "loss": 0.9828, "step": 730 }, { "epoch": 0.8394781622234827, "grad_norm": 0.6603434681892395, "learning_rate": 0.0001972054516412253, "loss": 0.9717, "step": 740 }, { "epoch": 0.8508224617129893, "grad_norm": 0.5622076988220215, "learning_rate": 0.00019701635005987792, "loss": 0.9392, "step": 750 }, { "epoch": 0.8621667612024957, "grad_norm": 0.8947564959526062, "learning_rate": 0.00019682115486211984, "loss": 0.9917, "step": 760 }, { "epoch": 0.8735110606920022, "grad_norm": 0.5935038328170776, "learning_rate": 0.00019661987830820065, "loss": 0.9749, "step": 770 }, { "epoch": 0.8848553601815088, "grad_norm": 0.8751797676086426, "learning_rate": 0.000196412533040341, "loss": 0.9828, "step": 780 }, { "epoch": 0.8961996596710153, "grad_norm": 0.5279515981674194, "learning_rate": 0.00019619913208193882, "loss": 0.9685, "step": 790 }, { "epoch": 0.9075439591605219, "grad_norm": 0.643695056438446, "learning_rate": 0.00019597968883675116, "loss": 0.9547, "step": 800 }, { "epoch": 0.9188882586500283, "grad_norm": 0.7370747923851013, "learning_rate": 0.00019575421708805215, "loss": 0.9129, "step": 810 }, { "epoch": 0.9302325581395349, "grad_norm": 0.7514728307723999, "learning_rate": 0.0001955227309977677, "loss": 0.9929, "step": 820 }, { "epoch": 0.9415768576290414, "grad_norm": 0.6589088439941406, "learning_rate": 0.00019528524510558547, "loss": 0.9627, "step": 830 }, { "epoch": 0.952921157118548, "grad_norm": 0.548102617263794, "learning_rate": 0.00019504177432804203, "loss": 0.9307, "step": 840 }, { "epoch": 0.9642654566080544, "grad_norm": 0.458879679441452, "learning_rate": 0.00019479233395758576, "loss": 0.9838, "step": 850 }, { "epoch": 0.975609756097561, "grad_norm": 0.9955594539642334, "learning_rate": 0.0001945369396616164, "loss": 0.9246, "step": 860 }, { "epoch": 0.9869540555870675, "grad_norm": 0.5781052708625793, "learning_rate": 0.0001942756074815009, "loss": 1.0076, "step": 870 }, { "epoch": 0.998298355076574, "grad_norm": 0.7370733022689819, "learning_rate": 0.00019400835383156592, "loss": 0.9618, "step": 880 }, { "epoch": 1.0096426545660806, "grad_norm": 0.6173350214958191, "learning_rate": 0.00019373519549806682, "loss": 0.872, "step": 890 }, { "epoch": 1.0209869540555871, "grad_norm": 0.6110262274742126, "learning_rate": 0.00019345614963813334, "loss": 0.8953, "step": 900 }, { "epoch": 1.0323312535450937, "grad_norm": 0.8880902528762817, "learning_rate": 0.00019317123377869192, "loss": 0.8847, "step": 910 }, { "epoch": 1.0436755530346, "grad_norm": 0.6907595992088318, "learning_rate": 0.00019288046581536486, "loss": 0.8878, "step": 920 }, { "epoch": 1.0550198525241066, "grad_norm": 0.7469139695167542, "learning_rate": 0.00019258386401134624, "loss": 0.9018, "step": 930 }, { "epoch": 1.0663641520136131, "grad_norm": 0.8650104403495789, "learning_rate": 0.0001922814469962549, "loss": 0.8825, "step": 940 }, { "epoch": 1.0777084515031197, "grad_norm": 1.1437135934829712, "learning_rate": 0.00019197323376496427, "loss": 0.8977, "step": 950 }, { "epoch": 1.0890527509926262, "grad_norm": 0.6191611289978027, "learning_rate": 0.00019165924367640916, "loss": 0.9059, "step": 960 }, { "epoch": 1.1003970504821328, "grad_norm": 0.7402692437171936, "learning_rate": 0.00019133949645237005, "loss": 0.8778, "step": 970 }, { "epoch": 1.1117413499716393, "grad_norm": 0.7002813220024109, "learning_rate": 0.00019101401217623426, "loss": 0.9281, "step": 980 }, { "epoch": 1.1230856494611459, "grad_norm": 0.9000174403190613, "learning_rate": 0.00019068281129173444, "loss": 0.8795, "step": 990 }, { "epoch": 1.1344299489506522, "grad_norm": 0.6749204993247986, "learning_rate": 0.00019034591460166463, "loss": 0.9091, "step": 1000 }, { "epoch": 1.1344299489506522, "eval_loss": 0.8940885663032532, "eval_runtime": 15.7869, "eval_samples_per_second": 94.065, "eval_steps_per_second": 11.782, "step": 1000 }, { "epoch": 1.1457742484401587, "grad_norm": 0.7294667959213257, "learning_rate": 0.00019000334326657345, "loss": 0.879, "step": 1010 }, { "epoch": 1.1571185479296653, "grad_norm": 0.9591787457466125, "learning_rate": 0.00018965511880343527, "loss": 0.9264, "step": 1020 }, { "epoch": 1.1684628474191718, "grad_norm": 0.9575808644294739, "learning_rate": 0.00018930126308429844, "loss": 0.8825, "step": 1030 }, { "epoch": 1.1798071469086784, "grad_norm": 0.49267736077308655, "learning_rate": 0.00018894179833491164, "loss": 0.9321, "step": 1040 }, { "epoch": 1.191151446398185, "grad_norm": 0.848102867603302, "learning_rate": 0.00018857674713332795, "loss": 0.8543, "step": 1050 }, { "epoch": 1.2024957458876915, "grad_norm": 0.7710912227630615, "learning_rate": 0.00018820613240848655, "loss": 0.9468, "step": 1060 }, { "epoch": 1.213840045377198, "grad_norm": 0.6399308443069458, "learning_rate": 0.00018782997743877264, "loss": 0.9081, "step": 1070 }, { "epoch": 1.2251843448667046, "grad_norm": 0.9124737977981567, "learning_rate": 0.00018744830585055538, "loss": 0.9288, "step": 1080 }, { "epoch": 1.236528644356211, "grad_norm": 0.6313666105270386, "learning_rate": 0.00018706114161670377, "loss": 0.8197, "step": 1090 }, { "epoch": 1.2478729438457175, "grad_norm": 0.7220073938369751, "learning_rate": 0.000186668509055081, "loss": 0.8576, "step": 1100 }, { "epoch": 1.259217243335224, "grad_norm": 1.1808422803878784, "learning_rate": 0.00018627043282701703, "loss": 0.9044, "step": 1110 }, { "epoch": 1.2705615428247305, "grad_norm": 0.6578934788703918, "learning_rate": 0.00018586693793575966, "loss": 0.9015, "step": 1120 }, { "epoch": 1.281905842314237, "grad_norm": 0.9080325961112976, "learning_rate": 0.0001854580497249039, "loss": 0.8919, "step": 1130 }, { "epoch": 1.2932501418037436, "grad_norm": 0.6446923017501831, "learning_rate": 0.00018504379387680034, "loss": 0.9033, "step": 1140 }, { "epoch": 1.3045944412932502, "grad_norm": 0.6877492070198059, "learning_rate": 0.00018462419641094189, "loss": 0.8843, "step": 1150 }, { "epoch": 1.3159387407827567, "grad_norm": 0.6565636396408081, "learning_rate": 0.00018419928368232957, "loss": 0.8925, "step": 1160 }, { "epoch": 1.3272830402722633, "grad_norm": 0.8198230862617493, "learning_rate": 0.0001837690823798171, "loss": 0.8495, "step": 1170 }, { "epoch": 1.3386273397617696, "grad_norm": 0.7579399347305298, "learning_rate": 0.00018333361952443462, "loss": 0.9051, "step": 1180 }, { "epoch": 1.3499716392512762, "grad_norm": 0.8067922592163086, "learning_rate": 0.0001828929224676914, "loss": 0.8677, "step": 1190 }, { "epoch": 1.3613159387407827, "grad_norm": 0.7077610492706299, "learning_rate": 0.00018244701888985802, "loss": 0.942, "step": 1200 }, { "epoch": 1.3726602382302893, "grad_norm": 1.2009291648864746, "learning_rate": 0.00018199593679822765, "loss": 0.9034, "step": 1210 }, { "epoch": 1.3840045377197958, "grad_norm": 0.8162534832954407, "learning_rate": 0.00018153970452535698, "loss": 0.8904, "step": 1220 }, { "epoch": 1.3953488372093024, "grad_norm": 0.6332406401634216, "learning_rate": 0.00018107835072728656, "loss": 0.8637, "step": 1230 }, { "epoch": 1.406693136698809, "grad_norm": 0.6449089050292969, "learning_rate": 0.00018061190438174105, "loss": 0.9463, "step": 1240 }, { "epoch": 1.4180374361883152, "grad_norm": 0.6543394327163696, "learning_rate": 0.00018014039478630894, "loss": 0.8497, "step": 1250 }, { "epoch": 1.429381735677822, "grad_norm": 0.7993437647819519, "learning_rate": 0.0001796638515566025, "loss": 0.9415, "step": 1260 }, { "epoch": 1.4407260351673283, "grad_norm": 0.878514289855957, "learning_rate": 0.0001791823046243977, "loss": 0.9143, "step": 1270 }, { "epoch": 1.4520703346568349, "grad_norm": 0.6794580817222595, "learning_rate": 0.00017869578423575387, "loss": 0.9041, "step": 1280 }, { "epoch": 1.4634146341463414, "grad_norm": 0.9009565711021423, "learning_rate": 0.00017820432094911427, "loss": 0.8773, "step": 1290 }, { "epoch": 1.474758933635848, "grad_norm": 0.6419825553894043, "learning_rate": 0.00017770794563338647, "loss": 0.9027, "step": 1300 }, { "epoch": 1.4861032331253545, "grad_norm": 0.7277469635009766, "learning_rate": 0.0001772066894660037, "loss": 0.9123, "step": 1310 }, { "epoch": 1.497447532614861, "grad_norm": 0.7514845132827759, "learning_rate": 0.00017670058393096634, "loss": 0.9095, "step": 1320 }, { "epoch": 1.5087918321043676, "grad_norm": 0.5530194044113159, "learning_rate": 0.0001761896608168646, "loss": 0.855, "step": 1330 }, { "epoch": 1.520136131593874, "grad_norm": 0.6379088759422302, "learning_rate": 0.0001756739522148818, "loss": 0.9485, "step": 1340 }, { "epoch": 1.5314804310833807, "grad_norm": 0.5411556959152222, "learning_rate": 0.0001751534905167787, "loss": 0.951, "step": 1350 }, { "epoch": 1.542824730572887, "grad_norm": 0.9241764545440674, "learning_rate": 0.00017462830841285894, "loss": 0.8459, "step": 1360 }, { "epoch": 1.5541690300623936, "grad_norm": 0.9029989242553711, "learning_rate": 0.00017409843888991584, "loss": 0.9045, "step": 1370 }, { "epoch": 1.5655133295519001, "grad_norm": 0.9002951979637146, "learning_rate": 0.00017356391522916042, "loss": 0.8388, "step": 1380 }, { "epoch": 1.5768576290414067, "grad_norm": 0.6322818994522095, "learning_rate": 0.0001730247710041311, "loss": 0.8937, "step": 1390 }, { "epoch": 1.5882019285309132, "grad_norm": 0.9197801351547241, "learning_rate": 0.00017248104007858476, "loss": 0.8656, "step": 1400 }, { "epoch": 1.5995462280204198, "grad_norm": 0.7498595714569092, "learning_rate": 0.00017193275660436997, "loss": 0.8848, "step": 1410 }, { "epoch": 1.6108905275099263, "grad_norm": 1.0003221035003662, "learning_rate": 0.00017137995501928166, "loss": 0.8494, "step": 1420 }, { "epoch": 1.6222348269994327, "grad_norm": 0.6622512340545654, "learning_rate": 0.00017082267004489842, "loss": 0.9158, "step": 1430 }, { "epoch": 1.6335791264889394, "grad_norm": 1.2562657594680786, "learning_rate": 0.00017026093668440114, "loss": 0.8899, "step": 1440 }, { "epoch": 1.6449234259784458, "grad_norm": 0.5380372405052185, "learning_rate": 0.00016969479022037502, "loss": 0.9082, "step": 1450 }, { "epoch": 1.6562677254679523, "grad_norm": 0.7120011448860168, "learning_rate": 0.00016912426621259297, "loss": 0.8456, "step": 1460 }, { "epoch": 1.6676120249574589, "grad_norm": 0.580111026763916, "learning_rate": 0.0001685494004957824, "loss": 0.9272, "step": 1470 }, { "epoch": 1.6789563244469654, "grad_norm": 0.9516561627388, "learning_rate": 0.0001679702291773743, "loss": 0.906, "step": 1480 }, { "epoch": 1.690300623936472, "grad_norm": 0.5973901152610779, "learning_rate": 0.0001673867886352354, "loss": 0.931, "step": 1490 }, { "epoch": 1.7016449234259783, "grad_norm": 0.7292883992195129, "learning_rate": 0.00016679911551538317, "loss": 0.8848, "step": 1500 }, { "epoch": 1.712989222915485, "grad_norm": 0.6363751888275146, "learning_rate": 0.0001662072467296842, "loss": 0.9059, "step": 1510 }, { "epoch": 1.7243335224049914, "grad_norm": 0.9236806631088257, "learning_rate": 0.00016561121945353566, "loss": 0.8557, "step": 1520 }, { "epoch": 1.7356778218944982, "grad_norm": 0.6865366697311401, "learning_rate": 0.00016501107112353028, "loss": 0.9264, "step": 1530 }, { "epoch": 1.7470221213840045, "grad_norm": 0.6749486923217773, "learning_rate": 0.00016440683943510516, "loss": 0.9224, "step": 1540 }, { "epoch": 1.758366420873511, "grad_norm": 0.7539329528808594, "learning_rate": 0.00016379856234017382, "loss": 0.8594, "step": 1550 }, { "epoch": 1.7697107203630176, "grad_norm": 0.6702885031700134, "learning_rate": 0.0001631862780447426, "loss": 0.8896, "step": 1560 }, { "epoch": 1.7810550198525241, "grad_norm": 0.6152791976928711, "learning_rate": 0.00016257002500651098, "loss": 0.8738, "step": 1570 }, { "epoch": 1.7923993193420307, "grad_norm": 0.5736550688743591, "learning_rate": 0.00016194984193245587, "loss": 0.9018, "step": 1580 }, { "epoch": 1.803743618831537, "grad_norm": 0.751157820224762, "learning_rate": 0.00016132576777640067, "loss": 0.8605, "step": 1590 }, { "epoch": 1.8150879183210438, "grad_norm": 0.6626732349395752, "learning_rate": 0.0001606978417365682, "loss": 0.8857, "step": 1600 }, { "epoch": 1.82643221781055, "grad_norm": 0.584065318107605, "learning_rate": 0.00016006610325311908, "loss": 0.9104, "step": 1610 }, { "epoch": 1.8377765173000569, "grad_norm": 0.5933496356010437, "learning_rate": 0.0001594305920056742, "loss": 0.8167, "step": 1620 }, { "epoch": 1.8491208167895632, "grad_norm": 0.5618401765823364, "learning_rate": 0.00015879134791082247, "loss": 0.8907, "step": 1630 }, { "epoch": 1.8604651162790697, "grad_norm": 0.9804329872131348, "learning_rate": 0.00015814841111961374, "loss": 0.9494, "step": 1640 }, { "epoch": 1.8718094157685763, "grad_norm": 0.937347412109375, "learning_rate": 0.00015750182201503682, "loss": 0.9045, "step": 1650 }, { "epoch": 1.8831537152580828, "grad_norm": 0.8898664712905884, "learning_rate": 0.00015685162120948317, "loss": 0.9346, "step": 1660 }, { "epoch": 1.8944980147475894, "grad_norm": 0.8580901622772217, "learning_rate": 0.00015619784954219577, "loss": 0.9412, "step": 1670 }, { "epoch": 1.9058423142370957, "grad_norm": 0.6913225054740906, "learning_rate": 0.00015554054807670418, "loss": 0.9006, "step": 1680 }, { "epoch": 1.9171866137266025, "grad_norm": 0.7101637125015259, "learning_rate": 0.00015487975809824539, "loss": 0.8857, "step": 1690 }, { "epoch": 1.9285309132161088, "grad_norm": 0.8228437900543213, "learning_rate": 0.00015421552111117044, "loss": 0.8607, "step": 1700 }, { "epoch": 1.9398752127056156, "grad_norm": 0.5591906905174255, "learning_rate": 0.00015354787883633782, "loss": 0.8674, "step": 1710 }, { "epoch": 1.951219512195122, "grad_norm": 0.6841379404067993, "learning_rate": 0.00015287687320849271, "loss": 0.8387, "step": 1720 }, { "epoch": 1.9625638116846285, "grad_norm": 0.8344857096672058, "learning_rate": 0.00015220254637363318, "loss": 0.9227, "step": 1730 }, { "epoch": 1.973908111174135, "grad_norm": 0.8986241221427917, "learning_rate": 0.00015152494068636308, "loss": 0.8917, "step": 1740 }, { "epoch": 1.9852524106636416, "grad_norm": 0.5783970952033997, "learning_rate": 0.00015084409870723154, "loss": 0.872, "step": 1750 }, { "epoch": 1.996596710153148, "grad_norm": 0.6369901895523071, "learning_rate": 0.00015016006320005986, "loss": 0.9132, "step": 1760 }, { "epoch": 2.0079410096426544, "grad_norm": 0.5906355381011963, "learning_rate": 0.00014947287712925545, "loss": 0.8074, "step": 1770 }, { "epoch": 2.019285309132161, "grad_norm": 0.6774492263793945, "learning_rate": 0.00014878258365711334, "loss": 0.759, "step": 1780 }, { "epoch": 2.0306296086216675, "grad_norm": 0.8353272676467896, "learning_rate": 0.00014808922614110493, "loss": 0.8028, "step": 1790 }, { "epoch": 2.0419739081111743, "grad_norm": 0.8876771926879883, "learning_rate": 0.00014739284813115498, "loss": 0.7302, "step": 1800 }, { "epoch": 2.0533182076006806, "grad_norm": 0.6215524673461914, "learning_rate": 0.00014669349336690594, "loss": 0.7759, "step": 1810 }, { "epoch": 2.0646625070901874, "grad_norm": 0.5663015246391296, "learning_rate": 0.00014599120577497087, "loss": 0.7834, "step": 1820 }, { "epoch": 2.0760068065796937, "grad_norm": 0.6096060872077942, "learning_rate": 0.00014528602946617432, "loss": 0.8364, "step": 1830 }, { "epoch": 2.0873511060692, "grad_norm": 0.7625316977500916, "learning_rate": 0.00014457800873278172, "loss": 0.7558, "step": 1840 }, { "epoch": 2.098695405558707, "grad_norm": 0.6301640272140503, "learning_rate": 0.0001438671880457174, "loss": 0.8297, "step": 1850 }, { "epoch": 2.110039705048213, "grad_norm": 0.6493074297904968, "learning_rate": 0.00014315361205177127, "loss": 0.7764, "step": 1860 }, { "epoch": 2.12138400453772, "grad_norm": 0.8326807618141174, "learning_rate": 0.0001424373255707947, "loss": 0.7895, "step": 1870 }, { "epoch": 2.1327283040272262, "grad_norm": 1.0578484535217285, "learning_rate": 0.00014171837359288524, "loss": 0.7889, "step": 1880 }, { "epoch": 2.144072603516733, "grad_norm": 0.6812543272972107, "learning_rate": 0.0001409968012755609, "loss": 0.7643, "step": 1890 }, { "epoch": 2.1554169030062393, "grad_norm": 0.8412303924560547, "learning_rate": 0.00014027265394092364, "loss": 0.7402, "step": 1900 }, { "epoch": 2.1667612024957457, "grad_norm": 0.947846531867981, "learning_rate": 0.00013954597707281288, "loss": 0.7763, "step": 1910 }, { "epoch": 2.1781055019852524, "grad_norm": 0.7577157616615295, "learning_rate": 0.00013881681631394842, "loss": 0.8334, "step": 1920 }, { "epoch": 2.1894498014747588, "grad_norm": 0.6362768411636353, "learning_rate": 0.0001380852174630639, "loss": 0.7484, "step": 1930 }, { "epoch": 2.2007941009642655, "grad_norm": 0.7967275381088257, "learning_rate": 0.00013735122647202984, "loss": 0.7302, "step": 1940 }, { "epoch": 2.212138400453772, "grad_norm": 0.7726805210113525, "learning_rate": 0.0001366148894429677, "loss": 0.7836, "step": 1950 }, { "epoch": 2.2234826999432786, "grad_norm": 0.7741623520851135, "learning_rate": 0.00013587625262535396, "loss": 0.7925, "step": 1960 }, { "epoch": 2.234826999432785, "grad_norm": 0.7582458257675171, "learning_rate": 0.0001351353624131153, "loss": 0.7765, "step": 1970 }, { "epoch": 2.2461712989222917, "grad_norm": 0.8276723027229309, "learning_rate": 0.00013439226534171463, "loss": 0.81, "step": 1980 }, { "epoch": 2.257515598411798, "grad_norm": 0.8419069051742554, "learning_rate": 0.00013364700808522807, "loss": 0.7464, "step": 1990 }, { "epoch": 2.2688598979013044, "grad_norm": 0.7446946501731873, "learning_rate": 0.00013289963745341345, "loss": 0.7524, "step": 2000 }, { "epoch": 2.2688598979013044, "eval_loss": 0.9066722989082336, "eval_runtime": 15.6396, "eval_samples_per_second": 94.951, "eval_steps_per_second": 11.893, "step": 2000 }, { "epoch": 2.280204197390811, "grad_norm": 0.7091513872146606, "learning_rate": 0.00013215020038877002, "loss": 0.7806, "step": 2010 }, { "epoch": 2.2915484968803175, "grad_norm": 0.5853792428970337, "learning_rate": 0.0001313987439635902, "loss": 0.7625, "step": 2020 }, { "epoch": 2.3028927963698242, "grad_norm": 0.7464004158973694, "learning_rate": 0.00013064531537700284, "loss": 0.7313, "step": 2030 }, { "epoch": 2.3142370958593306, "grad_norm": 0.6370956301689148, "learning_rate": 0.00012988996195200858, "loss": 0.7903, "step": 2040 }, { "epoch": 2.3255813953488373, "grad_norm": 0.8973234295845032, "learning_rate": 0.0001291327311325076, "loss": 0.7537, "step": 2050 }, { "epoch": 2.3369256948383437, "grad_norm": 1.206678032875061, "learning_rate": 0.00012837367048031955, "loss": 0.8081, "step": 2060 }, { "epoch": 2.3482699943278504, "grad_norm": 0.9258993864059448, "learning_rate": 0.0001276128276721963, "loss": 0.7754, "step": 2070 }, { "epoch": 2.3596142938173568, "grad_norm": 0.8008835315704346, "learning_rate": 0.00012685025049682732, "loss": 0.8119, "step": 2080 }, { "epoch": 2.370958593306863, "grad_norm": 0.8094901442527771, "learning_rate": 0.0001260859868518379, "loss": 0.7889, "step": 2090 }, { "epoch": 2.38230289279637, "grad_norm": 0.7824433445930481, "learning_rate": 0.00012532008474078093, "loss": 0.8443, "step": 2100 }, { "epoch": 2.393647192285876, "grad_norm": 0.8314623236656189, "learning_rate": 0.00012455259227012172, "loss": 0.8009, "step": 2110 }, { "epoch": 2.404991491775383, "grad_norm": 0.993483304977417, "learning_rate": 0.0001237835576462163, "loss": 0.803, "step": 2120 }, { "epoch": 2.4163357912648893, "grad_norm": 0.7922090291976929, "learning_rate": 0.00012301302917228364, "loss": 0.7785, "step": 2130 }, { "epoch": 2.427680090754396, "grad_norm": 0.8681336045265198, "learning_rate": 0.00012224105524537176, "loss": 0.7427, "step": 2140 }, { "epoch": 2.4390243902439024, "grad_norm": 0.868011474609375, "learning_rate": 0.00012146768435331797, "loss": 0.7841, "step": 2150 }, { "epoch": 2.450368689733409, "grad_norm": 0.8300703763961792, "learning_rate": 0.00012069296507170307, "loss": 0.7113, "step": 2160 }, { "epoch": 2.4617129892229155, "grad_norm": 1.0211178064346313, "learning_rate": 0.00011991694606080062, "loss": 0.7927, "step": 2170 }, { "epoch": 2.473057288712422, "grad_norm": 1.1126124858856201, "learning_rate": 0.00011913967606252035, "loss": 0.798, "step": 2180 }, { "epoch": 2.4844015882019286, "grad_norm": 1.331468939781189, "learning_rate": 0.00011836120389734677, "loss": 0.7868, "step": 2190 }, { "epoch": 2.495745887691435, "grad_norm": 0.7289639115333557, "learning_rate": 0.00011758157846127278, "loss": 0.7501, "step": 2200 }, { "epoch": 2.5070901871809417, "grad_norm": 0.6862948536872864, "learning_rate": 0.00011680084872272843, "loss": 0.8113, "step": 2210 }, { "epoch": 2.518434486670448, "grad_norm": 0.6838523745536804, "learning_rate": 0.00011601906371950523, "loss": 0.7794, "step": 2220 }, { "epoch": 2.5297787861599548, "grad_norm": 0.8923412561416626, "learning_rate": 0.00011523627255567606, "loss": 0.7532, "step": 2230 }, { "epoch": 2.541123085649461, "grad_norm": 0.7864569425582886, "learning_rate": 0.00011445252439851092, "loss": 0.8044, "step": 2240 }, { "epoch": 2.552467385138968, "grad_norm": 0.9186776280403137, "learning_rate": 0.0001136678684753889, "loss": 0.7861, "step": 2250 }, { "epoch": 2.563811684628474, "grad_norm": 0.9502933025360107, "learning_rate": 0.00011288235407070588, "loss": 0.7441, "step": 2260 }, { "epoch": 2.5751559841179805, "grad_norm": 0.9764688014984131, "learning_rate": 0.00011209603052277924, "loss": 0.7519, "step": 2270 }, { "epoch": 2.5865002836074873, "grad_norm": 0.8480959534645081, "learning_rate": 0.00011130894722074874, "loss": 0.7743, "step": 2280 }, { "epoch": 2.5978445830969936, "grad_norm": 0.8660979866981506, "learning_rate": 0.00011052115360147448, "loss": 0.7989, "step": 2290 }, { "epoch": 2.6091888825865004, "grad_norm": 0.6586043238639832, "learning_rate": 0.0001097326991464318, "loss": 0.7676, "step": 2300 }, { "epoch": 2.6205331820760067, "grad_norm": 0.7315343618392944, "learning_rate": 0.00010894363337860314, "loss": 0.7699, "step": 2310 }, { "epoch": 2.6318774815655135, "grad_norm": 0.7257770895957947, "learning_rate": 0.0001081540058593677, "loss": 0.7773, "step": 2320 }, { "epoch": 2.64322178105502, "grad_norm": 0.6760928630828857, "learning_rate": 0.00010736386618538838, "loss": 0.7902, "step": 2330 }, { "epoch": 2.6545660805445266, "grad_norm": 0.6824659705162048, "learning_rate": 0.00010657326398549661, "loss": 0.7759, "step": 2340 }, { "epoch": 2.665910380034033, "grad_norm": 0.972321629524231, "learning_rate": 0.0001057822489175752, "loss": 0.7926, "step": 2350 }, { "epoch": 2.6772546795235392, "grad_norm": 0.9526649713516235, "learning_rate": 0.00010499087066543922, "loss": 0.7648, "step": 2360 }, { "epoch": 2.688598979013046, "grad_norm": 0.7266947031021118, "learning_rate": 0.0001041991789357155, "loss": 0.776, "step": 2370 }, { "epoch": 2.6999432785025523, "grad_norm": 0.808121383190155, "learning_rate": 0.00010340722345472037, "loss": 0.7852, "step": 2380 }, { "epoch": 2.711287577992059, "grad_norm": 1.1124972105026245, "learning_rate": 0.00010261505396533648, "loss": 0.717, "step": 2390 }, { "epoch": 2.7226318774815654, "grad_norm": 0.7241740226745605, "learning_rate": 0.00010182272022388841, "loss": 0.8335, "step": 2400 }, { "epoch": 2.733976176971072, "grad_norm": 1.0944820642471313, "learning_rate": 0.0001010302719970174, "loss": 0.7874, "step": 2410 }, { "epoch": 2.7453204764605785, "grad_norm": 0.735615611076355, "learning_rate": 0.00010023775905855559, "loss": 0.7198, "step": 2420 }, { "epoch": 2.7566647759500853, "grad_norm": 0.8080368041992188, "learning_rate": 9.944523118639958e-05, "loss": 0.8275, "step": 2430 }, { "epoch": 2.7680090754395916, "grad_norm": 1.0709086656570435, "learning_rate": 9.865273815938403e-05, "loss": 0.841, "step": 2440 }, { "epoch": 2.779353374929098, "grad_norm": 0.8561082482337952, "learning_rate": 9.786032975415503e-05, "loss": 0.7393, "step": 2450 }, { "epoch": 2.7906976744186047, "grad_norm": 0.6831649541854858, "learning_rate": 9.706805574204341e-05, "loss": 0.7904, "step": 2460 }, { "epoch": 2.802041973908111, "grad_norm": 0.9404779672622681, "learning_rate": 9.627596588593884e-05, "loss": 0.7651, "step": 2470 }, { "epoch": 2.813386273397618, "grad_norm": 1.1059134006500244, "learning_rate": 9.54841099371641e-05, "loss": 0.7792, "step": 2480 }, { "epoch": 2.824730572887124, "grad_norm": 0.8339388966560364, "learning_rate": 9.469253763235015e-05, "loss": 0.8037, "step": 2490 }, { "epoch": 2.8360748723766305, "grad_norm": 0.691879153251648, "learning_rate": 9.390129869031232e-05, "loss": 0.7882, "step": 2500 }, { "epoch": 2.8474191718661372, "grad_norm": 0.8173119425773621, "learning_rate": 9.311044280892728e-05, "loss": 0.7723, "step": 2510 }, { "epoch": 2.858763471355644, "grad_norm": 1.2163662910461426, "learning_rate": 9.232001966201159e-05, "loss": 0.8332, "step": 2520 }, { "epoch": 2.8701077708451503, "grad_norm": 0.7762579917907715, "learning_rate": 9.153007889620169e-05, "loss": 0.8017, "step": 2530 }, { "epoch": 2.8814520703346567, "grad_norm": 0.7560020089149475, "learning_rate": 9.074067012783551e-05, "loss": 0.7645, "step": 2540 }, { "epoch": 2.8927963698241634, "grad_norm": 0.7039526104927063, "learning_rate": 8.995184293983627e-05, "loss": 0.7496, "step": 2550 }, { "epoch": 2.9041406693136698, "grad_norm": 0.8188515305519104, "learning_rate": 8.916364687859782e-05, "loss": 0.7941, "step": 2560 }, { "epoch": 2.9154849688031765, "grad_norm": 0.8847174048423767, "learning_rate": 8.837613145087289e-05, "loss": 0.7462, "step": 2570 }, { "epoch": 2.926829268292683, "grad_norm": 1.4302834272384644, "learning_rate": 8.758934612066353e-05, "loss": 0.7659, "step": 2580 }, { "epoch": 2.938173567782189, "grad_norm": 0.8293200135231018, "learning_rate": 8.680334030611414e-05, "loss": 0.7464, "step": 2590 }, { "epoch": 2.949517867271696, "grad_norm": 0.9347418546676636, "learning_rate": 8.601816337640767e-05, "loss": 0.7907, "step": 2600 }, { "epoch": 2.9608621667612027, "grad_norm": 0.8685625195503235, "learning_rate": 8.523386464866452e-05, "loss": 0.7881, "step": 2610 }, { "epoch": 2.972206466250709, "grad_norm": 1.0375618934631348, "learning_rate": 8.44504933848452e-05, "loss": 0.7415, "step": 2620 }, { "epoch": 2.9835507657402154, "grad_norm": 1.1286613941192627, "learning_rate": 8.366809878865594e-05, "loss": 0.759, "step": 2630 }, { "epoch": 2.994895065229722, "grad_norm": 0.9496249556541443, "learning_rate": 8.28867300024582e-05, "loss": 0.8122, "step": 2640 }, { "epoch": 3.0062393647192285, "grad_norm": 0.6161667108535767, "learning_rate": 8.210643610418232e-05, "loss": 0.7363, "step": 2650 }, { "epoch": 3.0175836642087353, "grad_norm": 1.1362223625183105, "learning_rate": 8.132726610424453e-05, "loss": 0.6957, "step": 2660 }, { "epoch": 3.0289279636982416, "grad_norm": 0.9549693465232849, "learning_rate": 8.054926894246887e-05, "loss": 0.6598, "step": 2670 }, { "epoch": 3.0402722631877483, "grad_norm": 0.7844473719596863, "learning_rate": 7.977249348501314e-05, "loss": 0.7104, "step": 2680 }, { "epoch": 3.0516165626772547, "grad_norm": 0.9754497408866882, "learning_rate": 7.899698852129962e-05, "loss": 0.7109, "step": 2690 }, { "epoch": 3.062960862166761, "grad_norm": 0.8465747237205505, "learning_rate": 7.822280276095073e-05, "loss": 0.6208, "step": 2700 }, { "epoch": 3.0743051616562678, "grad_norm": 0.7896714806556702, "learning_rate": 7.744998483072936e-05, "loss": 0.6417, "step": 2710 }, { "epoch": 3.085649461145774, "grad_norm": 0.8668105006217957, "learning_rate": 7.667858327148475e-05, "loss": 0.6525, "step": 2720 }, { "epoch": 3.096993760635281, "grad_norm": 1.0019567012786865, "learning_rate": 7.590864653510359e-05, "loss": 0.6604, "step": 2730 }, { "epoch": 3.108338060124787, "grad_norm": 0.7561362981796265, "learning_rate": 7.514022298146679e-05, "loss": 0.6912, "step": 2740 }, { "epoch": 3.119682359614294, "grad_norm": 0.9435575604438782, "learning_rate": 7.437336087541187e-05, "loss": 0.6993, "step": 2750 }, { "epoch": 3.1310266591038003, "grad_norm": 1.041034460067749, "learning_rate": 7.360810838370161e-05, "loss": 0.6562, "step": 2760 }, { "epoch": 3.142370958593307, "grad_norm": 0.8745769262313843, "learning_rate": 7.284451357199851e-05, "loss": 0.6035, "step": 2770 }, { "epoch": 3.1537152580828134, "grad_norm": 0.9436658620834351, "learning_rate": 7.208262440184584e-05, "loss": 0.6591, "step": 2780 }, { "epoch": 3.1650595575723197, "grad_norm": 0.9558268785476685, "learning_rate": 7.13224887276553e-05, "loss": 0.7548, "step": 2790 }, { "epoch": 3.1764038570618265, "grad_norm": 1.3072495460510254, "learning_rate": 7.056415429370106e-05, "loss": 0.648, "step": 2800 }, { "epoch": 3.187748156551333, "grad_norm": 1.0742169618606567, "learning_rate": 6.980766873112106e-05, "loss": 0.6646, "step": 2810 }, { "epoch": 3.1990924560408396, "grad_norm": 0.8391577005386353, "learning_rate": 6.905307955492523e-05, "loss": 0.6844, "step": 2820 }, { "epoch": 3.210436755530346, "grad_norm": 0.9172285795211792, "learning_rate": 6.83004341610111e-05, "loss": 0.6671, "step": 2830 }, { "epoch": 3.2217810550198527, "grad_norm": 1.0791727304458618, "learning_rate": 6.754977982318693e-05, "loss": 0.6619, "step": 2840 }, { "epoch": 3.233125354509359, "grad_norm": 0.8881738781929016, "learning_rate": 6.68011636902022e-05, "loss": 0.678, "step": 2850 }, { "epoch": 3.2444696539988658, "grad_norm": 0.8353477120399475, "learning_rate": 6.605463278278646e-05, "loss": 0.7061, "step": 2860 }, { "epoch": 3.255813953488372, "grad_norm": 0.9251864552497864, "learning_rate": 6.531023399069574e-05, "loss": 0.6658, "step": 2870 }, { "epoch": 3.2671582529778784, "grad_norm": 0.7780378460884094, "learning_rate": 6.45680140697675e-05, "loss": 0.6327, "step": 2880 }, { "epoch": 3.278502552467385, "grad_norm": 1.3496202230453491, "learning_rate": 6.38280196389839e-05, "loss": 0.6658, "step": 2890 }, { "epoch": 3.2898468519568915, "grad_norm": 1.0429950952529907, "learning_rate": 6.309029717754362e-05, "loss": 0.7013, "step": 2900 }, { "epoch": 3.3011911514463983, "grad_norm": 0.7141017317771912, "learning_rate": 6.235489302194247e-05, "loss": 0.6969, "step": 2910 }, { "epoch": 3.3125354509359046, "grad_norm": 1.2669309377670288, "learning_rate": 6.162185336306294e-05, "loss": 0.6468, "step": 2920 }, { "epoch": 3.3238797504254114, "grad_norm": 0.8476207852363586, "learning_rate": 6.089122424327307e-05, "loss": 0.6501, "step": 2930 }, { "epoch": 3.3352240499149177, "grad_norm": 0.9521162509918213, "learning_rate": 6.01630515535345e-05, "loss": 0.6546, "step": 2940 }, { "epoch": 3.346568349404424, "grad_norm": 0.7817677855491638, "learning_rate": 5.943738103051997e-05, "loss": 0.6919, "step": 2950 }, { "epoch": 3.357912648893931, "grad_norm": 0.776945948600769, "learning_rate": 5.8714258253740564e-05, "loss": 0.6897, "step": 2960 }, { "epoch": 3.369256948383437, "grad_norm": 0.9761963486671448, "learning_rate": 5.7993728642683e-05, "loss": 0.6299, "step": 2970 }, { "epoch": 3.380601247872944, "grad_norm": 0.7887254953384399, "learning_rate": 5.7275837453956614e-05, "loss": 0.6773, "step": 2980 }, { "epoch": 3.3919455473624502, "grad_norm": 0.860835611820221, "learning_rate": 5.656062977845116e-05, "loss": 0.6239, "step": 2990 }, { "epoch": 3.403289846851957, "grad_norm": 0.9700385928153992, "learning_rate": 5.584815053850407e-05, "loss": 0.7148, "step": 3000 }, { "epoch": 3.403289846851957, "eval_loss": 0.9692808389663696, "eval_runtime": 15.7325, "eval_samples_per_second": 94.39, "eval_steps_per_second": 11.823, "step": 3000 }, { "epoch": 3.4146341463414633, "grad_norm": 1.335462212562561, "learning_rate": 5.51384444850794e-05, "loss": 0.6387, "step": 3010 }, { "epoch": 3.42597844583097, "grad_norm": 0.8788994550704956, "learning_rate": 5.443155619495679e-05, "loss": 0.6809, "step": 3020 }, { "epoch": 3.4373227453204764, "grad_norm": 0.9188012480735779, "learning_rate": 5.372753006793143e-05, "loss": 0.6724, "step": 3030 }, { "epoch": 3.4486670448099828, "grad_norm": 0.9619457125663757, "learning_rate": 5.302641032402578e-05, "loss": 0.6789, "step": 3040 }, { "epoch": 3.4600113442994895, "grad_norm": 0.9403857588768005, "learning_rate": 5.2328241000711464e-05, "loss": 0.6274, "step": 3050 }, { "epoch": 3.471355643788996, "grad_norm": 0.9259539246559143, "learning_rate": 5.16330659501438e-05, "loss": 0.6551, "step": 3060 }, { "epoch": 3.4826999432785026, "grad_norm": 1.07770574092865, "learning_rate": 5.094092883640718e-05, "loss": 0.6593, "step": 3070 }, { "epoch": 3.494044242768009, "grad_norm": 0.7347473502159119, "learning_rate": 5.0251873132772576e-05, "loss": 0.6847, "step": 3080 }, { "epoch": 3.5053885422575157, "grad_norm": 0.9838495254516602, "learning_rate": 4.956594211896701e-05, "loss": 0.6667, "step": 3090 }, { "epoch": 3.516732841747022, "grad_norm": 1.1671929359436035, "learning_rate": 4.8883178878454996e-05, "loss": 0.683, "step": 3100 }, { "epoch": 3.528077141236529, "grad_norm": 0.6510323882102966, "learning_rate": 4.8203626295732675e-05, "loss": 0.6946, "step": 3110 }, { "epoch": 3.539421440726035, "grad_norm": 0.7871556282043457, "learning_rate": 4.7527327053634094e-05, "loss": 0.6652, "step": 3120 }, { "epoch": 3.5507657402155415, "grad_norm": 0.8053673505783081, "learning_rate": 4.685432363065036e-05, "loss": 0.6431, "step": 3130 }, { "epoch": 3.5621100397050482, "grad_norm": 0.8162011504173279, "learning_rate": 4.618465829826145e-05, "loss": 0.6089, "step": 3140 }, { "epoch": 3.5734543391945546, "grad_norm": 1.0298821926116943, "learning_rate": 4.551837311828131e-05, "loss": 0.6645, "step": 3150 }, { "epoch": 3.5847986386840613, "grad_norm": 1.0996955633163452, "learning_rate": 4.485550994021567e-05, "loss": 0.6872, "step": 3160 }, { "epoch": 3.5961429381735677, "grad_norm": 0.9979953765869141, "learning_rate": 4.419611039863377e-05, "loss": 0.628, "step": 3170 }, { "epoch": 3.6074872376630744, "grad_norm": 1.0593342781066895, "learning_rate": 4.354021591055311e-05, "loss": 0.6864, "step": 3180 }, { "epoch": 3.6188315371525808, "grad_norm": 1.6677913665771484, "learning_rate": 4.2887867672838056e-05, "loss": 0.6232, "step": 3190 }, { "epoch": 3.6301758366420875, "grad_norm": 0.8164204359054565, "learning_rate": 4.223910665961235e-05, "loss": 0.6786, "step": 3200 }, { "epoch": 3.641520136131594, "grad_norm": 0.8163765072822571, "learning_rate": 4.15939736196853e-05, "loss": 0.6763, "step": 3210 }, { "epoch": 3.6528644356211, "grad_norm": 0.9765521883964539, "learning_rate": 4.095250907399262e-05, "loss": 0.6719, "step": 3220 }, { "epoch": 3.664208735110607, "grad_norm": 0.9238688349723816, "learning_rate": 4.03147533130511e-05, "loss": 0.68, "step": 3230 }, { "epoch": 3.6755530346001133, "grad_norm": 0.9760640859603882, "learning_rate": 3.968074639442805e-05, "loss": 0.6542, "step": 3240 }, { "epoch": 3.68689733408962, "grad_norm": 0.9406284689903259, "learning_rate": 3.905052814022523e-05, "loss": 0.653, "step": 3250 }, { "epoch": 3.6982416335791264, "grad_norm": 0.9423522353172302, "learning_rate": 3.842413813457758e-05, "loss": 0.706, "step": 3260 }, { "epoch": 3.709585933068633, "grad_norm": 0.8088165521621704, "learning_rate": 3.780161572116704e-05, "loss": 0.7161, "step": 3270 }, { "epoch": 3.7209302325581395, "grad_norm": 0.9071544408798218, "learning_rate": 3.718300000075129e-05, "loss": 0.7193, "step": 3280 }, { "epoch": 3.7322745320476463, "grad_norm": 0.8792480230331421, "learning_rate": 3.6568329828707836e-05, "loss": 0.6381, "step": 3290 }, { "epoch": 3.7436188315371526, "grad_norm": 1.0307759046554565, "learning_rate": 3.5957643812593543e-05, "loss": 0.6668, "step": 3300 }, { "epoch": 3.754963131026659, "grad_norm": 1.0883175134658813, "learning_rate": 3.5350980309719514e-05, "loss": 0.6978, "step": 3310 }, { "epoch": 3.7663074305161657, "grad_norm": 1.0448516607284546, "learning_rate": 3.4748377424742115e-05, "loss": 0.6756, "step": 3320 }, { "epoch": 3.777651730005672, "grad_norm": 0.8772532939910889, "learning_rate": 3.414987300726945e-05, "loss": 0.6714, "step": 3330 }, { "epoch": 3.7889960294951788, "grad_norm": 1.0115753412246704, "learning_rate": 3.3555504649484046e-05, "loss": 0.6773, "step": 3340 }, { "epoch": 3.800340328984685, "grad_norm": 1.1093175411224365, "learning_rate": 3.296530968378173e-05, "loss": 0.6916, "step": 3350 }, { "epoch": 3.811684628474192, "grad_norm": 0.8998281359672546, "learning_rate": 3.237932518042664e-05, "loss": 0.6801, "step": 3360 }, { "epoch": 3.823028927963698, "grad_norm": 1.0179048776626587, "learning_rate": 3.1797587945223026e-05, "loss": 0.6702, "step": 3370 }, { "epoch": 3.834373227453205, "grad_norm": 0.9240026473999023, "learning_rate": 3.1220134517203335e-05, "loss": 0.671, "step": 3380 }, { "epoch": 3.8457175269427113, "grad_norm": 0.7641962766647339, "learning_rate": 3.0647001166333245e-05, "loss": 0.7147, "step": 3390 }, { "epoch": 3.8570618264322176, "grad_norm": 0.9078419804573059, "learning_rate": 3.0078223891233514e-05, "loss": 0.7155, "step": 3400 }, { "epoch": 3.8684061259217244, "grad_norm": 0.962393045425415, "learning_rate": 2.9513838416918815e-05, "loss": 0.6866, "step": 3410 }, { "epoch": 3.8797504254112307, "grad_norm": 1.5198420286178589, "learning_rate": 2.8953880192554105e-05, "loss": 0.6741, "step": 3420 }, { "epoch": 3.8910947249007375, "grad_norm": 1.1129947900772095, "learning_rate": 2.8398384389227816e-05, "loss": 0.6542, "step": 3430 }, { "epoch": 3.902439024390244, "grad_norm": 0.8633179664611816, "learning_rate": 2.7847385897742705e-05, "loss": 0.6768, "step": 3440 }, { "epoch": 3.9137833238797506, "grad_norm": 1.062277913093567, "learning_rate": 2.7300919326424658e-05, "loss": 0.6709, "step": 3450 }, { "epoch": 3.925127623369257, "grad_norm": 0.7949813604354858, "learning_rate": 2.675901899894854e-05, "loss": 0.6166, "step": 3460 }, { "epoch": 3.9364719228587637, "grad_norm": 0.9200356006622314, "learning_rate": 2.622171895218273e-05, "loss": 0.6718, "step": 3470 }, { "epoch": 3.94781622234827, "grad_norm": 0.9637920260429382, "learning_rate": 2.568905293405095e-05, "loss": 0.619, "step": 3480 }, { "epoch": 3.9591605218377763, "grad_norm": 1.157073974609375, "learning_rate": 2.516105440141262e-05, "loss": 0.6961, "step": 3490 }, { "epoch": 3.970504821327283, "grad_norm": 0.8323079347610474, "learning_rate": 2.4637756517961517e-05, "loss": 0.677, "step": 3500 }, { "epoch": 3.9818491208167894, "grad_norm": 0.9369989037513733, "learning_rate": 2.41191921521427e-05, "loss": 0.6619, "step": 3510 }, { "epoch": 3.993193420306296, "grad_norm": 0.8290889263153076, "learning_rate": 2.360539387508801e-05, "loss": 0.6534, "step": 3520 }, { "epoch": 4.0045377197958025, "grad_norm": 0.8619610071182251, "learning_rate": 2.309639395857033e-05, "loss": 0.6531, "step": 3530 }, { "epoch": 4.015882019285309, "grad_norm": 0.7406215071678162, "learning_rate": 2.259222437297649e-05, "loss": 0.5811, "step": 3540 }, { "epoch": 4.027226318774816, "grad_norm": 1.3408113718032837, "learning_rate": 2.2092916785299323e-05, "loss": 0.6163, "step": 3550 }, { "epoch": 4.038570618264322, "grad_norm": 0.9652060866355896, "learning_rate": 2.159850255714859e-05, "loss": 0.6345, "step": 3560 }, { "epoch": 4.049914917753829, "grad_norm": 1.2307026386260986, "learning_rate": 2.1109012742781142e-05, "loss": 0.5568, "step": 3570 }, { "epoch": 4.061259217243335, "grad_norm": 1.101637363433838, "learning_rate": 2.0624478087150456e-05, "loss": 0.608, "step": 3580 }, { "epoch": 4.072603516732841, "grad_norm": 2.5598561763763428, "learning_rate": 2.0144929023975413e-05, "loss": 0.5294, "step": 3590 }, { "epoch": 4.083947816222349, "grad_norm": 0.9463273286819458, "learning_rate": 1.967039567382888e-05, "loss": 0.5482, "step": 3600 }, { "epoch": 4.095292115711855, "grad_norm": 0.9838125109672546, "learning_rate": 1.920090784224581e-05, "loss": 0.6254, "step": 3610 }, { "epoch": 4.106636415201361, "grad_norm": 0.85828697681427, "learning_rate": 1.8736495017851062e-05, "loss": 0.5443, "step": 3620 }, { "epoch": 4.117980714690868, "grad_norm": 0.8922297954559326, "learning_rate": 1.827718637050736e-05, "loss": 0.6068, "step": 3630 }, { "epoch": 4.129325014180375, "grad_norm": 0.7973962426185608, "learning_rate": 1.7823010749482927e-05, "loss": 0.6179, "step": 3640 }, { "epoch": 4.140669313669881, "grad_norm": 0.8686882257461548, "learning_rate": 1.737399668163966e-05, "loss": 0.6186, "step": 3650 }, { "epoch": 4.152013613159387, "grad_norm": 1.4338245391845703, "learning_rate": 1.693017236964125e-05, "loss": 0.5784, "step": 3660 }, { "epoch": 4.163357912648894, "grad_norm": 0.9958694577217102, "learning_rate": 1.6491565690181765e-05, "loss": 0.6388, "step": 3670 }, { "epoch": 4.1747022121384, "grad_norm": 0.9962863922119141, "learning_rate": 1.605820419223476e-05, "loss": 0.6541, "step": 3680 }, { "epoch": 4.186046511627907, "grad_norm": 1.1754194498062134, "learning_rate": 1.5630115095322827e-05, "loss": 0.6037, "step": 3690 }, { "epoch": 4.197390811117414, "grad_norm": 1.1034218072891235, "learning_rate": 1.5207325287808027e-05, "loss": 0.5844, "step": 3700 }, { "epoch": 4.20873511060692, "grad_norm": 1.0171332359313965, "learning_rate": 1.4789861325203013e-05, "loss": 0.6724, "step": 3710 }, { "epoch": 4.220079410096426, "grad_norm": 0.9791539907455444, "learning_rate": 1.4377749428503006e-05, "loss": 0.5989, "step": 3720 }, { "epoch": 4.231423709585933, "grad_norm": 0.9501050710678101, "learning_rate": 1.3971015482538963e-05, "loss": 0.5911, "step": 3730 }, { "epoch": 4.24276800907544, "grad_norm": 1.2614890336990356, "learning_rate": 1.3569685034351554e-05, "loss": 0.5849, "step": 3740 }, { "epoch": 4.254112308564946, "grad_norm": 1.0194411277770996, "learning_rate": 1.3173783291586772e-05, "loss": 0.5976, "step": 3750 }, { "epoch": 4.2654566080544525, "grad_norm": 1.0711522102355957, "learning_rate": 1.2783335120912565e-05, "loss": 0.5931, "step": 3760 }, { "epoch": 4.276800907543959, "grad_norm": 0.8650385141372681, "learning_rate": 1.2398365046456783e-05, "loss": 0.6078, "step": 3770 }, { "epoch": 4.288145207033466, "grad_norm": 0.823208749294281, "learning_rate": 1.2018897248267103e-05, "loss": 0.5961, "step": 3780 }, { "epoch": 4.299489506522972, "grad_norm": 0.9447870850563049, "learning_rate": 1.1644955560791993e-05, "loss": 0.6468, "step": 3790 }, { "epoch": 4.310833806012479, "grad_norm": 1.102318525314331, "learning_rate": 1.1276563471383883e-05, "loss": 0.588, "step": 3800 }, { "epoch": 4.322178105501985, "grad_norm": 0.9916651248931885, "learning_rate": 1.0913744118823866e-05, "loss": 0.6188, "step": 3810 }, { "epoch": 4.333522404991491, "grad_norm": 1.1987171173095703, "learning_rate": 1.05565202918682e-05, "loss": 0.5841, "step": 3820 }, { "epoch": 4.3448667044809985, "grad_norm": 0.9708378911018372, "learning_rate": 1.0204914427817158e-05, "loss": 0.6023, "step": 3830 }, { "epoch": 4.356211003970505, "grad_norm": 1.0048896074295044, "learning_rate": 9.8589486111056e-06, "loss": 0.5705, "step": 3840 }, { "epoch": 4.367555303460011, "grad_norm": 0.8364105820655823, "learning_rate": 9.518644571915847e-06, "loss": 0.5872, "step": 3850 }, { "epoch": 4.3788996029495175, "grad_norm": 1.5254448652267456, "learning_rate": 9.184023684812926e-06, "loss": 0.6063, "step": 3860 }, { "epoch": 4.390243902439025, "grad_norm": 0.993635356426239, "learning_rate": 8.855106967401839e-06, "loss": 0.5311, "step": 3870 }, { "epoch": 4.401588201928531, "grad_norm": 0.8678284883499146, "learning_rate": 8.531915079007625e-06, "loss": 0.5894, "step": 3880 }, { "epoch": 4.412932501418037, "grad_norm": 1.081127643585205, "learning_rate": 8.214468319377633e-06, "loss": 0.5906, "step": 3890 }, { "epoch": 4.424276800907544, "grad_norm": 0.9130728840827942, "learning_rate": 7.902786627406477e-06, "loss": 0.5764, "step": 3900 }, { "epoch": 4.43562110039705, "grad_norm": 0.9263814091682434, "learning_rate": 7.596889579883826e-06, "loss": 0.5812, "step": 3910 }, { "epoch": 4.446965399886557, "grad_norm": 1.095747947692871, "learning_rate": 7.296796390264549e-06, "loss": 0.5721, "step": 3920 }, { "epoch": 4.458309699376064, "grad_norm": 0.8003553152084351, "learning_rate": 7.002525907462121e-06, "loss": 0.5882, "step": 3930 }, { "epoch": 4.46965399886557, "grad_norm": 0.8841357231140137, "learning_rate": 6.7140966146646e-06, "loss": 0.5543, "step": 3940 }, { "epoch": 4.480998298355076, "grad_norm": 0.8580918312072754, "learning_rate": 6.431526628173701e-06, "loss": 0.6549, "step": 3950 }, { "epoch": 4.4923425978445835, "grad_norm": 0.9447335004806519, "learning_rate": 6.154833696267015e-06, "loss": 0.6516, "step": 3960 }, { "epoch": 4.50368689733409, "grad_norm": 1.0485211610794067, "learning_rate": 5.884035198083071e-06, "loss": 0.579, "step": 3970 }, { "epoch": 4.515031196823596, "grad_norm": 0.9394044876098633, "learning_rate": 5.619148142529873e-06, "loss": 0.6396, "step": 3980 }, { "epoch": 4.526375496313102, "grad_norm": 0.93062824010849, "learning_rate": 5.360189167216545e-06, "loss": 0.6005, "step": 3990 }, { "epoch": 4.537719795802609, "grad_norm": 0.9513915777206421, "learning_rate": 5.107174537408233e-06, "loss": 0.5743, "step": 4000 }, { "epoch": 4.537719795802609, "eval_loss": 1.0443100929260254, "eval_runtime": 15.6805, "eval_samples_per_second": 94.704, "eval_steps_per_second": 11.862, "step": 4000 }, { "epoch": 4.549064095292116, "grad_norm": 0.9627020359039307, "learning_rate": 4.8601201450046316e-06, "loss": 0.6077, "step": 4010 }, { "epoch": 4.560408394781622, "grad_norm": 0.8539467453956604, "learning_rate": 4.619041507541688e-06, "loss": 0.5812, "step": 4020 }, { "epoch": 4.571752694271129, "grad_norm": 0.9446848630905151, "learning_rate": 4.383953767216964e-06, "loss": 0.624, "step": 4030 }, { "epoch": 4.583096993760635, "grad_norm": 1.188366174697876, "learning_rate": 4.154871689938633e-06, "loss": 0.6437, "step": 4040 }, { "epoch": 4.594441293250142, "grad_norm": 1.0908474922180176, "learning_rate": 3.931809664397867e-06, "loss": 0.6323, "step": 4050 }, { "epoch": 4.6057855927396485, "grad_norm": 0.9742168188095093, "learning_rate": 3.714781701165304e-06, "loss": 0.6132, "step": 4060 }, { "epoch": 4.617129892229155, "grad_norm": 0.8761405348777771, "learning_rate": 3.503801431810816e-06, "loss": 0.624, "step": 4070 }, { "epoch": 4.628474191718661, "grad_norm": 0.996088445186615, "learning_rate": 3.298882108047463e-06, "loss": 0.6009, "step": 4080 }, { "epoch": 4.6398184912081675, "grad_norm": 0.9667827486991882, "learning_rate": 3.10003660089907e-06, "loss": 0.5988, "step": 4090 }, { "epoch": 4.651162790697675, "grad_norm": 0.9298661351203918, "learning_rate": 2.9072773998918503e-06, "loss": 0.6453, "step": 4100 }, { "epoch": 4.662507090187181, "grad_norm": 0.9182038307189941, "learning_rate": 2.7206166122698774e-06, "loss": 0.5915, "step": 4110 }, { "epoch": 4.673851389676687, "grad_norm": 0.835645318031311, "learning_rate": 2.540065962234683e-06, "loss": 0.6515, "step": 4120 }, { "epoch": 4.685195689166194, "grad_norm": 0.8575255274772644, "learning_rate": 2.3656367902088026e-06, "loss": 0.6169, "step": 4130 }, { "epoch": 4.696539988655701, "grad_norm": 0.9075832962989807, "learning_rate": 2.19734005212352e-06, "loss": 0.6166, "step": 4140 }, { "epoch": 4.707884288145207, "grad_norm": 2.0740888118743896, "learning_rate": 2.035186318730742e-06, "loss": 0.5779, "step": 4150 }, { "epoch": 4.7192285876347135, "grad_norm": 1.0293558835983276, "learning_rate": 1.8791857749389741e-06, "loss": 0.6414, "step": 4160 }, { "epoch": 4.73057288712422, "grad_norm": 0.9525774121284485, "learning_rate": 1.7293482191736877e-06, "loss": 0.5802, "step": 4170 }, { "epoch": 4.741917186613726, "grad_norm": 0.9085150957107544, "learning_rate": 1.5856830627618001e-06, "loss": 0.6331, "step": 4180 }, { "epoch": 4.753261486103233, "grad_norm": 0.9908912777900696, "learning_rate": 1.4481993293406048e-06, "loss": 0.5844, "step": 4190 }, { "epoch": 4.76460578559274, "grad_norm": 0.7421241998672485, "learning_rate": 1.316905654291012e-06, "loss": 0.6653, "step": 4200 }, { "epoch": 4.775950085082246, "grad_norm": 0.857502281665802, "learning_rate": 1.1918102841950607e-06, "loss": 0.5693, "step": 4210 }, { "epoch": 4.787294384571752, "grad_norm": 0.9300210475921631, "learning_rate": 1.0729210763180564e-06, "loss": 0.5755, "step": 4220 }, { "epoch": 4.79863868406126, "grad_norm": 1.2351378202438354, "learning_rate": 9.602454981149977e-07, "loss": 0.618, "step": 4230 }, { "epoch": 4.809982983550766, "grad_norm": 1.24778151512146, "learning_rate": 8.537906267615415e-07, "loss": 0.5896, "step": 4240 }, { "epoch": 4.821327283040272, "grad_norm": 1.3560271263122559, "learning_rate": 7.535631487095352e-07, "loss": 0.5879, "step": 4250 }, { "epoch": 4.832671582529779, "grad_norm": 1.8108911514282227, "learning_rate": 6.59569359266976e-07, "loss": 0.5943, "step": 4260 }, { "epoch": 4.844015882019285, "grad_norm": 0.9743121862411499, "learning_rate": 5.718151622026379e-07, "loss": 0.6104, "step": 4270 }, { "epoch": 4.855360181508792, "grad_norm": 1.2035831212997437, "learning_rate": 4.903060693752348e-07, "loss": 0.608, "step": 4280 }, { "epoch": 4.866704480998298, "grad_norm": 0.9681785106658936, "learning_rate": 4.1504720038724187e-07, "loss": 0.5773, "step": 4290 }, { "epoch": 4.878048780487805, "grad_norm": 1.0151753425598145, "learning_rate": 3.4604328226333083e-07, "loss": 0.5609, "step": 4300 }, { "epoch": 4.889393079977311, "grad_norm": 1.0577515363693237, "learning_rate": 2.832986491534295e-07, "loss": 0.6435, "step": 4310 }, { "epoch": 4.900737379466818, "grad_norm": 0.8938112854957581, "learning_rate": 2.2681724206052857e-07, "loss": 0.6398, "step": 4320 }, { "epoch": 4.912081678956325, "grad_norm": 0.997191846370697, "learning_rate": 1.7660260859315713e-07, "loss": 0.628, "step": 4330 }, { "epoch": 4.923425978445831, "grad_norm": 0.8382704257965088, "learning_rate": 1.3265790274249456e-07, "loss": 0.6105, "step": 4340 }, { "epoch": 4.934770277935337, "grad_norm": 0.8330470323562622, "learning_rate": 9.498588468433989e-08, "loss": 0.5982, "step": 4350 }, { "epoch": 4.946114577424844, "grad_norm": 1.2183622121810913, "learning_rate": 6.35889206057172e-08, "loss": 0.5876, "step": 4360 }, { "epoch": 4.957458876914351, "grad_norm": 1.131373405456543, "learning_rate": 3.846898255622788e-08, "loss": 0.6113, "step": 4370 }, { "epoch": 4.968803176403857, "grad_norm": 1.1781286001205444, "learning_rate": 1.9627648324227476e-08, "loss": 0.5522, "step": 4380 }, { "epoch": 4.9801474758933635, "grad_norm": 1.2726503610610962, "learning_rate": 7.066101337682707e-09, "loss": 0.6312, "step": 4390 }, { "epoch": 4.99149177538287, "grad_norm": 1.1971274614334106, "learning_rate": 7.85130589897598e-10, "loss": 0.6052, "step": 4400 }, { "epoch": 4.997163925127623, "step": 4405, "total_flos": 9.40234358432727e+17, "train_loss": 0.7921485962039632, "train_runtime": 4193.8899, "train_samples_per_second": 33.618, "train_steps_per_second": 1.05 } ], "logging_steps": 10, "max_steps": 4405, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.40234358432727e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }