{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2094, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009551098376313277, "grad_norm": 2.2569775581359863, "learning_rate": 1.9914040114613182e-05, "loss": 1.0939, "step": 10 }, { "epoch": 0.019102196752626553, "grad_norm": 2.2780814170837402, "learning_rate": 1.981852913085005e-05, "loss": 1.0778, "step": 20 }, { "epoch": 0.02865329512893983, "grad_norm": 6.2301554679870605, "learning_rate": 1.9723018147086915e-05, "loss": 0.9625, "step": 30 }, { "epoch": 0.038204393505253106, "grad_norm": 24.95392417907715, "learning_rate": 1.9627507163323785e-05, "loss": 0.8937, "step": 40 }, { "epoch": 0.04775549188156638, "grad_norm": 3.5438709259033203, "learning_rate": 1.9531996179560652e-05, "loss": 0.7843, "step": 50 }, { "epoch": 0.05730659025787966, "grad_norm": 3.102586269378662, "learning_rate": 1.943648519579752e-05, "loss": 0.6911, "step": 60 }, { "epoch": 0.06685768863419293, "grad_norm": 1.7375000715255737, "learning_rate": 1.9340974212034385e-05, "loss": 0.5772, "step": 70 }, { "epoch": 0.07640878701050621, "grad_norm": 4.719320774078369, "learning_rate": 1.9245463228271252e-05, "loss": 0.5894, "step": 80 }, { "epoch": 0.08595988538681948, "grad_norm": 1.8283956050872803, "learning_rate": 1.9149952244508122e-05, "loss": 0.405, "step": 90 }, { "epoch": 0.09551098376313276, "grad_norm": 27.354843139648438, "learning_rate": 1.905444126074499e-05, "loss": 0.4705, "step": 100 }, { "epoch": 0.10506208213944604, "grad_norm": 4.766273021697998, "learning_rate": 1.8958930276981855e-05, "loss": 0.3644, "step": 110 }, { "epoch": 0.11461318051575932, "grad_norm": 4.26953125, "learning_rate": 1.8863419293218722e-05, "loss": 0.2284, "step": 120 }, { "epoch": 0.12416427889207259, "grad_norm": 0.7594988942146301, "learning_rate": 1.876790830945559e-05, "loss": 0.2435, "step": 130 }, { "epoch": 0.13371537726838587, "grad_norm": 1.365365743637085, "learning_rate": 1.8672397325692455e-05, "loss": 0.2184, "step": 140 }, { "epoch": 0.14326647564469913, "grad_norm": 9.257822036743164, "learning_rate": 1.857688634192932e-05, "loss": 0.2248, "step": 150 }, { "epoch": 0.15281757402101243, "grad_norm": 0.5103208422660828, "learning_rate": 1.848137535816619e-05, "loss": 0.243, "step": 160 }, { "epoch": 0.1623686723973257, "grad_norm": Infinity, "learning_rate": 1.8385864374403058e-05, "loss": 0.2442, "step": 170 }, { "epoch": 0.17191977077363896, "grad_norm": 0.6302635669708252, "learning_rate": 1.8290353390639925e-05, "loss": 0.4167, "step": 180 }, { "epoch": 0.18147086914995225, "grad_norm": 4.381824493408203, "learning_rate": 1.819484240687679e-05, "loss": 0.2861, "step": 190 }, { "epoch": 0.19102196752626552, "grad_norm": 0.35794690251350403, "learning_rate": 1.8099331423113658e-05, "loss": 0.1059, "step": 200 }, { "epoch": 0.20057306590257878, "grad_norm": 0.4472959637641907, "learning_rate": 1.8003820439350528e-05, "loss": 0.0896, "step": 210 }, { "epoch": 0.21012416427889208, "grad_norm": 0.2821422517299652, "learning_rate": 1.7908309455587395e-05, "loss": 0.0837, "step": 220 }, { "epoch": 0.21967526265520534, "grad_norm": 26.638248443603516, "learning_rate": 1.781279847182426e-05, "loss": 0.1115, "step": 230 }, { "epoch": 0.22922636103151864, "grad_norm": 1.5657349824905396, "learning_rate": 1.7717287488061128e-05, "loss": 0.1829, "step": 240 }, { "epoch": 0.2387774594078319, "grad_norm": 0.2336825728416443, "learning_rate": 1.7621776504297995e-05, "loss": 0.0556, "step": 250 }, { "epoch": 0.24832855778414517, "grad_norm": 4.114994525909424, "learning_rate": 1.752626552053486e-05, "loss": 0.1532, "step": 260 }, { "epoch": 0.25787965616045844, "grad_norm": 13.862896919250488, "learning_rate": 1.743075453677173e-05, "loss": 0.2155, "step": 270 }, { "epoch": 0.26743075453677173, "grad_norm": 0.21343673765659332, "learning_rate": 1.7335243553008598e-05, "loss": 0.0374, "step": 280 }, { "epoch": 0.276981852913085, "grad_norm": 0.1753835529088974, "learning_rate": 1.7239732569245464e-05, "loss": 0.108, "step": 290 }, { "epoch": 0.28653295128939826, "grad_norm": 0.24574324488639832, "learning_rate": 1.714422158548233e-05, "loss": 0.1175, "step": 300 }, { "epoch": 0.29608404966571156, "grad_norm": 23.268014907836914, "learning_rate": 1.7048710601719198e-05, "loss": 0.1532, "step": 310 }, { "epoch": 0.30563514804202485, "grad_norm": 0.1591091752052307, "learning_rate": 1.6953199617956068e-05, "loss": 0.0994, "step": 320 }, { "epoch": 0.3151862464183381, "grad_norm": 0.17243853211402893, "learning_rate": 1.6857688634192934e-05, "loss": 0.083, "step": 330 }, { "epoch": 0.3247373447946514, "grad_norm": 2.6030852794647217, "learning_rate": 1.67621776504298e-05, "loss": 0.2142, "step": 340 }, { "epoch": 0.3342884431709647, "grad_norm": 0.15623551607131958, "learning_rate": 1.6666666666666667e-05, "loss": 0.0821, "step": 350 }, { "epoch": 0.3438395415472779, "grad_norm": 1.7919316291809082, "learning_rate": 1.6571155682903534e-05, "loss": 0.1883, "step": 360 }, { "epoch": 0.3533906399235912, "grad_norm": 0.2467084378004074, "learning_rate": 1.6475644699140404e-05, "loss": 0.083, "step": 370 }, { "epoch": 0.3629417382999045, "grad_norm": 1.9838752746582031, "learning_rate": 1.6380133715377267e-05, "loss": 0.0919, "step": 380 }, { "epoch": 0.37249283667621774, "grad_norm": 0.13745100796222687, "learning_rate": 1.6284622731614137e-05, "loss": 0.2578, "step": 390 }, { "epoch": 0.38204393505253104, "grad_norm": 23.14975929260254, "learning_rate": 1.6189111747851004e-05, "loss": 0.0609, "step": 400 }, { "epoch": 0.39159503342884433, "grad_norm": 0.16147832572460175, "learning_rate": 1.609360076408787e-05, "loss": 0.1865, "step": 410 }, { "epoch": 0.40114613180515757, "grad_norm": 8.0061616897583, "learning_rate": 1.599808978032474e-05, "loss": 0.1216, "step": 420 }, { "epoch": 0.41069723018147086, "grad_norm": 1.0904359817504883, "learning_rate": 1.5902578796561604e-05, "loss": 0.137, "step": 430 }, { "epoch": 0.42024832855778416, "grad_norm": 2.3162689208984375, "learning_rate": 1.5807067812798474e-05, "loss": 0.1358, "step": 440 }, { "epoch": 0.4297994269340974, "grad_norm": 0.11886035650968552, "learning_rate": 1.571155682903534e-05, "loss": 0.0733, "step": 450 }, { "epoch": 0.4393505253104107, "grad_norm": 7.155008792877197, "learning_rate": 1.5616045845272207e-05, "loss": 0.1238, "step": 460 }, { "epoch": 0.448901623686724, "grad_norm": 0.11069323867559433, "learning_rate": 1.5520534861509077e-05, "loss": 0.1987, "step": 470 }, { "epoch": 0.4584527220630373, "grad_norm": 0.10282690078020096, "learning_rate": 1.542502387774594e-05, "loss": 0.091, "step": 480 }, { "epoch": 0.4680038204393505, "grad_norm": 0.1403094232082367, "learning_rate": 1.532951289398281e-05, "loss": 0.1213, "step": 490 }, { "epoch": 0.4775549188156638, "grad_norm": 0.1331929713487625, "learning_rate": 1.5234001910219675e-05, "loss": 0.0154, "step": 500 }, { "epoch": 0.4871060171919771, "grad_norm": 0.11407709866762161, "learning_rate": 1.5138490926456543e-05, "loss": 0.1221, "step": 510 }, { "epoch": 0.49665711556829034, "grad_norm": 0.5473654270172119, "learning_rate": 1.5042979942693412e-05, "loss": 0.1143, "step": 520 }, { "epoch": 0.5062082139446036, "grad_norm": 0.11086848378181458, "learning_rate": 1.4947468958930278e-05, "loss": 0.0141, "step": 530 }, { "epoch": 0.5157593123209169, "grad_norm": 78.81968688964844, "learning_rate": 1.4851957975167147e-05, "loss": 0.1552, "step": 540 }, { "epoch": 0.5253104106972302, "grad_norm": 6.011876106262207, "learning_rate": 1.4756446991404012e-05, "loss": 0.1232, "step": 550 }, { "epoch": 0.5348615090735435, "grad_norm": 0.15130910277366638, "learning_rate": 1.466093600764088e-05, "loss": 0.1164, "step": 560 }, { "epoch": 0.5444126074498568, "grad_norm": 0.0998225063085556, "learning_rate": 1.4565425023877747e-05, "loss": 0.1248, "step": 570 }, { "epoch": 0.55396370582617, "grad_norm": 0.09413418173789978, "learning_rate": 1.4469914040114615e-05, "loss": 0.1258, "step": 580 }, { "epoch": 0.5635148042024832, "grad_norm": 30.505067825317383, "learning_rate": 1.4374403056351483e-05, "loss": 0.0788, "step": 590 }, { "epoch": 0.5730659025787965, "grad_norm": 0.10750491917133331, "learning_rate": 1.4278892072588348e-05, "loss": 0.1026, "step": 600 }, { "epoch": 0.5826170009551098, "grad_norm": 0.08975467830896378, "learning_rate": 1.4183381088825216e-05, "loss": 0.2408, "step": 610 }, { "epoch": 0.5921680993314231, "grad_norm": 0.12342803925275803, "learning_rate": 1.4087870105062083e-05, "loss": 0.0717, "step": 620 }, { "epoch": 0.6017191977077364, "grad_norm": 0.07816806435585022, "learning_rate": 1.3992359121298951e-05, "loss": 0.1131, "step": 630 }, { "epoch": 0.6112702960840497, "grad_norm": 0.06454802304506302, "learning_rate": 1.389684813753582e-05, "loss": 0.0096, "step": 640 }, { "epoch": 0.620821394460363, "grad_norm": 0.08184290677309036, "learning_rate": 1.3801337153772685e-05, "loss": 0.1253, "step": 650 }, { "epoch": 0.6303724928366762, "grad_norm": 0.07238755375146866, "learning_rate": 1.3705826170009553e-05, "loss": 0.0805, "step": 660 }, { "epoch": 0.6399235912129895, "grad_norm": 1.8935270309448242, "learning_rate": 1.361031518624642e-05, "loss": 0.1435, "step": 670 }, { "epoch": 0.6494746895893028, "grad_norm": 17.36036491394043, "learning_rate": 1.3514804202483288e-05, "loss": 0.0125, "step": 680 }, { "epoch": 0.6590257879656161, "grad_norm": 0.07514392584562302, "learning_rate": 1.3419293218720153e-05, "loss": 0.0106, "step": 690 }, { "epoch": 0.6685768863419294, "grad_norm": 4.228085041046143, "learning_rate": 1.3323782234957021e-05, "loss": 0.2031, "step": 700 }, { "epoch": 0.6781279847182426, "grad_norm": 0.21097038686275482, "learning_rate": 1.322827125119389e-05, "loss": 0.1307, "step": 710 }, { "epoch": 0.6876790830945558, "grad_norm": 0.10752805322408676, "learning_rate": 1.3132760267430756e-05, "loss": 0.0989, "step": 720 }, { "epoch": 0.6972301814708691, "grad_norm": 51.11091995239258, "learning_rate": 1.3037249283667624e-05, "loss": 0.1572, "step": 730 }, { "epoch": 0.7067812798471824, "grad_norm": 0.08639833331108093, "learning_rate": 1.2941738299904489e-05, "loss": 0.0508, "step": 740 }, { "epoch": 0.7163323782234957, "grad_norm": 0.09963525086641312, "learning_rate": 1.2846227316141357e-05, "loss": 0.0763, "step": 750 }, { "epoch": 0.725883476599809, "grad_norm": 0.0683053508400917, "learning_rate": 1.2750716332378224e-05, "loss": 0.0567, "step": 760 }, { "epoch": 0.7354345749761223, "grad_norm": 40.62727737426758, "learning_rate": 1.2655205348615092e-05, "loss": 0.2471, "step": 770 }, { "epoch": 0.7449856733524355, "grad_norm": 0.07331220805644989, "learning_rate": 1.2559694364851959e-05, "loss": 0.1743, "step": 780 }, { "epoch": 0.7545367717287488, "grad_norm": 0.06410760432481766, "learning_rate": 1.2464183381088826e-05, "loss": 0.0624, "step": 790 }, { "epoch": 0.7640878701050621, "grad_norm": 0.11088142544031143, "learning_rate": 1.2368672397325694e-05, "loss": 0.0087, "step": 800 }, { "epoch": 0.7736389684813754, "grad_norm": 0.05074993893504143, "learning_rate": 1.227316141356256e-05, "loss": 0.1333, "step": 810 }, { "epoch": 0.7831900668576887, "grad_norm": 0.05052105337381363, "learning_rate": 1.2177650429799429e-05, "loss": 0.0829, "step": 820 }, { "epoch": 0.792741165234002, "grad_norm": 0.06240995600819588, "learning_rate": 1.2082139446036295e-05, "loss": 0.0074, "step": 830 }, { "epoch": 0.8022922636103151, "grad_norm": 0.06128745898604393, "learning_rate": 1.1986628462273162e-05, "loss": 0.0705, "step": 840 }, { "epoch": 0.8118433619866284, "grad_norm": 4.010462760925293, "learning_rate": 1.189111747851003e-05, "loss": 0.1351, "step": 850 }, { "epoch": 0.8213944603629417, "grad_norm": 0.07143828272819519, "learning_rate": 1.1795606494746897e-05, "loss": 0.0514, "step": 860 }, { "epoch": 0.830945558739255, "grad_norm": 0.06396259367465973, "learning_rate": 1.1700095510983764e-05, "loss": 0.0713, "step": 870 }, { "epoch": 0.8404966571155683, "grad_norm": 14.529672622680664, "learning_rate": 1.160458452722063e-05, "loss": 0.0513, "step": 880 }, { "epoch": 0.8500477554918816, "grad_norm": 1.4704737663269043, "learning_rate": 1.1509073543457498e-05, "loss": 0.353, "step": 890 }, { "epoch": 0.8595988538681948, "grad_norm": 0.06813743710517883, "learning_rate": 1.1413562559694367e-05, "loss": 0.144, "step": 900 }, { "epoch": 0.8691499522445081, "grad_norm": 0.7163823843002319, "learning_rate": 1.1318051575931233e-05, "loss": 0.0415, "step": 910 }, { "epoch": 0.8787010506208214, "grad_norm": 0.05734021216630936, "learning_rate": 1.12225405921681e-05, "loss": 0.1911, "step": 920 }, { "epoch": 0.8882521489971347, "grad_norm": 0.06162785366177559, "learning_rate": 1.1127029608404967e-05, "loss": 0.0711, "step": 930 }, { "epoch": 0.897803247373448, "grad_norm": 0.1327008605003357, "learning_rate": 1.1031518624641835e-05, "loss": 0.058, "step": 940 }, { "epoch": 0.9073543457497613, "grad_norm": 0.05325314775109291, "learning_rate": 1.0936007640878703e-05, "loss": 0.0376, "step": 950 }, { "epoch": 0.9169054441260746, "grad_norm": 0.7295445799827576, "learning_rate": 1.0840496657115568e-05, "loss": 0.083, "step": 960 }, { "epoch": 0.9264565425023877, "grad_norm": 0.0540502592921257, "learning_rate": 1.0744985673352436e-05, "loss": 0.1386, "step": 970 }, { "epoch": 0.936007640878701, "grad_norm": 0.1747369021177292, "learning_rate": 1.0649474689589303e-05, "loss": 0.0063, "step": 980 }, { "epoch": 0.9455587392550143, "grad_norm": 0.04095704108476639, "learning_rate": 1.0553963705826171e-05, "loss": 0.0961, "step": 990 }, { "epoch": 0.9551098376313276, "grad_norm": 1.786160945892334, "learning_rate": 1.0458452722063038e-05, "loss": 0.2077, "step": 1000 }, { "epoch": 0.9646609360076409, "grad_norm": 0.057904984802007675, "learning_rate": 1.0362941738299905e-05, "loss": 0.1009, "step": 1010 }, { "epoch": 0.9742120343839542, "grad_norm": 0.04530341923236847, "learning_rate": 1.0267430754536773e-05, "loss": 0.0077, "step": 1020 }, { "epoch": 0.9837631327602674, "grad_norm": 0.04884221404790878, "learning_rate": 1.017191977077364e-05, "loss": 0.0804, "step": 1030 }, { "epoch": 0.9933142311365807, "grad_norm": 5.464759349822998, "learning_rate": 1.0076408787010508e-05, "loss": 0.1108, "step": 1040 }, { "epoch": 1.0, "eval_loss": 0.10391418635845184, "eval_runtime": 1.2229, "eval_samples_per_second": 760.514, "eval_steps_per_second": 95.678, "step": 1047 }, { "epoch": 1.002865329512894, "grad_norm": 0.054792579263448715, "learning_rate": 9.980897803247374e-06, "loss": 0.0723, "step": 1050 }, { "epoch": 1.0124164278892072, "grad_norm": 2.5259604454040527, "learning_rate": 9.885386819484241e-06, "loss": 0.074, "step": 1060 }, { "epoch": 1.0219675262655206, "grad_norm": 0.06726188212633133, "learning_rate": 9.78987583572111e-06, "loss": 0.0709, "step": 1070 }, { "epoch": 1.0315186246418337, "grad_norm": 0.05034675449132919, "learning_rate": 9.694364851957976e-06, "loss": 0.0072, "step": 1080 }, { "epoch": 1.0410697230181472, "grad_norm": 1.9011842012405396, "learning_rate": 9.598853868194843e-06, "loss": 0.0803, "step": 1090 }, { "epoch": 1.0506208213944603, "grad_norm": 0.05382240563631058, "learning_rate": 9.50334288443171e-06, "loss": 0.0247, "step": 1100 }, { "epoch": 1.0601719197707737, "grad_norm": 0.22710120677947998, "learning_rate": 9.407831900668578e-06, "loss": 0.0061, "step": 1110 }, { "epoch": 1.069723018147087, "grad_norm": 0.042488373816013336, "learning_rate": 9.312320916905446e-06, "loss": 0.1183, "step": 1120 }, { "epoch": 1.0792741165234, "grad_norm": 2.047455072402954, "learning_rate": 9.216809933142312e-06, "loss": 0.0765, "step": 1130 }, { "epoch": 1.0888252148997135, "grad_norm": 61.58501052856445, "learning_rate": 9.121298949379179e-06, "loss": 0.2333, "step": 1140 }, { "epoch": 1.0983763132760267, "grad_norm": 0.06981759518384933, "learning_rate": 9.025787965616046e-06, "loss": 0.2329, "step": 1150 }, { "epoch": 1.10792741165234, "grad_norm": 0.09120076149702072, "learning_rate": 8.930276981852914e-06, "loss": 0.0239, "step": 1160 }, { "epoch": 1.1174785100286533, "grad_norm": 2.0684778690338135, "learning_rate": 8.834765998089782e-06, "loss": 0.0902, "step": 1170 }, { "epoch": 1.1270296084049667, "grad_norm": 0.08632172644138336, "learning_rate": 8.739255014326649e-06, "loss": 0.1149, "step": 1180 }, { "epoch": 1.1365807067812799, "grad_norm": 0.6690634489059448, "learning_rate": 8.643744030563516e-06, "loss": 0.0071, "step": 1190 }, { "epoch": 1.146131805157593, "grad_norm": 0.09388808161020279, "learning_rate": 8.548233046800382e-06, "loss": 0.0717, "step": 1200 }, { "epoch": 1.1556829035339065, "grad_norm": 0.06623850017786026, "learning_rate": 8.45272206303725e-06, "loss": 0.0866, "step": 1210 }, { "epoch": 1.1652340019102196, "grad_norm": 0.05635674670338631, "learning_rate": 8.357211079274117e-06, "loss": 0.0914, "step": 1220 }, { "epoch": 1.174785100286533, "grad_norm": 0.0588347390294075, "learning_rate": 8.261700095510985e-06, "loss": 0.0461, "step": 1230 }, { "epoch": 1.1843361986628462, "grad_norm": 0.03934504836797714, "learning_rate": 8.166189111747852e-06, "loss": 0.1019, "step": 1240 }, { "epoch": 1.1938872970391594, "grad_norm": 0.051371876150369644, "learning_rate": 8.070678127984719e-06, "loss": 0.016, "step": 1250 }, { "epoch": 1.2034383954154728, "grad_norm": 0.061191458255052567, "learning_rate": 7.975167144221587e-06, "loss": 0.0077, "step": 1260 }, { "epoch": 1.212989493791786, "grad_norm": 0.0495857410132885, "learning_rate": 7.879656160458454e-06, "loss": 0.0051, "step": 1270 }, { "epoch": 1.2225405921680994, "grad_norm": 0.04128009453415871, "learning_rate": 7.78414517669532e-06, "loss": 0.317, "step": 1280 }, { "epoch": 1.2320916905444126, "grad_norm": 0.03453819081187248, "learning_rate": 7.688634192932188e-06, "loss": 0.0608, "step": 1290 }, { "epoch": 1.2416427889207258, "grad_norm": 0.03668952360749245, "learning_rate": 7.593123209169055e-06, "loss": 0.0264, "step": 1300 }, { "epoch": 1.2511938872970392, "grad_norm": 0.03199330344796181, "learning_rate": 7.4976122254059225e-06, "loss": 0.0047, "step": 1310 }, { "epoch": 1.2607449856733524, "grad_norm": 1.9187037944793701, "learning_rate": 7.402101241642789e-06, "loss": 0.1462, "step": 1320 }, { "epoch": 1.2702960840496658, "grad_norm": 0.04411700740456581, "learning_rate": 7.306590257879657e-06, "loss": 0.0955, "step": 1330 }, { "epoch": 1.279847182425979, "grad_norm": 0.03471948206424713, "learning_rate": 7.211079274116523e-06, "loss": 0.0062, "step": 1340 }, { "epoch": 1.2893982808022924, "grad_norm": 0.042391568422317505, "learning_rate": 7.115568290353391e-06, "loss": 0.006, "step": 1350 }, { "epoch": 1.2989493791786055, "grad_norm": 0.04176805168390274, "learning_rate": 7.020057306590259e-06, "loss": 0.1142, "step": 1360 }, { "epoch": 1.3085004775549187, "grad_norm": 0.06825416535139084, "learning_rate": 6.924546322827126e-06, "loss": 0.1209, "step": 1370 }, { "epoch": 1.3180515759312321, "grad_norm": 0.04017266258597374, "learning_rate": 6.829035339063993e-06, "loss": 0.0045, "step": 1380 }, { "epoch": 1.3276026743075453, "grad_norm": 0.0732770562171936, "learning_rate": 6.73352435530086e-06, "loss": 0.1188, "step": 1390 }, { "epoch": 1.3371537726838587, "grad_norm": 0.04319130256772041, "learning_rate": 6.638013371537727e-06, "loss": 0.007, "step": 1400 }, { "epoch": 1.346704871060172, "grad_norm": 0.08936483412981033, "learning_rate": 6.542502387774594e-06, "loss": 0.0118, "step": 1410 }, { "epoch": 1.3562559694364853, "grad_norm": 0.035478316247463226, "learning_rate": 6.446991404011462e-06, "loss": 0.0048, "step": 1420 }, { "epoch": 1.3658070678127985, "grad_norm": 0.10484705865383148, "learning_rate": 6.3514804202483295e-06, "loss": 0.0056, "step": 1430 }, { "epoch": 1.3753581661891117, "grad_norm": 0.03144150972366333, "learning_rate": 6.255969436485196e-06, "loss": 0.0758, "step": 1440 }, { "epoch": 1.384909264565425, "grad_norm": 0.12434408813714981, "learning_rate": 6.160458452722064e-06, "loss": 0.0049, "step": 1450 }, { "epoch": 1.3944603629417383, "grad_norm": 3.348506212234497, "learning_rate": 6.06494746895893e-06, "loss": 0.0309, "step": 1460 }, { "epoch": 1.4040114613180517, "grad_norm": 0.03951037675142288, "learning_rate": 5.969436485195798e-06, "loss": 0.0538, "step": 1470 }, { "epoch": 1.4135625596943648, "grad_norm": 0.04269490763545036, "learning_rate": 5.873925501432666e-06, "loss": 0.0044, "step": 1480 }, { "epoch": 1.4231136580706782, "grad_norm": 0.760600745677948, "learning_rate": 5.778414517669533e-06, "loss": 0.0973, "step": 1490 }, { "epoch": 1.4326647564469914, "grad_norm": 0.03149113059043884, "learning_rate": 5.6829035339064e-06, "loss": 0.0346, "step": 1500 }, { "epoch": 1.4422158548233046, "grad_norm": 0.05680393800139427, "learning_rate": 5.587392550143267e-06, "loss": 0.0043, "step": 1510 }, { "epoch": 1.451766953199618, "grad_norm": 0.03370094299316406, "learning_rate": 5.491881566380134e-06, "loss": 0.1295, "step": 1520 }, { "epoch": 1.4613180515759312, "grad_norm": 0.045079704374074936, "learning_rate": 5.396370582617001e-06, "loss": 0.0736, "step": 1530 }, { "epoch": 1.4708691499522444, "grad_norm": 0.7708030343055725, "learning_rate": 5.300859598853869e-06, "loss": 0.0042, "step": 1540 }, { "epoch": 1.4804202483285578, "grad_norm": 0.04070596769452095, "learning_rate": 5.2053486150907365e-06, "loss": 0.0301, "step": 1550 }, { "epoch": 1.4899713467048712, "grad_norm": 0.033276643604040146, "learning_rate": 5.109837631327603e-06, "loss": 0.0696, "step": 1560 }, { "epoch": 1.4995224450811844, "grad_norm": 0.04143739864230156, "learning_rate": 5.014326647564471e-06, "loss": 0.0753, "step": 1570 }, { "epoch": 1.5090735434574976, "grad_norm": 0.032175932079553604, "learning_rate": 4.918815663801337e-06, "loss": 0.1394, "step": 1580 }, { "epoch": 1.518624641833811, "grad_norm": 0.045357052236795425, "learning_rate": 4.823304680038205e-06, "loss": 0.0066, "step": 1590 }, { "epoch": 1.5281757402101241, "grad_norm": 0.04212405905127525, "learning_rate": 4.727793696275072e-06, "loss": 0.0758, "step": 1600 }, { "epoch": 1.5377268385864373, "grad_norm": 0.06627684831619263, "learning_rate": 4.632282712511939e-06, "loss": 0.0945, "step": 1610 }, { "epoch": 1.5472779369627507, "grad_norm": 0.04194959998130798, "learning_rate": 4.536771728748807e-06, "loss": 0.0046, "step": 1620 }, { "epoch": 1.5568290353390641, "grad_norm": 0.04786338284611702, "learning_rate": 4.441260744985674e-06, "loss": 0.0045, "step": 1630 }, { "epoch": 1.5663801337153773, "grad_norm": 0.030701184645295143, "learning_rate": 4.345749761222541e-06, "loss": 0.0699, "step": 1640 }, { "epoch": 1.5759312320916905, "grad_norm": 0.043966639786958694, "learning_rate": 4.250238777459409e-06, "loss": 0.122, "step": 1650 }, { "epoch": 1.585482330468004, "grad_norm": 0.04325714334845543, "learning_rate": 4.154727793696275e-06, "loss": 0.0051, "step": 1660 }, { "epoch": 1.595033428844317, "grad_norm": 0.03839458152651787, "learning_rate": 4.059216809933143e-06, "loss": 0.0041, "step": 1670 }, { "epoch": 1.6045845272206303, "grad_norm": 0.02976052649319172, "learning_rate": 3.96370582617001e-06, "loss": 0.0563, "step": 1680 }, { "epoch": 1.6141356255969437, "grad_norm": 55.89206314086914, "learning_rate": 3.868194842406877e-06, "loss": 0.0258, "step": 1690 }, { "epoch": 1.623686723973257, "grad_norm": 0.057242073118686676, "learning_rate": 3.772683858643744e-06, "loss": 0.0042, "step": 1700 }, { "epoch": 1.63323782234957, "grad_norm": 0.0714183896780014, "learning_rate": 3.6771728748806117e-06, "loss": 0.005, "step": 1710 }, { "epoch": 1.6427889207258835, "grad_norm": 0.0358208492398262, "learning_rate": 3.5816618911174787e-06, "loss": 0.0739, "step": 1720 }, { "epoch": 1.6523400191021969, "grad_norm": 0.039776891469955444, "learning_rate": 3.4861509073543457e-06, "loss": 0.0685, "step": 1730 }, { "epoch": 1.66189111747851, "grad_norm": 0.03394331783056259, "learning_rate": 3.3906399235912136e-06, "loss": 0.0794, "step": 1740 }, { "epoch": 1.6714422158548232, "grad_norm": 0.031364619731903076, "learning_rate": 3.2951289398280806e-06, "loss": 0.118, "step": 1750 }, { "epoch": 1.6809933142311366, "grad_norm": 2.0534205436706543, "learning_rate": 3.1996179560649477e-06, "loss": 0.1878, "step": 1760 }, { "epoch": 1.6905444126074498, "grad_norm": 0.038952384144067764, "learning_rate": 3.104106972301815e-06, "loss": 0.0738, "step": 1770 }, { "epoch": 1.700095510983763, "grad_norm": 0.029340583831071854, "learning_rate": 3.008595988538682e-06, "loss": 0.005, "step": 1780 }, { "epoch": 1.7096466093600764, "grad_norm": 0.04813091456890106, "learning_rate": 2.9130850047755492e-06, "loss": 0.0675, "step": 1790 }, { "epoch": 1.7191977077363898, "grad_norm": 0.05901302769780159, "learning_rate": 2.8175740210124163e-06, "loss": 0.0479, "step": 1800 }, { "epoch": 1.728748806112703, "grad_norm": 0.044173464179039, "learning_rate": 2.722063037249284e-06, "loss": 0.0423, "step": 1810 }, { "epoch": 1.7382999044890162, "grad_norm": 0.03839905560016632, "learning_rate": 2.626552053486151e-06, "loss": 0.1425, "step": 1820 }, { "epoch": 1.7478510028653296, "grad_norm": 0.059842657297849655, "learning_rate": 2.5310410697230182e-06, "loss": 0.0042, "step": 1830 }, { "epoch": 1.7574021012416428, "grad_norm": 0.24521498382091522, "learning_rate": 2.4355300859598857e-06, "loss": 0.0525, "step": 1840 }, { "epoch": 1.766953199617956, "grad_norm": 0.5483675003051758, "learning_rate": 2.3400191021967527e-06, "loss": 0.0752, "step": 1850 }, { "epoch": 1.7765042979942693, "grad_norm": 0.036952149122953415, "learning_rate": 2.24450811843362e-06, "loss": 0.1074, "step": 1860 }, { "epoch": 1.7860553963705827, "grad_norm": 0.8135057091712952, "learning_rate": 2.1489971346704872e-06, "loss": 0.0048, "step": 1870 }, { "epoch": 1.795606494746896, "grad_norm": 0.06449055671691895, "learning_rate": 2.0534861509073547e-06, "loss": 0.1038, "step": 1880 }, { "epoch": 1.8051575931232091, "grad_norm": 107.41304779052734, "learning_rate": 1.9579751671442217e-06, "loss": 0.0318, "step": 1890 }, { "epoch": 1.8147086914995225, "grad_norm": 0.04807087033987045, "learning_rate": 1.862464183381089e-06, "loss": 0.1689, "step": 1900 }, { "epoch": 1.8242597898758357, "grad_norm": 0.04132077470421791, "learning_rate": 1.7669531996179562e-06, "loss": 0.0041, "step": 1910 }, { "epoch": 1.8338108882521489, "grad_norm": 0.06250818073749542, "learning_rate": 1.6714422158548235e-06, "loss": 0.0052, "step": 1920 }, { "epoch": 1.8433619866284623, "grad_norm": 83.07594299316406, "learning_rate": 1.5759312320916905e-06, "loss": 0.174, "step": 1930 }, { "epoch": 1.8529130850047757, "grad_norm": 0.04571348428726196, "learning_rate": 1.480420248328558e-06, "loss": 0.0049, "step": 1940 }, { "epoch": 1.8624641833810889, "grad_norm": 0.04580092057585716, "learning_rate": 1.3849092645654252e-06, "loss": 0.0638, "step": 1950 }, { "epoch": 1.872015281757402, "grad_norm": 0.04569645971059799, "learning_rate": 1.2893982808022922e-06, "loss": 0.0551, "step": 1960 }, { "epoch": 1.8815663801337155, "grad_norm": 0.049619242548942566, "learning_rate": 1.1938872970391597e-06, "loss": 0.0436, "step": 1970 }, { "epoch": 1.8911174785100286, "grad_norm": 0.059808436781167984, "learning_rate": 1.0983763132760267e-06, "loss": 0.0653, "step": 1980 }, { "epoch": 1.9006685768863418, "grad_norm": 0.038019582629203796, "learning_rate": 1.002865329512894e-06, "loss": 0.0632, "step": 1990 }, { "epoch": 1.9102196752626552, "grad_norm": 0.037652261555194855, "learning_rate": 9.073543457497613e-07, "loss": 0.0676, "step": 2000 }, { "epoch": 1.9197707736389686, "grad_norm": 0.0339687243103981, "learning_rate": 8.118433619866285e-07, "loss": 0.0048, "step": 2010 }, { "epoch": 1.9293218720152816, "grad_norm": 0.043730951845645905, "learning_rate": 7.163323782234957e-07, "loss": 0.005, "step": 2020 }, { "epoch": 1.938872970391595, "grad_norm": 0.044012073427438736, "learning_rate": 6.20821394460363e-07, "loss": 0.0738, "step": 2030 }, { "epoch": 1.9484240687679084, "grad_norm": 0.34402212500572205, "learning_rate": 5.253104106972302e-07, "loss": 0.0062, "step": 2040 }, { "epoch": 1.9579751671442216, "grad_norm": 0.06250176578760147, "learning_rate": 4.2979942693409743e-07, "loss": 0.0665, "step": 2050 }, { "epoch": 1.9675262655205348, "grad_norm": 0.06154881417751312, "learning_rate": 3.342884431709647e-07, "loss": 0.0266, "step": 2060 }, { "epoch": 1.9770773638968482, "grad_norm": 0.038224026560783386, "learning_rate": 2.3877745940783193e-07, "loss": 0.0039, "step": 2070 }, { "epoch": 1.9866284622731614, "grad_norm": 0.668195903301239, "learning_rate": 1.4326647564469915e-07, "loss": 0.0162, "step": 2080 }, { "epoch": 1.9961795606494745, "grad_norm": 0.048277534544467926, "learning_rate": 4.775549188156639e-08, "loss": 0.0049, "step": 2090 }, { "epoch": 2.0, "eval_loss": 0.10492703318595886, "eval_runtime": 1.2317, "eval_samples_per_second": 755.069, "eval_steps_per_second": 94.993, "step": 2094 } ], "logging_steps": 10, "max_steps": 2094, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4403992385931264.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }