{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996706915477497, "eval_steps": 500, "global_step": 1365, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021953896816684962, "grad_norm": 37.74675866275334, "learning_rate": 3.64963503649635e-09, "loss": 1.4854, "step": 1 }, { "epoch": 0.010976948408342482, "grad_norm": 34.58078409329761, "learning_rate": 1.824817518248175e-08, "loss": 1.4947, "step": 5 }, { "epoch": 0.021953896816684963, "grad_norm": 33.64622164796075, "learning_rate": 3.64963503649635e-08, "loss": 1.4655, "step": 10 }, { "epoch": 0.03293084522502744, "grad_norm": 33.96621368420792, "learning_rate": 5.474452554744526e-08, "loss": 1.4454, "step": 15 }, { "epoch": 0.043907793633369926, "grad_norm": 32.64608823456708, "learning_rate": 7.2992700729927e-08, "loss": 1.441, "step": 20 }, { "epoch": 0.054884742041712405, "grad_norm": 28.7407961705511, "learning_rate": 9.124087591240875e-08, "loss": 1.4399, "step": 25 }, { "epoch": 0.06586169045005488, "grad_norm": 26.49133383604883, "learning_rate": 1.0948905109489052e-07, "loss": 1.3907, "step": 30 }, { "epoch": 0.07683863885839737, "grad_norm": 28.558137483756873, "learning_rate": 1.2773722627737227e-07, "loss": 1.3469, "step": 35 }, { "epoch": 0.08781558726673985, "grad_norm": 16.332214000825115, "learning_rate": 1.45985401459854e-07, "loss": 1.2666, "step": 40 }, { "epoch": 0.09879253567508232, "grad_norm": 10.940594279582514, "learning_rate": 1.6423357664233575e-07, "loss": 1.2482, "step": 45 }, { "epoch": 0.10976948408342481, "grad_norm": 5.741537178926412, "learning_rate": 1.824817518248175e-07, "loss": 1.2249, "step": 50 }, { "epoch": 0.1207464324917673, "grad_norm": 5.018901761568715, "learning_rate": 2.0072992700729928e-07, "loss": 1.1839, "step": 55 }, { "epoch": 0.13172338090010977, "grad_norm": 4.321418760394092, "learning_rate": 2.1897810218978103e-07, "loss": 1.1806, "step": 60 }, { "epoch": 0.14270032930845225, "grad_norm": 3.6868934108802405, "learning_rate": 2.3722627737226276e-07, "loss": 1.1367, "step": 65 }, { "epoch": 0.15367727771679474, "grad_norm": 3.6056758938325633, "learning_rate": 2.5547445255474454e-07, "loss": 1.1194, "step": 70 }, { "epoch": 0.16465422612513722, "grad_norm": 2.9888643007945923, "learning_rate": 2.737226277372263e-07, "loss": 1.1277, "step": 75 }, { "epoch": 0.1756311745334797, "grad_norm": 3.1811524089107737, "learning_rate": 2.91970802919708e-07, "loss": 1.1041, "step": 80 }, { "epoch": 0.18660812294182216, "grad_norm": 2.7324489147282356, "learning_rate": 3.102189781021898e-07, "loss": 1.1085, "step": 85 }, { "epoch": 0.19758507135016465, "grad_norm": 2.612737685100592, "learning_rate": 3.284671532846715e-07, "loss": 1.0872, "step": 90 }, { "epoch": 0.20856201975850713, "grad_norm": 2.7784872016274806, "learning_rate": 3.467153284671533e-07, "loss": 1.1009, "step": 95 }, { "epoch": 0.21953896816684962, "grad_norm": 2.6806669714481695, "learning_rate": 3.64963503649635e-07, "loss": 1.0742, "step": 100 }, { "epoch": 0.2305159165751921, "grad_norm": 2.8589613220847774, "learning_rate": 3.8321167883211675e-07, "loss": 1.0928, "step": 105 }, { "epoch": 0.2414928649835346, "grad_norm": 2.6851891388693887, "learning_rate": 4.0145985401459856e-07, "loss": 1.079, "step": 110 }, { "epoch": 0.2524698133918771, "grad_norm": 2.3561189558185136, "learning_rate": 4.1970802919708026e-07, "loss": 1.0441, "step": 115 }, { "epoch": 0.26344676180021953, "grad_norm": 2.3680695140089796, "learning_rate": 4.3795620437956206e-07, "loss": 1.077, "step": 120 }, { "epoch": 0.27442371020856204, "grad_norm": 2.9917242722942095, "learning_rate": 4.5620437956204376e-07, "loss": 1.0486, "step": 125 }, { "epoch": 0.2854006586169045, "grad_norm": 2.331882690510773, "learning_rate": 4.744525547445255e-07, "loss": 1.0478, "step": 130 }, { "epoch": 0.29637760702524696, "grad_norm": 2.54506147430991, "learning_rate": 4.927007299270073e-07, "loss": 1.0501, "step": 135 }, { "epoch": 0.30735455543358947, "grad_norm": 3.343713651464699, "learning_rate": 4.999926370237027e-07, "loss": 1.0464, "step": 140 }, { "epoch": 0.31833150384193193, "grad_norm": 2.345761304992979, "learning_rate": 4.999476426280587e-07, "loss": 1.039, "step": 145 }, { "epoch": 0.32930845225027444, "grad_norm": 2.5391392458703286, "learning_rate": 4.998617517322294e-07, "loss": 1.0316, "step": 150 }, { "epoch": 0.3402854006586169, "grad_norm": 2.472489561391851, "learning_rate": 4.997349783897061e-07, "loss": 1.0417, "step": 155 }, { "epoch": 0.3512623490669594, "grad_norm": 2.3737303149457016, "learning_rate": 4.99567343343177e-07, "loss": 1.0195, "step": 160 }, { "epoch": 0.36223929747530187, "grad_norm": 2.3800187550942047, "learning_rate": 4.993588740211331e-07, "loss": 1.0474, "step": 165 }, { "epoch": 0.3732162458836443, "grad_norm": 2.330985495803907, "learning_rate": 4.991096045333809e-07, "loss": 1.0262, "step": 170 }, { "epoch": 0.38419319429198684, "grad_norm": 2.4303555873767935, "learning_rate": 4.988195756654605e-07, "loss": 1.0142, "step": 175 }, { "epoch": 0.3951701427003293, "grad_norm": 2.4094455102172585, "learning_rate": 4.984888348719731e-07, "loss": 1.0188, "step": 180 }, { "epoch": 0.4061470911086718, "grad_norm": 2.1476465324301057, "learning_rate": 4.981174362688158e-07, "loss": 1.0142, "step": 185 }, { "epoch": 0.41712403951701427, "grad_norm": 2.2526911826436615, "learning_rate": 4.977054406243274e-07, "loss": 1.0328, "step": 190 }, { "epoch": 0.4281009879253567, "grad_norm": 2.3224686636469443, "learning_rate": 4.972529153493455e-07, "loss": 0.9933, "step": 195 }, { "epoch": 0.43907793633369924, "grad_norm": 2.4302653074531437, "learning_rate": 4.967599344861768e-07, "loss": 0.9954, "step": 200 }, { "epoch": 0.4500548847420417, "grad_norm": 2.4835737176654904, "learning_rate": 4.96226578696482e-07, "loss": 0.9985, "step": 205 }, { "epoch": 0.4610318331503842, "grad_norm": 2.4230184538015354, "learning_rate": 4.956529352480782e-07, "loss": 0.9987, "step": 210 }, { "epoch": 0.47200878155872666, "grad_norm": 2.329153987703765, "learning_rate": 4.950390980006599e-07, "loss": 1.0098, "step": 215 }, { "epoch": 0.4829857299670692, "grad_norm": 2.1643982961769574, "learning_rate": 4.943851673904419e-07, "loss": 1.0133, "step": 220 }, { "epoch": 0.49396267837541163, "grad_norm": 2.3197403816872977, "learning_rate": 4.936912504137257e-07, "loss": 1.0142, "step": 225 }, { "epoch": 0.5049396267837541, "grad_norm": 2.34747990249977, "learning_rate": 4.929574606093926e-07, "loss": 0.9976, "step": 230 }, { "epoch": 0.5159165751920965, "grad_norm": 2.2485105183919107, "learning_rate": 4.92183918040327e-07, "loss": 0.9891, "step": 235 }, { "epoch": 0.5268935236004391, "grad_norm": 2.2790091103112977, "learning_rate": 4.913707492737708e-07, "loss": 1.002, "step": 240 }, { "epoch": 0.5378704720087816, "grad_norm": 2.239563511547285, "learning_rate": 4.905180873606156e-07, "loss": 0.9884, "step": 245 }, { "epoch": 0.5488474204171241, "grad_norm": 2.201197024971912, "learning_rate": 4.896260718136315e-07, "loss": 1.0053, "step": 250 }, { "epoch": 0.5598243688254665, "grad_norm": 2.4477006618500976, "learning_rate": 4.886948485846413e-07, "loss": 0.9905, "step": 255 }, { "epoch": 0.570801317233809, "grad_norm": 2.1778270375289357, "learning_rate": 4.877245700406389e-07, "loss": 1.006, "step": 260 }, { "epoch": 0.5817782656421515, "grad_norm": 2.470318023588166, "learning_rate": 4.867153949388592e-07, "loss": 1.0001, "step": 265 }, { "epoch": 0.5927552140504939, "grad_norm": 2.246482390471278, "learning_rate": 4.856674884008027e-07, "loss": 1.0016, "step": 270 }, { "epoch": 0.6037321624588364, "grad_norm": 2.555761228606553, "learning_rate": 4.845810218852175e-07, "loss": 0.9909, "step": 275 }, { "epoch": 0.6147091108671789, "grad_norm": 2.6187160779641405, "learning_rate": 4.834561731600457e-07, "loss": 0.9879, "step": 280 }, { "epoch": 0.6256860592755215, "grad_norm": 2.2540909179670288, "learning_rate": 4.822931262733367e-07, "loss": 0.9723, "step": 285 }, { "epoch": 0.6366630076838639, "grad_norm": 2.3392602017439468, "learning_rate": 4.810920715231334e-07, "loss": 0.9813, "step": 290 }, { "epoch": 0.6476399560922064, "grad_norm": 2.5885537238335607, "learning_rate": 4.798532054263356e-07, "loss": 0.9672, "step": 295 }, { "epoch": 0.6586169045005489, "grad_norm": 2.3642433721473712, "learning_rate": 4.785767306865457e-07, "loss": 0.9774, "step": 300 }, { "epoch": 0.6695938529088913, "grad_norm": 2.408185221870625, "learning_rate": 4.772628561609021e-07, "loss": 0.985, "step": 305 }, { "epoch": 0.6805708013172338, "grad_norm": 2.2324134890318246, "learning_rate": 4.7591179682590644e-07, "loss": 0.9725, "step": 310 }, { "epoch": 0.6915477497255763, "grad_norm": 2.322340624031069, "learning_rate": 4.7452377374224874e-07, "loss": 0.9806, "step": 315 }, { "epoch": 0.7025246981339188, "grad_norm": 2.211381205162829, "learning_rate": 4.730990140186373e-07, "loss": 0.999, "step": 320 }, { "epoch": 0.7135016465422612, "grad_norm": 2.2193259266838155, "learning_rate": 4.7163775077463963e-07, "loss": 0.9728, "step": 325 }, { "epoch": 0.7244785949506037, "grad_norm": 2.307395076998798, "learning_rate": 4.7014022310253875e-07, "loss": 0.9924, "step": 330 }, { "epoch": 0.7354555433589463, "grad_norm": 2.242602418548775, "learning_rate": 4.6860667602821316e-07, "loss": 0.9861, "step": 335 }, { "epoch": 0.7464324917672887, "grad_norm": 2.2686164540827547, "learning_rate": 4.670373604710456e-07, "loss": 0.9803, "step": 340 }, { "epoch": 0.7574094401756312, "grad_norm": 2.1375492767276048, "learning_rate": 4.6543253320286755e-07, "loss": 0.9617, "step": 345 }, { "epoch": 0.7683863885839737, "grad_norm": 2.2558323943911684, "learning_rate": 4.6379245680594625e-07, "loss": 0.9531, "step": 350 }, { "epoch": 0.7793633369923162, "grad_norm": 2.39500346963894, "learning_rate": 4.621173996300207e-07, "loss": 0.9897, "step": 355 }, { "epoch": 0.7903402854006586, "grad_norm": 2.636774268293073, "learning_rate": 4.6040763574839435e-07, "loss": 0.9899, "step": 360 }, { "epoch": 0.8013172338090011, "grad_norm": 2.3125464385328236, "learning_rate": 4.5866344491309104e-07, "loss": 0.9844, "step": 365 }, { "epoch": 0.8122941822173436, "grad_norm": 2.158764874076257, "learning_rate": 4.568851125090822e-07, "loss": 0.9789, "step": 370 }, { "epoch": 0.823271130625686, "grad_norm": 2.146303203312366, "learning_rate": 4.550729295075918e-07, "loss": 0.9894, "step": 375 }, { "epoch": 0.8342480790340285, "grad_norm": 2.1234461105972686, "learning_rate": 4.532271924184874e-07, "loss": 0.9573, "step": 380 }, { "epoch": 0.845225027442371, "grad_norm": 2.5991666840318524, "learning_rate": 4.513482032417655e-07, "loss": 0.978, "step": 385 }, { "epoch": 0.8562019758507134, "grad_norm": 2.459654668779688, "learning_rate": 4.494362694181384e-07, "loss": 0.9774, "step": 390 }, { "epoch": 0.867178924259056, "grad_norm": 2.247292046481084, "learning_rate": 4.474917037787297e-07, "loss": 0.9753, "step": 395 }, { "epoch": 0.8781558726673985, "grad_norm": 2.2211212349255574, "learning_rate": 4.455148244938901e-07, "loss": 0.9789, "step": 400 }, { "epoch": 0.889132821075741, "grad_norm": 2.3018670472595644, "learning_rate": 4.435059550211371e-07, "loss": 0.9748, "step": 405 }, { "epoch": 0.9001097694840834, "grad_norm": 2.1721337174418784, "learning_rate": 4.414654240522315e-07, "loss": 0.9741, "step": 410 }, { "epoch": 0.9110867178924259, "grad_norm": 2.28819746756527, "learning_rate": 4.3939356545939677e-07, "loss": 0.9756, "step": 415 }, { "epoch": 0.9220636663007684, "grad_norm": 2.4856430019366806, "learning_rate": 4.372907182406902e-07, "loss": 0.9599, "step": 420 }, { "epoch": 0.9330406147091108, "grad_norm": 2.279118517164672, "learning_rate": 4.3515722646453657e-07, "loss": 0.9766, "step": 425 }, { "epoch": 0.9440175631174533, "grad_norm": 2.283587407407711, "learning_rate": 4.329934392134314e-07, "loss": 0.9708, "step": 430 }, { "epoch": 0.9549945115257958, "grad_norm": 2.3436121330856676, "learning_rate": 4.3079971052682387e-07, "loss": 0.9752, "step": 435 }, { "epoch": 0.9659714599341384, "grad_norm": 2.33617606208673, "learning_rate": 4.2857639934318874e-07, "loss": 0.9719, "step": 440 }, { "epoch": 0.9769484083424808, "grad_norm": 2.2203473882036966, "learning_rate": 4.26323869441297e-07, "loss": 0.9613, "step": 445 }, { "epoch": 0.9879253567508233, "grad_norm": 2.5027842654915005, "learning_rate": 4.240424893806941e-07, "loss": 0.9864, "step": 450 }, { "epoch": 0.9989023051591658, "grad_norm": 2.333812012249256, "learning_rate": 4.217326324413962e-07, "loss": 0.9643, "step": 455 }, { "epoch": 0.9989023051591658, "eval_loss": 1.019766926765442, "eval_runtime": 4.8917, "eval_samples_per_second": 54.786, "eval_steps_per_second": 1.022, "step": 455 }, { "epoch": 1.0098792535675083, "grad_norm": 2.6384758953681495, "learning_rate": 4.1939467656281387e-07, "loss": 0.9411, "step": 460 }, { "epoch": 1.0208562019758507, "grad_norm": 2.4368918808468263, "learning_rate": 4.170290042819137e-07, "loss": 0.9304, "step": 465 }, { "epoch": 1.031833150384193, "grad_norm": 2.606275797390668, "learning_rate": 4.146360026706276e-07, "loss": 0.9401, "step": 470 }, { "epoch": 1.0428100987925357, "grad_norm": 2.3703029555878175, "learning_rate": 4.122160632725195e-07, "loss": 0.9411, "step": 475 }, { "epoch": 1.0537870472008781, "grad_norm": 2.3433873192058785, "learning_rate": 4.097695820387216e-07, "loss": 0.9314, "step": 480 }, { "epoch": 1.0647639956092205, "grad_norm": 2.3059951796342757, "learning_rate": 4.0729695926314813e-07, "loss": 0.9212, "step": 485 }, { "epoch": 1.0757409440175631, "grad_norm": 2.246905867491328, "learning_rate": 4.0479859951699934e-07, "loss": 0.9299, "step": 490 }, { "epoch": 1.0867178924259056, "grad_norm": 2.332204582181345, "learning_rate": 4.0227491158256564e-07, "loss": 0.9283, "step": 495 }, { "epoch": 1.0976948408342482, "grad_norm": 2.3052192318083096, "learning_rate": 3.9972630838634236e-07, "loss": 0.9081, "step": 500 }, { "epoch": 1.1086717892425906, "grad_norm": 2.2938816727121174, "learning_rate": 3.9715320693146653e-07, "loss": 0.9244, "step": 505 }, { "epoch": 1.119648737650933, "grad_norm": 2.3376274658971794, "learning_rate": 3.9455602822948695e-07, "loss": 0.9356, "step": 510 }, { "epoch": 1.1306256860592756, "grad_norm": 2.2234966705346437, "learning_rate": 3.9193519723147795e-07, "loss": 0.937, "step": 515 }, { "epoch": 1.141602634467618, "grad_norm": 2.3167888423276897, "learning_rate": 3.892911427585089e-07, "loss": 0.9253, "step": 520 }, { "epoch": 1.1525795828759604, "grad_norm": 2.2822984548495, "learning_rate": 3.8662429743148046e-07, "loss": 0.93, "step": 525 }, { "epoch": 1.163556531284303, "grad_norm": 2.3315853272899885, "learning_rate": 3.839350976003386e-07, "loss": 0.9307, "step": 530 }, { "epoch": 1.1745334796926454, "grad_norm": 2.225438330230274, "learning_rate": 3.8122398327267956e-07, "loss": 0.9191, "step": 535 }, { "epoch": 1.1855104281009878, "grad_norm": 2.282678706637911, "learning_rate": 3.7849139804175527e-07, "loss": 0.9242, "step": 540 }, { "epoch": 1.1964873765093305, "grad_norm": 2.4151431361330262, "learning_rate": 3.7573778901389264e-07, "loss": 0.9129, "step": 545 }, { "epoch": 1.2074643249176729, "grad_norm": 2.1432543425535986, "learning_rate": 3.729636067353377e-07, "loss": 0.9352, "step": 550 }, { "epoch": 1.2184412733260155, "grad_norm": 2.2103634060126045, "learning_rate": 3.701693051185375e-07, "loss": 0.9217, "step": 555 }, { "epoch": 1.2294182217343579, "grad_norm": 2.29979398895129, "learning_rate": 3.6735534136787034e-07, "loss": 0.9241, "step": 560 }, { "epoch": 1.2403951701427003, "grad_norm": 2.211918385111886, "learning_rate": 3.6452217590483843e-07, "loss": 0.9316, "step": 565 }, { "epoch": 1.2513721185510427, "grad_norm": 2.188755024370649, "learning_rate": 3.6167027229273296e-07, "loss": 0.9089, "step": 570 }, { "epoch": 1.2623490669593853, "grad_norm": 2.4206253706110576, "learning_rate": 3.5880009716078643e-07, "loss": 0.9285, "step": 575 }, { "epoch": 1.2733260153677277, "grad_norm": 2.2945442413712684, "learning_rate": 3.559121201278219e-07, "loss": 0.918, "step": 580 }, { "epoch": 1.2843029637760703, "grad_norm": 2.345758700673061, "learning_rate": 3.5300681372541473e-07, "loss": 0.9305, "step": 585 }, { "epoch": 1.2952799121844127, "grad_norm": 2.2789322285102314, "learning_rate": 3.5008465332057594e-07, "loss": 0.9281, "step": 590 }, { "epoch": 1.3062568605927551, "grad_norm": 2.243522596217181, "learning_rate": 3.471461170379732e-07, "loss": 0.914, "step": 595 }, { "epoch": 1.3172338090010978, "grad_norm": 2.323467820978964, "learning_rate": 3.4419168568169935e-07, "loss": 0.9361, "step": 600 }, { "epoch": 1.3282107574094402, "grad_norm": 2.194684758387474, "learning_rate": 3.4122184265660395e-07, "loss": 0.9311, "step": 605 }, { "epoch": 1.3391877058177828, "grad_norm": 2.2140499236992603, "learning_rate": 3.3823707388919736e-07, "loss": 0.9125, "step": 610 }, { "epoch": 1.3501646542261252, "grad_norm": 2.258349553066923, "learning_rate": 3.352378677481444e-07, "loss": 0.9426, "step": 615 }, { "epoch": 1.3611416026344676, "grad_norm": 2.358073970259772, "learning_rate": 3.3222471496435643e-07, "loss": 0.9188, "step": 620 }, { "epoch": 1.37211855104281, "grad_norm": 2.293914944788209, "learning_rate": 3.291981085506986e-07, "loss": 0.9046, "step": 625 }, { "epoch": 1.3830954994511526, "grad_norm": 2.239065960199277, "learning_rate": 3.2615854372132226e-07, "loss": 0.924, "step": 630 }, { "epoch": 1.394072447859495, "grad_norm": 2.236539103492653, "learning_rate": 3.231065178106384e-07, "loss": 0.9229, "step": 635 }, { "epoch": 1.4050493962678376, "grad_norm": 2.1945349736134707, "learning_rate": 3.2004253019194324e-07, "loss": 0.9272, "step": 640 }, { "epoch": 1.41602634467618, "grad_norm": 2.2398945956325513, "learning_rate": 3.169670821957112e-07, "loss": 0.9102, "step": 645 }, { "epoch": 1.4270032930845225, "grad_norm": 2.289801618738276, "learning_rate": 3.1388067702756655e-07, "loss": 0.9429, "step": 650 }, { "epoch": 1.4379802414928649, "grad_norm": 2.245541374176715, "learning_rate": 3.1078381968594895e-07, "loss": 0.9115, "step": 655 }, { "epoch": 1.4489571899012075, "grad_norm": 2.278894612214528, "learning_rate": 3.076770168794854e-07, "loss": 0.9398, "step": 660 }, { "epoch": 1.4599341383095499, "grad_norm": 2.3065149845422392, "learning_rate": 3.045607769440829e-07, "loss": 0.8946, "step": 665 }, { "epoch": 1.4709110867178925, "grad_norm": 2.3594569504860696, "learning_rate": 3.014356097597535e-07, "loss": 0.9137, "step": 670 }, { "epoch": 1.481888035126235, "grad_norm": 2.4349120794769354, "learning_rate": 2.983020266671886e-07, "loss": 0.9113, "step": 675 }, { "epoch": 1.4928649835345773, "grad_norm": 2.2209808300678318, "learning_rate": 2.951605403840921e-07, "loss": 0.8983, "step": 680 }, { "epoch": 1.5038419319429197, "grad_norm": 2.3286466229872635, "learning_rate": 2.920116649212909e-07, "loss": 0.9312, "step": 685 }, { "epoch": 1.5148188803512623, "grad_norm": 2.238157735155838, "learning_rate": 2.888559154986307e-07, "loss": 0.921, "step": 690 }, { "epoch": 1.525795828759605, "grad_norm": 2.2522331987696944, "learning_rate": 2.856938084606769e-07, "loss": 0.9027, "step": 695 }, { "epoch": 1.5367727771679474, "grad_norm": 2.225248865709702, "learning_rate": 2.825258611922292e-07, "loss": 0.9186, "step": 700 }, { "epoch": 1.5477497255762898, "grad_norm": 2.2372307174396036, "learning_rate": 2.793525920336677e-07, "loss": 0.9179, "step": 705 }, { "epoch": 1.5587266739846322, "grad_norm": 2.227278818178374, "learning_rate": 2.7617452019614144e-07, "loss": 0.9068, "step": 710 }, { "epoch": 1.5697036223929748, "grad_norm": 2.1887205060374133, "learning_rate": 2.7299216567661503e-07, "loss": 0.9128, "step": 715 }, { "epoch": 1.5806805708013172, "grad_norm": 2.1730778041819487, "learning_rate": 2.6980604917278675e-07, "loss": 0.9072, "step": 720 }, { "epoch": 1.5916575192096598, "grad_norm": 2.306897540604203, "learning_rate": 2.6661669199789174e-07, "loss": 0.9241, "step": 725 }, { "epoch": 1.6026344676180022, "grad_norm": 2.2513680878615956, "learning_rate": 2.6342461599540456e-07, "loss": 0.9183, "step": 730 }, { "epoch": 1.6136114160263446, "grad_norm": 2.2683353014176504, "learning_rate": 2.6023034345365515e-07, "loss": 0.9101, "step": 735 }, { "epoch": 1.624588364434687, "grad_norm": 2.3773000711715793, "learning_rate": 2.5703439702037155e-07, "loss": 0.9226, "step": 740 }, { "epoch": 1.6355653128430296, "grad_norm": 2.2707149058977047, "learning_rate": 2.538372996171648e-07, "loss": 0.9099, "step": 745 }, { "epoch": 1.6465422612513723, "grad_norm": 2.3321984673401808, "learning_rate": 2.506395743539677e-07, "loss": 0.9053, "step": 750 }, { "epoch": 1.6575192096597147, "grad_norm": 2.289265422242154, "learning_rate": 2.474417444434436e-07, "loss": 0.9167, "step": 755 }, { "epoch": 1.668496158068057, "grad_norm": 2.29419980236518, "learning_rate": 2.442443331153783e-07, "loss": 0.9255, "step": 760 }, { "epoch": 1.6794731064763995, "grad_norm": 2.2919378273617785, "learning_rate": 2.4104786353106926e-07, "loss": 0.9039, "step": 765 }, { "epoch": 1.690450054884742, "grad_norm": 2.2503081710593347, "learning_rate": 2.3785285869772525e-07, "loss": 0.908, "step": 770 }, { "epoch": 1.7014270032930845, "grad_norm": 2.273983108472163, "learning_rate": 2.3465984138289237e-07, "loss": 0.9178, "step": 775 }, { "epoch": 1.7124039517014271, "grad_norm": 2.2199735303427333, "learning_rate": 2.3146933402891812e-07, "loss": 0.9174, "step": 780 }, { "epoch": 1.7233809001097695, "grad_norm": 2.246832371307188, "learning_rate": 2.2828185866746987e-07, "loss": 0.9063, "step": 785 }, { "epoch": 1.734357848518112, "grad_norm": 2.262285299215957, "learning_rate": 2.2509793683411932e-07, "loss": 0.9282, "step": 790 }, { "epoch": 1.7453347969264543, "grad_norm": 2.179628826722484, "learning_rate": 2.2191808948300904e-07, "loss": 0.8962, "step": 795 }, { "epoch": 1.756311745334797, "grad_norm": 2.363447036072627, "learning_rate": 2.1874283690161364e-07, "loss": 0.9288, "step": 800 }, { "epoch": 1.7672886937431396, "grad_norm": 2.2413506946996025, "learning_rate": 2.1557269862561042e-07, "loss": 0.9105, "step": 805 }, { "epoch": 1.778265642151482, "grad_norm": 2.213099523042175, "learning_rate": 2.124081933538726e-07, "loss": 0.9152, "step": 810 }, { "epoch": 1.7892425905598244, "grad_norm": 2.3911262169971232, "learning_rate": 2.0924983886359966e-07, "loss": 0.9114, "step": 815 }, { "epoch": 1.8002195389681668, "grad_norm": 2.3443041782008, "learning_rate": 2.0609815192559863e-07, "loss": 0.906, "step": 820 }, { "epoch": 1.8111964873765092, "grad_norm": 2.2448633071392736, "learning_rate": 2.0295364821972993e-07, "loss": 0.9229, "step": 825 }, { "epoch": 1.8221734357848518, "grad_norm": 2.335369643205432, "learning_rate": 1.998168422505315e-07, "loss": 0.9055, "step": 830 }, { "epoch": 1.8331503841931944, "grad_norm": 2.260199044110363, "learning_rate": 1.966882472630356e-07, "loss": 0.916, "step": 835 }, { "epoch": 1.8441273326015368, "grad_norm": 2.1940831038305624, "learning_rate": 1.9356837515879137e-07, "loss": 0.9128, "step": 840 }, { "epoch": 1.8551042810098792, "grad_norm": 2.2395348963129176, "learning_rate": 1.904577364121077e-07, "loss": 0.9137, "step": 845 }, { "epoch": 1.8660812294182216, "grad_norm": 2.2788272374999288, "learning_rate": 1.873568399865288e-07, "loss": 0.8989, "step": 850 }, { "epoch": 1.8770581778265643, "grad_norm": 2.364940762438768, "learning_rate": 1.8426619325155772e-07, "loss": 0.9136, "step": 855 }, { "epoch": 1.8880351262349067, "grad_norm": 2.248184524351583, "learning_rate": 1.8118630189964055e-07, "loss": 0.9234, "step": 860 }, { "epoch": 1.8990120746432493, "grad_norm": 2.265297755363968, "learning_rate": 1.7811766986342457e-07, "loss": 0.9003, "step": 865 }, { "epoch": 1.9099890230515917, "grad_norm": 2.3424580199936447, "learning_rate": 1.7506079923330492e-07, "loss": 0.918, "step": 870 }, { "epoch": 1.920965971459934, "grad_norm": 2.262902517301985, "learning_rate": 1.7201619017527224e-07, "loss": 0.9305, "step": 875 }, { "epoch": 1.9319429198682765, "grad_norm": 2.2767358868319385, "learning_rate": 1.6898434084907548e-07, "loss": 0.9004, "step": 880 }, { "epoch": 1.942919868276619, "grad_norm": 2.233676995391588, "learning_rate": 1.6596574732671286e-07, "loss": 0.9107, "step": 885 }, { "epoch": 1.9538968166849617, "grad_norm": 2.2349418347267984, "learning_rate": 1.6296090351126445e-07, "loss": 0.9012, "step": 890 }, { "epoch": 1.9648737650933041, "grad_norm": 2.2644252517319043, "learning_rate": 1.5997030105607967e-07, "loss": 0.9055, "step": 895 }, { "epoch": 1.9758507135016465, "grad_norm": 2.3639914159405824, "learning_rate": 1.5699442928433255e-07, "loss": 0.8966, "step": 900 }, { "epoch": 1.986827661909989, "grad_norm": 2.2554267100734346, "learning_rate": 1.5403377510895898e-07, "loss": 0.9192, "step": 905 }, { "epoch": 1.9978046103183313, "grad_norm": 2.202331515306409, "learning_rate": 1.5108882295298748e-07, "loss": 0.9077, "step": 910 }, { "epoch": 2.0, "eval_loss": 0.9618459939956665, "eval_runtime": 3.545, "eval_samples_per_second": 75.599, "eval_steps_per_second": 1.41, "step": 911 }, { "epoch": 2.008781558726674, "grad_norm": 2.3776213420255745, "learning_rate": 1.4816005467027793e-07, "loss": 0.8907, "step": 915 }, { "epoch": 2.0197585071350166, "grad_norm": 2.2333706796571384, "learning_rate": 1.452479494666809e-07, "loss": 0.8927, "step": 920 }, { "epoch": 2.030735455543359, "grad_norm": 2.2112426814454964, "learning_rate": 1.4235298382162897e-07, "loss": 0.8852, "step": 925 }, { "epoch": 2.0417124039517014, "grad_norm": 2.236063023873273, "learning_rate": 1.3947563141017593e-07, "loss": 0.8955, "step": 930 }, { "epoch": 2.052689352360044, "grad_norm": 2.329499119971109, "learning_rate": 1.3661636302549355e-07, "loss": 0.8726, "step": 935 }, { "epoch": 2.063666300768386, "grad_norm": 2.226335466772911, "learning_rate": 1.3377564650184008e-07, "loss": 0.8844, "step": 940 }, { "epoch": 2.074643249176729, "grad_norm": 2.359491075414422, "learning_rate": 1.3095394663801344e-07, "loss": 0.8888, "step": 945 }, { "epoch": 2.0856201975850714, "grad_norm": 2.2422498605446894, "learning_rate": 1.2815172512130079e-07, "loss": 0.8946, "step": 950 }, { "epoch": 2.096597145993414, "grad_norm": 2.34035910547768, "learning_rate": 1.2536944045193644e-07, "loss": 0.8925, "step": 955 }, { "epoch": 2.1075740944017562, "grad_norm": 2.3092350865810105, "learning_rate": 1.2260754786808286e-07, "loss": 0.8769, "step": 960 }, { "epoch": 2.1185510428100987, "grad_norm": 2.285345952327998, "learning_rate": 1.198664992713437e-07, "loss": 0.8637, "step": 965 }, { "epoch": 2.129527991218441, "grad_norm": 2.31950118158217, "learning_rate": 1.1714674315282406e-07, "loss": 0.8851, "step": 970 }, { "epoch": 2.140504939626784, "grad_norm": 2.260100588550131, "learning_rate": 1.144487245197481e-07, "loss": 0.877, "step": 975 }, { "epoch": 2.1514818880351263, "grad_norm": 2.28406902577705, "learning_rate": 1.1177288482264652e-07, "loss": 0.8731, "step": 980 }, { "epoch": 2.1624588364434687, "grad_norm": 2.349769500619024, "learning_rate": 1.091196618831268e-07, "loss": 0.8741, "step": 985 }, { "epoch": 2.173435784851811, "grad_norm": 2.319775552144385, "learning_rate": 1.0648948982223657e-07, "loss": 0.9009, "step": 990 }, { "epoch": 2.1844127332601535, "grad_norm": 2.854032904166289, "learning_rate": 1.0388279898943222e-07, "loss": 0.8867, "step": 995 }, { "epoch": 2.1953896816684964, "grad_norm": 2.3639396187236508, "learning_rate": 1.0130001589216566e-07, "loss": 0.8848, "step": 1000 }, { "epoch": 2.2063666300768388, "grad_norm": 2.3793062354194143, "learning_rate": 9.874156312609835e-08, "loss": 0.8811, "step": 1005 }, { "epoch": 2.217343578485181, "grad_norm": 2.287306409448599, "learning_rate": 9.620785930595679e-08, "loss": 0.8742, "step": 1010 }, { "epoch": 2.2283205268935236, "grad_norm": 2.308427796984264, "learning_rate": 9.369931899703821e-08, "loss": 0.8879, "step": 1015 }, { "epoch": 2.239297475301866, "grad_norm": 2.266095319232965, "learning_rate": 9.121635264737923e-08, "loss": 0.8976, "step": 1020 }, { "epoch": 2.2502744237102084, "grad_norm": 2.290278874944272, "learning_rate": 8.87593665205987e-08, "loss": 0.8957, "step": 1025 }, { "epoch": 2.261251372118551, "grad_norm": 2.351021776661905, "learning_rate": 8.632876262942415e-08, "loss": 0.8833, "step": 1030 }, { "epoch": 2.2722283205268936, "grad_norm": 2.290071235509179, "learning_rate": 8.392493866991487e-08, "loss": 0.8728, "step": 1035 }, { "epoch": 2.283205268935236, "grad_norm": 2.2366625219176037, "learning_rate": 8.154828795639057e-08, "loss": 0.8925, "step": 1040 }, { "epoch": 2.2941822173435784, "grad_norm": 2.2935122329416253, "learning_rate": 7.9199199357077e-08, "loss": 0.8954, "step": 1045 }, { "epoch": 2.305159165751921, "grad_norm": 2.2174582268909346, "learning_rate": 7.687805723047952e-08, "loss": 0.8755, "step": 1050 }, { "epoch": 2.3161361141602637, "grad_norm": 2.320147905008012, "learning_rate": 7.45852413624943e-08, "loss": 0.8672, "step": 1055 }, { "epoch": 2.327113062568606, "grad_norm": 2.311040151424324, "learning_rate": 7.232112690426712e-08, "loss": 0.8732, "step": 1060 }, { "epoch": 2.3380900109769485, "grad_norm": 2.286645866168397, "learning_rate": 7.008608431081179e-08, "loss": 0.8862, "step": 1065 }, { "epoch": 2.349066959385291, "grad_norm": 2.243167376226312, "learning_rate": 6.78804792803955e-08, "loss": 0.8708, "step": 1070 }, { "epoch": 2.3600439077936333, "grad_norm": 2.315245531135358, "learning_rate": 6.570467269470375e-08, "loss": 0.8822, "step": 1075 }, { "epoch": 2.3710208562019757, "grad_norm": 2.268683761726241, "learning_rate": 6.355902055979253e-08, "loss": 0.879, "step": 1080 }, { "epoch": 2.3819978046103185, "grad_norm": 2.312610053338687, "learning_rate": 6.144387394783829e-08, "loss": 0.8647, "step": 1085 }, { "epoch": 2.392974753018661, "grad_norm": 2.2428629980309616, "learning_rate": 5.935957893969587e-08, "loss": 0.8762, "step": 1090 }, { "epoch": 2.4039517014270033, "grad_norm": 2.35765208556055, "learning_rate": 5.730647656827242e-08, "loss": 0.8841, "step": 1095 }, { "epoch": 2.4149286498353457, "grad_norm": 2.2494376360363164, "learning_rate": 5.528490276272732e-08, "loss": 0.8711, "step": 1100 }, { "epoch": 2.425905598243688, "grad_norm": 2.2596581762379793, "learning_rate": 5.329518829350788e-08, "loss": 0.8724, "step": 1105 }, { "epoch": 2.436882546652031, "grad_norm": 2.3221019724494982, "learning_rate": 5.1337658718227926e-08, "loss": 0.8716, "step": 1110 }, { "epoch": 2.4478594950603734, "grad_norm": 2.2548297711291467, "learning_rate": 4.941263432840062e-08, "loss": 0.8703, "step": 1115 }, { "epoch": 2.4588364434687158, "grad_norm": 2.2559502603970776, "learning_rate": 4.7520430097031855e-08, "loss": 0.891, "step": 1120 }, { "epoch": 2.469813391877058, "grad_norm": 2.348427252684778, "learning_rate": 4.566135562708437e-08, "loss": 0.8697, "step": 1125 }, { "epoch": 2.4807903402854006, "grad_norm": 2.216073741504324, "learning_rate": 4.383571510082051e-08, "loss": 0.8682, "step": 1130 }, { "epoch": 2.491767288693743, "grad_norm": 2.295163968424724, "learning_rate": 4.20438072300319e-08, "loss": 0.8837, "step": 1135 }, { "epoch": 2.5027442371020854, "grad_norm": 2.2516483968280343, "learning_rate": 4.028592520716387e-08, "loss": 0.8638, "step": 1140 }, { "epoch": 2.5137211855104282, "grad_norm": 2.269953195416105, "learning_rate": 3.8562356657343584e-08, "loss": 0.8788, "step": 1145 }, { "epoch": 2.5246981339187706, "grad_norm": 2.3024875486631133, "learning_rate": 3.6873383591318394e-08, "loss": 0.8714, "step": 1150 }, { "epoch": 2.535675082327113, "grad_norm": 2.2241713120206037, "learning_rate": 3.521928235931346e-08, "loss": 0.887, "step": 1155 }, { "epoch": 2.5466520307354554, "grad_norm": 2.2936145333162994, "learning_rate": 3.3600323605815107e-08, "loss": 0.87, "step": 1160 }, { "epoch": 2.5576289791437983, "grad_norm": 2.32545502497619, "learning_rate": 3.201677222528784e-08, "loss": 0.8686, "step": 1165 }, { "epoch": 2.5686059275521407, "grad_norm": 2.2489169681563195, "learning_rate": 3.0468887318832406e-08, "loss": 0.8746, "step": 1170 }, { "epoch": 2.579582875960483, "grad_norm": 2.2858684159165734, "learning_rate": 2.8956922151791547e-08, "loss": 0.8894, "step": 1175 }, { "epoch": 2.5905598243688255, "grad_norm": 2.2940920483922382, "learning_rate": 2.748112411231046e-08, "loss": 0.8682, "step": 1180 }, { "epoch": 2.601536772777168, "grad_norm": 2.2496855474263366, "learning_rate": 2.6041734670859488e-08, "loss": 0.8694, "step": 1185 }, { "epoch": 2.6125137211855103, "grad_norm": 2.3184342673634055, "learning_rate": 2.463898934072417e-08, "loss": 0.8742, "step": 1190 }, { "epoch": 2.6234906695938527, "grad_norm": 2.2891655132723576, "learning_rate": 2.3273117639470958e-08, "loss": 0.8692, "step": 1195 }, { "epoch": 2.6344676180021955, "grad_norm": 2.186621510859105, "learning_rate": 2.1944343051393173e-08, "loss": 0.8871, "step": 1200 }, { "epoch": 2.645444566410538, "grad_norm": 2.1909337313948467, "learning_rate": 2.0652882990944532e-08, "loss": 0.882, "step": 1205 }, { "epoch": 2.6564215148188803, "grad_norm": 2.3021129840080827, "learning_rate": 1.9398948767165774e-08, "loss": 0.8798, "step": 1210 }, { "epoch": 2.6673984632272227, "grad_norm": 2.233026500612387, "learning_rate": 1.818274554911034e-08, "loss": 0.87, "step": 1215 }, { "epoch": 2.6783754116355656, "grad_norm": 2.3034061687210925, "learning_rate": 1.7004472332274117e-08, "loss": 0.8878, "step": 1220 }, { "epoch": 2.689352360043908, "grad_norm": 2.2448202497203957, "learning_rate": 1.586432190603626e-08, "loss": 0.8924, "step": 1225 }, { "epoch": 2.7003293084522504, "grad_norm": 2.297719442799476, "learning_rate": 1.4762480822114731e-08, "loss": 0.8702, "step": 1230 }, { "epoch": 2.711306256860593, "grad_norm": 2.2378202672720273, "learning_rate": 1.3699129364042522e-08, "loss": 0.8907, "step": 1235 }, { "epoch": 2.722283205268935, "grad_norm": 2.2880262820978703, "learning_rate": 1.267444151766986e-08, "loss": 0.9005, "step": 1240 }, { "epoch": 2.7332601536772776, "grad_norm": 2.338012677352183, "learning_rate": 1.1688584942696366e-08, "loss": 0.8904, "step": 1245 }, { "epoch": 2.74423710208562, "grad_norm": 2.289922529284634, "learning_rate": 1.0741720945238731e-08, "loss": 0.8986, "step": 1250 }, { "epoch": 2.755214050493963, "grad_norm": 2.2470416957804273, "learning_rate": 9.834004451437699e-09, "loss": 0.8728, "step": 1255 }, { "epoch": 2.7661909989023052, "grad_norm": 2.2478830819509303, "learning_rate": 8.965583982108865e-09, "loss": 0.8754, "step": 1260 }, { "epoch": 2.7771679473106476, "grad_norm": 2.244470092578961, "learning_rate": 8.136601628441875e-09, "loss": 0.8828, "step": 1265 }, { "epoch": 2.78814489571899, "grad_norm": 2.2299781074531326, "learning_rate": 7.347193028751364e-09, "loss": 0.8755, "step": 1270 }, { "epoch": 2.7991218441273324, "grad_norm": 2.379183267890298, "learning_rate": 6.597487346283626e-09, "loss": 0.8697, "step": 1275 }, { "epoch": 2.8100987925356753, "grad_norm": 2.3021772414641837, "learning_rate": 5.8876072480831264e-09, "loss": 0.8849, "step": 1280 }, { "epoch": 2.8210757409440177, "grad_norm": 2.2739127597741144, "learning_rate": 5.217668884921505e-09, "loss": 0.8889, "step": 1285 }, { "epoch": 2.83205268935236, "grad_norm": 2.239037460712834, "learning_rate": 4.587781872293056e-09, "loss": 0.8636, "step": 1290 }, { "epoch": 2.8430296377607025, "grad_norm": 2.3065882078621436, "learning_rate": 3.998049272479431e-09, "loss": 0.8643, "step": 1295 }, { "epoch": 2.854006586169045, "grad_norm": 2.2987652140661265, "learning_rate": 3.4485675776863843e-09, "loss": 0.882, "step": 1300 }, { "epoch": 2.8649835345773873, "grad_norm": 2.2776905750235956, "learning_rate": 2.9394266942558976e-09, "loss": 0.8883, "step": 1305 }, { "epoch": 2.8759604829857297, "grad_norm": 2.2719810844272406, "learning_rate": 2.4707099279556164e-09, "loss": 0.9002, "step": 1310 }, { "epoch": 2.8869374313940726, "grad_norm": 2.393629212210359, "learning_rate": 2.04249397034828e-09, "loss": 0.8737, "step": 1315 }, { "epoch": 2.897914379802415, "grad_norm": 2.318705736365544, "learning_rate": 1.6548488862435095e-09, "loss": 0.8641, "step": 1320 }, { "epoch": 2.9088913282107574, "grad_norm": 2.1750311426072604, "learning_rate": 1.3078381022336715e-09, "loss": 0.8636, "step": 1325 }, { "epoch": 2.9198682766190998, "grad_norm": 2.2319410876071166, "learning_rate": 1.0015183963161811e-09, "loss": 0.8631, "step": 1330 }, { "epoch": 2.9308452250274426, "grad_norm": 2.2205987624431143, "learning_rate": 7.359398886032653e-10, "loss": 0.8888, "step": 1335 }, { "epoch": 2.941822173435785, "grad_norm": 2.255451203517086, "learning_rate": 5.111460331214124e-10, "loss": 0.8845, "step": 1340 }, { "epoch": 2.9527991218441274, "grad_norm": 2.360197105909993, "learning_rate": 3.271736107015033e-10, "loss": 0.8999, "step": 1345 }, { "epoch": 2.96377607025247, "grad_norm": 2.2839001061008553, "learning_rate": 1.8405272296045937e-10, "loss": 0.8876, "step": 1350 }, { "epoch": 2.974753018660812, "grad_norm": 2.240462357412488, "learning_rate": 8.180678737629287e-11, "loss": 0.8671, "step": 1355 }, { "epoch": 2.9857299670691546, "grad_norm": 2.282162777030651, "learning_rate": 2.0452533456311037e-11, "loss": 0.875, "step": 1360 }, { "epoch": 2.996706915477497, "grad_norm": 2.320056946149888, "learning_rate": 0.0, "loss": 0.8919, "step": 1365 }, { "epoch": 2.996706915477497, "eval_loss": 0.9529117345809937, "eval_runtime": 3.471, "eval_samples_per_second": 77.211, "eval_steps_per_second": 1.44, "step": 1365 }, { "epoch": 2.996706915477497, "step": 1365, "total_flos": 571501770178560.0, "train_loss": 0.9509169136648213, "train_runtime": 15780.8295, "train_samples_per_second": 22.162, "train_steps_per_second": 0.086 } ], "logging_steps": 5, "max_steps": 1365, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 571501770178560.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }