{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999709884243814, "eval_steps": 1000, "global_step": 17234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000580231512373437, "grad_norm": 4.4758100509643555, "learning_rate": 6.264501160092807e-06, "loss": 10.4749, "step": 10 }, { "epoch": 0.001160463024746874, "grad_norm": 1.6773627996444702, "learning_rate": 1.322505800464037e-05, "loss": 9.159, "step": 20 }, { "epoch": 0.001740694537120311, "grad_norm": 1.5999170541763306, "learning_rate": 2.018561484918793e-05, "loss": 8.8189, "step": 30 }, { "epoch": 0.002320926049493748, "grad_norm": 1.9260104894638062, "learning_rate": 2.7146171693735496e-05, "loss": 8.4574, "step": 40 }, { "epoch": 0.002901157561867185, "grad_norm": 2.173593282699585, "learning_rate": 3.410672853828306e-05, "loss": 8.0835, "step": 50 }, { "epoch": 0.003481389074240622, "grad_norm": 1.5830281972885132, "learning_rate": 4.1067285382830626e-05, "loss": 7.7376, "step": 60 }, { "epoch": 0.004061620586614059, "grad_norm": 2.772728443145752, "learning_rate": 4.802784222737819e-05, "loss": 7.4168, "step": 70 }, { "epoch": 0.004641852098987496, "grad_norm": 1.511775016784668, "learning_rate": 5.498839907192575e-05, "loss": 7.1442, "step": 80 }, { "epoch": 0.005222083611360933, "grad_norm": 1.9058183431625366, "learning_rate": 6.194895591647331e-05, "loss": 6.9324, "step": 90 }, { "epoch": 0.00580231512373437, "grad_norm": 1.6976985931396484, "learning_rate": 6.890951276102087e-05, "loss": 6.8005, "step": 100 }, { "epoch": 0.006382546636107807, "grad_norm": 1.4346176385879517, "learning_rate": 7.587006960556844e-05, "loss": 6.6814, "step": 110 }, { "epoch": 0.006962778148481244, "grad_norm": 1.0364270210266113, "learning_rate": 8.283062645011599e-05, "loss": 6.5547, "step": 120 }, { "epoch": 0.007543009660854681, "grad_norm": 0.6528536677360535, "learning_rate": 8.979118329466357e-05, "loss": 6.4482, "step": 130 }, { "epoch": 0.008123241173228117, "grad_norm": 1.1468390226364136, "learning_rate": 9.675174013921112e-05, "loss": 6.3518, "step": 140 }, { "epoch": 0.008703472685601555, "grad_norm": 0.6249582171440125, "learning_rate": 0.0001037122969837587, "loss": 6.2749, "step": 150 }, { "epoch": 0.009283704197974993, "grad_norm": 0.9577043652534485, "learning_rate": 0.00011067285382830626, "loss": 6.2026, "step": 160 }, { "epoch": 0.009863935710348428, "grad_norm": 1.156731367111206, "learning_rate": 0.00011763341067285381, "loss": 6.1482, "step": 170 }, { "epoch": 0.010444167222721866, "grad_norm": 0.7919487357139587, "learning_rate": 0.0001245939675174014, "loss": 6.0907, "step": 180 }, { "epoch": 0.011024398735095304, "grad_norm": 0.5902596712112427, "learning_rate": 0.00013155452436194894, "loss": 6.0469, "step": 190 }, { "epoch": 0.01160463024746874, "grad_norm": 0.9712298512458801, "learning_rate": 0.00013851508120649652, "loss": 6.0128, "step": 200 }, { "epoch": 0.012184861759842177, "grad_norm": 0.6487208008766174, "learning_rate": 0.00014547563805104407, "loss": 5.949, "step": 210 }, { "epoch": 0.012765093272215615, "grad_norm": 0.6659431457519531, "learning_rate": 0.00015243619489559162, "loss": 5.9004, "step": 220 }, { "epoch": 0.01334532478458905, "grad_norm": 0.9973188042640686, "learning_rate": 0.0001593967517401392, "loss": 5.8727, "step": 230 }, { "epoch": 0.013925556296962488, "grad_norm": 0.592413067817688, "learning_rate": 0.00016635730858468675, "loss": 5.8594, "step": 240 }, { "epoch": 0.014505787809335926, "grad_norm": 0.6143619418144226, "learning_rate": 0.00017331786542923433, "loss": 5.8114, "step": 250 }, { "epoch": 0.015086019321709361, "grad_norm": 0.5780689120292664, "learning_rate": 0.00018027842227378188, "loss": 5.7829, "step": 260 }, { "epoch": 0.0156662508340828, "grad_norm": 0.41307076811790466, "learning_rate": 0.00018723897911832944, "loss": 5.7197, "step": 270 }, { "epoch": 0.016246482346456235, "grad_norm": 0.6880993247032166, "learning_rate": 0.00019419953596287701, "loss": 5.7168, "step": 280 }, { "epoch": 0.016826713858829674, "grad_norm": 0.4273562431335449, "learning_rate": 0.0002011600928074246, "loss": 5.6639, "step": 290 }, { "epoch": 0.01740694537120311, "grad_norm": 0.5025382041931152, "learning_rate": 0.00020812064965197212, "loss": 5.6305, "step": 300 }, { "epoch": 0.017987176883576546, "grad_norm": 0.7127647995948792, "learning_rate": 0.0002150812064965197, "loss": 5.5991, "step": 310 }, { "epoch": 0.018567408395949985, "grad_norm": 0.6494776010513306, "learning_rate": 0.00022204176334106727, "loss": 5.5961, "step": 320 }, { "epoch": 0.01914763990832342, "grad_norm": 0.43809765577316284, "learning_rate": 0.00022900232018561485, "loss": 5.5242, "step": 330 }, { "epoch": 0.019727871420696857, "grad_norm": 0.5514947175979614, "learning_rate": 0.00023596287703016238, "loss": 5.4885, "step": 340 }, { "epoch": 0.020308102933070296, "grad_norm": 0.7086557745933533, "learning_rate": 0.00024292343387470995, "loss": 5.4558, "step": 350 }, { "epoch": 0.020888334445443732, "grad_norm": 0.44333210587501526, "learning_rate": 0.0002498839907192575, "loss": 5.4249, "step": 360 }, { "epoch": 0.021468565957817168, "grad_norm": 0.5971847772598267, "learning_rate": 0.0002568445475638051, "loss": 5.3896, "step": 370 }, { "epoch": 0.022048797470190607, "grad_norm": 0.5358195900917053, "learning_rate": 0.0002638051044083526, "loss": 5.3647, "step": 380 }, { "epoch": 0.022629028982564043, "grad_norm": 0.4231407046318054, "learning_rate": 0.0002707656612529002, "loss": 5.3325, "step": 390 }, { "epoch": 0.02320926049493748, "grad_norm": 0.48789191246032715, "learning_rate": 0.00027772621809744777, "loss": 5.2922, "step": 400 }, { "epoch": 0.023789492007310918, "grad_norm": 0.46154582500457764, "learning_rate": 0.0002846867749419953, "loss": 5.2881, "step": 410 }, { "epoch": 0.024369723519684354, "grad_norm": 0.44972172379493713, "learning_rate": 0.00029164733178654287, "loss": 5.2397, "step": 420 }, { "epoch": 0.02494995503205779, "grad_norm": 0.505415678024292, "learning_rate": 0.0002986078886310905, "loss": 5.1841, "step": 430 }, { "epoch": 0.02553018654443123, "grad_norm": 0.42717623710632324, "learning_rate": 0.0003055684454756381, "loss": 5.1848, "step": 440 }, { "epoch": 0.026110418056804665, "grad_norm": 0.4216056168079376, "learning_rate": 0.0003125290023201856, "loss": 5.1447, "step": 450 }, { "epoch": 0.0266906495691781, "grad_norm": 0.5051509141921997, "learning_rate": 0.00031948955916473313, "loss": 5.1084, "step": 460 }, { "epoch": 0.02727088108155154, "grad_norm": 0.5205376744270325, "learning_rate": 0.0003264501160092807, "loss": 5.0462, "step": 470 }, { "epoch": 0.027851112593924976, "grad_norm": 0.5111084580421448, "learning_rate": 0.0003334106728538283, "loss": 5.0225, "step": 480 }, { "epoch": 0.028431344106298412, "grad_norm": 0.4395337402820587, "learning_rate": 0.00034037122969837584, "loss": 4.991, "step": 490 }, { "epoch": 0.02901157561867185, "grad_norm": 0.2879785895347595, "learning_rate": 0.00034733178654292344, "loss": 4.9628, "step": 500 }, { "epoch": 0.029591807131045287, "grad_norm": 0.3356530964374542, "learning_rate": 0.000354292343387471, "loss": 4.9165, "step": 510 }, { "epoch": 0.030172038643418723, "grad_norm": 0.39410287141799927, "learning_rate": 0.00036125290023201855, "loss": 4.8802, "step": 520 }, { "epoch": 0.030752270155792162, "grad_norm": 0.4210626184940338, "learning_rate": 0.00036821345707656604, "loss": 4.8403, "step": 530 }, { "epoch": 0.0313325016681656, "grad_norm": 0.4170067608356476, "learning_rate": 0.00037517401392111365, "loss": 4.8156, "step": 540 }, { "epoch": 0.031912733180539034, "grad_norm": 0.40876781940460205, "learning_rate": 0.0003821345707656612, "loss": 4.7932, "step": 550 }, { "epoch": 0.03249296469291247, "grad_norm": 0.3717671036720276, "learning_rate": 0.0003890951276102088, "loss": 4.7812, "step": 560 }, { "epoch": 0.03307319620528591, "grad_norm": 0.37275081872940063, "learning_rate": 0.00039605568445475636, "loss": 4.7324, "step": 570 }, { "epoch": 0.03365342771765935, "grad_norm": 0.32523536682128906, "learning_rate": 0.0004030162412993039, "loss": 4.6891, "step": 580 }, { "epoch": 0.034233659230032784, "grad_norm": 0.2909957468509674, "learning_rate": 0.0004099767981438515, "loss": 4.6555, "step": 590 }, { "epoch": 0.03481389074240622, "grad_norm": 0.40268951654434204, "learning_rate": 0.00041693735498839906, "loss": 4.622, "step": 600 }, { "epoch": 0.035394122254779656, "grad_norm": 0.433383584022522, "learning_rate": 0.00042389791183294656, "loss": 4.6122, "step": 610 }, { "epoch": 0.03597435376715309, "grad_norm": 0.3096088171005249, "learning_rate": 0.0004308584686774941, "loss": 4.5976, "step": 620 }, { "epoch": 0.036554585279526534, "grad_norm": 0.30540433526039124, "learning_rate": 0.0004378190255220417, "loss": 4.5569, "step": 630 }, { "epoch": 0.03713481679189997, "grad_norm": 0.3136671781539917, "learning_rate": 0.00044477958236658927, "loss": 4.5228, "step": 640 }, { "epoch": 0.037715048304273406, "grad_norm": 0.332621693611145, "learning_rate": 0.0004517401392111369, "loss": 4.4901, "step": 650 }, { "epoch": 0.03829527981664684, "grad_norm": 0.3817736804485321, "learning_rate": 0.0004587006960556844, "loss": 4.475, "step": 660 }, { "epoch": 0.03887551132902028, "grad_norm": 0.458741158246994, "learning_rate": 0.000465661252900232, "loss": 4.4545, "step": 670 }, { "epoch": 0.039455742841393714, "grad_norm": 0.27561265230178833, "learning_rate": 0.0004726218097447796, "loss": 4.4406, "step": 680 }, { "epoch": 0.040035974353767156, "grad_norm": 0.380633145570755, "learning_rate": 0.0004795823665893271, "loss": 4.4027, "step": 690 }, { "epoch": 0.04061620586614059, "grad_norm": 0.3662358820438385, "learning_rate": 0.00048654292343387463, "loss": 4.377, "step": 700 }, { "epoch": 0.04119643737851403, "grad_norm": 0.31104594469070435, "learning_rate": 0.0004935034802784222, "loss": 4.3399, "step": 710 }, { "epoch": 0.041776668890887464, "grad_norm": 0.43897074460983276, "learning_rate": 0.0005004640371229698, "loss": 4.3229, "step": 720 }, { "epoch": 0.0423569004032609, "grad_norm": 0.2685506343841553, "learning_rate": 0.0005074245939675173, "loss": 4.302, "step": 730 }, { "epoch": 0.042937131915634336, "grad_norm": 0.2662206292152405, "learning_rate": 0.0005143851508120649, "loss": 4.2533, "step": 740 }, { "epoch": 0.04351736342800778, "grad_norm": 0.31665244698524475, "learning_rate": 0.0005213457076566126, "loss": 4.2463, "step": 750 }, { "epoch": 0.044097594940381214, "grad_norm": 0.3573771119117737, "learning_rate": 0.0005283062645011601, "loss": 4.2177, "step": 760 }, { "epoch": 0.04467782645275465, "grad_norm": 0.3051789402961731, "learning_rate": 0.0005352668213457077, "loss": 4.2098, "step": 770 }, { "epoch": 0.045258057965128086, "grad_norm": 0.26946839690208435, "learning_rate": 0.0005422273781902551, "loss": 4.1739, "step": 780 }, { "epoch": 0.04583828947750152, "grad_norm": 0.21327945590019226, "learning_rate": 0.0005491879350348028, "loss": 4.151, "step": 790 }, { "epoch": 0.04641852098987496, "grad_norm": 0.28413307666778564, "learning_rate": 0.0005561484918793503, "loss": 4.1455, "step": 800 }, { "epoch": 0.0469987525022484, "grad_norm": 0.2847752869129181, "learning_rate": 0.0005631090487238979, "loss": 4.1166, "step": 810 }, { "epoch": 0.047578984014621836, "grad_norm": 0.25382527709007263, "learning_rate": 0.0005700696055684454, "loss": 4.0986, "step": 820 }, { "epoch": 0.04815921552699527, "grad_norm": 0.2375078797340393, "learning_rate": 0.000577030162412993, "loss": 4.0765, "step": 830 }, { "epoch": 0.04873944703936871, "grad_norm": 0.3032638430595398, "learning_rate": 0.0005839907192575406, "loss": 4.085, "step": 840 }, { "epoch": 0.049319678551742144, "grad_norm": 0.2454582005739212, "learning_rate": 0.0005909512761020882, "loss": 4.0505, "step": 850 }, { "epoch": 0.04989991006411558, "grad_norm": 0.23829826712608337, "learning_rate": 0.0005979118329466356, "loss": 4.0391, "step": 860 }, { "epoch": 0.05048014157648902, "grad_norm": 0.29694074392318726, "learning_rate": 0.0005999997293652579, "loss": 4.0195, "step": 870 }, { "epoch": 0.05106037308886246, "grad_norm": 0.20268426835536957, "learning_rate": 0.0005999984038085133, "loss": 4.0023, "step": 880 }, { "epoch": 0.051640604601235894, "grad_norm": 0.2563273310661316, "learning_rate": 0.000599995973626219, "loss": 3.98, "step": 890 }, { "epoch": 0.05222083611360933, "grad_norm": 0.26515451073646545, "learning_rate": 0.0005999924388273229, "loss": 3.9799, "step": 900 }, { "epoch": 0.052801067625982766, "grad_norm": 0.23011842370033264, "learning_rate": 0.0005999877994248407, "loss": 3.9592, "step": 910 }, { "epoch": 0.0533812991383562, "grad_norm": 0.21570523083209991, "learning_rate": 0.0005999820554358552, "loss": 3.9366, "step": 920 }, { "epoch": 0.053961530650729644, "grad_norm": 0.24623119831085205, "learning_rate": 0.0005999752068815162, "loss": 3.923, "step": 930 }, { "epoch": 0.05454176216310308, "grad_norm": 0.26557642221450806, "learning_rate": 0.0005999672537870409, "loss": 3.9114, "step": 940 }, { "epoch": 0.055121993675476516, "grad_norm": 0.23711174726486206, "learning_rate": 0.0005999581961817135, "loss": 3.9021, "step": 950 }, { "epoch": 0.05570222518784995, "grad_norm": 0.2636472284793854, "learning_rate": 0.000599948034098885, "loss": 3.8945, "step": 960 }, { "epoch": 0.05628245670022339, "grad_norm": 0.2139461785554886, "learning_rate": 0.000599936767575973, "loss": 3.8742, "step": 970 }, { "epoch": 0.056862688212596824, "grad_norm": 0.2411975860595703, "learning_rate": 0.0005999243966544624, "loss": 3.8627, "step": 980 }, { "epoch": 0.057442919724970266, "grad_norm": 0.22522902488708496, "learning_rate": 0.000599910921379904, "loss": 3.8439, "step": 990 }, { "epoch": 0.0580231512373437, "grad_norm": 0.2505146861076355, "learning_rate": 0.0005998963418019153, "loss": 3.8376, "step": 1000 }, { "epoch": 0.0580231512373437, "eval_loss": 3.7977514266967773, "eval_runtime": 3.2666, "eval_samples_per_second": 1325.524, "eval_steps_per_second": 2.755, "step": 1000 }, { "epoch": 0.05860338274971714, "grad_norm": 0.21931585669517517, "learning_rate": 0.0005998806579741798, "loss": 3.8196, "step": 1010 }, { "epoch": 0.059183614262090574, "grad_norm": 0.19973556697368622, "learning_rate": 0.0005998638699544469, "loss": 3.813, "step": 1020 }, { "epoch": 0.05976384577446401, "grad_norm": 0.21615122258663177, "learning_rate": 0.0005998459778045319, "loss": 3.7993, "step": 1030 }, { "epoch": 0.060344077286837446, "grad_norm": 0.18904747068881989, "learning_rate": 0.0005998269815903156, "loss": 3.8122, "step": 1040 }, { "epoch": 0.06092430879921089, "grad_norm": 0.20379868149757385, "learning_rate": 0.000599806881381744, "loss": 3.7891, "step": 1050 }, { "epoch": 0.061504540311584324, "grad_norm": 0.21616701781749725, "learning_rate": 0.0005997856772528283, "loss": 3.7768, "step": 1060 }, { "epoch": 0.06208477182395776, "grad_norm": 0.1838783323764801, "learning_rate": 0.0005997633692816442, "loss": 3.7744, "step": 1070 }, { "epoch": 0.0626650033363312, "grad_norm": 0.17894767224788666, "learning_rate": 0.0005997399575503321, "loss": 3.7667, "step": 1080 }, { "epoch": 0.06324523484870463, "grad_norm": 0.20992882549762726, "learning_rate": 0.0005997154421450963, "loss": 3.7449, "step": 1090 }, { "epoch": 0.06382546636107807, "grad_norm": 0.19586902856826782, "learning_rate": 0.0005996898231562051, "loss": 3.7423, "step": 1100 }, { "epoch": 0.0644056978734515, "grad_norm": 0.24105612933635712, "learning_rate": 0.0005996631006779903, "loss": 3.7223, "step": 1110 }, { "epoch": 0.06498592938582494, "grad_norm": 0.19526907801628113, "learning_rate": 0.0005996352748088471, "loss": 3.7189, "step": 1120 }, { "epoch": 0.06556616089819838, "grad_norm": 0.16144131124019623, "learning_rate": 0.000599606345651233, "loss": 3.7118, "step": 1130 }, { "epoch": 0.06614639241057182, "grad_norm": 0.167442187666893, "learning_rate": 0.0005995763133116683, "loss": 3.6986, "step": 1140 }, { "epoch": 0.06672662392294526, "grad_norm": 0.23503893613815308, "learning_rate": 0.0005995451779007352, "loss": 3.7049, "step": 1150 }, { "epoch": 0.0673068554353187, "grad_norm": 0.2096278965473175, "learning_rate": 0.0005995129395330776, "loss": 3.6865, "step": 1160 }, { "epoch": 0.06788708694769213, "grad_norm": 0.19825097918510437, "learning_rate": 0.0005994795983274004, "loss": 3.6712, "step": 1170 }, { "epoch": 0.06846731846006557, "grad_norm": 0.15405306220054626, "learning_rate": 0.0005994451544064696, "loss": 3.6711, "step": 1180 }, { "epoch": 0.069047549972439, "grad_norm": 0.563884437084198, "learning_rate": 0.0005994096078971111, "loss": 3.677, "step": 1190 }, { "epoch": 0.06962778148481244, "grad_norm": 0.1655234694480896, "learning_rate": 0.0005993729589302111, "loss": 3.7143, "step": 1200 }, { "epoch": 0.07020801299718588, "grad_norm": 0.15598031878471375, "learning_rate": 0.0005993352076407148, "loss": 3.6689, "step": 1210 }, { "epoch": 0.07078824450955931, "grad_norm": 0.14992448687553406, "learning_rate": 0.0005992963541676265, "loss": 3.6581, "step": 1220 }, { "epoch": 0.07136847602193275, "grad_norm": 0.1618255376815796, "learning_rate": 0.0005992563986540086, "loss": 3.642, "step": 1230 }, { "epoch": 0.07194870753430618, "grad_norm": 0.16188852488994598, "learning_rate": 0.0005992153412469816, "loss": 3.6399, "step": 1240 }, { "epoch": 0.07252893904667962, "grad_norm": 0.17180649936199188, "learning_rate": 0.0005991731820977231, "loss": 3.6252, "step": 1250 }, { "epoch": 0.07310917055905307, "grad_norm": 0.1691058874130249, "learning_rate": 0.0005991299213614678, "loss": 3.6244, "step": 1260 }, { "epoch": 0.0736894020714265, "grad_norm": 0.19470703601837158, "learning_rate": 0.0005990855591975059, "loss": 3.6199, "step": 1270 }, { "epoch": 0.07426963358379994, "grad_norm": 0.15482653677463531, "learning_rate": 0.0005990400957691835, "loss": 3.6176, "step": 1280 }, { "epoch": 0.07484986509617338, "grad_norm": 0.18342998623847961, "learning_rate": 0.000598993531243902, "loss": 3.6082, "step": 1290 }, { "epoch": 0.07543009660854681, "grad_norm": 0.17348110675811768, "learning_rate": 0.0005989458657931167, "loss": 3.6063, "step": 1300 }, { "epoch": 0.07601032812092025, "grad_norm": 0.1687677949666977, "learning_rate": 0.0005988970995923368, "loss": 3.6015, "step": 1310 }, { "epoch": 0.07659055963329368, "grad_norm": 0.19341568648815155, "learning_rate": 0.0005988472328211246, "loss": 3.5912, "step": 1320 }, { "epoch": 0.07717079114566712, "grad_norm": 0.15345478057861328, "learning_rate": 0.0005987962656630947, "loss": 3.586, "step": 1330 }, { "epoch": 0.07775102265804056, "grad_norm": 0.16126085817813873, "learning_rate": 0.0005987441983059136, "loss": 3.5797, "step": 1340 }, { "epoch": 0.07833125417041399, "grad_norm": 0.1716892272233963, "learning_rate": 0.0005986910309412986, "loss": 3.5751, "step": 1350 }, { "epoch": 0.07891148568278743, "grad_norm": 0.15669932961463928, "learning_rate": 0.0005986367637650177, "loss": 3.5799, "step": 1360 }, { "epoch": 0.07949171719516086, "grad_norm": 0.19878168404102325, "learning_rate": 0.0005985813969768884, "loss": 3.572, "step": 1370 }, { "epoch": 0.08007194870753431, "grad_norm": 0.1505119651556015, "learning_rate": 0.0005985249307807767, "loss": 3.567, "step": 1380 }, { "epoch": 0.08065218021990775, "grad_norm": 0.1548507809638977, "learning_rate": 0.0005984673653845972, "loss": 3.5427, "step": 1390 }, { "epoch": 0.08123241173228118, "grad_norm": 0.15786635875701904, "learning_rate": 0.0005984087010003119, "loss": 3.5637, "step": 1400 }, { "epoch": 0.08181264324465462, "grad_norm": 0.15546779334545135, "learning_rate": 0.0005983489378439289, "loss": 3.5475, "step": 1410 }, { "epoch": 0.08239287475702806, "grad_norm": 0.17267097532749176, "learning_rate": 0.0005982880761355026, "loss": 3.5519, "step": 1420 }, { "epoch": 0.08297310626940149, "grad_norm": 0.2120850831270218, "learning_rate": 0.0005982261160991321, "loss": 3.545, "step": 1430 }, { "epoch": 0.08355333778177493, "grad_norm": 0.1541440784931183, "learning_rate": 0.0005981630579629609, "loss": 3.5236, "step": 1440 }, { "epoch": 0.08413356929414836, "grad_norm": 0.1610753834247589, "learning_rate": 0.0005980989019591753, "loss": 3.5153, "step": 1450 }, { "epoch": 0.0847138008065218, "grad_norm": 0.1872093677520752, "learning_rate": 0.0005980336483240048, "loss": 3.5208, "step": 1460 }, { "epoch": 0.08529403231889524, "grad_norm": 0.15793032944202423, "learning_rate": 0.0005979672972977201, "loss": 3.5294, "step": 1470 }, { "epoch": 0.08587426383126867, "grad_norm": 0.1738296002149582, "learning_rate": 0.0005978998491246324, "loss": 3.5234, "step": 1480 }, { "epoch": 0.08645449534364211, "grad_norm": 0.1644987314939499, "learning_rate": 0.0005978313040530931, "loss": 3.515, "step": 1490 }, { "epoch": 0.08703472685601556, "grad_norm": 0.16707918047904968, "learning_rate": 0.0005977616623354923, "loss": 3.5014, "step": 1500 }, { "epoch": 0.08761495836838899, "grad_norm": 0.14812146127223969, "learning_rate": 0.0005976909242282581, "loss": 3.4923, "step": 1510 }, { "epoch": 0.08819518988076243, "grad_norm": 0.15653282403945923, "learning_rate": 0.0005976190899918555, "loss": 3.4899, "step": 1520 }, { "epoch": 0.08877542139313586, "grad_norm": 0.1531265377998352, "learning_rate": 0.0005975461598907858, "loss": 3.4939, "step": 1530 }, { "epoch": 0.0893556529055093, "grad_norm": 0.19499650597572327, "learning_rate": 0.0005974721341935854, "loss": 3.4776, "step": 1540 }, { "epoch": 0.08993588441788274, "grad_norm": 0.16522051393985748, "learning_rate": 0.0005973970131728245, "loss": 3.4843, "step": 1550 }, { "epoch": 0.09051611593025617, "grad_norm": 0.14911240339279175, "learning_rate": 0.0005973207971051066, "loss": 3.4854, "step": 1560 }, { "epoch": 0.09109634744262961, "grad_norm": 0.1797751784324646, "learning_rate": 0.0005972434862710673, "loss": 3.4814, "step": 1570 }, { "epoch": 0.09167657895500304, "grad_norm": 0.14958298206329346, "learning_rate": 0.0005971650809553729, "loss": 3.4791, "step": 1580 }, { "epoch": 0.09225681046737648, "grad_norm": 0.17834265530109406, "learning_rate": 0.0005970855814467205, "loss": 3.4633, "step": 1590 }, { "epoch": 0.09283704197974992, "grad_norm": 0.15738125145435333, "learning_rate": 0.0005970049880378353, "loss": 3.4676, "step": 1600 }, { "epoch": 0.09341727349212335, "grad_norm": 0.14483994245529175, "learning_rate": 0.0005969233010254707, "loss": 3.4661, "step": 1610 }, { "epoch": 0.0939975050044968, "grad_norm": 0.14126789569854736, "learning_rate": 0.0005968405207104068, "loss": 3.4571, "step": 1620 }, { "epoch": 0.09457773651687024, "grad_norm": 0.1578633040189743, "learning_rate": 0.0005967566473974495, "loss": 3.4558, "step": 1630 }, { "epoch": 0.09515796802924367, "grad_norm": 0.1565486639738083, "learning_rate": 0.000596671681395429, "loss": 3.4604, "step": 1640 }, { "epoch": 0.09573819954161711, "grad_norm": 0.13866451382637024, "learning_rate": 0.0005965856230171993, "loss": 3.4552, "step": 1650 }, { "epoch": 0.09631843105399054, "grad_norm": 0.2121124267578125, "learning_rate": 0.0005964984725796359, "loss": 3.4541, "step": 1660 }, { "epoch": 0.09689866256636398, "grad_norm": 0.17082008719444275, "learning_rate": 0.0005964102304036363, "loss": 3.4382, "step": 1670 }, { "epoch": 0.09747889407873742, "grad_norm": 0.20681622624397278, "learning_rate": 0.0005963208968141172, "loss": 3.4372, "step": 1680 }, { "epoch": 0.09805912559111085, "grad_norm": 0.1384105086326599, "learning_rate": 0.0005962304721400142, "loss": 3.4484, "step": 1690 }, { "epoch": 0.09863935710348429, "grad_norm": 0.16820856928825378, "learning_rate": 0.0005961389567142806, "loss": 3.4302, "step": 1700 }, { "epoch": 0.09921958861585772, "grad_norm": 0.16617996990680695, "learning_rate": 0.0005960463508738855, "loss": 3.4328, "step": 1710 }, { "epoch": 0.09979982012823116, "grad_norm": 0.16344214975833893, "learning_rate": 0.0005959526549598137, "loss": 3.4326, "step": 1720 }, { "epoch": 0.1003800516406046, "grad_norm": 0.16235540807247162, "learning_rate": 0.000595857869317063, "loss": 3.4271, "step": 1730 }, { "epoch": 0.10096028315297804, "grad_norm": 0.1524738371372223, "learning_rate": 0.0005957619942946442, "loss": 3.424, "step": 1740 }, { "epoch": 0.10154051466535148, "grad_norm": 0.18023791909217834, "learning_rate": 0.0005956650302455793, "loss": 3.4266, "step": 1750 }, { "epoch": 0.10212074617772492, "grad_norm": 0.17738115787506104, "learning_rate": 0.0005955669775268999, "loss": 3.4046, "step": 1760 }, { "epoch": 0.10270097769009835, "grad_norm": 0.13939271867275238, "learning_rate": 0.0005954678364996466, "loss": 3.4177, "step": 1770 }, { "epoch": 0.10328120920247179, "grad_norm": 0.18028447031974792, "learning_rate": 0.0005953676075288668, "loss": 3.4113, "step": 1780 }, { "epoch": 0.10386144071484522, "grad_norm": 0.15911422669887543, "learning_rate": 0.0005952662909836142, "loss": 3.4191, "step": 1790 }, { "epoch": 0.10444167222721866, "grad_norm": 0.15596607327461243, "learning_rate": 0.0005951638872369469, "loss": 3.3993, "step": 1800 }, { "epoch": 0.1050219037395921, "grad_norm": 0.15493981540203094, "learning_rate": 0.0005950603966659264, "loss": 3.4043, "step": 1810 }, { "epoch": 0.10560213525196553, "grad_norm": 0.1727568507194519, "learning_rate": 0.0005949558196516154, "loss": 3.4028, "step": 1820 }, { "epoch": 0.10618236676433897, "grad_norm": 0.1614874303340912, "learning_rate": 0.0005948501565790779, "loss": 3.3998, "step": 1830 }, { "epoch": 0.1067625982767124, "grad_norm": 0.13620299100875854, "learning_rate": 0.000594743407837376, "loss": 3.3896, "step": 1840 }, { "epoch": 0.10734282978908584, "grad_norm": 0.15391112864017487, "learning_rate": 0.0005946355738195701, "loss": 3.3823, "step": 1850 }, { "epoch": 0.10792306130145929, "grad_norm": 0.15937426686286926, "learning_rate": 0.0005945266549227162, "loss": 3.3893, "step": 1860 }, { "epoch": 0.10850329281383272, "grad_norm": 0.16253319382667542, "learning_rate": 0.0005944166515478649, "loss": 3.3905, "step": 1870 }, { "epoch": 0.10908352432620616, "grad_norm": 0.14502382278442383, "learning_rate": 0.0005943055641000604, "loss": 3.3836, "step": 1880 }, { "epoch": 0.1096637558385796, "grad_norm": 0.14128324389457703, "learning_rate": 0.0005941933929883384, "loss": 3.3854, "step": 1890 }, { "epoch": 0.11024398735095303, "grad_norm": 0.19345618784427643, "learning_rate": 0.0005940801386257244, "loss": 3.3746, "step": 1900 }, { "epoch": 0.11082421886332647, "grad_norm": 0.1499020904302597, "learning_rate": 0.000593965801429233, "loss": 3.3729, "step": 1910 }, { "epoch": 0.1114044503756999, "grad_norm": 0.14975206553936005, "learning_rate": 0.0005938503818198656, "loss": 3.3676, "step": 1920 }, { "epoch": 0.11198468188807334, "grad_norm": 0.13726426661014557, "learning_rate": 0.0005937338802226094, "loss": 3.373, "step": 1930 }, { "epoch": 0.11256491340044678, "grad_norm": 0.1749139279127121, "learning_rate": 0.0005936162970664355, "loss": 3.3761, "step": 1940 }, { "epoch": 0.11314514491282021, "grad_norm": 0.14197006821632385, "learning_rate": 0.0005934976327842974, "loss": 3.3513, "step": 1950 }, { "epoch": 0.11372537642519365, "grad_norm": 0.15288510918617249, "learning_rate": 0.0005933778878131294, "loss": 3.357, "step": 1960 }, { "epoch": 0.11430560793756708, "grad_norm": 0.1787514090538025, "learning_rate": 0.000593257062593845, "loss": 3.3642, "step": 1970 }, { "epoch": 0.11488583944994053, "grad_norm": 0.13630741834640503, "learning_rate": 0.0005931351575713353, "loss": 3.3614, "step": 1980 }, { "epoch": 0.11546607096231397, "grad_norm": 0.16102264821529388, "learning_rate": 0.0005930121731944674, "loss": 3.3523, "step": 1990 }, { "epoch": 0.1160463024746874, "grad_norm": 0.16226573288440704, "learning_rate": 0.0005928881099160826, "loss": 3.3595, "step": 2000 }, { "epoch": 0.1160463024746874, "eval_loss": 3.3178560733795166, "eval_runtime": 3.2576, "eval_samples_per_second": 1329.214, "eval_steps_per_second": 2.763, "step": 2000 }, { "epoch": 0.11662653398706084, "grad_norm": 0.14609858393669128, "learning_rate": 0.0005927629681929951, "loss": 3.3585, "step": 2010 }, { "epoch": 0.11720676549943428, "grad_norm": 0.14387281239032745, "learning_rate": 0.0005926367484859896, "loss": 3.3517, "step": 2020 }, { "epoch": 0.11778699701180771, "grad_norm": 0.14605766534805298, "learning_rate": 0.0005925094512598202, "loss": 3.3524, "step": 2030 }, { "epoch": 0.11836722852418115, "grad_norm": 0.22022885084152222, "learning_rate": 0.000592381076983209, "loss": 3.3356, "step": 2040 }, { "epoch": 0.11894746003655458, "grad_norm": 0.1847839504480362, "learning_rate": 0.0005922516261288431, "loss": 3.3441, "step": 2050 }, { "epoch": 0.11952769154892802, "grad_norm": 0.13915176689624786, "learning_rate": 0.0005921210991733745, "loss": 3.352, "step": 2060 }, { "epoch": 0.12010792306130146, "grad_norm": 0.1398390680551529, "learning_rate": 0.0005919894965974168, "loss": 3.3455, "step": 2070 }, { "epoch": 0.12068815457367489, "grad_norm": 0.1368722915649414, "learning_rate": 0.0005918568188855447, "loss": 3.3403, "step": 2080 }, { "epoch": 0.12126838608604833, "grad_norm": 0.16239017248153687, "learning_rate": 0.0005917230665262914, "loss": 3.3334, "step": 2090 }, { "epoch": 0.12184861759842178, "grad_norm": 0.14380386471748352, "learning_rate": 0.000591588240012147, "loss": 3.3294, "step": 2100 }, { "epoch": 0.12242884911079521, "grad_norm": 0.16626037657260895, "learning_rate": 0.0005914523398395569, "loss": 3.3425, "step": 2110 }, { "epoch": 0.12300908062316865, "grad_norm": 0.15981921553611755, "learning_rate": 0.0005913153665089197, "loss": 3.3403, "step": 2120 }, { "epoch": 0.12358931213554208, "grad_norm": 0.15275150537490845, "learning_rate": 0.0005911773205245857, "loss": 3.3261, "step": 2130 }, { "epoch": 0.12416954364791552, "grad_norm": 0.1598198413848877, "learning_rate": 0.0005910382023948546, "loss": 3.3264, "step": 2140 }, { "epoch": 0.12474977516028896, "grad_norm": 0.138661190867424, "learning_rate": 0.0005908980126319739, "loss": 3.3216, "step": 2150 }, { "epoch": 0.1253300066726624, "grad_norm": 0.15583263337612152, "learning_rate": 0.000590756751752137, "loss": 3.3204, "step": 2160 }, { "epoch": 0.12591023818503583, "grad_norm": 0.15883944928646088, "learning_rate": 0.0005906144202754813, "loss": 3.3274, "step": 2170 }, { "epoch": 0.12649046969740926, "grad_norm": 0.15031637251377106, "learning_rate": 0.0005904710187260862, "loss": 3.3224, "step": 2180 }, { "epoch": 0.1270707012097827, "grad_norm": 0.1994715929031372, "learning_rate": 0.0005903265476319712, "loss": 3.3204, "step": 2190 }, { "epoch": 0.12765093272215614, "grad_norm": 0.16986873745918274, "learning_rate": 0.000590181007525094, "loss": 3.327, "step": 2200 }, { "epoch": 0.12823116423452957, "grad_norm": 0.147616907954216, "learning_rate": 0.0005900343989413485, "loss": 3.3063, "step": 2210 }, { "epoch": 0.128811395746903, "grad_norm": 0.16532088816165924, "learning_rate": 0.0005898867224205629, "loss": 3.3198, "step": 2220 }, { "epoch": 0.12939162725927644, "grad_norm": 0.16687408089637756, "learning_rate": 0.0005897379785064977, "loss": 3.3193, "step": 2230 }, { "epoch": 0.12997185877164988, "grad_norm": 0.16683116555213928, "learning_rate": 0.0005895881677468434, "loss": 3.3078, "step": 2240 }, { "epoch": 0.13055209028402331, "grad_norm": 0.15461483597755432, "learning_rate": 0.000589437290693219, "loss": 3.3126, "step": 2250 }, { "epoch": 0.13113232179639675, "grad_norm": 0.1432589441537857, "learning_rate": 0.0005892853479011696, "loss": 3.3004, "step": 2260 }, { "epoch": 0.13171255330877019, "grad_norm": 0.1792496293783188, "learning_rate": 0.0005891323399301646, "loss": 3.2946, "step": 2270 }, { "epoch": 0.13229278482114365, "grad_norm": 0.15189994871616364, "learning_rate": 0.0005889782673435952, "loss": 3.3013, "step": 2280 }, { "epoch": 0.13287301633351709, "grad_norm": 0.15026351809501648, "learning_rate": 0.0005888231307087728, "loss": 3.295, "step": 2290 }, { "epoch": 0.13345324784589052, "grad_norm": 0.16199465095996857, "learning_rate": 0.0005886669305969269, "loss": 3.2955, "step": 2300 }, { "epoch": 0.13403347935826396, "grad_norm": 0.16704988479614258, "learning_rate": 0.0005885096675832027, "loss": 3.3057, "step": 2310 }, { "epoch": 0.1346137108706374, "grad_norm": 0.14401213824748993, "learning_rate": 0.0005883513422466588, "loss": 3.2876, "step": 2320 }, { "epoch": 0.13519394238301083, "grad_norm": 0.15336865186691284, "learning_rate": 0.000588191955170266, "loss": 3.2903, "step": 2330 }, { "epoch": 0.13577417389538426, "grad_norm": 0.16176366806030273, "learning_rate": 0.0005880315069409039, "loss": 3.2873, "step": 2340 }, { "epoch": 0.1363544054077577, "grad_norm": 0.14728406071662903, "learning_rate": 0.00058786999814936, "loss": 3.2862, "step": 2350 }, { "epoch": 0.13693463692013114, "grad_norm": 0.14426636695861816, "learning_rate": 0.0005877074293903264, "loss": 3.2786, "step": 2360 }, { "epoch": 0.13751486843250457, "grad_norm": 0.15023665130138397, "learning_rate": 0.0005875438012623984, "loss": 3.2888, "step": 2370 }, { "epoch": 0.138095099944878, "grad_norm": 0.1882687211036682, "learning_rate": 0.0005873791143680718, "loss": 3.2806, "step": 2380 }, { "epoch": 0.13867533145725144, "grad_norm": 0.14847789704799652, "learning_rate": 0.000587213369313741, "loss": 3.2698, "step": 2390 }, { "epoch": 0.13925556296962488, "grad_norm": 0.14070352911949158, "learning_rate": 0.0005870465667096969, "loss": 3.2782, "step": 2400 }, { "epoch": 0.13983579448199832, "grad_norm": 0.19226056337356567, "learning_rate": 0.0005868787071701238, "loss": 3.2639, "step": 2410 }, { "epoch": 0.14041602599437175, "grad_norm": 0.1776312291622162, "learning_rate": 0.0005867097913130982, "loss": 3.2792, "step": 2420 }, { "epoch": 0.1409962575067452, "grad_norm": 0.13482613861560822, "learning_rate": 0.0005865398197605863, "loss": 3.2834, "step": 2430 }, { "epoch": 0.14157648901911862, "grad_norm": 0.16731715202331543, "learning_rate": 0.0005863687931384408, "loss": 3.2773, "step": 2440 }, { "epoch": 0.14215672053149206, "grad_norm": 0.14542406797409058, "learning_rate": 0.0005861967120763997, "loss": 3.2676, "step": 2450 }, { "epoch": 0.1427369520438655, "grad_norm": 0.1490476280450821, "learning_rate": 0.0005860235772080836, "loss": 3.2783, "step": 2460 }, { "epoch": 0.14331718355623893, "grad_norm": 0.1446717530488968, "learning_rate": 0.0005858493891709932, "loss": 3.283, "step": 2470 }, { "epoch": 0.14389741506861237, "grad_norm": 0.1412891447544098, "learning_rate": 0.0005856741486065071, "loss": 3.2652, "step": 2480 }, { "epoch": 0.1444776465809858, "grad_norm": 0.14674563705921173, "learning_rate": 0.0005854978561598794, "loss": 3.2613, "step": 2490 }, { "epoch": 0.14505787809335924, "grad_norm": 0.14808981120586395, "learning_rate": 0.0005853205124802374, "loss": 3.2742, "step": 2500 }, { "epoch": 0.14563810960573267, "grad_norm": 0.14043253660202026, "learning_rate": 0.0005851421182205789, "loss": 3.2685, "step": 2510 }, { "epoch": 0.14621834111810614, "grad_norm": 0.1568257212638855, "learning_rate": 0.0005849626740377705, "loss": 3.2711, "step": 2520 }, { "epoch": 0.14679857263047957, "grad_norm": 0.13545943796634674, "learning_rate": 0.0005847821805925444, "loss": 3.2573, "step": 2530 }, { "epoch": 0.147378804142853, "grad_norm": 0.18863698840141296, "learning_rate": 0.0005846006385494964, "loss": 3.2526, "step": 2540 }, { "epoch": 0.14795903565522645, "grad_norm": 0.14628858864307404, "learning_rate": 0.0005844180485770832, "loss": 3.2629, "step": 2550 }, { "epoch": 0.14853926716759988, "grad_norm": 0.1624503880739212, "learning_rate": 0.0005842344113476202, "loss": 3.2529, "step": 2560 }, { "epoch": 0.14911949867997332, "grad_norm": 0.16218945384025574, "learning_rate": 0.0005840497275372792, "loss": 3.2548, "step": 2570 }, { "epoch": 0.14969973019234675, "grad_norm": 0.16516704857349396, "learning_rate": 0.0005838639978260851, "loss": 3.2501, "step": 2580 }, { "epoch": 0.1502799617047202, "grad_norm": 0.1366761326789856, "learning_rate": 0.0005836772228979142, "loss": 3.2467, "step": 2590 }, { "epoch": 0.15086019321709362, "grad_norm": 0.15526661276817322, "learning_rate": 0.0005834894034404913, "loss": 3.242, "step": 2600 }, { "epoch": 0.15144042472946706, "grad_norm": 0.1441916972398758, "learning_rate": 0.0005833005401453874, "loss": 3.2399, "step": 2610 }, { "epoch": 0.1520206562418405, "grad_norm": 0.1708252727985382, "learning_rate": 0.0005831106337080169, "loss": 3.2427, "step": 2620 }, { "epoch": 0.15260088775421393, "grad_norm": 0.14945155382156372, "learning_rate": 0.0005829196848276351, "loss": 3.2449, "step": 2630 }, { "epoch": 0.15318111926658737, "grad_norm": 0.1512700468301773, "learning_rate": 0.000582727694207336, "loss": 3.2438, "step": 2640 }, { "epoch": 0.1537613507789608, "grad_norm": 0.15101619064807892, "learning_rate": 0.0005825346625540491, "loss": 3.2396, "step": 2650 }, { "epoch": 0.15434158229133424, "grad_norm": 0.13658584654331207, "learning_rate": 0.000582340590578537, "loss": 3.2475, "step": 2660 }, { "epoch": 0.15492181380370768, "grad_norm": 0.16723176836967468, "learning_rate": 0.0005821454789953932, "loss": 3.2385, "step": 2670 }, { "epoch": 0.1555020453160811, "grad_norm": 0.16236084699630737, "learning_rate": 0.000581949328523039, "loss": 3.2287, "step": 2680 }, { "epoch": 0.15608227682845455, "grad_norm": 0.1473713517189026, "learning_rate": 0.0005817521398837209, "loss": 3.2335, "step": 2690 }, { "epoch": 0.15666250834082798, "grad_norm": 0.14422966539859772, "learning_rate": 0.0005815539138035082, "loss": 3.2217, "step": 2700 }, { "epoch": 0.15724273985320142, "grad_norm": 0.1676100343465805, "learning_rate": 0.00058135465101229, "loss": 3.2329, "step": 2710 }, { "epoch": 0.15782297136557485, "grad_norm": 0.14574168622493744, "learning_rate": 0.000581154352243773, "loss": 3.2278, "step": 2720 }, { "epoch": 0.1584032028779483, "grad_norm": 0.16981543600559235, "learning_rate": 0.000580953018235478, "loss": 3.229, "step": 2730 }, { "epoch": 0.15898343439032173, "grad_norm": 0.13945645093917847, "learning_rate": 0.0005807506497287379, "loss": 3.2297, "step": 2740 }, { "epoch": 0.15956366590269516, "grad_norm": 0.17302276194095612, "learning_rate": 0.0005805472474686949, "loss": 3.2227, "step": 2750 }, { "epoch": 0.16014389741506863, "grad_norm": 0.15059055387973785, "learning_rate": 0.0005803428122042974, "loss": 3.2288, "step": 2760 }, { "epoch": 0.16072412892744206, "grad_norm": 0.14908020198345184, "learning_rate": 0.0005801373446882973, "loss": 3.2293, "step": 2770 }, { "epoch": 0.1613043604398155, "grad_norm": 0.1653462052345276, "learning_rate": 0.0005799308456772478, "loss": 3.2189, "step": 2780 }, { "epoch": 0.16188459195218893, "grad_norm": 0.14483293890953064, "learning_rate": 0.0005797233159314997, "loss": 3.2239, "step": 2790 }, { "epoch": 0.16246482346456237, "grad_norm": 0.15277917683124542, "learning_rate": 0.0005795147562151992, "loss": 3.2155, "step": 2800 }, { "epoch": 0.1630450549769358, "grad_norm": 0.13660204410552979, "learning_rate": 0.0005793051672962852, "loss": 3.2183, "step": 2810 }, { "epoch": 0.16362528648930924, "grad_norm": 0.15595564246177673, "learning_rate": 0.0005790945499464861, "loss": 3.2163, "step": 2820 }, { "epoch": 0.16420551800168268, "grad_norm": 0.14608708024024963, "learning_rate": 0.0005788829049413167, "loss": 3.2222, "step": 2830 }, { "epoch": 0.1647857495140561, "grad_norm": 0.14129003882408142, "learning_rate": 0.0005786702330600764, "loss": 3.2115, "step": 2840 }, { "epoch": 0.16536598102642955, "grad_norm": 0.13925908505916595, "learning_rate": 0.0005784565350858453, "loss": 3.2115, "step": 2850 }, { "epoch": 0.16594621253880298, "grad_norm": 0.15094564855098724, "learning_rate": 0.0005782418118054816, "loss": 3.216, "step": 2860 }, { "epoch": 0.16652644405117642, "grad_norm": 0.1384998857975006, "learning_rate": 0.0005780260640096189, "loss": 3.2084, "step": 2870 }, { "epoch": 0.16710667556354986, "grad_norm": 0.15442876517772675, "learning_rate": 0.0005778092924926634, "loss": 3.2071, "step": 2880 }, { "epoch": 0.1676869070759233, "grad_norm": 0.16494965553283691, "learning_rate": 0.0005775914980527904, "loss": 3.2101, "step": 2890 }, { "epoch": 0.16826713858829673, "grad_norm": 0.16855239868164062, "learning_rate": 0.0005773726814919419, "loss": 3.2019, "step": 2900 }, { "epoch": 0.16884737010067016, "grad_norm": 0.1579483449459076, "learning_rate": 0.0005771528436158233, "loss": 3.209, "step": 2910 }, { "epoch": 0.1694276016130436, "grad_norm": 0.1417829543352127, "learning_rate": 0.0005769319852339008, "loss": 3.2019, "step": 2920 }, { "epoch": 0.17000783312541703, "grad_norm": 0.14454993605613708, "learning_rate": 0.0005767101071593979, "loss": 3.2047, "step": 2930 }, { "epoch": 0.17058806463779047, "grad_norm": 0.16087666153907776, "learning_rate": 0.0005764872102092931, "loss": 3.2062, "step": 2940 }, { "epoch": 0.1711682961501639, "grad_norm": 0.139312744140625, "learning_rate": 0.0005762632952043163, "loss": 3.1988, "step": 2950 }, { "epoch": 0.17174852766253734, "grad_norm": 0.15459179878234863, "learning_rate": 0.000576038362968946, "loss": 3.2002, "step": 2960 }, { "epoch": 0.17232875917491078, "grad_norm": 0.18820500373840332, "learning_rate": 0.0005758124143314062, "loss": 3.2035, "step": 2970 }, { "epoch": 0.17290899068728421, "grad_norm": 0.14626365900039673, "learning_rate": 0.0005755854501236635, "loss": 3.194, "step": 2980 }, { "epoch": 0.17348922219965765, "grad_norm": 0.14270606637001038, "learning_rate": 0.0005753574711814238, "loss": 3.1879, "step": 2990 }, { "epoch": 0.1740694537120311, "grad_norm": 0.15857936441898346, "learning_rate": 0.0005751284783441297, "loss": 3.207, "step": 3000 }, { "epoch": 0.1740694537120311, "eval_loss": 3.158046245574951, "eval_runtime": 3.2654, "eval_samples_per_second": 1326.029, "eval_steps_per_second": 2.756, "step": 3000 }, { "epoch": 0.17464968522440455, "grad_norm": 0.14403465390205383, "learning_rate": 0.0005748984724549565, "loss": 3.1895, "step": 3010 }, { "epoch": 0.17522991673677799, "grad_norm": 0.1392756998538971, "learning_rate": 0.0005746674543608101, "loss": 3.1942, "step": 3020 }, { "epoch": 0.17581014824915142, "grad_norm": 0.13957557082176208, "learning_rate": 0.0005744354249123234, "loss": 3.1969, "step": 3030 }, { "epoch": 0.17639037976152486, "grad_norm": 0.151198148727417, "learning_rate": 0.0005742023849638531, "loss": 3.1903, "step": 3040 }, { "epoch": 0.1769706112738983, "grad_norm": 0.14607684314250946, "learning_rate": 0.0005739683353734766, "loss": 3.2003, "step": 3050 }, { "epoch": 0.17755084278627173, "grad_norm": 0.13925622403621674, "learning_rate": 0.0005737332770029891, "loss": 3.1927, "step": 3060 }, { "epoch": 0.17813107429864516, "grad_norm": 0.13125456869602203, "learning_rate": 0.0005734972107179001, "loss": 3.1849, "step": 3070 }, { "epoch": 0.1787113058110186, "grad_norm": 0.16905735433101654, "learning_rate": 0.0005732601373874306, "loss": 3.187, "step": 3080 }, { "epoch": 0.17929153732339204, "grad_norm": 0.13563838601112366, "learning_rate": 0.0005730220578845091, "loss": 3.1853, "step": 3090 }, { "epoch": 0.17987176883576547, "grad_norm": 0.15470236539840698, "learning_rate": 0.0005727829730857695, "loss": 3.1906, "step": 3100 }, { "epoch": 0.1804520003481389, "grad_norm": 0.160013347864151, "learning_rate": 0.0005725428838715469, "loss": 3.1705, "step": 3110 }, { "epoch": 0.18103223186051234, "grad_norm": 0.14684250950813293, "learning_rate": 0.0005723017911258752, "loss": 3.1825, "step": 3120 }, { "epoch": 0.18161246337288578, "grad_norm": 0.1529027372598648, "learning_rate": 0.0005720596957364829, "loss": 3.1817, "step": 3130 }, { "epoch": 0.18219269488525922, "grad_norm": 0.13860736787319183, "learning_rate": 0.0005718165985947907, "loss": 3.1844, "step": 3140 }, { "epoch": 0.18277292639763265, "grad_norm": 0.14795511960983276, "learning_rate": 0.0005715725005959077, "loss": 3.1741, "step": 3150 }, { "epoch": 0.1833531579100061, "grad_norm": 0.1455545276403427, "learning_rate": 0.0005713274026386283, "loss": 3.1869, "step": 3160 }, { "epoch": 0.18393338942237952, "grad_norm": 0.14845995604991913, "learning_rate": 0.0005710813056254289, "loss": 3.1735, "step": 3170 }, { "epoch": 0.18451362093475296, "grad_norm": 0.14949209988117218, "learning_rate": 0.0005708342104624645, "loss": 3.178, "step": 3180 }, { "epoch": 0.1850938524471264, "grad_norm": 0.16276435554027557, "learning_rate": 0.0005705861180595653, "loss": 3.1712, "step": 3190 }, { "epoch": 0.18567408395949983, "grad_norm": 0.14152179658412933, "learning_rate": 0.0005703370293302335, "loss": 3.1752, "step": 3200 }, { "epoch": 0.18625431547187327, "grad_norm": 0.1554255187511444, "learning_rate": 0.00057008694519164, "loss": 3.169, "step": 3210 }, { "epoch": 0.1868345469842467, "grad_norm": 0.14890237152576447, "learning_rate": 0.0005698358665646207, "loss": 3.1706, "step": 3220 }, { "epoch": 0.18741477849662014, "grad_norm": 0.15197904407978058, "learning_rate": 0.0005695837943736735, "loss": 3.1691, "step": 3230 }, { "epoch": 0.1879950100089936, "grad_norm": 0.15369053184986115, "learning_rate": 0.0005693307295469547, "loss": 3.1678, "step": 3240 }, { "epoch": 0.18857524152136704, "grad_norm": 0.19938114285469055, "learning_rate": 0.0005690766730162752, "loss": 3.1706, "step": 3250 }, { "epoch": 0.18915547303374047, "grad_norm": 0.14962078630924225, "learning_rate": 0.0005688216257170979, "loss": 3.1665, "step": 3260 }, { "epoch": 0.1897357045461139, "grad_norm": 0.14826686680316925, "learning_rate": 0.0005685655885885337, "loss": 3.1478, "step": 3270 }, { "epoch": 0.19031593605848734, "grad_norm": 0.137392058968544, "learning_rate": 0.0005683085625733382, "loss": 3.1645, "step": 3280 }, { "epoch": 0.19089616757086078, "grad_norm": 0.15559589862823486, "learning_rate": 0.000568050548617908, "loss": 3.1674, "step": 3290 }, { "epoch": 0.19147639908323422, "grad_norm": 0.17506170272827148, "learning_rate": 0.0005677915476722775, "loss": 3.1606, "step": 3300 }, { "epoch": 0.19205663059560765, "grad_norm": 0.1602877825498581, "learning_rate": 0.0005675315606901155, "loss": 3.1586, "step": 3310 }, { "epoch": 0.1926368621079811, "grad_norm": 0.13343220949172974, "learning_rate": 0.0005672705886287211, "loss": 3.1553, "step": 3320 }, { "epoch": 0.19321709362035452, "grad_norm": 0.15390737354755402, "learning_rate": 0.0005670086324490208, "loss": 3.1687, "step": 3330 }, { "epoch": 0.19379732513272796, "grad_norm": 0.13513082265853882, "learning_rate": 0.0005667456931155647, "loss": 3.1543, "step": 3340 }, { "epoch": 0.1943775566451014, "grad_norm": 0.1489078551530838, "learning_rate": 0.0005664817715965231, "loss": 3.1623, "step": 3350 }, { "epoch": 0.19495778815747483, "grad_norm": 0.14149461686611176, "learning_rate": 0.0005662168688636826, "loss": 3.1487, "step": 3360 }, { "epoch": 0.19553801966984827, "grad_norm": 0.150479257106781, "learning_rate": 0.0005659509858924428, "loss": 3.1588, "step": 3370 }, { "epoch": 0.1961182511822217, "grad_norm": 0.15041102468967438, "learning_rate": 0.0005656841236618127, "loss": 3.155, "step": 3380 }, { "epoch": 0.19669848269459514, "grad_norm": 0.14053913950920105, "learning_rate": 0.0005654162831544068, "loss": 3.1581, "step": 3390 }, { "epoch": 0.19727871420696858, "grad_norm": 0.15485486388206482, "learning_rate": 0.0005651474653564421, "loss": 3.1465, "step": 3400 }, { "epoch": 0.197858945719342, "grad_norm": 0.1425885111093521, "learning_rate": 0.0005648776712577338, "loss": 3.1535, "step": 3410 }, { "epoch": 0.19843917723171545, "grad_norm": 0.1361316442489624, "learning_rate": 0.0005646069018516921, "loss": 3.1466, "step": 3420 }, { "epoch": 0.19901940874408888, "grad_norm": 0.15521439909934998, "learning_rate": 0.0005643351581353184, "loss": 3.1415, "step": 3430 }, { "epoch": 0.19959964025646232, "grad_norm": 0.14644280076026917, "learning_rate": 0.0005640624411092014, "loss": 3.1411, "step": 3440 }, { "epoch": 0.20017987176883575, "grad_norm": 0.14116531610488892, "learning_rate": 0.0005637887517775137, "loss": 3.1542, "step": 3450 }, { "epoch": 0.2007601032812092, "grad_norm": 0.1301729828119278, "learning_rate": 0.0005635140911480082, "loss": 3.1448, "step": 3460 }, { "epoch": 0.20134033479358263, "grad_norm": 0.16307103633880615, "learning_rate": 0.000563238460232014, "loss": 3.1397, "step": 3470 }, { "epoch": 0.2019205663059561, "grad_norm": 0.13141117990016937, "learning_rate": 0.0005629618600444332, "loss": 3.1469, "step": 3480 }, { "epoch": 0.20250079781832953, "grad_norm": 0.13741467893123627, "learning_rate": 0.0005626842916037365, "loss": 3.1419, "step": 3490 }, { "epoch": 0.20308102933070296, "grad_norm": 0.16112880408763885, "learning_rate": 0.0005624057559319601, "loss": 3.1449, "step": 3500 }, { "epoch": 0.2036612608430764, "grad_norm": 0.153072327375412, "learning_rate": 0.0005621262540547015, "loss": 3.1365, "step": 3510 }, { "epoch": 0.20424149235544983, "grad_norm": 0.1413891613483429, "learning_rate": 0.0005618457870011158, "loss": 3.1307, "step": 3520 }, { "epoch": 0.20482172386782327, "grad_norm": 0.15589068830013275, "learning_rate": 0.0005615643558039121, "loss": 3.1418, "step": 3530 }, { "epoch": 0.2054019553801967, "grad_norm": 0.12889379262924194, "learning_rate": 0.0005612819614993496, "loss": 3.1366, "step": 3540 }, { "epoch": 0.20598218689257014, "grad_norm": 0.14375300705432892, "learning_rate": 0.0005609986051272336, "loss": 3.13, "step": 3550 }, { "epoch": 0.20656241840494358, "grad_norm": 0.1587209552526474, "learning_rate": 0.000560714287730912, "loss": 3.1338, "step": 3560 }, { "epoch": 0.207142649917317, "grad_norm": 0.15273341536521912, "learning_rate": 0.0005604290103572714, "loss": 3.1393, "step": 3570 }, { "epoch": 0.20772288142969045, "grad_norm": 0.13435807824134827, "learning_rate": 0.0005601427740567328, "loss": 3.137, "step": 3580 }, { "epoch": 0.20830311294206388, "grad_norm": 0.1391715109348297, "learning_rate": 0.0005598555798832482, "loss": 3.1347, "step": 3590 }, { "epoch": 0.20888334445443732, "grad_norm": 0.16318084299564362, "learning_rate": 0.0005595674288942969, "loss": 3.1279, "step": 3600 }, { "epoch": 0.20946357596681076, "grad_norm": 0.1386035829782486, "learning_rate": 0.0005592783221508807, "loss": 3.1335, "step": 3610 }, { "epoch": 0.2100438074791842, "grad_norm": 0.14639577269554138, "learning_rate": 0.000558988260717521, "loss": 3.142, "step": 3620 }, { "epoch": 0.21062403899155763, "grad_norm": 0.13666051626205444, "learning_rate": 0.0005586972456622546, "loss": 3.1287, "step": 3630 }, { "epoch": 0.21120427050393106, "grad_norm": 0.14930284023284912, "learning_rate": 0.0005584052780566293, "loss": 3.1283, "step": 3640 }, { "epoch": 0.2117845020163045, "grad_norm": 0.13987945020198822, "learning_rate": 0.0005581123589757002, "loss": 3.1329, "step": 3650 }, { "epoch": 0.21236473352867793, "grad_norm": 0.1452946811914444, "learning_rate": 0.0005578184894980263, "loss": 3.1294, "step": 3660 }, { "epoch": 0.21294496504105137, "grad_norm": 0.15192043781280518, "learning_rate": 0.0005575236707056657, "loss": 3.1206, "step": 3670 }, { "epoch": 0.2135251965534248, "grad_norm": 0.16006827354431152, "learning_rate": 0.0005572279036841721, "loss": 3.1273, "step": 3680 }, { "epoch": 0.21410542806579824, "grad_norm": 0.18141302466392517, "learning_rate": 0.0005569311895225906, "loss": 3.1245, "step": 3690 }, { "epoch": 0.21468565957817168, "grad_norm": 0.14263153076171875, "learning_rate": 0.0005566335293134539, "loss": 3.1211, "step": 3700 }, { "epoch": 0.21526589109054511, "grad_norm": 0.1435001790523529, "learning_rate": 0.0005563349241527781, "loss": 3.1258, "step": 3710 }, { "epoch": 0.21584612260291858, "grad_norm": 0.15155887603759766, "learning_rate": 0.0005560353751400585, "loss": 3.1233, "step": 3720 }, { "epoch": 0.216426354115292, "grad_norm": 0.1545734703540802, "learning_rate": 0.0005557348833782663, "loss": 3.1292, "step": 3730 }, { "epoch": 0.21700658562766545, "grad_norm": 0.15549300611019135, "learning_rate": 0.0005554334499738433, "loss": 3.1142, "step": 3740 }, { "epoch": 0.21758681714003889, "grad_norm": 0.15990693867206573, "learning_rate": 0.000555131076036699, "loss": 3.125, "step": 3750 }, { "epoch": 0.21816704865241232, "grad_norm": 0.16630201041698456, "learning_rate": 0.0005548277626802058, "loss": 3.1216, "step": 3760 }, { "epoch": 0.21874728016478576, "grad_norm": 0.1408713161945343, "learning_rate": 0.0005545235110211954, "loss": 3.1111, "step": 3770 }, { "epoch": 0.2193275116771592, "grad_norm": 0.1488475650548935, "learning_rate": 0.0005542183221799544, "loss": 3.1253, "step": 3780 }, { "epoch": 0.21990774318953263, "grad_norm": 0.14259935915470123, "learning_rate": 0.0005539121972802198, "loss": 3.1179, "step": 3790 }, { "epoch": 0.22048797470190606, "grad_norm": 0.14055614173412323, "learning_rate": 0.0005536051374491757, "loss": 3.1113, "step": 3800 }, { "epoch": 0.2210682062142795, "grad_norm": 0.1665177196264267, "learning_rate": 0.0005532971438174485, "loss": 3.1197, "step": 3810 }, { "epoch": 0.22164843772665294, "grad_norm": 0.15349626541137695, "learning_rate": 0.0005529882175191031, "loss": 3.1086, "step": 3820 }, { "epoch": 0.22222866923902637, "grad_norm": 0.14321498572826385, "learning_rate": 0.0005526783596916385, "loss": 3.1161, "step": 3830 }, { "epoch": 0.2228089007513998, "grad_norm": 0.14768148958683014, "learning_rate": 0.0005523675714759835, "loss": 3.1164, "step": 3840 }, { "epoch": 0.22338913226377324, "grad_norm": 0.1546637862920761, "learning_rate": 0.000552055854016493, "loss": 3.1185, "step": 3850 }, { "epoch": 0.22396936377614668, "grad_norm": 0.16114896535873413, "learning_rate": 0.0005517432084609434, "loss": 3.1083, "step": 3860 }, { "epoch": 0.22454959528852012, "grad_norm": 0.13796792924404144, "learning_rate": 0.0005514296359605284, "loss": 3.102, "step": 3870 }, { "epoch": 0.22512982680089355, "grad_norm": 0.13948635756969452, "learning_rate": 0.0005511151376698546, "loss": 3.1079, "step": 3880 }, { "epoch": 0.225710058313267, "grad_norm": 0.13826532661914825, "learning_rate": 0.0005507997147469378, "loss": 3.107, "step": 3890 }, { "epoch": 0.22629028982564042, "grad_norm": 0.1437525451183319, "learning_rate": 0.0005504833683531981, "loss": 3.1076, "step": 3900 }, { "epoch": 0.22687052133801386, "grad_norm": 0.14256474375724792, "learning_rate": 0.0005501660996534563, "loss": 3.1056, "step": 3910 }, { "epoch": 0.2274507528503873, "grad_norm": 0.1531156748533249, "learning_rate": 0.0005498479098159289, "loss": 3.101, "step": 3920 }, { "epoch": 0.22803098436276073, "grad_norm": 0.16901366412639618, "learning_rate": 0.0005495288000122242, "loss": 3.0981, "step": 3930 }, { "epoch": 0.22861121587513417, "grad_norm": 0.1440243273973465, "learning_rate": 0.0005492087714173378, "loss": 3.1052, "step": 3940 }, { "epoch": 0.2291914473875076, "grad_norm": 0.1603139340877533, "learning_rate": 0.0005488878252096487, "loss": 3.105, "step": 3950 }, { "epoch": 0.22977167889988107, "grad_norm": 0.1588706523180008, "learning_rate": 0.0005485659625709144, "loss": 3.1107, "step": 3960 }, { "epoch": 0.2303519104122545, "grad_norm": 0.1452343761920929, "learning_rate": 0.0005482431846862667, "loss": 3.1074, "step": 3970 }, { "epoch": 0.23093214192462794, "grad_norm": 0.15799881517887115, "learning_rate": 0.0005479194927442078, "loss": 3.0985, "step": 3980 }, { "epoch": 0.23151237343700137, "grad_norm": 0.12657681107521057, "learning_rate": 0.0005475948879366053, "loss": 3.0958, "step": 3990 }, { "epoch": 0.2320926049493748, "grad_norm": 0.13606688380241394, "learning_rate": 0.000547269371458688, "loss": 3.0999, "step": 4000 }, { "epoch": 0.2320926049493748, "eval_loss": 3.0630993843078613, "eval_runtime": 3.264, "eval_samples_per_second": 1326.576, "eval_steps_per_second": 2.757, "step": 4000 }, { "epoch": 0.23267283646174824, "grad_norm": 0.16136619448661804, "learning_rate": 0.0005469429445090417, "loss": 3.1004, "step": 4010 }, { "epoch": 0.23325306797412168, "grad_norm": 0.14767828583717346, "learning_rate": 0.0005466156082896047, "loss": 3.1075, "step": 4020 }, { "epoch": 0.23383329948649512, "grad_norm": 0.1492021530866623, "learning_rate": 0.0005462873640056632, "loss": 3.1025, "step": 4030 }, { "epoch": 0.23441353099886855, "grad_norm": 0.14654645323753357, "learning_rate": 0.000545958212865847, "loss": 3.0966, "step": 4040 }, { "epoch": 0.234993762511242, "grad_norm": 0.15648731589317322, "learning_rate": 0.0005456281560821252, "loss": 3.0937, "step": 4050 }, { "epoch": 0.23557399402361542, "grad_norm": 0.13584694266319275, "learning_rate": 0.0005452971948698014, "loss": 3.1052, "step": 4060 }, { "epoch": 0.23615422553598886, "grad_norm": 0.13829472661018372, "learning_rate": 0.0005449653304475094, "loss": 3.0933, "step": 4070 }, { "epoch": 0.2367344570483623, "grad_norm": 0.16889816522598267, "learning_rate": 0.0005446325640372088, "loss": 3.0949, "step": 4080 }, { "epoch": 0.23731468856073573, "grad_norm": 0.12351599335670471, "learning_rate": 0.0005442988968641804, "loss": 3.0914, "step": 4090 }, { "epoch": 0.23789492007310917, "grad_norm": 0.14327877759933472, "learning_rate": 0.0005439643301570216, "loss": 3.0814, "step": 4100 }, { "epoch": 0.2384751515854826, "grad_norm": 0.15155468881130219, "learning_rate": 0.0005436288651476421, "loss": 3.0849, "step": 4110 }, { "epoch": 0.23905538309785604, "grad_norm": 0.14292922616004944, "learning_rate": 0.0005432925030712594, "loss": 3.0887, "step": 4120 }, { "epoch": 0.23963561461022947, "grad_norm": 0.14884264767169952, "learning_rate": 0.0005429552451663936, "loss": 3.0911, "step": 4130 }, { "epoch": 0.2402158461226029, "grad_norm": 0.1403530389070511, "learning_rate": 0.0005426170926748639, "loss": 3.0926, "step": 4140 }, { "epoch": 0.24079607763497635, "grad_norm": 0.14543718099594116, "learning_rate": 0.0005422780468417829, "loss": 3.0897, "step": 4150 }, { "epoch": 0.24137630914734978, "grad_norm": 0.12813718616962433, "learning_rate": 0.0005419381089155532, "loss": 3.0902, "step": 4160 }, { "epoch": 0.24195654065972322, "grad_norm": 0.13375824689865112, "learning_rate": 0.0005415972801478617, "loss": 3.0915, "step": 4170 }, { "epoch": 0.24253677217209665, "grad_norm": 0.14347635209560394, "learning_rate": 0.0005412555617936755, "loss": 3.0892, "step": 4180 }, { "epoch": 0.2431170036844701, "grad_norm": 0.14166522026062012, "learning_rate": 0.0005409129551112377, "loss": 3.0808, "step": 4190 }, { "epoch": 0.24369723519684355, "grad_norm": 0.13924048840999603, "learning_rate": 0.0005405694613620617, "loss": 3.0854, "step": 4200 }, { "epoch": 0.244277466709217, "grad_norm": 0.13338492810726166, "learning_rate": 0.0005402250818109276, "loss": 3.0836, "step": 4210 }, { "epoch": 0.24485769822159043, "grad_norm": 0.14531342685222626, "learning_rate": 0.0005398798177258768, "loss": 3.0971, "step": 4220 }, { "epoch": 0.24543792973396386, "grad_norm": 0.1432162970304489, "learning_rate": 0.0005395336703782082, "loss": 3.0838, "step": 4230 }, { "epoch": 0.2460181612463373, "grad_norm": 0.15475274622440338, "learning_rate": 0.0005391866410424722, "loss": 3.0764, "step": 4240 }, { "epoch": 0.24659839275871073, "grad_norm": 0.15521539747714996, "learning_rate": 0.0005388387309964675, "loss": 3.0837, "step": 4250 }, { "epoch": 0.24717862427108417, "grad_norm": 0.1430870145559311, "learning_rate": 0.0005384899415212351, "loss": 3.0889, "step": 4260 }, { "epoch": 0.2477588557834576, "grad_norm": 0.14807622134685516, "learning_rate": 0.0005381402739010545, "loss": 3.0769, "step": 4270 }, { "epoch": 0.24833908729583104, "grad_norm": 0.1509249359369278, "learning_rate": 0.0005377897294234385, "loss": 3.0815, "step": 4280 }, { "epoch": 0.24891931880820448, "grad_norm": 0.1451188027858734, "learning_rate": 0.0005374383093791287, "loss": 3.0766, "step": 4290 }, { "epoch": 0.2494995503205779, "grad_norm": 0.130240797996521, "learning_rate": 0.0005370860150620901, "loss": 3.0824, "step": 4300 }, { "epoch": 0.2500797818329513, "grad_norm": 0.14696471393108368, "learning_rate": 0.0005367328477695077, "loss": 3.0678, "step": 4310 }, { "epoch": 0.2506600133453248, "grad_norm": 0.13198255002498627, "learning_rate": 0.0005363788088017803, "loss": 3.0759, "step": 4320 }, { "epoch": 0.25124024485769825, "grad_norm": 0.1413690447807312, "learning_rate": 0.0005360238994625166, "loss": 3.0842, "step": 4330 }, { "epoch": 0.25182047637007166, "grad_norm": 0.1560727059841156, "learning_rate": 0.0005356681210585297, "loss": 3.074, "step": 4340 }, { "epoch": 0.2524007078824451, "grad_norm": 0.13727669417858124, "learning_rate": 0.0005353114748998332, "loss": 3.082, "step": 4350 }, { "epoch": 0.2529809393948185, "grad_norm": 0.1479531228542328, "learning_rate": 0.0005349539622996356, "loss": 3.0804, "step": 4360 }, { "epoch": 0.253561170907192, "grad_norm": 0.13756506145000458, "learning_rate": 0.0005345955845743358, "loss": 3.0829, "step": 4370 }, { "epoch": 0.2541414024195654, "grad_norm": 0.14778585731983185, "learning_rate": 0.0005342363430435177, "loss": 3.0785, "step": 4380 }, { "epoch": 0.25472163393193886, "grad_norm": 0.13227440416812897, "learning_rate": 0.0005338762390299467, "loss": 3.0776, "step": 4390 }, { "epoch": 0.25530186544431227, "grad_norm": 0.14178766310214996, "learning_rate": 0.0005335152738595634, "loss": 3.0799, "step": 4400 }, { "epoch": 0.25588209695668573, "grad_norm": 0.14833244681358337, "learning_rate": 0.0005331534488614794, "loss": 3.0674, "step": 4410 }, { "epoch": 0.25646232846905914, "grad_norm": 0.13829241693019867, "learning_rate": 0.0005327907653679721, "loss": 3.0643, "step": 4420 }, { "epoch": 0.2570425599814326, "grad_norm": 0.16908784210681915, "learning_rate": 0.0005324272247144802, "loss": 3.0649, "step": 4430 }, { "epoch": 0.257622791493806, "grad_norm": 0.14392369985580444, "learning_rate": 0.0005320628282395985, "loss": 3.0761, "step": 4440 }, { "epoch": 0.2582030230061795, "grad_norm": 0.16387993097305298, "learning_rate": 0.0005316975772850729, "loss": 3.0666, "step": 4450 }, { "epoch": 0.2587832545185529, "grad_norm": 0.13506962358951569, "learning_rate": 0.0005313314731957957, "loss": 3.0672, "step": 4460 }, { "epoch": 0.25936348603092635, "grad_norm": 0.1522989273071289, "learning_rate": 0.0005309645173198007, "loss": 3.0607, "step": 4470 }, { "epoch": 0.25994371754329976, "grad_norm": 0.13824021816253662, "learning_rate": 0.0005305967110082576, "loss": 3.0627, "step": 4480 }, { "epoch": 0.2605239490556732, "grad_norm": 0.13685718178749084, "learning_rate": 0.000530228055615468, "loss": 3.0612, "step": 4490 }, { "epoch": 0.26110418056804663, "grad_norm": 0.13309134542942047, "learning_rate": 0.0005298585524988594, "loss": 3.0548, "step": 4500 }, { "epoch": 0.2616844120804201, "grad_norm": 0.17121103405952454, "learning_rate": 0.0005294882030189812, "loss": 3.066, "step": 4510 }, { "epoch": 0.2622646435927935, "grad_norm": 0.13467055559158325, "learning_rate": 0.000529117008539499, "loss": 3.0606, "step": 4520 }, { "epoch": 0.26284487510516696, "grad_norm": 0.12970523536205292, "learning_rate": 0.0005287449704271896, "loss": 3.0553, "step": 4530 }, { "epoch": 0.26342510661754037, "grad_norm": 0.1509917676448822, "learning_rate": 0.0005283720900519365, "loss": 3.0571, "step": 4540 }, { "epoch": 0.26400533812991384, "grad_norm": 0.1372883915901184, "learning_rate": 0.0005279983687867243, "loss": 3.0635, "step": 4550 }, { "epoch": 0.2645855696422873, "grad_norm": 0.1482354998588562, "learning_rate": 0.0005276238080076335, "loss": 3.0619, "step": 4560 }, { "epoch": 0.2651658011546607, "grad_norm": 0.13884900510311127, "learning_rate": 0.0005272484090938365, "loss": 3.069, "step": 4570 }, { "epoch": 0.26574603266703417, "grad_norm": 0.14500798285007477, "learning_rate": 0.0005268721734275914, "loss": 3.0715, "step": 4580 }, { "epoch": 0.2663262641794076, "grad_norm": 0.1357218474149704, "learning_rate": 0.000526495102394237, "loss": 3.0584, "step": 4590 }, { "epoch": 0.26690649569178104, "grad_norm": 0.14025723934173584, "learning_rate": 0.0005261171973821887, "loss": 3.0613, "step": 4600 }, { "epoch": 0.26748672720415445, "grad_norm": 0.15253092348575592, "learning_rate": 0.0005257384597829322, "loss": 3.0584, "step": 4610 }, { "epoch": 0.2680669587165279, "grad_norm": 0.14573270082473755, "learning_rate": 0.0005253588909910191, "loss": 3.0634, "step": 4620 }, { "epoch": 0.2686471902289013, "grad_norm": 0.15005233883857727, "learning_rate": 0.0005249784924040614, "loss": 3.0526, "step": 4630 }, { "epoch": 0.2692274217412748, "grad_norm": 0.15314225852489471, "learning_rate": 0.0005245972654227265, "loss": 3.0635, "step": 4640 }, { "epoch": 0.2698076532536482, "grad_norm": 0.14412705600261688, "learning_rate": 0.0005242152114507321, "loss": 3.055, "step": 4650 }, { "epoch": 0.27038788476602166, "grad_norm": 0.15046367049217224, "learning_rate": 0.0005238323318948412, "loss": 3.066, "step": 4660 }, { "epoch": 0.27096811627839507, "grad_norm": 0.12618590891361237, "learning_rate": 0.0005234486281648559, "loss": 3.0433, "step": 4670 }, { "epoch": 0.27154834779076853, "grad_norm": 0.14097653329372406, "learning_rate": 0.000523064101673614, "loss": 3.0593, "step": 4680 }, { "epoch": 0.27212857930314194, "grad_norm": 0.14015048742294312, "learning_rate": 0.0005226787538369821, "loss": 3.057, "step": 4690 }, { "epoch": 0.2727088108155154, "grad_norm": 0.1534152328968048, "learning_rate": 0.0005222925860738513, "loss": 3.06, "step": 4700 }, { "epoch": 0.2732890423278888, "grad_norm": 0.1350966989994049, "learning_rate": 0.0005219055998061319, "loss": 3.0518, "step": 4710 }, { "epoch": 0.2738692738402623, "grad_norm": 0.15589705109596252, "learning_rate": 0.0005215177964587478, "loss": 3.0468, "step": 4720 }, { "epoch": 0.2744495053526357, "grad_norm": 0.14144299924373627, "learning_rate": 0.0005211291774596316, "loss": 3.0555, "step": 4730 }, { "epoch": 0.27502973686500914, "grad_norm": 0.14553704857826233, "learning_rate": 0.000520739744239719, "loss": 3.0531, "step": 4740 }, { "epoch": 0.27560996837738255, "grad_norm": 0.15157508850097656, "learning_rate": 0.0005203494982329441, "loss": 3.0504, "step": 4750 }, { "epoch": 0.276190199889756, "grad_norm": 0.14391539990901947, "learning_rate": 0.0005199584408762335, "loss": 3.0512, "step": 4760 }, { "epoch": 0.2767704314021294, "grad_norm": 0.1297539621591568, "learning_rate": 0.0005195665736095013, "loss": 3.036, "step": 4770 }, { "epoch": 0.2773506629145029, "grad_norm": 0.13723768293857574, "learning_rate": 0.0005191738978756439, "loss": 3.0532, "step": 4780 }, { "epoch": 0.2779308944268763, "grad_norm": 0.1422174870967865, "learning_rate": 0.0005187804151205345, "loss": 3.0605, "step": 4790 }, { "epoch": 0.27851112593924976, "grad_norm": 0.137346088886261, "learning_rate": 0.0005183861267930177, "loss": 3.0552, "step": 4800 }, { "epoch": 0.2790913574516232, "grad_norm": 0.13471810519695282, "learning_rate": 0.0005179910343449046, "loss": 3.0426, "step": 4810 }, { "epoch": 0.27967158896399663, "grad_norm": 0.12727439403533936, "learning_rate": 0.0005175951392309669, "loss": 3.0448, "step": 4820 }, { "epoch": 0.2802518204763701, "grad_norm": 0.13242101669311523, "learning_rate": 0.0005171984429089318, "loss": 3.0546, "step": 4830 }, { "epoch": 0.2808320519887435, "grad_norm": 0.14276637136936188, "learning_rate": 0.0005168009468394769, "loss": 3.0392, "step": 4840 }, { "epoch": 0.28141228350111697, "grad_norm": 0.1340208798646927, "learning_rate": 0.0005164026524862242, "loss": 3.0491, "step": 4850 }, { "epoch": 0.2819925150134904, "grad_norm": 0.14000356197357178, "learning_rate": 0.0005160035613157354, "loss": 3.0396, "step": 4860 }, { "epoch": 0.28257274652586384, "grad_norm": 0.15974439680576324, "learning_rate": 0.0005156036747975059, "loss": 3.0406, "step": 4870 }, { "epoch": 0.28315297803823725, "grad_norm": 0.1382746398448944, "learning_rate": 0.0005152029944039597, "loss": 3.0449, "step": 4880 }, { "epoch": 0.2837332095506107, "grad_norm": 0.14049001038074493, "learning_rate": 0.000514801521610444, "loss": 3.0463, "step": 4890 }, { "epoch": 0.2843134410629841, "grad_norm": 0.13699445128440857, "learning_rate": 0.0005143992578952238, "loss": 3.0393, "step": 4900 }, { "epoch": 0.2848936725753576, "grad_norm": 0.1515870988368988, "learning_rate": 0.0005139962047394761, "loss": 3.0399, "step": 4910 }, { "epoch": 0.285473904087731, "grad_norm": 0.1437605917453766, "learning_rate": 0.0005135923636272849, "loss": 3.0378, "step": 4920 }, { "epoch": 0.28605413560010445, "grad_norm": 0.13769088685512543, "learning_rate": 0.0005131877360456355, "loss": 3.0377, "step": 4930 }, { "epoch": 0.28663436711247786, "grad_norm": 0.15194256603717804, "learning_rate": 0.000512782323484409, "loss": 3.0399, "step": 4940 }, { "epoch": 0.2872145986248513, "grad_norm": 0.14672812819480896, "learning_rate": 0.0005123761274363769, "loss": 3.04, "step": 4950 }, { "epoch": 0.28779483013722473, "grad_norm": 0.13162557780742645, "learning_rate": 0.0005119691493971957, "loss": 3.0317, "step": 4960 }, { "epoch": 0.2883750616495982, "grad_norm": 0.13286751508712769, "learning_rate": 0.0005115613908654011, "loss": 3.0486, "step": 4970 }, { "epoch": 0.2889552931619716, "grad_norm": 0.13034851849079132, "learning_rate": 0.0005111528533424027, "loss": 3.0399, "step": 4980 }, { "epoch": 0.28953552467434507, "grad_norm": 0.1405908614397049, "learning_rate": 0.0005107435383324786, "loss": 3.0372, "step": 4990 }, { "epoch": 0.2901157561867185, "grad_norm": 0.16415055096149445, "learning_rate": 0.0005103334473427695, "loss": 3.0333, "step": 5000 }, { "epoch": 0.2901157561867185, "eval_loss": 2.9981322288513184, "eval_runtime": 3.2581, "eval_samples_per_second": 1329.001, "eval_steps_per_second": 2.762, "step": 5000 }, { "epoch": 0.29069598769909194, "grad_norm": 0.12301915884017944, "learning_rate": 0.0005099225818832731, "loss": 3.0312, "step": 5010 }, { "epoch": 0.29127621921146535, "grad_norm": 0.16767041385173798, "learning_rate": 0.0005095109434668395, "loss": 3.0247, "step": 5020 }, { "epoch": 0.2918564507238388, "grad_norm": 0.13234609365463257, "learning_rate": 0.0005090985336091642, "loss": 3.0348, "step": 5030 }, { "epoch": 0.2924366822362123, "grad_norm": 0.14020933210849762, "learning_rate": 0.0005086853538287835, "loss": 3.0317, "step": 5040 }, { "epoch": 0.2930169137485857, "grad_norm": 0.14580604434013367, "learning_rate": 0.0005082714056470687, "loss": 3.0321, "step": 5050 }, { "epoch": 0.29359714526095915, "grad_norm": 0.13627541065216064, "learning_rate": 0.0005078566905882205, "loss": 3.0318, "step": 5060 }, { "epoch": 0.29417737677333256, "grad_norm": 0.12629657983779907, "learning_rate": 0.0005074412101792631, "loss": 3.0284, "step": 5070 }, { "epoch": 0.294757608285706, "grad_norm": 0.13409367203712463, "learning_rate": 0.0005070249659500387, "loss": 3.0381, "step": 5080 }, { "epoch": 0.2953378397980794, "grad_norm": 0.1341470181941986, "learning_rate": 0.0005066079594332023, "loss": 3.0229, "step": 5090 }, { "epoch": 0.2959180713104529, "grad_norm": 0.1630919873714447, "learning_rate": 0.0005061901921642156, "loss": 3.0315, "step": 5100 }, { "epoch": 0.2964983028228263, "grad_norm": 0.12825888395309448, "learning_rate": 0.0005057716656813416, "loss": 3.0249, "step": 5110 }, { "epoch": 0.29707853433519976, "grad_norm": 0.1613105833530426, "learning_rate": 0.0005053523815256384, "loss": 3.0238, "step": 5120 }, { "epoch": 0.29765876584757317, "grad_norm": 0.14038483798503876, "learning_rate": 0.0005049323412409542, "loss": 3.0294, "step": 5130 }, { "epoch": 0.29823899735994663, "grad_norm": 0.16509568691253662, "learning_rate": 0.0005045115463739215, "loss": 3.0356, "step": 5140 }, { "epoch": 0.29881922887232004, "grad_norm": 0.14289237558841705, "learning_rate": 0.0005040899984739509, "loss": 3.0228, "step": 5150 }, { "epoch": 0.2993994603846935, "grad_norm": 0.14584140479564667, "learning_rate": 0.000503667699093226, "loss": 3.0294, "step": 5160 }, { "epoch": 0.2999796918970669, "grad_norm": 0.12970221042633057, "learning_rate": 0.0005032446497866973, "loss": 3.0321, "step": 5170 }, { "epoch": 0.3005599234094404, "grad_norm": 0.13744401931762695, "learning_rate": 0.0005028208521120769, "loss": 3.0236, "step": 5180 }, { "epoch": 0.3011401549218138, "grad_norm": 0.1317235380411148, "learning_rate": 0.0005023963076298321, "loss": 3.0254, "step": 5190 }, { "epoch": 0.30172038643418725, "grad_norm": 0.14213494956493378, "learning_rate": 0.0005019710179031801, "loss": 3.0275, "step": 5200 }, { "epoch": 0.30230061794656066, "grad_norm": 0.13712069392204285, "learning_rate": 0.0005015449844980823, "loss": 3.0249, "step": 5210 }, { "epoch": 0.3028808494589341, "grad_norm": 0.14411009848117828, "learning_rate": 0.0005011182089832381, "loss": 3.0215, "step": 5220 }, { "epoch": 0.30346108097130753, "grad_norm": 0.12583871185779572, "learning_rate": 0.0005006906929300799, "loss": 3.0275, "step": 5230 }, { "epoch": 0.304041312483681, "grad_norm": 0.14499635994434357, "learning_rate": 0.0005002624379127666, "loss": 3.0258, "step": 5240 }, { "epoch": 0.3046215439960544, "grad_norm": 0.14918765425682068, "learning_rate": 0.0004998334455081779, "loss": 3.0209, "step": 5250 }, { "epoch": 0.30520177550842786, "grad_norm": 0.13245496153831482, "learning_rate": 0.0004994037172959089, "loss": 3.0212, "step": 5260 }, { "epoch": 0.3057820070208013, "grad_norm": 0.12850724160671234, "learning_rate": 0.0004989732548582638, "loss": 3.0258, "step": 5270 }, { "epoch": 0.30636223853317474, "grad_norm": 0.1346123367547989, "learning_rate": 0.0004985420597802503, "loss": 3.0138, "step": 5280 }, { "epoch": 0.3069424700455482, "grad_norm": 0.14746621251106262, "learning_rate": 0.0004981101336495741, "loss": 3.0202, "step": 5290 }, { "epoch": 0.3075227015579216, "grad_norm": 0.140406534075737, "learning_rate": 0.0004976774780566324, "loss": 3.0276, "step": 5300 }, { "epoch": 0.30810293307029507, "grad_norm": 0.133416548371315, "learning_rate": 0.0004972440945945083, "loss": 3.0228, "step": 5310 }, { "epoch": 0.3086831645826685, "grad_norm": 0.140433207154274, "learning_rate": 0.0004968099848589651, "loss": 3.0219, "step": 5320 }, { "epoch": 0.30926339609504194, "grad_norm": 0.14963370561599731, "learning_rate": 0.0004963751504484403, "loss": 3.0119, "step": 5330 }, { "epoch": 0.30984362760741535, "grad_norm": 0.12273452430963516, "learning_rate": 0.0004959395929640401, "loss": 3.0136, "step": 5340 }, { "epoch": 0.3104238591197888, "grad_norm": 0.14232607185840607, "learning_rate": 0.0004955033140095322, "loss": 3.0088, "step": 5350 }, { "epoch": 0.3110040906321622, "grad_norm": 0.15276071429252625, "learning_rate": 0.0004950663151913419, "loss": 3.0189, "step": 5360 }, { "epoch": 0.3115843221445357, "grad_norm": 0.14110638201236725, "learning_rate": 0.0004946285981185446, "loss": 3.0273, "step": 5370 }, { "epoch": 0.3121645536569091, "grad_norm": 0.12971307337284088, "learning_rate": 0.0004941901644028601, "loss": 3.0181, "step": 5380 }, { "epoch": 0.31274478516928256, "grad_norm": 0.12775759398937225, "learning_rate": 0.0004937510156586474, "loss": 3.0108, "step": 5390 }, { "epoch": 0.31332501668165597, "grad_norm": 0.15120139718055725, "learning_rate": 0.0004933111535028983, "loss": 3.0142, "step": 5400 }, { "epoch": 0.31390524819402943, "grad_norm": 0.14965811371803284, "learning_rate": 0.0004928705795552312, "loss": 3.0137, "step": 5410 }, { "epoch": 0.31448547970640284, "grad_norm": 0.1459018588066101, "learning_rate": 0.0004924292954378856, "loss": 3.0146, "step": 5420 }, { "epoch": 0.3150657112187763, "grad_norm": 0.1286230981349945, "learning_rate": 0.0004919873027757159, "loss": 3.0162, "step": 5430 }, { "epoch": 0.3156459427311497, "grad_norm": 0.13560357689857483, "learning_rate": 0.0004915446031961854, "loss": 3.0129, "step": 5440 }, { "epoch": 0.3162261742435232, "grad_norm": 0.1419978141784668, "learning_rate": 0.0004911011983293601, "loss": 3.0115, "step": 5450 }, { "epoch": 0.3168064057558966, "grad_norm": 0.12910611927509308, "learning_rate": 0.0004906570898079032, "loss": 3.0151, "step": 5460 }, { "epoch": 0.31738663726827004, "grad_norm": 0.15491628646850586, "learning_rate": 0.0004902122792670692, "loss": 3.0118, "step": 5470 }, { "epoch": 0.31796686878064345, "grad_norm": 0.12448934465646744, "learning_rate": 0.0004897667683446967, "loss": 3.0119, "step": 5480 }, { "epoch": 0.3185471002930169, "grad_norm": 0.1288510411977768, "learning_rate": 0.0004893205586812036, "loss": 3.0078, "step": 5490 }, { "epoch": 0.3191273318053903, "grad_norm": 0.12903016805648804, "learning_rate": 0.000488873651919581, "loss": 3.0085, "step": 5500 }, { "epoch": 0.3197075633177638, "grad_norm": 0.14042973518371582, "learning_rate": 0.0004884260497053859, "loss": 3.0093, "step": 5510 }, { "epoch": 0.32028779483013725, "grad_norm": 0.13995361328125, "learning_rate": 0.0004879777536867369, "loss": 3.0009, "step": 5520 }, { "epoch": 0.32086802634251066, "grad_norm": 0.13979199528694153, "learning_rate": 0.00048752876551430677, "loss": 3.0089, "step": 5530 }, { "epoch": 0.3214482578548841, "grad_norm": 0.130417600274086, "learning_rate": 0.0004870790868413171, "loss": 3.0087, "step": 5540 }, { "epoch": 0.32202848936725753, "grad_norm": 0.13676275312900543, "learning_rate": 0.00048662871932353164, "loss": 3.0092, "step": 5550 }, { "epoch": 0.322608720879631, "grad_norm": 0.12869158387184143, "learning_rate": 0.00048617766461925104, "loss": 3.0074, "step": 5560 }, { "epoch": 0.3231889523920044, "grad_norm": 0.13846737146377563, "learning_rate": 0.0004857259243893058, "loss": 3.0079, "step": 5570 }, { "epoch": 0.32376918390437787, "grad_norm": 0.1349971890449524, "learning_rate": 0.0004852735002970509, "loss": 2.9915, "step": 5580 }, { "epoch": 0.3243494154167513, "grad_norm": 0.13398951292037964, "learning_rate": 0.000484820394008359, "loss": 2.9982, "step": 5590 }, { "epoch": 0.32492964692912474, "grad_norm": 0.13627557456493378, "learning_rate": 0.0004843666071916152, "loss": 3.0019, "step": 5600 }, { "epoch": 0.32550987844149815, "grad_norm": 0.13470283150672913, "learning_rate": 0.00048391214151771, "loss": 3.0015, "step": 5610 }, { "epoch": 0.3260901099538716, "grad_norm": 0.14207038283348083, "learning_rate": 0.0004834569986600336, "loss": 3.0051, "step": 5620 }, { "epoch": 0.326670341466245, "grad_norm": 0.13324964046478271, "learning_rate": 0.00048300118029446967, "loss": 2.9956, "step": 5630 }, { "epoch": 0.3272505729786185, "grad_norm": 0.15288645029067993, "learning_rate": 0.0004825446880993892, "loss": 3.0087, "step": 5640 }, { "epoch": 0.3278308044909919, "grad_norm": 0.13744772970676422, "learning_rate": 0.00048208752375564424, "loss": 3.0049, "step": 5650 }, { "epoch": 0.32841103600336535, "grad_norm": 0.13114534318447113, "learning_rate": 0.00048162968894656193, "loss": 2.9993, "step": 5660 }, { "epoch": 0.32899126751573876, "grad_norm": 0.1254429966211319, "learning_rate": 0.00048117118535793773, "loss": 2.9937, "step": 5670 }, { "epoch": 0.3295714990281122, "grad_norm": 0.15155521035194397, "learning_rate": 0.00048071201467803017, "loss": 3.0017, "step": 5680 }, { "epoch": 0.33015173054048563, "grad_norm": 0.1420249044895172, "learning_rate": 0.00048025217859755365, "loss": 3.017, "step": 5690 }, { "epoch": 0.3307319620528591, "grad_norm": 0.14615775644779205, "learning_rate": 0.0004797916788096728, "loss": 3.0052, "step": 5700 }, { "epoch": 0.3313121935652325, "grad_norm": 0.12851493060588837, "learning_rate": 0.00047933051700999605, "loss": 3.0041, "step": 5710 }, { "epoch": 0.33189242507760597, "grad_norm": 0.13371190428733826, "learning_rate": 0.00047886869489656956, "loss": 2.9879, "step": 5720 }, { "epoch": 0.3324726565899794, "grad_norm": 0.13223771750926971, "learning_rate": 0.0004784062141698707, "loss": 2.993, "step": 5730 }, { "epoch": 0.33305288810235284, "grad_norm": 0.13460920751094818, "learning_rate": 0.00047794307653280184, "loss": 2.9928, "step": 5740 }, { "epoch": 0.3336331196147263, "grad_norm": 0.12678171694278717, "learning_rate": 0.0004774792836906844, "loss": 3.0053, "step": 5750 }, { "epoch": 0.3342133511270997, "grad_norm": 0.14595790207386017, "learning_rate": 0.0004770148373512522, "loss": 2.9974, "step": 5760 }, { "epoch": 0.3347935826394732, "grad_norm": 0.1505734771490097, "learning_rate": 0.00047654973922464525, "loss": 3.0053, "step": 5770 }, { "epoch": 0.3353738141518466, "grad_norm": 0.13636811077594757, "learning_rate": 0.00047608399102340367, "loss": 2.9984, "step": 5780 }, { "epoch": 0.33595404566422005, "grad_norm": 0.14487333595752716, "learning_rate": 0.000475617594462461, "loss": 3.0013, "step": 5790 }, { "epoch": 0.33653427717659345, "grad_norm": 0.13392585515975952, "learning_rate": 0.00047515055125913825, "loss": 2.9897, "step": 5800 }, { "epoch": 0.3371145086889669, "grad_norm": 0.1241224929690361, "learning_rate": 0.0004746828631331376, "loss": 2.9918, "step": 5810 }, { "epoch": 0.3376947402013403, "grad_norm": 0.1381169706583023, "learning_rate": 0.00047421453180653553, "loss": 2.9874, "step": 5820 }, { "epoch": 0.3382749717137138, "grad_norm": 0.12413561344146729, "learning_rate": 0.00047374555900377716, "loss": 2.9928, "step": 5830 }, { "epoch": 0.3388552032260872, "grad_norm": 0.13286706805229187, "learning_rate": 0.0004732759464516694, "loss": 2.9907, "step": 5840 }, { "epoch": 0.33943543473846066, "grad_norm": 0.1558184027671814, "learning_rate": 0.0004728056958793749, "loss": 3.0036, "step": 5850 }, { "epoch": 0.34001566625083407, "grad_norm": 0.13220670819282532, "learning_rate": 0.0004723348090184056, "loss": 2.9945, "step": 5860 }, { "epoch": 0.34059589776320753, "grad_norm": 0.13015997409820557, "learning_rate": 0.00047186328760261603, "loss": 3.0005, "step": 5870 }, { "epoch": 0.34117612927558094, "grad_norm": 0.146441251039505, "learning_rate": 0.0004713911333681976, "loss": 2.9984, "step": 5880 }, { "epoch": 0.3417563607879544, "grad_norm": 0.12352869659662247, "learning_rate": 0.0004709183480536718, "loss": 2.9946, "step": 5890 }, { "epoch": 0.3423365923003278, "grad_norm": 0.12516902387142181, "learning_rate": 0.0004704449333998834, "loss": 2.9918, "step": 5900 }, { "epoch": 0.3429168238127013, "grad_norm": 0.14155182242393494, "learning_rate": 0.00046997089114999494, "loss": 2.9937, "step": 5910 }, { "epoch": 0.3434970553250747, "grad_norm": 0.12636148929595947, "learning_rate": 0.0004694962230494796, "loss": 2.9869, "step": 5920 }, { "epoch": 0.34407728683744815, "grad_norm": 0.14390048384666443, "learning_rate": 0.000469020930846115, "loss": 2.9759, "step": 5930 }, { "epoch": 0.34465751834982156, "grad_norm": 0.14705798029899597, "learning_rate": 0.0004685450162899768, "loss": 2.9876, "step": 5940 }, { "epoch": 0.345237749862195, "grad_norm": 0.13937653601169586, "learning_rate": 0.00046806848113343234, "loss": 2.9872, "step": 5950 }, { "epoch": 0.34581798137456843, "grad_norm": 0.13351042568683624, "learning_rate": 0.00046759132713113403, "loss": 2.986, "step": 5960 }, { "epoch": 0.3463982128869419, "grad_norm": 0.133000910282135, "learning_rate": 0.0004671135560400127, "loss": 2.9886, "step": 5970 }, { "epoch": 0.3469784443993153, "grad_norm": 0.1261400580406189, "learning_rate": 0.0004666351696192718, "loss": 2.9811, "step": 5980 }, { "epoch": 0.34755867591168876, "grad_norm": 0.13575439155101776, "learning_rate": 0.00046615616963038007, "loss": 2.9796, "step": 5990 }, { "epoch": 0.3481389074240622, "grad_norm": 0.13202066719532013, "learning_rate": 0.0004656765578370657, "loss": 2.9958, "step": 6000 }, { "epoch": 0.3481389074240622, "eval_loss": 2.949599027633667, "eval_runtime": 3.2655, "eval_samples_per_second": 1325.986, "eval_steps_per_second": 2.756, "step": 6000 }, { "epoch": 0.34871913893643564, "grad_norm": 0.14002783596515656, "learning_rate": 0.0004651963360053096, "loss": 2.9811, "step": 6010 }, { "epoch": 0.3492993704488091, "grad_norm": 0.1519598364830017, "learning_rate": 0.00046471550590333874, "loss": 2.9884, "step": 6020 }, { "epoch": 0.3498796019611825, "grad_norm": 0.1435564160346985, "learning_rate": 0.00046423406930162, "loss": 2.9831, "step": 6030 }, { "epoch": 0.35045983347355597, "grad_norm": 0.1241581067442894, "learning_rate": 0.0004637520279728534, "loss": 2.9801, "step": 6040 }, { "epoch": 0.3510400649859294, "grad_norm": 0.124722421169281, "learning_rate": 0.00046326938369196566, "loss": 2.9872, "step": 6050 }, { "epoch": 0.35162029649830284, "grad_norm": 0.12400694936513901, "learning_rate": 0.0004627861382361034, "loss": 2.9863, "step": 6060 }, { "epoch": 0.35220052801067625, "grad_norm": 0.14388398826122284, "learning_rate": 0.0004623022933846272, "loss": 2.973, "step": 6070 }, { "epoch": 0.3527807595230497, "grad_norm": 0.14111004769802094, "learning_rate": 0.0004618178509191045, "loss": 2.9902, "step": 6080 }, { "epoch": 0.3533609910354231, "grad_norm": 0.1257510930299759, "learning_rate": 0.000461332812623303, "loss": 2.9877, "step": 6090 }, { "epoch": 0.3539412225477966, "grad_norm": 0.1282566338777542, "learning_rate": 0.00046084718028318466, "loss": 2.9832, "step": 6100 }, { "epoch": 0.35452145406017, "grad_norm": 0.14325213432312012, "learning_rate": 0.00046036095568689864, "loss": 2.9782, "step": 6110 }, { "epoch": 0.35510168557254346, "grad_norm": 0.1563083529472351, "learning_rate": 0.0004598741406247748, "loss": 2.9793, "step": 6120 }, { "epoch": 0.35568191708491687, "grad_norm": 0.1327456384897232, "learning_rate": 0.0004593867368893172, "loss": 2.9843, "step": 6130 }, { "epoch": 0.35626214859729033, "grad_norm": 0.13930997252464294, "learning_rate": 0.0004588987462751975, "loss": 2.976, "step": 6140 }, { "epoch": 0.35684238010966374, "grad_norm": 0.1295255720615387, "learning_rate": 0.00045841017057924807, "loss": 2.9801, "step": 6150 }, { "epoch": 0.3574226116220372, "grad_norm": 0.1404607594013214, "learning_rate": 0.00045792101160045613, "loss": 2.9788, "step": 6160 }, { "epoch": 0.3580028431344106, "grad_norm": 0.12297389656305313, "learning_rate": 0.0004574312711399561, "loss": 2.9853, "step": 6170 }, { "epoch": 0.3585830746467841, "grad_norm": 0.15521986782550812, "learning_rate": 0.0004569409510010236, "loss": 2.9825, "step": 6180 }, { "epoch": 0.3591633061591575, "grad_norm": 0.12915629148483276, "learning_rate": 0.00045645005298906887, "loss": 2.984, "step": 6190 }, { "epoch": 0.35974353767153094, "grad_norm": 0.12852182984352112, "learning_rate": 0.00045595857891162964, "loss": 2.9703, "step": 6200 }, { "epoch": 0.36032376918390435, "grad_norm": 0.1300152987241745, "learning_rate": 0.00045546653057836517, "loss": 2.971, "step": 6210 }, { "epoch": 0.3609040006962778, "grad_norm": 0.13348935544490814, "learning_rate": 0.00045497390980104885, "loss": 2.9762, "step": 6220 }, { "epoch": 0.3614842322086513, "grad_norm": 0.13476519286632538, "learning_rate": 0.00045448071839356203, "loss": 2.9756, "step": 6230 }, { "epoch": 0.3620644637210247, "grad_norm": 0.13884297013282776, "learning_rate": 0.000453986958171887, "loss": 2.9829, "step": 6240 }, { "epoch": 0.36264469523339815, "grad_norm": 0.12928573787212372, "learning_rate": 0.00045349263095410087, "loss": 2.9752, "step": 6250 }, { "epoch": 0.36322492674577156, "grad_norm": 0.13350141048431396, "learning_rate": 0.000452997738560368, "loss": 2.9748, "step": 6260 }, { "epoch": 0.363805158258145, "grad_norm": 0.13747799396514893, "learning_rate": 0.00045250228281293423, "loss": 2.9705, "step": 6270 }, { "epoch": 0.36438538977051843, "grad_norm": 0.1344989687204361, "learning_rate": 0.00045200626553611943, "loss": 2.9801, "step": 6280 }, { "epoch": 0.3649656212828919, "grad_norm": 0.1321888118982315, "learning_rate": 0.00045150968855631104, "loss": 2.9781, "step": 6290 }, { "epoch": 0.3655458527952653, "grad_norm": 0.12561041116714478, "learning_rate": 0.0004510125537019577, "loss": 2.973, "step": 6300 }, { "epoch": 0.36612608430763877, "grad_norm": 0.13948814570903778, "learning_rate": 0.00045051486280356194, "loss": 2.9731, "step": 6310 }, { "epoch": 0.3667063158200122, "grad_norm": 0.12595129013061523, "learning_rate": 0.0004500166176936739, "loss": 2.9659, "step": 6320 }, { "epoch": 0.36728654733238564, "grad_norm": 0.12941335141658783, "learning_rate": 0.00044951782020688415, "loss": 2.973, "step": 6330 }, { "epoch": 0.36786677884475905, "grad_norm": 0.14215658605098724, "learning_rate": 0.00044901847217981736, "loss": 2.975, "step": 6340 }, { "epoch": 0.3684470103571325, "grad_norm": 0.12309448421001434, "learning_rate": 0.00044851857545112525, "loss": 2.9749, "step": 6350 }, { "epoch": 0.3690272418695059, "grad_norm": 0.12824192643165588, "learning_rate": 0.00044801813186147986, "loss": 2.9672, "step": 6360 }, { "epoch": 0.3696074733818794, "grad_norm": 0.12063992768526077, "learning_rate": 0.00044751714325356697, "loss": 2.9708, "step": 6370 }, { "epoch": 0.3701877048942528, "grad_norm": 0.12898465991020203, "learning_rate": 0.0004470156114720792, "loss": 2.9699, "step": 6380 }, { "epoch": 0.37076793640662625, "grad_norm": 0.1321457326412201, "learning_rate": 0.00044651353836370897, "loss": 2.9661, "step": 6390 }, { "epoch": 0.37134816791899966, "grad_norm": 0.13804246485233307, "learning_rate": 0.0004460109257771422, "loss": 2.9783, "step": 6400 }, { "epoch": 0.3719283994313731, "grad_norm": 0.12447643280029297, "learning_rate": 0.00044550777556305094, "loss": 2.9691, "step": 6410 }, { "epoch": 0.37250863094374653, "grad_norm": 0.1610770970582962, "learning_rate": 0.00044500408957408706, "loss": 2.972, "step": 6420 }, { "epoch": 0.37308886245612, "grad_norm": 0.1278504580259323, "learning_rate": 0.00044449986966487527, "loss": 2.9694, "step": 6430 }, { "epoch": 0.3736690939684934, "grad_norm": 0.13527578115463257, "learning_rate": 0.0004439951176920059, "loss": 2.9707, "step": 6440 }, { "epoch": 0.37424932548086687, "grad_norm": 0.14050637185573578, "learning_rate": 0.0004434898355140287, "loss": 2.9712, "step": 6450 }, { "epoch": 0.3748295569932403, "grad_norm": 0.1513315588235855, "learning_rate": 0.00044298402499144554, "loss": 2.9705, "step": 6460 }, { "epoch": 0.37540978850561374, "grad_norm": 0.1299854964017868, "learning_rate": 0.00044247768798670367, "loss": 2.9662, "step": 6470 }, { "epoch": 0.3759900200179872, "grad_norm": 0.1321675330400467, "learning_rate": 0.00044197082636418907, "loss": 2.9675, "step": 6480 }, { "epoch": 0.3765702515303606, "grad_norm": 0.1453583687543869, "learning_rate": 0.00044146344199021934, "loss": 2.9639, "step": 6490 }, { "epoch": 0.3771504830427341, "grad_norm": 0.13450521230697632, "learning_rate": 0.00044095553673303685, "loss": 2.9661, "step": 6500 }, { "epoch": 0.3777307145551075, "grad_norm": 0.13579097390174866, "learning_rate": 0.00044044711246280215, "loss": 2.9608, "step": 6510 }, { "epoch": 0.37831094606748095, "grad_norm": 0.1469910442829132, "learning_rate": 0.00043993817105158627, "loss": 2.9686, "step": 6520 }, { "epoch": 0.37889117757985435, "grad_norm": 0.1311839371919632, "learning_rate": 0.00043942871437336527, "loss": 2.9636, "step": 6530 }, { "epoch": 0.3794714090922278, "grad_norm": 0.15060357749462128, "learning_rate": 0.0004389187443040116, "loss": 2.9613, "step": 6540 }, { "epoch": 0.3800516406046012, "grad_norm": 0.13408997654914856, "learning_rate": 0.00043840826272128873, "loss": 2.9626, "step": 6550 }, { "epoch": 0.3806318721169747, "grad_norm": 0.1458410769701004, "learning_rate": 0.0004378972715048434, "loss": 2.9604, "step": 6560 }, { "epoch": 0.3812121036293481, "grad_norm": 0.13342171907424927, "learning_rate": 0.0004373857725361984, "loss": 2.9602, "step": 6570 }, { "epoch": 0.38179233514172156, "grad_norm": 0.12624911963939667, "learning_rate": 0.00043687376769874686, "loss": 2.9703, "step": 6580 }, { "epoch": 0.38237256665409497, "grad_norm": 0.13120518624782562, "learning_rate": 0.0004363612588777442, "loss": 2.9601, "step": 6590 }, { "epoch": 0.38295279816646843, "grad_norm": 0.1357596516609192, "learning_rate": 0.00043584824796030145, "loss": 2.9561, "step": 6600 }, { "epoch": 0.38353302967884184, "grad_norm": 0.1270647495985031, "learning_rate": 0.00043533473683537863, "loss": 2.9522, "step": 6610 }, { "epoch": 0.3841132611912153, "grad_norm": 0.1325126439332962, "learning_rate": 0.0004348207273937776, "loss": 2.9603, "step": 6620 }, { "epoch": 0.3846934927035887, "grad_norm": 0.13015331327915192, "learning_rate": 0.0004343062215281347, "loss": 2.955, "step": 6630 }, { "epoch": 0.3852737242159622, "grad_norm": 0.12867479026317596, "learning_rate": 0.00043379122113291465, "loss": 2.9692, "step": 6640 }, { "epoch": 0.3858539557283356, "grad_norm": 0.14423881471157074, "learning_rate": 0.00043327572810440283, "loss": 2.9539, "step": 6650 }, { "epoch": 0.38643418724070905, "grad_norm": 0.13097575306892395, "learning_rate": 0.00043275974434069846, "loss": 2.9576, "step": 6660 }, { "epoch": 0.38701441875308246, "grad_norm": 0.129910409450531, "learning_rate": 0.0004322432717417079, "loss": 2.9617, "step": 6670 }, { "epoch": 0.3875946502654559, "grad_norm": 0.13308489322662354, "learning_rate": 0.00043172631220913735, "loss": 2.9514, "step": 6680 }, { "epoch": 0.38817488177782933, "grad_norm": 0.12263292074203491, "learning_rate": 0.00043120886764648605, "loss": 2.9557, "step": 6690 }, { "epoch": 0.3887551132902028, "grad_norm": 0.1288110911846161, "learning_rate": 0.0004306909399590389, "loss": 2.9558, "step": 6700 }, { "epoch": 0.38933534480257626, "grad_norm": 0.12322728335857391, "learning_rate": 0.00043017253105386005, "loss": 2.9551, "step": 6710 }, { "epoch": 0.38991557631494966, "grad_norm": 0.1551227867603302, "learning_rate": 0.0004296536428397853, "loss": 2.9583, "step": 6720 }, { "epoch": 0.3904958078273231, "grad_norm": 0.12883497774600983, "learning_rate": 0.00042913427722741546, "loss": 2.9495, "step": 6730 }, { "epoch": 0.39107603933969654, "grad_norm": 0.12460558116436005, "learning_rate": 0.00042861443612910913, "loss": 2.9597, "step": 6740 }, { "epoch": 0.39165627085207, "grad_norm": 0.122388556599617, "learning_rate": 0.00042809412145897576, "loss": 2.9557, "step": 6750 }, { "epoch": 0.3922365023644434, "grad_norm": 0.12150498479604721, "learning_rate": 0.00042757333513286834, "loss": 2.9489, "step": 6760 }, { "epoch": 0.39281673387681687, "grad_norm": 0.15273340046405792, "learning_rate": 0.00042705207906837666, "loss": 2.9503, "step": 6770 }, { "epoch": 0.3933969653891903, "grad_norm": 0.13954737782478333, "learning_rate": 0.00042653035518482025, "loss": 2.9481, "step": 6780 }, { "epoch": 0.39397719690156374, "grad_norm": 0.15386004745960236, "learning_rate": 0.0004260081654032411, "loss": 2.9596, "step": 6790 }, { "epoch": 0.39455742841393715, "grad_norm": 0.1319696307182312, "learning_rate": 0.0004254855116463966, "loss": 2.9526, "step": 6800 }, { "epoch": 0.3951376599263106, "grad_norm": 0.14486876130104065, "learning_rate": 0.00042496239583875286, "loss": 2.9501, "step": 6810 }, { "epoch": 0.395717891438684, "grad_norm": 0.12461838871240616, "learning_rate": 0.0004244388199064768, "loss": 2.9519, "step": 6820 }, { "epoch": 0.3962981229510575, "grad_norm": 0.14132647216320038, "learning_rate": 0.00042391478577743006, "loss": 2.9533, "step": 6830 }, { "epoch": 0.3968783544634309, "grad_norm": 0.12907026708126068, "learning_rate": 0.00042339029538116104, "loss": 2.9451, "step": 6840 }, { "epoch": 0.39745858597580436, "grad_norm": 0.13801275193691254, "learning_rate": 0.0004228653506488984, "loss": 2.9382, "step": 6850 }, { "epoch": 0.39803881748817777, "grad_norm": 0.11962810158729553, "learning_rate": 0.00042233995351354366, "loss": 2.9501, "step": 6860 }, { "epoch": 0.39861904900055123, "grad_norm": 0.12804014980793, "learning_rate": 0.00042181410590966413, "loss": 2.9556, "step": 6870 }, { "epoch": 0.39919928051292464, "grad_norm": 0.1232592836022377, "learning_rate": 0.0004212878097734857, "loss": 2.9493, "step": 6880 }, { "epoch": 0.3997795120252981, "grad_norm": 0.12467402964830399, "learning_rate": 0.0004207610670428859, "loss": 2.9518, "step": 6890 }, { "epoch": 0.4003597435376715, "grad_norm": 0.13029509782791138, "learning_rate": 0.0004202338796573866, "loss": 2.9476, "step": 6900 }, { "epoch": 0.40093997505004497, "grad_norm": 0.13504283130168915, "learning_rate": 0.0004197062495581471, "loss": 2.9457, "step": 6910 }, { "epoch": 0.4015202065624184, "grad_norm": 0.12205976992845535, "learning_rate": 0.00041917817868795666, "loss": 2.9418, "step": 6920 }, { "epoch": 0.40210043807479184, "grad_norm": 0.14173905551433563, "learning_rate": 0.0004186496689912275, "loss": 2.9401, "step": 6930 }, { "epoch": 0.40268066958716525, "grad_norm": 0.131003275513649, "learning_rate": 0.00041812072241398764, "loss": 2.9416, "step": 6940 }, { "epoch": 0.4032609010995387, "grad_norm": 0.1430942267179489, "learning_rate": 0.00041759134090387396, "loss": 2.9526, "step": 6950 }, { "epoch": 0.4038411326119122, "grad_norm": 0.11908053606748581, "learning_rate": 0.00041706152641012435, "loss": 2.9457, "step": 6960 }, { "epoch": 0.4044213641242856, "grad_norm": 0.12189971655607224, "learning_rate": 0.0004165312808835716, "loss": 2.9497, "step": 6970 }, { "epoch": 0.40500159563665905, "grad_norm": 0.1238475888967514, "learning_rate": 0.00041600060627663515, "loss": 2.9426, "step": 6980 }, { "epoch": 0.40558182714903246, "grad_norm": 0.13269031047821045, "learning_rate": 0.00041546950454331437, "loss": 2.9441, "step": 6990 }, { "epoch": 0.4061620586614059, "grad_norm": 0.14216388761997223, "learning_rate": 0.0004149379776391817, "loss": 2.9443, "step": 7000 }, { "epoch": 0.4061620586614059, "eval_loss": 2.910210609436035, "eval_runtime": 3.2597, "eval_samples_per_second": 1328.339, "eval_steps_per_second": 2.761, "step": 7000 }, { "epoch": 0.40674229017377933, "grad_norm": 0.13298869132995605, "learning_rate": 0.0004144060275213747, "loss": 2.946, "step": 7010 }, { "epoch": 0.4073225216861528, "grad_norm": 0.14648084342479706, "learning_rate": 0.00041387365614858955, "loss": 2.9468, "step": 7020 }, { "epoch": 0.4079027531985262, "grad_norm": 0.13918638229370117, "learning_rate": 0.00041334086548107336, "loss": 2.9561, "step": 7030 }, { "epoch": 0.40848298471089967, "grad_norm": 0.1421622335910797, "learning_rate": 0.00041280765748061727, "loss": 2.9437, "step": 7040 }, { "epoch": 0.4090632162232731, "grad_norm": 0.1364564597606659, "learning_rate": 0.0004122740341105488, "loss": 2.9354, "step": 7050 }, { "epoch": 0.40964344773564654, "grad_norm": 0.1310495287179947, "learning_rate": 0.00041173999733572523, "loss": 2.9471, "step": 7060 }, { "epoch": 0.41022367924801995, "grad_norm": 0.14024296402931213, "learning_rate": 0.000411205549122526, "loss": 2.9372, "step": 7070 }, { "epoch": 0.4108039107603934, "grad_norm": 0.1430574357509613, "learning_rate": 0.0004106706914388452, "loss": 2.9468, "step": 7080 }, { "epoch": 0.4113841422727668, "grad_norm": 0.12103896588087082, "learning_rate": 0.00041013542625408504, "loss": 2.9463, "step": 7090 }, { "epoch": 0.4119643737851403, "grad_norm": 0.12720054388046265, "learning_rate": 0.00040959975553914787, "loss": 2.9427, "step": 7100 }, { "epoch": 0.4125446052975137, "grad_norm": 0.14135150611400604, "learning_rate": 0.0004090636812664295, "loss": 2.9407, "step": 7110 }, { "epoch": 0.41312483680988715, "grad_norm": 0.14666588604450226, "learning_rate": 0.0004085272054098115, "loss": 2.9435, "step": 7120 }, { "epoch": 0.41370506832226056, "grad_norm": 0.13804596662521362, "learning_rate": 0.0004079903299446541, "loss": 2.9365, "step": 7130 }, { "epoch": 0.414285299834634, "grad_norm": 0.1470736414194107, "learning_rate": 0.00040745305684778907, "loss": 2.9278, "step": 7140 }, { "epoch": 0.41486553134700743, "grad_norm": 0.12926244735717773, "learning_rate": 0.00040691538809751234, "loss": 2.9354, "step": 7150 }, { "epoch": 0.4154457628593809, "grad_norm": 0.1294509321451187, "learning_rate": 0.00040637732567357635, "loss": 2.9466, "step": 7160 }, { "epoch": 0.4160259943717543, "grad_norm": 0.12196213006973267, "learning_rate": 0.0004058388715571835, "loss": 2.9322, "step": 7170 }, { "epoch": 0.41660622588412777, "grad_norm": 0.15902066230773926, "learning_rate": 0.00040530002773097825, "loss": 2.9448, "step": 7180 }, { "epoch": 0.41718645739650123, "grad_norm": 0.11859998106956482, "learning_rate": 0.0004047607961790399, "loss": 2.9428, "step": 7190 }, { "epoch": 0.41776668890887464, "grad_norm": 0.13470393419265747, "learning_rate": 0.00040422117888687555, "loss": 2.942, "step": 7200 }, { "epoch": 0.4183469204212481, "grad_norm": 0.1288190484046936, "learning_rate": 0.0004036811778414125, "loss": 2.9362, "step": 7210 }, { "epoch": 0.4189271519336215, "grad_norm": 0.12759481370449066, "learning_rate": 0.0004031407950309915, "loss": 2.9447, "step": 7220 }, { "epoch": 0.419507383445995, "grad_norm": 0.13468439877033234, "learning_rate": 0.0004026000324453584, "loss": 2.9313, "step": 7230 }, { "epoch": 0.4200876149583684, "grad_norm": 0.12287794053554535, "learning_rate": 0.0004020588920756577, "loss": 2.9369, "step": 7240 }, { "epoch": 0.42066784647074185, "grad_norm": 0.12006892263889313, "learning_rate": 0.00040151737591442497, "loss": 2.9329, "step": 7250 }, { "epoch": 0.42124807798311525, "grad_norm": 0.13062633574008942, "learning_rate": 0.00040097548595557935, "loss": 2.9474, "step": 7260 }, { "epoch": 0.4218283094954887, "grad_norm": 0.12141095846891403, "learning_rate": 0.00040043322419441667, "loss": 2.9386, "step": 7270 }, { "epoch": 0.4224085410078621, "grad_norm": 0.13452979922294617, "learning_rate": 0.0003998905926276014, "loss": 2.9203, "step": 7280 }, { "epoch": 0.4229887725202356, "grad_norm": 0.13672851026058197, "learning_rate": 0.0003993475932531598, "loss": 2.9353, "step": 7290 }, { "epoch": 0.423569004032609, "grad_norm": 0.1266540139913559, "learning_rate": 0.0003988042280704724, "loss": 2.929, "step": 7300 }, { "epoch": 0.42414923554498246, "grad_norm": 0.1192171648144722, "learning_rate": 0.0003982604990802668, "loss": 2.9314, "step": 7310 }, { "epoch": 0.42472946705735587, "grad_norm": 0.11528236418962479, "learning_rate": 0.0003977164082846101, "loss": 2.9349, "step": 7320 }, { "epoch": 0.42530969856972933, "grad_norm": 0.12837885320186615, "learning_rate": 0.00039717195768690155, "loss": 2.9211, "step": 7330 }, { "epoch": 0.42588993008210274, "grad_norm": 0.1254536211490631, "learning_rate": 0.0003966271492918654, "loss": 2.9311, "step": 7340 }, { "epoch": 0.4264701615944762, "grad_norm": 0.12365511804819107, "learning_rate": 0.0003960819851055432, "loss": 2.9411, "step": 7350 }, { "epoch": 0.4270503931068496, "grad_norm": 0.14178220927715302, "learning_rate": 0.00039553646713528644, "loss": 2.9322, "step": 7360 }, { "epoch": 0.4276306246192231, "grad_norm": 0.13220851123332977, "learning_rate": 0.0003949905973897496, "loss": 2.9397, "step": 7370 }, { "epoch": 0.4282108561315965, "grad_norm": 0.12264362722635269, "learning_rate": 0.00039444437787888224, "loss": 2.9355, "step": 7380 }, { "epoch": 0.42879108764396995, "grad_norm": 0.12907512485980988, "learning_rate": 0.00039389781061392184, "loss": 2.9259, "step": 7390 }, { "epoch": 0.42937131915634336, "grad_norm": 0.1319524645805359, "learning_rate": 0.00039335089760738625, "loss": 2.9284, "step": 7400 }, { "epoch": 0.4299515506687168, "grad_norm": 0.1404864490032196, "learning_rate": 0.0003928036408730664, "loss": 2.932, "step": 7410 }, { "epoch": 0.43053178218109023, "grad_norm": 0.12499509751796722, "learning_rate": 0.00039225604242601914, "loss": 2.9313, "step": 7420 }, { "epoch": 0.4311120136934637, "grad_norm": 0.13161097466945648, "learning_rate": 0.0003917081042825591, "loss": 2.9261, "step": 7430 }, { "epoch": 0.43169224520583716, "grad_norm": 0.13262121379375458, "learning_rate": 0.000391159828460252, "loss": 2.9302, "step": 7440 }, { "epoch": 0.43227247671821056, "grad_norm": 0.13169781863689423, "learning_rate": 0.0003906112169779069, "loss": 2.9247, "step": 7450 }, { "epoch": 0.432852708230584, "grad_norm": 0.1297696828842163, "learning_rate": 0.00039006227185556865, "loss": 2.9422, "step": 7460 }, { "epoch": 0.43343293974295743, "grad_norm": 0.1292199194431305, "learning_rate": 0.00038951299511451077, "loss": 2.9232, "step": 7470 }, { "epoch": 0.4340131712553309, "grad_norm": 0.13055439293384552, "learning_rate": 0.0003889633887772278, "loss": 2.9246, "step": 7480 }, { "epoch": 0.4345934027677043, "grad_norm": 0.1166820153594017, "learning_rate": 0.0003884134548674278, "loss": 2.9361, "step": 7490 }, { "epoch": 0.43517363428007777, "grad_norm": 0.12382174283266068, "learning_rate": 0.00038786319541002487, "loss": 2.9221, "step": 7500 }, { "epoch": 0.4357538657924512, "grad_norm": 0.12510880827903748, "learning_rate": 0.0003873126124311323, "loss": 2.9289, "step": 7510 }, { "epoch": 0.43633409730482464, "grad_norm": 0.13196755945682526, "learning_rate": 0.000386761707958054, "loss": 2.9203, "step": 7520 }, { "epoch": 0.43691432881719805, "grad_norm": 0.13719266653060913, "learning_rate": 0.00038621048401927817, "loss": 2.9319, "step": 7530 }, { "epoch": 0.4374945603295715, "grad_norm": 0.13211804628372192, "learning_rate": 0.000385658942644469, "loss": 2.9326, "step": 7540 }, { "epoch": 0.4380747918419449, "grad_norm": 0.12999597191810608, "learning_rate": 0.0003851070858644596, "loss": 2.9239, "step": 7550 }, { "epoch": 0.4386550233543184, "grad_norm": 0.13165125250816345, "learning_rate": 0.0003845549157112445, "loss": 2.9312, "step": 7560 }, { "epoch": 0.4392352548666918, "grad_norm": 0.13743376731872559, "learning_rate": 0.00038400243421797206, "loss": 2.9254, "step": 7570 }, { "epoch": 0.43981548637906526, "grad_norm": 0.12621231377124786, "learning_rate": 0.00038344964341893684, "loss": 2.9203, "step": 7580 }, { "epoch": 0.44039571789143866, "grad_norm": 0.12167075276374817, "learning_rate": 0.00038289654534957266, "loss": 2.9281, "step": 7590 }, { "epoch": 0.44097594940381213, "grad_norm": 0.13523493707180023, "learning_rate": 0.0003823431420464444, "loss": 2.916, "step": 7600 }, { "epoch": 0.44155618091618554, "grad_norm": 0.11718156933784485, "learning_rate": 0.0003817894355472413, "loss": 2.9145, "step": 7610 }, { "epoch": 0.442136412428559, "grad_norm": 0.13470205664634705, "learning_rate": 0.0003812354278907683, "loss": 2.9173, "step": 7620 }, { "epoch": 0.4427166439409324, "grad_norm": 0.1286102533340454, "learning_rate": 0.00038068112111693984, "loss": 2.9249, "step": 7630 }, { "epoch": 0.44329687545330587, "grad_norm": 0.13669750094413757, "learning_rate": 0.00038012651726677146, "loss": 2.9239, "step": 7640 }, { "epoch": 0.4438771069656793, "grad_norm": 0.14638318121433258, "learning_rate": 0.0003795716183823728, "loss": 2.9306, "step": 7650 }, { "epoch": 0.44445733847805274, "grad_norm": 0.13569045066833496, "learning_rate": 0.00037901642650693944, "loss": 2.9168, "step": 7660 }, { "epoch": 0.4450375699904262, "grad_norm": 0.1257532387971878, "learning_rate": 0.00037846094368474613, "loss": 2.9242, "step": 7670 }, { "epoch": 0.4456178015027996, "grad_norm": 0.11852803826332092, "learning_rate": 0.0003779051719611389, "loss": 2.9209, "step": 7680 }, { "epoch": 0.4461980330151731, "grad_norm": 0.12594154477119446, "learning_rate": 0.0003773491133825273, "loss": 2.929, "step": 7690 }, { "epoch": 0.4467782645275465, "grad_norm": 0.12566526234149933, "learning_rate": 0.00037679276999637746, "loss": 2.9119, "step": 7700 }, { "epoch": 0.44735849603991995, "grad_norm": 0.13207079470157623, "learning_rate": 0.0003762361438512038, "loss": 2.917, "step": 7710 }, { "epoch": 0.44793872755229336, "grad_norm": 0.13788865506649017, "learning_rate": 0.00037567923699656226, "loss": 2.92, "step": 7720 }, { "epoch": 0.4485189590646668, "grad_norm": 0.13110986351966858, "learning_rate": 0.00037512205148304204, "loss": 2.9249, "step": 7730 }, { "epoch": 0.44909919057704023, "grad_norm": 0.1643168181180954, "learning_rate": 0.00037456458936225873, "loss": 2.9232, "step": 7740 }, { "epoch": 0.4496794220894137, "grad_norm": 0.14076946675777435, "learning_rate": 0.00037400685268684623, "loss": 2.9252, "step": 7750 }, { "epoch": 0.4502596536017871, "grad_norm": 0.1238834485411644, "learning_rate": 0.0003734488435104494, "loss": 2.9093, "step": 7760 }, { "epoch": 0.45083988511416057, "grad_norm": 0.11924099922180176, "learning_rate": 0.00037289056388771643, "loss": 2.9324, "step": 7770 }, { "epoch": 0.451420116626534, "grad_norm": 0.13720078766345978, "learning_rate": 0.0003723320158742914, "loss": 2.9154, "step": 7780 }, { "epoch": 0.45200034813890744, "grad_norm": 0.12532520294189453, "learning_rate": 0.00037177320152680663, "loss": 2.9228, "step": 7790 }, { "epoch": 0.45258057965128085, "grad_norm": 0.129350483417511, "learning_rate": 0.0003712141229028751, "loss": 2.9071, "step": 7800 }, { "epoch": 0.4531608111636543, "grad_norm": 0.12484076619148254, "learning_rate": 0.0003706547820610828, "loss": 2.9107, "step": 7810 }, { "epoch": 0.4537410426760277, "grad_norm": 0.12527912855148315, "learning_rate": 0.0003700951810609815, "loss": 2.9166, "step": 7820 }, { "epoch": 0.4543212741884012, "grad_norm": 0.1453130692243576, "learning_rate": 0.0003695353219630803, "loss": 2.9195, "step": 7830 }, { "epoch": 0.4549015057007746, "grad_norm": 0.1291913241147995, "learning_rate": 0.0003689752068288395, "loss": 2.9124, "step": 7840 }, { "epoch": 0.45548173721314805, "grad_norm": 0.12470022588968277, "learning_rate": 0.0003684148377206615, "loss": 2.9241, "step": 7850 }, { "epoch": 0.45606196872552146, "grad_norm": 0.1276790350675583, "learning_rate": 0.00036785421670188395, "loss": 2.9178, "step": 7860 }, { "epoch": 0.4566422002378949, "grad_norm": 0.15164950489997864, "learning_rate": 0.0003672933458367724, "loss": 2.9072, "step": 7870 }, { "epoch": 0.45722243175026833, "grad_norm": 0.14891022443771362, "learning_rate": 0.00036673222719051194, "loss": 2.9235, "step": 7880 }, { "epoch": 0.4578026632626418, "grad_norm": 0.1266569346189499, "learning_rate": 0.0003661708628292003, "loss": 2.9159, "step": 7890 }, { "epoch": 0.4583828947750152, "grad_norm": 0.12030439078807831, "learning_rate": 0.0003656092548198399, "loss": 2.912, "step": 7900 }, { "epoch": 0.45896312628738867, "grad_norm": 0.12590278685092926, "learning_rate": 0.00036504740523033016, "loss": 2.91, "step": 7910 }, { "epoch": 0.45954335779976213, "grad_norm": 0.1255042403936386, "learning_rate": 0.0003644853161294601, "loss": 2.9127, "step": 7920 }, { "epoch": 0.46012358931213554, "grad_norm": 0.1253713071346283, "learning_rate": 0.0003639229895869009, "loss": 2.9242, "step": 7930 }, { "epoch": 0.460703820824509, "grad_norm": 0.1254982203245163, "learning_rate": 0.0003633604276731975, "loss": 2.9115, "step": 7940 }, { "epoch": 0.4612840523368824, "grad_norm": 0.12157725542783737, "learning_rate": 0.00036279763245976207, "loss": 2.9114, "step": 7950 }, { "epoch": 0.4618642838492559, "grad_norm": 0.12421195954084396, "learning_rate": 0.00036223460601886537, "loss": 2.9083, "step": 7960 }, { "epoch": 0.4624445153616293, "grad_norm": 0.11870937049388885, "learning_rate": 0.00036167135042362977, "loss": 2.907, "step": 7970 }, { "epoch": 0.46302474687400275, "grad_norm": 0.12460967898368835, "learning_rate": 0.00036110786774802133, "loss": 2.9088, "step": 7980 }, { "epoch": 0.46360497838637615, "grad_norm": 0.1310334950685501, "learning_rate": 0.00036054416006684245, "loss": 2.9102, "step": 7990 }, { "epoch": 0.4641852098987496, "grad_norm": 0.12560488283634186, "learning_rate": 0.00035998022945572366, "loss": 2.9097, "step": 8000 }, { "epoch": 0.4641852098987496, "eval_loss": 2.875955820083618, "eval_runtime": 3.2545, "eval_samples_per_second": 1330.484, "eval_steps_per_second": 2.765, "step": 8000 }, { "epoch": 0.464765441411123, "grad_norm": 0.12761953473091125, "learning_rate": 0.00035941607799111675, "loss": 2.91, "step": 8010 }, { "epoch": 0.4653456729234965, "grad_norm": 0.1247384324669838, "learning_rate": 0.0003588517077502864, "loss": 2.9149, "step": 8020 }, { "epoch": 0.4659259044358699, "grad_norm": 0.14209751784801483, "learning_rate": 0.00035828712081130296, "loss": 2.9083, "step": 8030 }, { "epoch": 0.46650613594824336, "grad_norm": 0.12985317409038544, "learning_rate": 0.00035772231925303464, "loss": 2.9046, "step": 8040 }, { "epoch": 0.46708636746061677, "grad_norm": 0.14672869443893433, "learning_rate": 0.00035715730515514, "loss": 2.9113, "step": 8050 }, { "epoch": 0.46766659897299023, "grad_norm": 0.13361111283302307, "learning_rate": 0.0003565920805980602, "loss": 2.913, "step": 8060 }, { "epoch": 0.46824683048536364, "grad_norm": 0.12082985788583755, "learning_rate": 0.0003560266476630112, "loss": 2.9138, "step": 8070 }, { "epoch": 0.4688270619977371, "grad_norm": 0.1150035560131073, "learning_rate": 0.0003554610084319763, "loss": 2.9048, "step": 8080 }, { "epoch": 0.4694072935101105, "grad_norm": 0.1214471235871315, "learning_rate": 0.0003548951649876984, "loss": 2.9123, "step": 8090 }, { "epoch": 0.469987525022484, "grad_norm": 0.12934035062789917, "learning_rate": 0.0003543291194136723, "loss": 2.9028, "step": 8100 }, { "epoch": 0.4705677565348574, "grad_norm": 0.15276013314723969, "learning_rate": 0.00035376287379413723, "loss": 2.9031, "step": 8110 }, { "epoch": 0.47114798804723085, "grad_norm": 0.1335725337266922, "learning_rate": 0.00035319643021406886, "loss": 2.9124, "step": 8120 }, { "epoch": 0.47172821955960426, "grad_norm": 0.12289181351661682, "learning_rate": 0.00035262979075917166, "loss": 2.9053, "step": 8130 }, { "epoch": 0.4723084510719777, "grad_norm": 0.11827896535396576, "learning_rate": 0.0003520629575158715, "loss": 2.9138, "step": 8140 }, { "epoch": 0.4728886825843512, "grad_norm": 0.12505313754081726, "learning_rate": 0.0003514959325713078, "loss": 2.909, "step": 8150 }, { "epoch": 0.4734689140967246, "grad_norm": 0.1321611851453781, "learning_rate": 0.00035092871801332574, "loss": 2.9075, "step": 8160 }, { "epoch": 0.47404914560909805, "grad_norm": 0.12144722044467926, "learning_rate": 0.00035036131593046895, "loss": 2.9046, "step": 8170 }, { "epoch": 0.47462937712147146, "grad_norm": 0.11893021315336227, "learning_rate": 0.0003497937284119711, "loss": 2.9021, "step": 8180 }, { "epoch": 0.4752096086338449, "grad_norm": 0.13043691217899323, "learning_rate": 0.0003492259575477491, "loss": 2.9052, "step": 8190 }, { "epoch": 0.47578984014621833, "grad_norm": 0.12443230301141739, "learning_rate": 0.00034865800542839445, "loss": 2.9003, "step": 8200 }, { "epoch": 0.4763700716585918, "grad_norm": 0.1350659728050232, "learning_rate": 0.0003480898741451667, "loss": 2.9077, "step": 8210 }, { "epoch": 0.4769503031709652, "grad_norm": 0.13212652504444122, "learning_rate": 0.0003475215657899844, "loss": 2.8955, "step": 8220 }, { "epoch": 0.47753053468333867, "grad_norm": 0.13865076005458832, "learning_rate": 0.0003469530824554188, "loss": 2.9015, "step": 8230 }, { "epoch": 0.4781107661957121, "grad_norm": 0.1313691884279251, "learning_rate": 0.00034638442623468484, "loss": 2.9014, "step": 8240 }, { "epoch": 0.47869099770808554, "grad_norm": 0.13368923962116241, "learning_rate": 0.00034581559922163447, "loss": 2.8962, "step": 8250 }, { "epoch": 0.47927122922045895, "grad_norm": 0.12228936702013016, "learning_rate": 0.0003452466035107481, "loss": 2.8997, "step": 8260 }, { "epoch": 0.4798514607328324, "grad_norm": 0.12648892402648926, "learning_rate": 0.00034467744119712787, "loss": 2.9052, "step": 8270 }, { "epoch": 0.4804316922452058, "grad_norm": 0.12937045097351074, "learning_rate": 0.00034410811437648873, "loss": 2.9037, "step": 8280 }, { "epoch": 0.4810119237575793, "grad_norm": 0.12095940858125687, "learning_rate": 0.00034353862514515185, "loss": 2.9002, "step": 8290 }, { "epoch": 0.4815921552699527, "grad_norm": 0.11992644518613815, "learning_rate": 0.0003429689756000362, "loss": 2.9051, "step": 8300 }, { "epoch": 0.48217238678232616, "grad_norm": 0.1110587939620018, "learning_rate": 0.0003423991678386511, "loss": 2.9046, "step": 8310 }, { "epoch": 0.48275261829469956, "grad_norm": 0.11831989139318466, "learning_rate": 0.00034182920395908837, "loss": 2.9001, "step": 8320 }, { "epoch": 0.48333284980707303, "grad_norm": 0.11492130905389786, "learning_rate": 0.0003412590860600148, "loss": 2.8944, "step": 8330 }, { "epoch": 0.48391308131944644, "grad_norm": 0.12855441868305206, "learning_rate": 0.00034068881624066405, "loss": 2.8941, "step": 8340 }, { "epoch": 0.4844933128318199, "grad_norm": 0.12829254567623138, "learning_rate": 0.0003401183966008296, "loss": 2.8989, "step": 8350 }, { "epoch": 0.4850735443441933, "grad_norm": 0.1167573556303978, "learning_rate": 0.00033954782924085604, "loss": 2.9027, "step": 8360 }, { "epoch": 0.48565377585656677, "grad_norm": 0.12906575202941895, "learning_rate": 0.0003389771162616324, "loss": 2.893, "step": 8370 }, { "epoch": 0.4862340073689402, "grad_norm": 0.12219451367855072, "learning_rate": 0.00033840625976458357, "loss": 2.8971, "step": 8380 }, { "epoch": 0.48681423888131364, "grad_norm": 0.1430503875017166, "learning_rate": 0.00033783526185166295, "loss": 2.8945, "step": 8390 }, { "epoch": 0.4873944703936871, "grad_norm": 0.1279267519712448, "learning_rate": 0.00033726412462534454, "loss": 2.8969, "step": 8400 }, { "epoch": 0.4879747019060605, "grad_norm": 0.1239406168460846, "learning_rate": 0.00033669285018861567, "loss": 2.8994, "step": 8410 }, { "epoch": 0.488554933418434, "grad_norm": 0.1379164159297943, "learning_rate": 0.00033612144064496853, "loss": 2.8949, "step": 8420 }, { "epoch": 0.4891351649308074, "grad_norm": 0.12819483876228333, "learning_rate": 0.00033554989809839294, "loss": 2.897, "step": 8430 }, { "epoch": 0.48971539644318085, "grad_norm": 0.12451434880495071, "learning_rate": 0.00033497822465336854, "loss": 2.903, "step": 8440 }, { "epoch": 0.49029562795555426, "grad_norm": 0.1466275155544281, "learning_rate": 0.0003344064224148567, "loss": 2.8912, "step": 8450 }, { "epoch": 0.4908758594679277, "grad_norm": 0.12186205387115479, "learning_rate": 0.0003338344934882932, "loss": 2.8998, "step": 8460 }, { "epoch": 0.49145609098030113, "grad_norm": 0.12687867879867554, "learning_rate": 0.00033326243997958014, "loss": 2.8983, "step": 8470 }, { "epoch": 0.4920363224926746, "grad_norm": 0.12620693445205688, "learning_rate": 0.00033269026399507874, "loss": 2.895, "step": 8480 }, { "epoch": 0.492616554005048, "grad_norm": 0.1362224668264389, "learning_rate": 0.00033211796764160074, "loss": 2.9007, "step": 8490 }, { "epoch": 0.49319678551742147, "grad_norm": 0.1300470530986786, "learning_rate": 0.00033154555302640135, "loss": 2.8914, "step": 8500 }, { "epoch": 0.4937770170297949, "grad_norm": 0.12057654559612274, "learning_rate": 0.00033097302225717096, "loss": 2.8971, "step": 8510 }, { "epoch": 0.49435724854216834, "grad_norm": 0.13263335824012756, "learning_rate": 0.00033040037744202805, "loss": 2.8971, "step": 8520 }, { "epoch": 0.49493748005454175, "grad_norm": 0.12660051882266998, "learning_rate": 0.00032982762068951073, "loss": 2.8914, "step": 8530 }, { "epoch": 0.4955177115669152, "grad_norm": 0.12398383021354675, "learning_rate": 0.0003292547541085694, "loss": 2.8936, "step": 8540 }, { "epoch": 0.4960979430792886, "grad_norm": 0.1229000836610794, "learning_rate": 0.00032868177980855876, "loss": 2.888, "step": 8550 }, { "epoch": 0.4966781745916621, "grad_norm": 0.11801040917634964, "learning_rate": 0.0003281086998992303, "loss": 2.8909, "step": 8560 }, { "epoch": 0.4972584061040355, "grad_norm": 0.12945981323719025, "learning_rate": 0.0003275355164907241, "loss": 2.8878, "step": 8570 }, { "epoch": 0.49783863761640895, "grad_norm": 0.12002068758010864, "learning_rate": 0.0003269622316935618, "loss": 2.892, "step": 8580 }, { "epoch": 0.49841886912878236, "grad_norm": 0.12449994683265686, "learning_rate": 0.0003263888476186377, "loss": 2.8912, "step": 8590 }, { "epoch": 0.4989991006411558, "grad_norm": 0.13638156652450562, "learning_rate": 0.0003258153663772124, "loss": 2.8877, "step": 8600 }, { "epoch": 0.49957933215352923, "grad_norm": 0.12280316650867462, "learning_rate": 0.0003252417900809038, "loss": 2.8879, "step": 8610 }, { "epoch": 0.5001595636659026, "grad_norm": 0.12275322526693344, "learning_rate": 0.0003246681208416797, "loss": 2.8906, "step": 8620 }, { "epoch": 0.5007397951782762, "grad_norm": 0.1220172718167305, "learning_rate": 0.0003240943607718506, "loss": 2.8952, "step": 8630 }, { "epoch": 0.5013200266906496, "grad_norm": 0.11458177119493484, "learning_rate": 0.00032352051198406104, "loss": 2.902, "step": 8640 }, { "epoch": 0.501900258203023, "grad_norm": 0.12652765214443207, "learning_rate": 0.0003229465765912824, "loss": 2.9038, "step": 8650 }, { "epoch": 0.5024804897153965, "grad_norm": 0.12456042319536209, "learning_rate": 0.000322372556706805, "loss": 2.8844, "step": 8660 }, { "epoch": 0.5030607212277699, "grad_norm": 0.13799023628234863, "learning_rate": 0.0003217984544442301, "loss": 2.8987, "step": 8670 }, { "epoch": 0.5036409527401433, "grad_norm": 0.12474406510591507, "learning_rate": 0.00032122427191746234, "loss": 2.8976, "step": 8680 }, { "epoch": 0.5042211842525167, "grad_norm": 0.12724703550338745, "learning_rate": 0.00032065001124070207, "loss": 2.8862, "step": 8690 }, { "epoch": 0.5048014157648902, "grad_norm": 0.11946358531713486, "learning_rate": 0.0003200756745284371, "loss": 2.8926, "step": 8700 }, { "epoch": 0.5053816472772636, "grad_norm": 0.1258503645658493, "learning_rate": 0.0003195012638954354, "loss": 2.8932, "step": 8710 }, { "epoch": 0.505961878789637, "grad_norm": 0.12079302221536636, "learning_rate": 0.00031892678145673724, "loss": 2.8914, "step": 8720 }, { "epoch": 0.5065421103020105, "grad_norm": 0.12168605625629425, "learning_rate": 0.000318352229327647, "loss": 2.8867, "step": 8730 }, { "epoch": 0.507122341814384, "grad_norm": 0.13427579402923584, "learning_rate": 0.00031777760962372584, "loss": 2.8893, "step": 8740 }, { "epoch": 0.5077025733267574, "grad_norm": 0.1176985576748848, "learning_rate": 0.00031720292446078374, "loss": 2.8887, "step": 8750 }, { "epoch": 0.5082828048391308, "grad_norm": 0.12351604551076889, "learning_rate": 0.00031662817595487166, "loss": 2.8915, "step": 8760 }, { "epoch": 0.5088630363515042, "grad_norm": 0.1390778124332428, "learning_rate": 0.00031605336622227365, "loss": 2.8737, "step": 8770 }, { "epoch": 0.5094432678638777, "grad_norm": 0.11954103410243988, "learning_rate": 0.00031547849737949957, "loss": 2.8888, "step": 8780 }, { "epoch": 0.5100234993762511, "grad_norm": 0.12293373793363571, "learning_rate": 0.00031490357154327674, "loss": 2.8814, "step": 8790 }, { "epoch": 0.5106037308886245, "grad_norm": 0.12284509837627411, "learning_rate": 0.0003143285908305422, "loss": 2.8874, "step": 8800 }, { "epoch": 0.511183962400998, "grad_norm": 0.11924895644187927, "learning_rate": 0.00031375355735843523, "loss": 2.8813, "step": 8810 }, { "epoch": 0.5117641939133715, "grad_norm": 0.12003005295991898, "learning_rate": 0.00031317847324428924, "loss": 2.8836, "step": 8820 }, { "epoch": 0.5123444254257449, "grad_norm": 0.13070861995220184, "learning_rate": 0.00031260334060562416, "loss": 2.8851, "step": 8830 }, { "epoch": 0.5129246569381183, "grad_norm": 0.11900255084037781, "learning_rate": 0.0003120281615601387, "loss": 2.8827, "step": 8840 }, { "epoch": 0.5135048884504917, "grad_norm": 0.12470702081918716, "learning_rate": 0.0003114529382257024, "loss": 2.8916, "step": 8850 }, { "epoch": 0.5140851199628652, "grad_norm": 0.1312616765499115, "learning_rate": 0.0003108776727203478, "loss": 2.897, "step": 8860 }, { "epoch": 0.5146653514752386, "grad_norm": 0.13872870802879333, "learning_rate": 0.00031030236716226265, "loss": 2.8836, "step": 8870 }, { "epoch": 0.515245582987612, "grad_norm": 0.11608674377202988, "learning_rate": 0.00030972702366978237, "loss": 2.8875, "step": 8880 }, { "epoch": 0.5158258144999855, "grad_norm": 0.12205769121646881, "learning_rate": 0.000309151644361382, "loss": 2.8862, "step": 8890 }, { "epoch": 0.516406046012359, "grad_norm": 0.12009671330451965, "learning_rate": 0.0003085762313556683, "loss": 2.8797, "step": 8900 }, { "epoch": 0.5169862775247324, "grad_norm": 0.12120591104030609, "learning_rate": 0.0003080007867713724, "loss": 2.8905, "step": 8910 }, { "epoch": 0.5175665090371058, "grad_norm": 0.12842518091201782, "learning_rate": 0.00030742531272734153, "loss": 2.8747, "step": 8920 }, { "epoch": 0.5181467405494793, "grad_norm": 0.12532438337802887, "learning_rate": 0.00030684981134253123, "loss": 2.8892, "step": 8930 }, { "epoch": 0.5187269720618527, "grad_norm": 0.1295221596956253, "learning_rate": 0.0003062742847359981, "loss": 2.8842, "step": 8940 }, { "epoch": 0.5193072035742261, "grad_norm": 0.1296953707933426, "learning_rate": 0.00030569873502689116, "loss": 2.878, "step": 8950 }, { "epoch": 0.5198874350865995, "grad_norm": 0.14120282232761383, "learning_rate": 0.00030512316433444495, "loss": 2.8809, "step": 8960 }, { "epoch": 0.520467666598973, "grad_norm": 0.12610268592834473, "learning_rate": 0.000304547574777971, "loss": 2.8794, "step": 8970 }, { "epoch": 0.5210478981113464, "grad_norm": 0.11908390372991562, "learning_rate": 0.0003039719684768503, "loss": 2.8839, "step": 8980 }, { "epoch": 0.5216281296237198, "grad_norm": 0.13508306443691254, "learning_rate": 0.0003033963475505256, "loss": 2.8782, "step": 8990 }, { "epoch": 0.5222083611360933, "grad_norm": 0.12108524888753891, "learning_rate": 0.00030282071411849343, "loss": 2.879, "step": 9000 }, { "epoch": 0.5222083611360933, "eval_loss": 2.845144271850586, "eval_runtime": 3.2553, "eval_samples_per_second": 1330.14, "eval_steps_per_second": 2.765, "step": 9000 }, { "epoch": 0.5227885926484668, "grad_norm": 0.13046176731586456, "learning_rate": 0.00030224507030029627, "loss": 2.8809, "step": 9010 }, { "epoch": 0.5233688241608402, "grad_norm": 0.12113803625106812, "learning_rate": 0.0003016694182155152, "loss": 2.8839, "step": 9020 }, { "epoch": 0.5239490556732136, "grad_norm": 0.12337899953126907, "learning_rate": 0.0003010937599837613, "loss": 2.8821, "step": 9030 }, { "epoch": 0.524529287185587, "grad_norm": 0.11981160938739777, "learning_rate": 0.0003005180977246686, "loss": 2.888, "step": 9040 }, { "epoch": 0.5251095186979605, "grad_norm": 0.12357629835605621, "learning_rate": 0.0002999424335578858, "loss": 2.8804, "step": 9050 }, { "epoch": 0.5256897502103339, "grad_norm": 0.11688230186700821, "learning_rate": 0.00029936676960306863, "loss": 2.8891, "step": 9060 }, { "epoch": 0.5262699817227073, "grad_norm": 0.11743608117103577, "learning_rate": 0.0002987911079798723, "loss": 2.8685, "step": 9070 }, { "epoch": 0.5268502132350807, "grad_norm": 0.1338096410036087, "learning_rate": 0.0002982154508079428, "loss": 2.8758, "step": 9080 }, { "epoch": 0.5274304447474543, "grad_norm": 0.13182982802391052, "learning_rate": 0.0002976398002069105, "loss": 2.882, "step": 9090 }, { "epoch": 0.5280106762598277, "grad_norm": 0.12470164895057678, "learning_rate": 0.000297064158296381, "loss": 2.8817, "step": 9100 }, { "epoch": 0.5285909077722011, "grad_norm": 0.11741513013839722, "learning_rate": 0.0002964885271959282, "loss": 2.8768, "step": 9110 }, { "epoch": 0.5291711392845746, "grad_norm": 0.1364392340183258, "learning_rate": 0.0002959129090250863, "loss": 2.8822, "step": 9120 }, { "epoch": 0.529751370796948, "grad_norm": 0.12005024403333664, "learning_rate": 0.0002953373059033413, "loss": 2.8789, "step": 9130 }, { "epoch": 0.5303316023093214, "grad_norm": 0.1239180713891983, "learning_rate": 0.0002947617199501245, "loss": 2.8754, "step": 9140 }, { "epoch": 0.5309118338216948, "grad_norm": 0.12774530053138733, "learning_rate": 0.00029418615328480357, "loss": 2.8773, "step": 9150 }, { "epoch": 0.5314920653340683, "grad_norm": 0.11815381795167923, "learning_rate": 0.00029361060802667526, "loss": 2.8711, "step": 9160 }, { "epoch": 0.5320722968464418, "grad_norm": 0.12450312077999115, "learning_rate": 0.0002930350862949577, "loss": 2.8743, "step": 9170 }, { "epoch": 0.5326525283588152, "grad_norm": 0.12741632759571075, "learning_rate": 0.00029245959020878187, "loss": 2.8846, "step": 9180 }, { "epoch": 0.5332327598711886, "grad_norm": 0.12712997198104858, "learning_rate": 0.0002918841218871848, "loss": 2.8774, "step": 9190 }, { "epoch": 0.5338129913835621, "grad_norm": 0.11238303780555725, "learning_rate": 0.0002913086834491012, "loss": 2.8782, "step": 9200 }, { "epoch": 0.5343932228959355, "grad_norm": 0.1266774982213974, "learning_rate": 0.00029073327701335566, "loss": 2.883, "step": 9210 }, { "epoch": 0.5349734544083089, "grad_norm": 0.12266207486391068, "learning_rate": 0.00029015790469865484, "loss": 2.8735, "step": 9220 }, { "epoch": 0.5355536859206823, "grad_norm": 0.10979332774877548, "learning_rate": 0.0002895825686235799, "loss": 2.8791, "step": 9230 }, { "epoch": 0.5361339174330558, "grad_norm": 0.11939531564712524, "learning_rate": 0.0002890072709065787, "loss": 2.8745, "step": 9240 }, { "epoch": 0.5367141489454292, "grad_norm": 0.12080537527799606, "learning_rate": 0.0002884320136659575, "loss": 2.8775, "step": 9250 }, { "epoch": 0.5372943804578026, "grad_norm": 0.12394317239522934, "learning_rate": 0.00028785679901987394, "loss": 2.8734, "step": 9260 }, { "epoch": 0.537874611970176, "grad_norm": 0.12320924550294876, "learning_rate": 0.0002872816290863283, "loss": 2.8703, "step": 9270 }, { "epoch": 0.5384548434825496, "grad_norm": 0.12183520197868347, "learning_rate": 0.0002867065059831568, "loss": 2.8731, "step": 9280 }, { "epoch": 0.539035074994923, "grad_norm": 0.13638751208782196, "learning_rate": 0.0002861314318280229, "loss": 2.8725, "step": 9290 }, { "epoch": 0.5396153065072964, "grad_norm": 0.12684093415737152, "learning_rate": 0.0002855564087384098, "loss": 2.8714, "step": 9300 }, { "epoch": 0.5401955380196698, "grad_norm": 0.11322664469480515, "learning_rate": 0.00028498143883161277, "loss": 2.8693, "step": 9310 }, { "epoch": 0.5407757695320433, "grad_norm": 0.11759771406650543, "learning_rate": 0.00028440652422473124, "loss": 2.8679, "step": 9320 }, { "epoch": 0.5413560010444167, "grad_norm": 0.12511123716831207, "learning_rate": 0.0002838316670346612, "loss": 2.8744, "step": 9330 }, { "epoch": 0.5419362325567901, "grad_norm": 0.1160508468747139, "learning_rate": 0.00028325686937808673, "loss": 2.874, "step": 9340 }, { "epoch": 0.5425164640691637, "grad_norm": 0.11813979595899582, "learning_rate": 0.0002826821333714732, "loss": 2.8691, "step": 9350 }, { "epoch": 0.5430966955815371, "grad_norm": 0.11728700250387192, "learning_rate": 0.0002821074611310588, "loss": 2.8717, "step": 9360 }, { "epoch": 0.5436769270939105, "grad_norm": 0.12824493646621704, "learning_rate": 0.0002815328547728469, "loss": 2.875, "step": 9370 }, { "epoch": 0.5442571586062839, "grad_norm": 0.12653270363807678, "learning_rate": 0.0002809583164125983, "loss": 2.8682, "step": 9380 }, { "epoch": 0.5448373901186574, "grad_norm": 0.13113363087177277, "learning_rate": 0.00028038384816582337, "loss": 2.8583, "step": 9390 }, { "epoch": 0.5454176216310308, "grad_norm": 0.11145169287919998, "learning_rate": 0.0002798094521477744, "loss": 2.8714, "step": 9400 }, { "epoch": 0.5459978531434042, "grad_norm": 0.12025914341211319, "learning_rate": 0.0002792351304734378, "loss": 2.8689, "step": 9410 }, { "epoch": 0.5465780846557776, "grad_norm": 0.1347450315952301, "learning_rate": 0.000278660885257526, "loss": 2.8803, "step": 9420 }, { "epoch": 0.5471583161681511, "grad_norm": 0.11728854477405548, "learning_rate": 0.0002780867186144703, "loss": 2.8614, "step": 9430 }, { "epoch": 0.5477385476805245, "grad_norm": 0.1399793028831482, "learning_rate": 0.00027751263265841204, "loss": 2.8777, "step": 9440 }, { "epoch": 0.548318779192898, "grad_norm": 0.13229645788669586, "learning_rate": 0.0002769386295031961, "loss": 2.8723, "step": 9450 }, { "epoch": 0.5488990107052714, "grad_norm": 0.12199070304632187, "learning_rate": 0.00027636471126236213, "loss": 2.8577, "step": 9460 }, { "epoch": 0.5494792422176449, "grad_norm": 0.14131730794906616, "learning_rate": 0.0002757908800491373, "loss": 2.857, "step": 9470 }, { "epoch": 0.5500594737300183, "grad_norm": 0.1343252956867218, "learning_rate": 0.0002752171379764283, "loss": 2.8689, "step": 9480 }, { "epoch": 0.5506397052423917, "grad_norm": 0.1338685154914856, "learning_rate": 0.0002746434871568133, "loss": 2.8775, "step": 9490 }, { "epoch": 0.5512199367547651, "grad_norm": 0.12388128787279129, "learning_rate": 0.00027406992970253506, "loss": 2.8761, "step": 9500 }, { "epoch": 0.5518001682671386, "grad_norm": 0.12272147834300995, "learning_rate": 0.0002734964677254918, "loss": 2.8722, "step": 9510 }, { "epoch": 0.552380399779512, "grad_norm": 0.12000911682844162, "learning_rate": 0.00027292310333723086, "loss": 2.8743, "step": 9520 }, { "epoch": 0.5529606312918854, "grad_norm": 0.13635672628879547, "learning_rate": 0.00027234983864894, "loss": 2.8657, "step": 9530 }, { "epoch": 0.5535408628042588, "grad_norm": 0.12129581719636917, "learning_rate": 0.0002717766757714398, "loss": 2.8661, "step": 9540 }, { "epoch": 0.5541210943166324, "grad_norm": 0.11717355996370316, "learning_rate": 0.00027120361681517606, "loss": 2.8707, "step": 9550 }, { "epoch": 0.5547013258290058, "grad_norm": 0.12199341505765915, "learning_rate": 0.0002706306638902117, "loss": 2.8555, "step": 9560 }, { "epoch": 0.5552815573413792, "grad_norm": 0.1175154522061348, "learning_rate": 0.0002700578191062196, "loss": 2.8721, "step": 9570 }, { "epoch": 0.5558617888537526, "grad_norm": 0.12546683847904205, "learning_rate": 0.00026948508457247416, "loss": 2.8689, "step": 9580 }, { "epoch": 0.5564420203661261, "grad_norm": 0.11439734697341919, "learning_rate": 0.000268912462397844, "loss": 2.8552, "step": 9590 }, { "epoch": 0.5570222518784995, "grad_norm": 0.13139833509922028, "learning_rate": 0.00026833995469078404, "loss": 2.8728, "step": 9600 }, { "epoch": 0.5576024833908729, "grad_norm": 0.14722158014774323, "learning_rate": 0.00026776756355932743, "loss": 2.8594, "step": 9610 }, { "epoch": 0.5581827149032464, "grad_norm": 0.12206868082284927, "learning_rate": 0.00026719529111107846, "loss": 2.8713, "step": 9620 }, { "epoch": 0.5587629464156199, "grad_norm": 0.11777371913194656, "learning_rate": 0.00026662313945320404, "loss": 2.8656, "step": 9630 }, { "epoch": 0.5593431779279933, "grad_norm": 0.12058188021183014, "learning_rate": 0.00026605111069242664, "loss": 2.8712, "step": 9640 }, { "epoch": 0.5599234094403667, "grad_norm": 0.1278459131717682, "learning_rate": 0.00026547920693501616, "loss": 2.8686, "step": 9650 }, { "epoch": 0.5605036409527402, "grad_norm": 0.12272592633962631, "learning_rate": 0.00026490743028678194, "loss": 2.8636, "step": 9660 }, { "epoch": 0.5610838724651136, "grad_norm": 0.11543965339660645, "learning_rate": 0.00026433578285306567, "loss": 2.8592, "step": 9670 }, { "epoch": 0.561664103977487, "grad_norm": 0.11765621602535248, "learning_rate": 0.0002637642667387329, "loss": 2.867, "step": 9680 }, { "epoch": 0.5622443354898604, "grad_norm": 0.12996822595596313, "learning_rate": 0.0002631928840481662, "loss": 2.8669, "step": 9690 }, { "epoch": 0.5628245670022339, "grad_norm": 0.11992313712835312, "learning_rate": 0.00026262163688525606, "loss": 2.8576, "step": 9700 }, { "epoch": 0.5634047985146073, "grad_norm": 0.1216612309217453, "learning_rate": 0.00026205052735339457, "loss": 2.8656, "step": 9710 }, { "epoch": 0.5639850300269807, "grad_norm": 0.11923664063215256, "learning_rate": 0.00026147955755546686, "loss": 2.8625, "step": 9720 }, { "epoch": 0.5645652615393542, "grad_norm": 0.1174679845571518, "learning_rate": 0.00026090872959384353, "loss": 2.8589, "step": 9730 }, { "epoch": 0.5651454930517277, "grad_norm": 0.12439408898353577, "learning_rate": 0.00026033804557037304, "loss": 2.8573, "step": 9740 }, { "epoch": 0.5657257245641011, "grad_norm": 0.12268688529729843, "learning_rate": 0.0002597675075863735, "loss": 2.8612, "step": 9750 }, { "epoch": 0.5663059560764745, "grad_norm": 0.11994469910860062, "learning_rate": 0.0002591971177426256, "loss": 2.8667, "step": 9760 }, { "epoch": 0.5668861875888479, "grad_norm": 0.12739793956279755, "learning_rate": 0.0002586268781393648, "loss": 2.8657, "step": 9770 }, { "epoch": 0.5674664191012214, "grad_norm": 0.12942016124725342, "learning_rate": 0.00025805679087627267, "loss": 2.863, "step": 9780 }, { "epoch": 0.5680466506135948, "grad_norm": 0.12867708504199982, "learning_rate": 0.00025748685805247046, "loss": 2.8596, "step": 9790 }, { "epoch": 0.5686268821259682, "grad_norm": 0.1384700983762741, "learning_rate": 0.00025691708176651034, "loss": 2.8612, "step": 9800 }, { "epoch": 0.5692071136383416, "grad_norm": 0.11695626378059387, "learning_rate": 0.0002563474641163686, "loss": 2.8613, "step": 9810 }, { "epoch": 0.5697873451507152, "grad_norm": 0.12379258126020432, "learning_rate": 0.0002557780071994367, "loss": 2.8637, "step": 9820 }, { "epoch": 0.5703675766630886, "grad_norm": 0.13220758736133575, "learning_rate": 0.00025520871311251493, "loss": 2.8572, "step": 9830 }, { "epoch": 0.570947808175462, "grad_norm": 0.12004509568214417, "learning_rate": 0.00025463958395180377, "loss": 2.8614, "step": 9840 }, { "epoch": 0.5715280396878355, "grad_norm": 0.12457242608070374, "learning_rate": 0.0002540706218128962, "loss": 2.8606, "step": 9850 }, { "epoch": 0.5721082712002089, "grad_norm": 0.125260129570961, "learning_rate": 0.0002535018287907707, "loss": 2.8606, "step": 9860 }, { "epoch": 0.5726885027125823, "grad_norm": 0.11718660593032837, "learning_rate": 0.00025293320697978254, "loss": 2.86, "step": 9870 }, { "epoch": 0.5732687342249557, "grad_norm": 0.1096329316496849, "learning_rate": 0.0002523647584736568, "loss": 2.8743, "step": 9880 }, { "epoch": 0.5738489657373292, "grad_norm": 0.11327598243951797, "learning_rate": 0.0002517964853654806, "loss": 2.8492, "step": 9890 }, { "epoch": 0.5744291972497026, "grad_norm": 0.1237105280160904, "learning_rate": 0.0002512283897476949, "loss": 2.852, "step": 9900 }, { "epoch": 0.5750094287620761, "grad_norm": 0.11739984154701233, "learning_rate": 0.0002506604737120874, "loss": 2.8535, "step": 9910 }, { "epoch": 0.5755896602744495, "grad_norm": 0.12682320177555084, "learning_rate": 0.00025009273934978424, "loss": 2.8575, "step": 9920 }, { "epoch": 0.576169891786823, "grad_norm": 0.12347414344549179, "learning_rate": 0.00024952518875124305, "loss": 2.8596, "step": 9930 }, { "epoch": 0.5767501232991964, "grad_norm": 0.11207421123981476, "learning_rate": 0.0002489578240062444, "loss": 2.8563, "step": 9940 }, { "epoch": 0.5773303548115698, "grad_norm": 0.12151192873716354, "learning_rate": 0.0002483906472038848, "loss": 2.8513, "step": 9950 }, { "epoch": 0.5779105863239432, "grad_norm": 0.11661417037248611, "learning_rate": 0.00024782366043256876, "loss": 2.8538, "step": 9960 }, { "epoch": 0.5784908178363167, "grad_norm": 0.11908597499132156, "learning_rate": 0.0002472568657800007, "loss": 2.8549, "step": 9970 }, { "epoch": 0.5790710493486901, "grad_norm": 0.12369140982627869, "learning_rate": 0.00024669026533317816, "loss": 2.859, "step": 9980 }, { "epoch": 0.5796512808610635, "grad_norm": 0.12169597297906876, "learning_rate": 0.0002461238611783832, "loss": 2.8516, "step": 9990 }, { "epoch": 0.580231512373437, "grad_norm": 0.1137092188000679, "learning_rate": 0.0002455576554011753, "loss": 2.8506, "step": 10000 }, { "epoch": 0.580231512373437, "eval_loss": 2.8198139667510986, "eval_runtime": 3.2544, "eval_samples_per_second": 1330.504, "eval_steps_per_second": 2.765, "step": 10000 }, { "epoch": 0.5808117438858105, "grad_norm": 0.11945224553346634, "learning_rate": 0.00024499165008638355, "loss": 2.8527, "step": 10010 }, { "epoch": 0.5813919753981839, "grad_norm": 0.12194681167602539, "learning_rate": 0.0002444258473180986, "loss": 2.8676, "step": 10020 }, { "epoch": 0.5819722069105573, "grad_norm": 0.12587039172649384, "learning_rate": 0.00024386024917966563, "loss": 2.8468, "step": 10030 }, { "epoch": 0.5825524384229307, "grad_norm": 0.12192162871360779, "learning_rate": 0.0002432948577536762, "loss": 2.8484, "step": 10040 }, { "epoch": 0.5831326699353042, "grad_norm": 0.11401449888944626, "learning_rate": 0.00024272967512196093, "loss": 2.8636, "step": 10050 }, { "epoch": 0.5837129014476776, "grad_norm": 0.12227935343980789, "learning_rate": 0.0002421647033655812, "loss": 2.8497, "step": 10060 }, { "epoch": 0.584293132960051, "grad_norm": 0.11773716658353806, "learning_rate": 0.00024159994456482233, "loss": 2.857, "step": 10070 }, { "epoch": 0.5848733644724246, "grad_norm": 0.124253049492836, "learning_rate": 0.00024103540079918555, "loss": 2.8499, "step": 10080 }, { "epoch": 0.585453595984798, "grad_norm": 0.11704014986753464, "learning_rate": 0.00024047107414737985, "loss": 2.8522, "step": 10090 }, { "epoch": 0.5860338274971714, "grad_norm": 0.11885286867618561, "learning_rate": 0.0002399069666873153, "loss": 2.855, "step": 10100 }, { "epoch": 0.5866140590095448, "grad_norm": 0.12006965279579163, "learning_rate": 0.00023934308049609453, "loss": 2.8488, "step": 10110 }, { "epoch": 0.5871942905219183, "grad_norm": 0.12023113667964935, "learning_rate": 0.00023877941765000564, "loss": 2.8542, "step": 10120 }, { "epoch": 0.5877745220342917, "grad_norm": 0.12737338244915009, "learning_rate": 0.00023821598022451436, "loss": 2.8588, "step": 10130 }, { "epoch": 0.5883547535466651, "grad_norm": 0.11698620766401291, "learning_rate": 0.00023765277029425607, "loss": 2.8544, "step": 10140 }, { "epoch": 0.5889349850590385, "grad_norm": 0.12589864432811737, "learning_rate": 0.000237089789933029, "loss": 2.8448, "step": 10150 }, { "epoch": 0.589515216571412, "grad_norm": 0.11532309651374817, "learning_rate": 0.0002365270412137856, "loss": 2.8618, "step": 10160 }, { "epoch": 0.5900954480837854, "grad_norm": 0.10937913507223129, "learning_rate": 0.00023596452620862585, "loss": 2.8527, "step": 10170 }, { "epoch": 0.5906756795961589, "grad_norm": 0.11980416625738144, "learning_rate": 0.00023540224698878861, "loss": 2.8553, "step": 10180 }, { "epoch": 0.5912559111085323, "grad_norm": 0.11810686439275742, "learning_rate": 0.00023484020562464507, "loss": 2.8545, "step": 10190 }, { "epoch": 0.5918361426209058, "grad_norm": 0.11651547253131866, "learning_rate": 0.00023427840418569043, "loss": 2.8522, "step": 10200 }, { "epoch": 0.5924163741332792, "grad_norm": 0.11145967990159988, "learning_rate": 0.00023371684474053633, "loss": 2.8564, "step": 10210 }, { "epoch": 0.5929966056456526, "grad_norm": 0.11742381006479263, "learning_rate": 0.0002331555293569037, "loss": 2.8529, "step": 10220 }, { "epoch": 0.593576837158026, "grad_norm": 0.1287650465965271, "learning_rate": 0.00023259446010161425, "loss": 2.847, "step": 10230 }, { "epoch": 0.5941570686703995, "grad_norm": 0.12560808658599854, "learning_rate": 0.00023203363904058394, "loss": 2.8424, "step": 10240 }, { "epoch": 0.5947373001827729, "grad_norm": 0.13144509494304657, "learning_rate": 0.0002314730682388147, "loss": 2.8497, "step": 10250 }, { "epoch": 0.5953175316951463, "grad_norm": 0.11483640223741531, "learning_rate": 0.00023091274976038686, "loss": 2.8525, "step": 10260 }, { "epoch": 0.5958977632075197, "grad_norm": 0.12085619568824768, "learning_rate": 0.0002303526856684519, "loss": 2.846, "step": 10270 }, { "epoch": 0.5964779947198933, "grad_norm": 0.13581375777721405, "learning_rate": 0.00022979287802522423, "loss": 2.8471, "step": 10280 }, { "epoch": 0.5970582262322667, "grad_norm": 0.11522037535905838, "learning_rate": 0.00022923332889197447, "loss": 2.841, "step": 10290 }, { "epoch": 0.5976384577446401, "grad_norm": 0.1114853248000145, "learning_rate": 0.00022867404032902097, "loss": 2.8507, "step": 10300 }, { "epoch": 0.5982186892570136, "grad_norm": 0.1106984093785286, "learning_rate": 0.00022811501439572288, "loss": 2.8501, "step": 10310 }, { "epoch": 0.598798920769387, "grad_norm": 0.12095363438129425, "learning_rate": 0.0002275562531504724, "loss": 2.8392, "step": 10320 }, { "epoch": 0.5993791522817604, "grad_norm": 0.11527710407972336, "learning_rate": 0.00022699775865068667, "loss": 2.8498, "step": 10330 }, { "epoch": 0.5999593837941338, "grad_norm": 0.11631615459918976, "learning_rate": 0.00022643953295280127, "loss": 2.8526, "step": 10340 }, { "epoch": 0.6005396153065073, "grad_norm": 0.1107979491353035, "learning_rate": 0.0002258815781122614, "loss": 2.8488, "step": 10350 }, { "epoch": 0.6011198468188808, "grad_norm": 0.1126491129398346, "learning_rate": 0.00022532389618351532, "loss": 2.8404, "step": 10360 }, { "epoch": 0.6017000783312542, "grad_norm": 0.11740950495004654, "learning_rate": 0.00022476648922000646, "loss": 2.8499, "step": 10370 }, { "epoch": 0.6022803098436276, "grad_norm": 0.11938904970884323, "learning_rate": 0.00022420935927416547, "loss": 2.8547, "step": 10380 }, { "epoch": 0.6028605413560011, "grad_norm": 0.11484769731760025, "learning_rate": 0.00022365250839740338, "loss": 2.8392, "step": 10390 }, { "epoch": 0.6034407728683745, "grad_norm": 0.12051428109407425, "learning_rate": 0.0002230959386401032, "loss": 2.8416, "step": 10400 }, { "epoch": 0.6040210043807479, "grad_norm": 0.12364054471254349, "learning_rate": 0.00022253965205161326, "loss": 2.8343, "step": 10410 }, { "epoch": 0.6046012358931213, "grad_norm": 0.1125280112028122, "learning_rate": 0.00022198365068023892, "loss": 2.8441, "step": 10420 }, { "epoch": 0.6051814674054948, "grad_norm": 0.11715447157621384, "learning_rate": 0.00022142793657323558, "loss": 2.8391, "step": 10430 }, { "epoch": 0.6057616989178682, "grad_norm": 0.11433437466621399, "learning_rate": 0.00022087251177680086, "loss": 2.8549, "step": 10440 }, { "epoch": 0.6063419304302416, "grad_norm": 0.1222948208451271, "learning_rate": 0.00022031737833606686, "loss": 2.8406, "step": 10450 }, { "epoch": 0.6069221619426151, "grad_norm": 0.11805406212806702, "learning_rate": 0.0002197625382950932, "loss": 2.8415, "step": 10460 }, { "epoch": 0.6075023934549886, "grad_norm": 0.13002602756023407, "learning_rate": 0.00021920799369685892, "loss": 2.851, "step": 10470 }, { "epoch": 0.608082624967362, "grad_norm": 0.11929357796907425, "learning_rate": 0.00021865374658325544, "loss": 2.8437, "step": 10480 }, { "epoch": 0.6086628564797354, "grad_norm": 0.11752030998468399, "learning_rate": 0.00021809979899507876, "loss": 2.8532, "step": 10490 }, { "epoch": 0.6092430879921088, "grad_norm": 0.12201694399118423, "learning_rate": 0.00021754615297202168, "loss": 2.8474, "step": 10500 }, { "epoch": 0.6098233195044823, "grad_norm": 0.12019883096218109, "learning_rate": 0.00021699281055266706, "loss": 2.8422, "step": 10510 }, { "epoch": 0.6104035510168557, "grad_norm": 0.12413442134857178, "learning_rate": 0.00021643977377447954, "loss": 2.8316, "step": 10520 }, { "epoch": 0.6109837825292291, "grad_norm": 0.11983013898134232, "learning_rate": 0.00021588704467379862, "loss": 2.8448, "step": 10530 }, { "epoch": 0.6115640140416027, "grad_norm": 0.13365738093852997, "learning_rate": 0.0002153346252858306, "loss": 2.837, "step": 10540 }, { "epoch": 0.6121442455539761, "grad_norm": 0.13185539841651917, "learning_rate": 0.00021478251764464148, "loss": 2.8468, "step": 10550 }, { "epoch": 0.6127244770663495, "grad_norm": 0.1213960349559784, "learning_rate": 0.00021423072378314964, "loss": 2.8444, "step": 10560 }, { "epoch": 0.6133047085787229, "grad_norm": 0.12037312239408493, "learning_rate": 0.00021367924573311773, "loss": 2.8438, "step": 10570 }, { "epoch": 0.6138849400910964, "grad_norm": 0.12542636692523956, "learning_rate": 0.00021312808552514592, "loss": 2.8424, "step": 10580 }, { "epoch": 0.6144651716034698, "grad_norm": 0.14415085315704346, "learning_rate": 0.00021257724518866352, "loss": 2.8417, "step": 10590 }, { "epoch": 0.6150454031158432, "grad_norm": 0.1150176003575325, "learning_rate": 0.00021202672675192248, "loss": 2.8435, "step": 10600 }, { "epoch": 0.6156256346282166, "grad_norm": 0.11662835627794266, "learning_rate": 0.00021147653224198951, "loss": 2.8441, "step": 10610 }, { "epoch": 0.6162058661405901, "grad_norm": 0.11693531274795532, "learning_rate": 0.00021092666368473817, "loss": 2.8391, "step": 10620 }, { "epoch": 0.6167860976529635, "grad_norm": 0.11077579110860825, "learning_rate": 0.0002103771231048423, "loss": 2.8345, "step": 10630 }, { "epoch": 0.617366329165337, "grad_norm": 0.11653861403465271, "learning_rate": 0.00020982791252576773, "loss": 2.8448, "step": 10640 }, { "epoch": 0.6179465606777104, "grad_norm": 0.11749275773763657, "learning_rate": 0.00020927903396976552, "loss": 2.8558, "step": 10650 }, { "epoch": 0.6185267921900839, "grad_norm": 0.11677636206150055, "learning_rate": 0.00020873048945786382, "loss": 2.8353, "step": 10660 }, { "epoch": 0.6191070237024573, "grad_norm": 0.11745753139257431, "learning_rate": 0.00020818228100986106, "loss": 2.8494, "step": 10670 }, { "epoch": 0.6196872552148307, "grad_norm": 0.11747489869594574, "learning_rate": 0.00020763441064431827, "loss": 2.8397, "step": 10680 }, { "epoch": 0.6202674867272041, "grad_norm": 0.11356910318136215, "learning_rate": 0.00020708688037855138, "loss": 2.8472, "step": 10690 }, { "epoch": 0.6208477182395776, "grad_norm": 0.11063719540834427, "learning_rate": 0.00020653969222862435, "loss": 2.8508, "step": 10700 }, { "epoch": 0.621427949751951, "grad_norm": 0.10978058725595474, "learning_rate": 0.00020599284820934112, "loss": 2.8308, "step": 10710 }, { "epoch": 0.6220081812643244, "grad_norm": 0.11860186606645584, "learning_rate": 0.00020544635033423867, "loss": 2.8263, "step": 10720 }, { "epoch": 0.6225884127766979, "grad_norm": 0.1312050074338913, "learning_rate": 0.00020490020061557953, "loss": 2.8455, "step": 10730 }, { "epoch": 0.6231686442890714, "grad_norm": 0.13181331753730774, "learning_rate": 0.00020435440106434408, "loss": 2.8489, "step": 10740 }, { "epoch": 0.6237488758014448, "grad_norm": 0.1471181958913803, "learning_rate": 0.00020380895369022357, "loss": 2.8285, "step": 10750 }, { "epoch": 0.6243291073138182, "grad_norm": 0.12075991183519363, "learning_rate": 0.00020326386050161215, "loss": 2.8402, "step": 10760 }, { "epoch": 0.6249093388261916, "grad_norm": 0.1117480993270874, "learning_rate": 0.0002027191235056003, "loss": 2.8426, "step": 10770 }, { "epoch": 0.6254895703385651, "grad_norm": 0.11622477322816849, "learning_rate": 0.0002021747447079665, "loss": 2.8423, "step": 10780 }, { "epoch": 0.6260698018509385, "grad_norm": 0.11475232988595963, "learning_rate": 0.00020163072611317055, "loss": 2.835, "step": 10790 }, { "epoch": 0.6266500333633119, "grad_norm": 0.12252891808748245, "learning_rate": 0.00020108706972434606, "loss": 2.8381, "step": 10800 }, { "epoch": 0.6272302648756855, "grad_norm": 0.11319098621606827, "learning_rate": 0.00020054377754329258, "loss": 2.8326, "step": 10810 }, { "epoch": 0.6278104963880589, "grad_norm": 0.11103735119104385, "learning_rate": 0.00020000085157046902, "loss": 2.8292, "step": 10820 }, { "epoch": 0.6283907279004323, "grad_norm": 0.12254971265792847, "learning_rate": 0.00019945829380498556, "loss": 2.8379, "step": 10830 }, { "epoch": 0.6289709594128057, "grad_norm": 0.1253294050693512, "learning_rate": 0.00019891610624459674, "loss": 2.8404, "step": 10840 }, { "epoch": 0.6295511909251792, "grad_norm": 0.12701797485351562, "learning_rate": 0.0001983742908856942, "loss": 2.8331, "step": 10850 }, { "epoch": 0.6301314224375526, "grad_norm": 0.1351822167634964, "learning_rate": 0.00019783284972329845, "loss": 2.831, "step": 10860 }, { "epoch": 0.630711653949926, "grad_norm": 0.11504077911376953, "learning_rate": 0.00019729178475105292, "loss": 2.8397, "step": 10870 }, { "epoch": 0.6312918854622994, "grad_norm": 0.11900710314512253, "learning_rate": 0.00019675109796121523, "loss": 2.8328, "step": 10880 }, { "epoch": 0.6318721169746729, "grad_norm": 0.11879398673772812, "learning_rate": 0.00019621079134465096, "loss": 2.8275, "step": 10890 }, { "epoch": 0.6324523484870463, "grad_norm": 0.11795203387737274, "learning_rate": 0.00019567086689082562, "loss": 2.828, "step": 10900 }, { "epoch": 0.6330325799994198, "grad_norm": 0.1163572296500206, "learning_rate": 0.00019513132658779758, "loss": 2.8387, "step": 10910 }, { "epoch": 0.6336128115117932, "grad_norm": 0.11812139302492142, "learning_rate": 0.00019459217242221092, "loss": 2.8336, "step": 10920 }, { "epoch": 0.6341930430241667, "grad_norm": 0.11195320636034012, "learning_rate": 0.00019405340637928755, "loss": 2.8427, "step": 10930 }, { "epoch": 0.6347732745365401, "grad_norm": 0.11674754321575165, "learning_rate": 0.0001935150304428206, "loss": 2.8279, "step": 10940 }, { "epoch": 0.6353535060489135, "grad_norm": 0.11432943493127823, "learning_rate": 0.00019297704659516655, "loss": 2.8267, "step": 10950 }, { "epoch": 0.6359337375612869, "grad_norm": 0.12507887184619904, "learning_rate": 0.0001924394568172384, "loss": 2.8309, "step": 10960 }, { "epoch": 0.6365139690736604, "grad_norm": 0.12057894468307495, "learning_rate": 0.0001919022630884981, "loss": 2.8422, "step": 10970 }, { "epoch": 0.6370942005860338, "grad_norm": 0.11377721279859543, "learning_rate": 0.000191365467386949, "loss": 2.8381, "step": 10980 }, { "epoch": 0.6376744320984072, "grad_norm": 0.11800755560398102, "learning_rate": 0.00019082907168912932, "loss": 2.8331, "step": 10990 }, { "epoch": 0.6382546636107806, "grad_norm": 0.12301038950681686, "learning_rate": 0.00019029307797010402, "loss": 2.831, "step": 11000 }, { "epoch": 0.6382546636107806, "eval_loss": 2.796895742416382, "eval_runtime": 3.2627, "eval_samples_per_second": 1327.123, "eval_steps_per_second": 2.758, "step": 11000 }, { "epoch": 0.6388348951231542, "grad_norm": 0.1179603561758995, "learning_rate": 0.00018975748820345838, "loss": 2.8436, "step": 11010 }, { "epoch": 0.6394151266355276, "grad_norm": 0.13155020773410797, "learning_rate": 0.0001892223043612898, "loss": 2.8317, "step": 11020 }, { "epoch": 0.639995358147901, "grad_norm": 0.11468763649463654, "learning_rate": 0.00018868752841420122, "loss": 2.8284, "step": 11030 }, { "epoch": 0.6405755896602745, "grad_norm": 0.10960279405117035, "learning_rate": 0.00018815316233129393, "loss": 2.8286, "step": 11040 }, { "epoch": 0.6411558211726479, "grad_norm": 0.1298363208770752, "learning_rate": 0.00018761920808015966, "loss": 2.8326, "step": 11050 }, { "epoch": 0.6417360526850213, "grad_norm": 0.11535240709781647, "learning_rate": 0.00018708566762687403, "loss": 2.8281, "step": 11060 }, { "epoch": 0.6423162841973947, "grad_norm": 0.12528617680072784, "learning_rate": 0.00018655254293598866, "loss": 2.8179, "step": 11070 }, { "epoch": 0.6428965157097682, "grad_norm": 0.11952237784862518, "learning_rate": 0.00018601983597052468, "loss": 2.8294, "step": 11080 }, { "epoch": 0.6434767472221417, "grad_norm": 0.12121649086475372, "learning_rate": 0.00018548754869196496, "loss": 2.8336, "step": 11090 }, { "epoch": 0.6440569787345151, "grad_norm": 0.12465447187423706, "learning_rate": 0.00018495568306024687, "loss": 2.8314, "step": 11100 }, { "epoch": 0.6446372102468885, "grad_norm": 0.10858411341905594, "learning_rate": 0.00018442424103375563, "loss": 2.8191, "step": 11110 }, { "epoch": 0.645217441759262, "grad_norm": 0.1240803673863411, "learning_rate": 0.00018389322456931616, "loss": 2.8334, "step": 11120 }, { "epoch": 0.6457976732716354, "grad_norm": 0.11604313552379608, "learning_rate": 0.00018336263562218695, "loss": 2.8241, "step": 11130 }, { "epoch": 0.6463779047840088, "grad_norm": 0.10764401406049728, "learning_rate": 0.00018283247614605185, "loss": 2.8343, "step": 11140 }, { "epoch": 0.6469581362963822, "grad_norm": 0.11341771483421326, "learning_rate": 0.00018230274809301377, "loss": 2.8323, "step": 11150 }, { "epoch": 0.6475383678087557, "grad_norm": 0.11618595570325851, "learning_rate": 0.00018177345341358699, "loss": 2.8295, "step": 11160 }, { "epoch": 0.6481185993211291, "grad_norm": 0.11492364853620529, "learning_rate": 0.00018124459405668967, "loss": 2.8253, "step": 11170 }, { "epoch": 0.6486988308335025, "grad_norm": 0.12541726231575012, "learning_rate": 0.0001807161719696377, "loss": 2.8305, "step": 11180 }, { "epoch": 0.649279062345876, "grad_norm": 0.1240224838256836, "learning_rate": 0.0001801881890981362, "loss": 2.832, "step": 11190 }, { "epoch": 0.6498592938582495, "grad_norm": 0.12260005623102188, "learning_rate": 0.00017966064738627363, "loss": 2.8274, "step": 11200 }, { "epoch": 0.6504395253706229, "grad_norm": 0.11284399777650833, "learning_rate": 0.00017913354877651386, "loss": 2.8291, "step": 11210 }, { "epoch": 0.6510197568829963, "grad_norm": 0.11993937194347382, "learning_rate": 0.00017860689520968906, "loss": 2.8357, "step": 11220 }, { "epoch": 0.6515999883953697, "grad_norm": 0.11259515583515167, "learning_rate": 0.00017808068862499302, "loss": 2.8134, "step": 11230 }, { "epoch": 0.6521802199077432, "grad_norm": 0.1146656796336174, "learning_rate": 0.0001775549309599733, "loss": 2.8275, "step": 11240 }, { "epoch": 0.6527604514201166, "grad_norm": 0.11118417978286743, "learning_rate": 0.0001770296241505248, "loss": 2.8276, "step": 11250 }, { "epoch": 0.65334068293249, "grad_norm": 0.1155654564499855, "learning_rate": 0.00017650477013088218, "loss": 2.8333, "step": 11260 }, { "epoch": 0.6539209144448636, "grad_norm": 0.12370238453149796, "learning_rate": 0.000175980370833613, "loss": 2.8209, "step": 11270 }, { "epoch": 0.654501145957237, "grad_norm": 0.11332956701517105, "learning_rate": 0.00017545642818961045, "loss": 2.824, "step": 11280 }, { "epoch": 0.6550813774696104, "grad_norm": 0.11696597188711166, "learning_rate": 0.00017493294412808603, "loss": 2.8285, "step": 11290 }, { "epoch": 0.6556616089819838, "grad_norm": 0.11556991934776306, "learning_rate": 0.00017440992057656302, "loss": 2.833, "step": 11300 }, { "epoch": 0.6562418404943573, "grad_norm": 0.11072834581136703, "learning_rate": 0.000173887359460869, "loss": 2.8202, "step": 11310 }, { "epoch": 0.6568220720067307, "grad_norm": 0.12139474600553513, "learning_rate": 0.0001733652627051285, "loss": 2.8323, "step": 11320 }, { "epoch": 0.6574023035191041, "grad_norm": 0.11882605403661728, "learning_rate": 0.0001728436322317567, "loss": 2.8325, "step": 11330 }, { "epoch": 0.6579825350314775, "grad_norm": 0.10851707309484482, "learning_rate": 0.00017232246996145163, "loss": 2.8304, "step": 11340 }, { "epoch": 0.658562766543851, "grad_norm": 0.11566723883152008, "learning_rate": 0.0001718017778131873, "loss": 2.8359, "step": 11350 }, { "epoch": 0.6591429980562244, "grad_norm": 0.1224483922123909, "learning_rate": 0.00017128155770420673, "loss": 2.8246, "step": 11360 }, { "epoch": 0.6597232295685979, "grad_norm": 0.11472085118293762, "learning_rate": 0.00017076181155001492, "loss": 2.8274, "step": 11370 }, { "epoch": 0.6603034610809713, "grad_norm": 0.11463634669780731, "learning_rate": 0.00017024254126437149, "loss": 2.8208, "step": 11380 }, { "epoch": 0.6608836925933448, "grad_norm": 0.11640073359012604, "learning_rate": 0.00016972374875928427, "loss": 2.8351, "step": 11390 }, { "epoch": 0.6614639241057182, "grad_norm": 0.12146312743425369, "learning_rate": 0.00016920543594500147, "loss": 2.8249, "step": 11400 }, { "epoch": 0.6620441556180916, "grad_norm": 0.11683548241853714, "learning_rate": 0.00016868760473000524, "loss": 2.8281, "step": 11410 }, { "epoch": 0.662624387130465, "grad_norm": 0.11443763226270676, "learning_rate": 0.0001681702570210043, "loss": 2.8239, "step": 11420 }, { "epoch": 0.6632046186428385, "grad_norm": 0.1136617586016655, "learning_rate": 0.00016765339472292714, "loss": 2.827, "step": 11430 }, { "epoch": 0.6637848501552119, "grad_norm": 0.11093004792928696, "learning_rate": 0.00016713701973891472, "loss": 2.8359, "step": 11440 }, { "epoch": 0.6643650816675853, "grad_norm": 0.12110643088817596, "learning_rate": 0.00016662113397031413, "loss": 2.8164, "step": 11450 }, { "epoch": 0.6649453131799588, "grad_norm": 0.12236957252025604, "learning_rate": 0.00016610573931667065, "loss": 2.8295, "step": 11460 }, { "epoch": 0.6655255446923323, "grad_norm": 0.11643628776073456, "learning_rate": 0.0001655908376757214, "loss": 2.8199, "step": 11470 }, { "epoch": 0.6661057762047057, "grad_norm": 0.12198419123888016, "learning_rate": 0.00016507643094338818, "loss": 2.8234, "step": 11480 }, { "epoch": 0.6666860077170791, "grad_norm": 0.11697736382484436, "learning_rate": 0.00016456252101377042, "loss": 2.8309, "step": 11490 }, { "epoch": 0.6672662392294526, "grad_norm": 0.11377154290676117, "learning_rate": 0.00016404910977913824, "loss": 2.8174, "step": 11500 }, { "epoch": 0.667846470741826, "grad_norm": 0.1169874370098114, "learning_rate": 0.0001635361991299258, "loss": 2.8174, "step": 11510 }, { "epoch": 0.6684267022541994, "grad_norm": 0.11022408306598663, "learning_rate": 0.00016302379095472374, "loss": 2.8251, "step": 11520 }, { "epoch": 0.6690069337665728, "grad_norm": 0.11143022775650024, "learning_rate": 0.00016251188714027265, "loss": 2.832, "step": 11530 }, { "epoch": 0.6695871652789464, "grad_norm": 0.11829391121864319, "learning_rate": 0.00016200048957145597, "loss": 2.8181, "step": 11540 }, { "epoch": 0.6701673967913198, "grad_norm": 0.11668332666158676, "learning_rate": 0.00016148960013129303, "loss": 2.8163, "step": 11550 }, { "epoch": 0.6707476283036932, "grad_norm": 0.11444656550884247, "learning_rate": 0.0001609792207009325, "loss": 2.8171, "step": 11560 }, { "epoch": 0.6713278598160666, "grad_norm": 0.11538255959749222, "learning_rate": 0.00016046935315964476, "loss": 2.8192, "step": 11570 }, { "epoch": 0.6719080913284401, "grad_norm": 0.13890443742275238, "learning_rate": 0.0001599599993848155, "loss": 2.814, "step": 11580 }, { "epoch": 0.6724883228408135, "grad_norm": 0.10878733545541763, "learning_rate": 0.00015945116125193876, "loss": 2.8161, "step": 11590 }, { "epoch": 0.6730685543531869, "grad_norm": 0.11337769776582718, "learning_rate": 0.00015894284063460966, "loss": 2.8161, "step": 11600 }, { "epoch": 0.6736487858655603, "grad_norm": 0.1095629557967186, "learning_rate": 0.00015843503940451834, "loss": 2.8087, "step": 11610 }, { "epoch": 0.6742290173779338, "grad_norm": 0.1378069370985031, "learning_rate": 0.00015792775943144165, "loss": 2.8151, "step": 11620 }, { "epoch": 0.6748092488903072, "grad_norm": 0.1202809140086174, "learning_rate": 0.00015742100258323794, "loss": 2.831, "step": 11630 }, { "epoch": 0.6753894804026807, "grad_norm": 0.12298610061407089, "learning_rate": 0.00015691477072583894, "loss": 2.8247, "step": 11640 }, { "epoch": 0.6759697119150541, "grad_norm": 0.11947082728147507, "learning_rate": 0.00015640906572324319, "loss": 2.8238, "step": 11650 }, { "epoch": 0.6765499434274276, "grad_norm": 0.11039472371339798, "learning_rate": 0.00015590388943750988, "loss": 2.8267, "step": 11660 }, { "epoch": 0.677130174939801, "grad_norm": 0.11807908117771149, "learning_rate": 0.0001553992437287505, "loss": 2.8222, "step": 11670 }, { "epoch": 0.6777104064521744, "grad_norm": 0.11934113502502441, "learning_rate": 0.00015489513045512386, "loss": 2.8193, "step": 11680 }, { "epoch": 0.6782906379645478, "grad_norm": 0.11163033545017242, "learning_rate": 0.00015439155147282764, "loss": 2.8137, "step": 11690 }, { "epoch": 0.6788708694769213, "grad_norm": 0.11381068080663681, "learning_rate": 0.0001538885086360923, "loss": 2.8202, "step": 11700 }, { "epoch": 0.6794511009892947, "grad_norm": 0.11011006683111191, "learning_rate": 0.0001533860037971747, "loss": 2.8213, "step": 11710 }, { "epoch": 0.6800313325016681, "grad_norm": 0.11611464619636536, "learning_rate": 0.0001528840388063497, "loss": 2.8216, "step": 11720 }, { "epoch": 0.6806115640140415, "grad_norm": 0.10734301805496216, "learning_rate": 0.0001523826155119055, "loss": 2.8188, "step": 11730 }, { "epoch": 0.6811917955264151, "grad_norm": 0.12189003825187683, "learning_rate": 0.00015188173576013482, "loss": 2.8206, "step": 11740 }, { "epoch": 0.6817720270387885, "grad_norm": 0.11146776378154755, "learning_rate": 0.0001513814013953296, "loss": 2.8176, "step": 11750 }, { "epoch": 0.6823522585511619, "grad_norm": 0.11531021445989609, "learning_rate": 0.0001508816142597733, "loss": 2.8192, "step": 11760 }, { "epoch": 0.6829324900635354, "grad_norm": 0.11541693657636642, "learning_rate": 0.00015038237619373443, "loss": 2.8219, "step": 11770 }, { "epoch": 0.6835127215759088, "grad_norm": 0.11345332115888596, "learning_rate": 0.0001498836890354602, "loss": 2.8024, "step": 11780 }, { "epoch": 0.6840929530882822, "grad_norm": 0.10796009749174118, "learning_rate": 0.00014938555462116842, "loss": 2.8119, "step": 11790 }, { "epoch": 0.6846731846006556, "grad_norm": 0.11463455855846405, "learning_rate": 0.00014888797478504261, "loss": 2.8119, "step": 11800 }, { "epoch": 0.6852534161130291, "grad_norm": 0.11192594468593597, "learning_rate": 0.00014839095135922372, "loss": 2.8252, "step": 11810 }, { "epoch": 0.6858336476254026, "grad_norm": 0.11805829405784607, "learning_rate": 0.000147894486173804, "loss": 2.8095, "step": 11820 }, { "epoch": 0.686413879137776, "grad_norm": 0.11721805483102798, "learning_rate": 0.00014739858105682053, "loss": 2.8123, "step": 11830 }, { "epoch": 0.6869941106501494, "grad_norm": 0.11619780957698822, "learning_rate": 0.0001469032378342475, "loss": 2.8177, "step": 11840 }, { "epoch": 0.6875743421625229, "grad_norm": 0.10933215916156769, "learning_rate": 0.00014640845832999087, "loss": 2.8078, "step": 11850 }, { "epoch": 0.6881545736748963, "grad_norm": 0.11362309753894806, "learning_rate": 0.0001459142443658805, "loss": 2.8103, "step": 11860 }, { "epoch": 0.6887348051872697, "grad_norm": 0.10805781930685043, "learning_rate": 0.00014542059776166382, "loss": 2.8073, "step": 11870 }, { "epoch": 0.6893150366996431, "grad_norm": 0.124758280813694, "learning_rate": 0.00014492752033499977, "loss": 2.8133, "step": 11880 }, { "epoch": 0.6898952682120166, "grad_norm": 0.11096182465553284, "learning_rate": 0.00014443501390145057, "loss": 2.8061, "step": 11890 }, { "epoch": 0.69047549972439, "grad_norm": 0.1132817193865776, "learning_rate": 0.00014394308027447685, "loss": 2.8209, "step": 11900 }, { "epoch": 0.6910557312367634, "grad_norm": 0.10996360331773758, "learning_rate": 0.00014345172126542966, "loss": 2.8161, "step": 11910 }, { "epoch": 0.6916359627491369, "grad_norm": 0.11297384649515152, "learning_rate": 0.0001429609386835442, "loss": 2.8116, "step": 11920 }, { "epoch": 0.6922161942615104, "grad_norm": 0.12191120535135269, "learning_rate": 0.00014247073433593373, "loss": 2.8156, "step": 11930 }, { "epoch": 0.6927964257738838, "grad_norm": 0.11631318181753159, "learning_rate": 0.00014198111002758154, "loss": 2.8225, "step": 11940 }, { "epoch": 0.6933766572862572, "grad_norm": 0.14487071335315704, "learning_rate": 0.00014149206756133595, "loss": 2.8153, "step": 11950 }, { "epoch": 0.6939568887986306, "grad_norm": 0.11780226230621338, "learning_rate": 0.00014100360873790248, "loss": 2.8163, "step": 11960 }, { "epoch": 0.6945371203110041, "grad_norm": 0.11396613717079163, "learning_rate": 0.00014051573535583766, "loss": 2.8101, "step": 11970 }, { "epoch": 0.6951173518233775, "grad_norm": 0.11514125019311905, "learning_rate": 0.00014002844921154233, "loss": 2.819, "step": 11980 }, { "epoch": 0.6956975833357509, "grad_norm": 0.11687569320201874, "learning_rate": 0.00013954175209925513, "loss": 2.8106, "step": 11990 }, { "epoch": 0.6962778148481245, "grad_norm": 0.11218845099210739, "learning_rate": 0.00013905564581104607, "loss": 2.8156, "step": 12000 }, { "epoch": 0.6962778148481245, "eval_loss": 2.778130531311035, "eval_runtime": 3.2555, "eval_samples_per_second": 1330.053, "eval_steps_per_second": 2.765, "step": 12000 }, { "epoch": 0.6968580463604979, "grad_norm": 0.11513704061508179, "learning_rate": 0.000138570132136809, "loss": 2.8185, "step": 12010 }, { "epoch": 0.6974382778728713, "grad_norm": 0.12384956330060959, "learning_rate": 0.00013808521286425644, "loss": 2.8159, "step": 12020 }, { "epoch": 0.6980185093852447, "grad_norm": 0.11136494576931, "learning_rate": 0.0001376008897789119, "loss": 2.8196, "step": 12030 }, { "epoch": 0.6985987408976182, "grad_norm": 0.11704517900943756, "learning_rate": 0.00013711716466410353, "loss": 2.8118, "step": 12040 }, { "epoch": 0.6991789724099916, "grad_norm": 0.11521551758050919, "learning_rate": 0.00013663403930095827, "loss": 2.8131, "step": 12050 }, { "epoch": 0.699759203922365, "grad_norm": 0.10568945109844208, "learning_rate": 0.00013615151546839382, "loss": 2.8098, "step": 12060 }, { "epoch": 0.7003394354347384, "grad_norm": 0.1213884949684143, "learning_rate": 0.00013566959494311386, "loss": 2.8091, "step": 12070 }, { "epoch": 0.7009196669471119, "grad_norm": 0.11004059761762619, "learning_rate": 0.00013518827949960015, "loss": 2.8238, "step": 12080 }, { "epoch": 0.7014998984594853, "grad_norm": 0.11095508933067322, "learning_rate": 0.00013470757091010649, "loss": 2.8116, "step": 12090 }, { "epoch": 0.7020801299718588, "grad_norm": 0.11275944113731384, "learning_rate": 0.00013422747094465234, "loss": 2.8109, "step": 12100 }, { "epoch": 0.7026603614842322, "grad_norm": 0.11312493681907654, "learning_rate": 0.00013374798137101595, "loss": 2.814, "step": 12110 }, { "epoch": 0.7032405929966057, "grad_norm": 0.10738647729158401, "learning_rate": 0.00013326910395472833, "loss": 2.8111, "step": 12120 }, { "epoch": 0.7038208245089791, "grad_norm": 0.11198966205120087, "learning_rate": 0.00013279084045906623, "loss": 2.806, "step": 12130 }, { "epoch": 0.7044010560213525, "grad_norm": 0.11718153953552246, "learning_rate": 0.00013231319264504594, "loss": 2.8186, "step": 12140 }, { "epoch": 0.7049812875337259, "grad_norm": 0.11054380983114243, "learning_rate": 0.00013183616227141674, "loss": 2.8144, "step": 12150 }, { "epoch": 0.7055615190460994, "grad_norm": 0.11579257249832153, "learning_rate": 0.0001313597510946543, "loss": 2.8101, "step": 12160 }, { "epoch": 0.7061417505584728, "grad_norm": 0.10710903257131577, "learning_rate": 0.00013088396086895476, "loss": 2.8104, "step": 12170 }, { "epoch": 0.7067219820708462, "grad_norm": 0.11220473051071167, "learning_rate": 0.00013040879334622738, "loss": 2.8049, "step": 12180 }, { "epoch": 0.7073022135832197, "grad_norm": 0.10872667282819748, "learning_rate": 0.00012993425027608884, "loss": 2.8175, "step": 12190 }, { "epoch": 0.7078824450955932, "grad_norm": 0.10861840099096298, "learning_rate": 0.00012946033340585641, "loss": 2.8072, "step": 12200 }, { "epoch": 0.7084626766079666, "grad_norm": 0.11558268964290619, "learning_rate": 0.00012898704448054162, "loss": 2.8034, "step": 12210 }, { "epoch": 0.70904290812034, "grad_norm": 0.11709378659725189, "learning_rate": 0.00012851438524284382, "loss": 2.8047, "step": 12220 }, { "epoch": 0.7096231396327135, "grad_norm": 0.12139759957790375, "learning_rate": 0.00012804235743314401, "loss": 2.8056, "step": 12230 }, { "epoch": 0.7102033711450869, "grad_norm": 0.11130308359861374, "learning_rate": 0.00012757096278949792, "loss": 2.8138, "step": 12240 }, { "epoch": 0.7107836026574603, "grad_norm": 0.1112653836607933, "learning_rate": 0.00012710020304763003, "loss": 2.8004, "step": 12250 }, { "epoch": 0.7113638341698337, "grad_norm": 0.11182957142591476, "learning_rate": 0.00012663007994092703, "loss": 2.8064, "step": 12260 }, { "epoch": 0.7119440656822072, "grad_norm": 0.13386094570159912, "learning_rate": 0.00012616059520043145, "loss": 2.8148, "step": 12270 }, { "epoch": 0.7125242971945807, "grad_norm": 0.11641652137041092, "learning_rate": 0.0001256917505548352, "loss": 2.8102, "step": 12280 }, { "epoch": 0.7131045287069541, "grad_norm": 0.10916447639465332, "learning_rate": 0.00012522354773047352, "loss": 2.8148, "step": 12290 }, { "epoch": 0.7136847602193275, "grad_norm": 0.10887318104505539, "learning_rate": 0.0001247559884513182, "loss": 2.8047, "step": 12300 }, { "epoch": 0.714264991731701, "grad_norm": 0.11701834946870804, "learning_rate": 0.0001242890744389715, "loss": 2.8144, "step": 12310 }, { "epoch": 0.7148452232440744, "grad_norm": 0.10473381727933884, "learning_rate": 0.00012382280741265968, "loss": 2.8057, "step": 12320 }, { "epoch": 0.7154254547564478, "grad_norm": 0.10586260259151459, "learning_rate": 0.00012335718908922685, "loss": 2.8032, "step": 12330 }, { "epoch": 0.7160056862688212, "grad_norm": 0.10688824206590652, "learning_rate": 0.00012289222118312822, "loss": 2.8054, "step": 12340 }, { "epoch": 0.7165859177811947, "grad_norm": 0.11233460903167725, "learning_rate": 0.0001224279054064247, "loss": 2.801, "step": 12350 }, { "epoch": 0.7171661492935681, "grad_norm": 0.10600557923316956, "learning_rate": 0.00012196424346877541, "loss": 2.8035, "step": 12360 }, { "epoch": 0.7177463808059416, "grad_norm": 0.11300963163375854, "learning_rate": 0.00012150123707743219, "loss": 2.8098, "step": 12370 }, { "epoch": 0.718326612318315, "grad_norm": 0.11773265898227692, "learning_rate": 0.00012103888793723312, "loss": 2.8103, "step": 12380 }, { "epoch": 0.7189068438306885, "grad_norm": 0.11092250049114227, "learning_rate": 0.00012057719775059602, "loss": 2.8028, "step": 12390 }, { "epoch": 0.7194870753430619, "grad_norm": 0.10554751008749008, "learning_rate": 0.00012011616821751271, "loss": 2.8044, "step": 12400 }, { "epoch": 0.7200673068554353, "grad_norm": 0.1148175522685051, "learning_rate": 0.0001196558010355422, "loss": 2.8099, "step": 12410 }, { "epoch": 0.7206475383678087, "grad_norm": 0.10981535166501999, "learning_rate": 0.00011919609789980458, "loss": 2.7991, "step": 12420 }, { "epoch": 0.7212277698801822, "grad_norm": 0.11188452690839767, "learning_rate": 0.00011873706050297508, "loss": 2.8067, "step": 12430 }, { "epoch": 0.7218080013925556, "grad_norm": 0.11328940838575363, "learning_rate": 0.00011827869053527727, "loss": 2.8049, "step": 12440 }, { "epoch": 0.722388232904929, "grad_norm": 0.11542364954948425, "learning_rate": 0.00011782098968447774, "loss": 2.7988, "step": 12450 }, { "epoch": 0.7229684644173026, "grad_norm": 0.11087549477815628, "learning_rate": 0.00011736395963587857, "loss": 2.8102, "step": 12460 }, { "epoch": 0.723548695929676, "grad_norm": 0.11298040300607681, "learning_rate": 0.00011690760207231256, "loss": 2.8063, "step": 12470 }, { "epoch": 0.7241289274420494, "grad_norm": 0.10775293409824371, "learning_rate": 0.00011645191867413596, "loss": 2.8065, "step": 12480 }, { "epoch": 0.7247091589544228, "grad_norm": 0.11240221560001373, "learning_rate": 0.00011599691111922272, "loss": 2.8062, "step": 12490 }, { "epoch": 0.7252893904667963, "grad_norm": 0.1069854348897934, "learning_rate": 0.00011554258108295859, "loss": 2.79, "step": 12500 }, { "epoch": 0.7258696219791697, "grad_norm": 0.11566832661628723, "learning_rate": 0.00011508893023823393, "loss": 2.7977, "step": 12510 }, { "epoch": 0.7264498534915431, "grad_norm": 0.11771980673074722, "learning_rate": 0.00011463596025543905, "loss": 2.803, "step": 12520 }, { "epoch": 0.7270300850039165, "grad_norm": 0.11435101926326752, "learning_rate": 0.0001141836728024567, "loss": 2.7985, "step": 12530 }, { "epoch": 0.72761031651629, "grad_norm": 0.10902056097984314, "learning_rate": 0.0001137320695446566, "loss": 2.8096, "step": 12540 }, { "epoch": 0.7281905480286635, "grad_norm": 0.10939980298280716, "learning_rate": 0.0001132811521448896, "loss": 2.8121, "step": 12550 }, { "epoch": 0.7287707795410369, "grad_norm": 0.10922636091709137, "learning_rate": 0.00011283092226348031, "loss": 2.8093, "step": 12560 }, { "epoch": 0.7293510110534103, "grad_norm": 0.10520195960998535, "learning_rate": 0.00011238138155822275, "loss": 2.8031, "step": 12570 }, { "epoch": 0.7299312425657838, "grad_norm": 0.10655706375837326, "learning_rate": 0.00011193253168437253, "loss": 2.8083, "step": 12580 }, { "epoch": 0.7305114740781572, "grad_norm": 0.11627507954835892, "learning_rate": 0.00011148437429464215, "loss": 2.7994, "step": 12590 }, { "epoch": 0.7310917055905306, "grad_norm": 0.1093965470790863, "learning_rate": 0.00011103691103919401, "loss": 2.8054, "step": 12600 }, { "epoch": 0.731671937102904, "grad_norm": 0.113887257874012, "learning_rate": 0.00011059014356563458, "loss": 2.7963, "step": 12610 }, { "epoch": 0.7322521686152775, "grad_norm": 0.10929399728775024, "learning_rate": 0.00011014407351900879, "loss": 2.8033, "step": 12620 }, { "epoch": 0.7328324001276509, "grad_norm": 0.11176785826683044, "learning_rate": 0.00010969870254179285, "loss": 2.8061, "step": 12630 }, { "epoch": 0.7334126316400243, "grad_norm": 0.10631275177001953, "learning_rate": 0.00010925403227388973, "loss": 2.8107, "step": 12640 }, { "epoch": 0.7339928631523978, "grad_norm": 0.11108485609292984, "learning_rate": 0.00010881006435262179, "loss": 2.8059, "step": 12650 }, { "epoch": 0.7345730946647713, "grad_norm": 0.10749488323926926, "learning_rate": 0.00010836680041272536, "loss": 2.8004, "step": 12660 }, { "epoch": 0.7351533261771447, "grad_norm": 0.10994744300842285, "learning_rate": 0.00010792424208634495, "loss": 2.8093, "step": 12670 }, { "epoch": 0.7357335576895181, "grad_norm": 0.10910103470087051, "learning_rate": 0.00010748239100302627, "loss": 2.7928, "step": 12680 }, { "epoch": 0.7363137892018915, "grad_norm": 0.10835743695497513, "learning_rate": 0.0001070412487897117, "loss": 2.8077, "step": 12690 }, { "epoch": 0.736894020714265, "grad_norm": 0.10580655187368393, "learning_rate": 0.00010660081707073288, "loss": 2.7991, "step": 12700 }, { "epoch": 0.7374742522266384, "grad_norm": 0.10928157716989517, "learning_rate": 0.00010616109746780546, "loss": 2.7905, "step": 12710 }, { "epoch": 0.7380544837390118, "grad_norm": 0.10654684156179428, "learning_rate": 0.00010572209160002339, "loss": 2.8021, "step": 12720 }, { "epoch": 0.7386347152513854, "grad_norm": 0.10834140330553055, "learning_rate": 0.00010528380108385186, "loss": 2.805, "step": 12730 }, { "epoch": 0.7392149467637588, "grad_norm": 0.1152142882347107, "learning_rate": 0.00010484622753312279, "loss": 2.7916, "step": 12740 }, { "epoch": 0.7397951782761322, "grad_norm": 0.10981319844722748, "learning_rate": 0.0001044093725590277, "loss": 2.8029, "step": 12750 }, { "epoch": 0.7403754097885056, "grad_norm": 0.1065368577837944, "learning_rate": 0.00010397323777011229, "loss": 2.8048, "step": 12760 }, { "epoch": 0.7409556413008791, "grad_norm": 0.10563939809799194, "learning_rate": 0.00010353782477227083, "loss": 2.8058, "step": 12770 }, { "epoch": 0.7415358728132525, "grad_norm": 0.11117275804281235, "learning_rate": 0.00010310313516873922, "loss": 2.7985, "step": 12780 }, { "epoch": 0.7421161043256259, "grad_norm": 0.11544723808765411, "learning_rate": 0.00010266917056009036, "loss": 2.8001, "step": 12790 }, { "epoch": 0.7426963358379993, "grad_norm": 0.11005005240440369, "learning_rate": 0.00010223593254422733, "loss": 2.7954, "step": 12800 }, { "epoch": 0.7432765673503728, "grad_norm": 0.11374104768037796, "learning_rate": 0.0001018034227163779, "loss": 2.8053, "step": 12810 }, { "epoch": 0.7438567988627462, "grad_norm": 0.11264318227767944, "learning_rate": 0.00010137164266908854, "loss": 2.8029, "step": 12820 }, { "epoch": 0.7444370303751197, "grad_norm": 0.10718287527561188, "learning_rate": 0.00010094059399221855, "loss": 2.7964, "step": 12830 }, { "epoch": 0.7450172618874931, "grad_norm": 0.11395127326250076, "learning_rate": 0.00010051027827293457, "loss": 2.8057, "step": 12840 }, { "epoch": 0.7455974933998666, "grad_norm": 0.11251317709684372, "learning_rate": 0.00010008069709570378, "loss": 2.8036, "step": 12850 }, { "epoch": 0.74617772491224, "grad_norm": 0.1180030032992363, "learning_rate": 9.965185204228941e-05, "loss": 2.8016, "step": 12860 }, { "epoch": 0.7467579564246134, "grad_norm": 0.12361141294240952, "learning_rate": 9.922374469174372e-05, "loss": 2.7891, "step": 12870 }, { "epoch": 0.7473381879369868, "grad_norm": 0.11456003040075302, "learning_rate": 9.879637662040275e-05, "loss": 2.8028, "step": 12880 }, { "epoch": 0.7479184194493603, "grad_norm": 0.11008987575769424, "learning_rate": 9.83697494018808e-05, "loss": 2.8093, "step": 12890 }, { "epoch": 0.7484986509617337, "grad_norm": 0.11017616838216782, "learning_rate": 9.794386460706356e-05, "loss": 2.8005, "step": 12900 }, { "epoch": 0.7490788824741071, "grad_norm": 0.11627316474914551, "learning_rate": 9.751872380410378e-05, "loss": 2.799, "step": 12910 }, { "epoch": 0.7496591139864806, "grad_norm": 0.11369270831346512, "learning_rate": 9.709432855841436e-05, "loss": 2.7941, "step": 12920 }, { "epoch": 0.7502393454988541, "grad_norm": 0.10983362793922424, "learning_rate": 9.667068043266302e-05, "loss": 2.7996, "step": 12930 }, { "epoch": 0.7508195770112275, "grad_norm": 0.10419350117444992, "learning_rate": 9.624778098676652e-05, "loss": 2.8052, "step": 12940 }, { "epoch": 0.7513998085236009, "grad_norm": 0.10500075668096542, "learning_rate": 9.582563177788487e-05, "loss": 2.7993, "step": 12950 }, { "epoch": 0.7519800400359744, "grad_norm": 0.10765775293111801, "learning_rate": 9.540423436041585e-05, "loss": 2.7964, "step": 12960 }, { "epoch": 0.7525602715483478, "grad_norm": 0.10872151702642441, "learning_rate": 9.49835902859888e-05, "loss": 2.7876, "step": 12970 }, { "epoch": 0.7531405030607212, "grad_norm": 0.10935165733098984, "learning_rate": 9.456370110345927e-05, "loss": 2.8003, "step": 12980 }, { "epoch": 0.7537207345730946, "grad_norm": 0.1083398386836052, "learning_rate": 9.414456835890322e-05, "loss": 2.7945, "step": 12990 }, { "epoch": 0.7543009660854681, "grad_norm": 0.10846253484487534, "learning_rate": 9.372619359561121e-05, "loss": 2.799, "step": 13000 }, { "epoch": 0.7543009660854681, "eval_loss": 2.7616169452667236, "eval_runtime": 3.2768, "eval_samples_per_second": 1321.408, "eval_steps_per_second": 2.747, "step": 13000 }, { "epoch": 0.7548811975978416, "grad_norm": 0.10937865823507309, "learning_rate": 9.330857835408318e-05, "loss": 2.7962, "step": 13010 }, { "epoch": 0.755461429110215, "grad_norm": 0.10633205622434616, "learning_rate": 9.289172417202205e-05, "loss": 2.7989, "step": 13020 }, { "epoch": 0.7560416606225884, "grad_norm": 0.11001235246658325, "learning_rate": 9.247563258432861e-05, "loss": 2.7955, "step": 13030 }, { "epoch": 0.7566218921349619, "grad_norm": 0.10847952216863632, "learning_rate": 9.206030512309566e-05, "loss": 2.7959, "step": 13040 }, { "epoch": 0.7572021236473353, "grad_norm": 0.10858704149723053, "learning_rate": 9.164574331760246e-05, "loss": 2.7965, "step": 13050 }, { "epoch": 0.7577823551597087, "grad_norm": 0.10710106790065765, "learning_rate": 9.123194869430888e-05, "loss": 2.7921, "step": 13060 }, { "epoch": 0.7583625866720821, "grad_norm": 0.10932508111000061, "learning_rate": 9.081892277685026e-05, "loss": 2.7921, "step": 13070 }, { "epoch": 0.7589428181844556, "grad_norm": 0.11362321674823761, "learning_rate": 9.040666708603125e-05, "loss": 2.7981, "step": 13080 }, { "epoch": 0.759523049696829, "grad_norm": 0.10791613906621933, "learning_rate": 8.999518313982039e-05, "loss": 2.7993, "step": 13090 }, { "epoch": 0.7601032812092025, "grad_norm": 0.11038652807474136, "learning_rate": 8.958447245334476e-05, "loss": 2.7922, "step": 13100 }, { "epoch": 0.7606835127215759, "grad_norm": 0.11153964698314667, "learning_rate": 8.91745365388841e-05, "loss": 2.8016, "step": 13110 }, { "epoch": 0.7612637442339494, "grad_norm": 0.10748942941427231, "learning_rate": 8.876537690586529e-05, "loss": 2.791, "step": 13120 }, { "epoch": 0.7618439757463228, "grad_norm": 0.1106482520699501, "learning_rate": 8.83569950608572e-05, "loss": 2.8008, "step": 13130 }, { "epoch": 0.7624242072586962, "grad_norm": 0.10443028807640076, "learning_rate": 8.794939250756441e-05, "loss": 2.7936, "step": 13140 }, { "epoch": 0.7630044387710696, "grad_norm": 0.11383570730686188, "learning_rate": 8.754257074682222e-05, "loss": 2.7912, "step": 13150 }, { "epoch": 0.7635846702834431, "grad_norm": 0.10836578160524368, "learning_rate": 8.713653127659105e-05, "loss": 2.7939, "step": 13160 }, { "epoch": 0.7641649017958165, "grad_norm": 0.10870825499296188, "learning_rate": 8.673127559195066e-05, "loss": 2.7991, "step": 13170 }, { "epoch": 0.7647451333081899, "grad_norm": 0.10718671977519989, "learning_rate": 8.632680518509492e-05, "loss": 2.7879, "step": 13180 }, { "epoch": 0.7653253648205635, "grad_norm": 0.11277935653924942, "learning_rate": 8.592312154532637e-05, "loss": 2.7947, "step": 13190 }, { "epoch": 0.7659055963329369, "grad_norm": 0.11088382452726364, "learning_rate": 8.552022615905038e-05, "loss": 2.7996, "step": 13200 }, { "epoch": 0.7664858278453103, "grad_norm": 0.10912182927131653, "learning_rate": 8.511812050977003e-05, "loss": 2.7943, "step": 13210 }, { "epoch": 0.7670660593576837, "grad_norm": 0.10919041931629181, "learning_rate": 8.471680607808035e-05, "loss": 2.7992, "step": 13220 }, { "epoch": 0.7676462908700572, "grad_norm": 0.10616286844015121, "learning_rate": 8.431628434166309e-05, "loss": 2.7977, "step": 13230 }, { "epoch": 0.7682265223824306, "grad_norm": 0.10572168231010437, "learning_rate": 8.391655677528143e-05, "loss": 2.7959, "step": 13240 }, { "epoch": 0.768806753894804, "grad_norm": 0.10937794297933578, "learning_rate": 8.3517624850774e-05, "loss": 2.793, "step": 13250 }, { "epoch": 0.7693869854071774, "grad_norm": 0.10820769518613815, "learning_rate": 8.311949003704996e-05, "loss": 2.7991, "step": 13260 }, { "epoch": 0.769967216919551, "grad_norm": 0.10802992433309555, "learning_rate": 8.272215380008343e-05, "loss": 2.7965, "step": 13270 }, { "epoch": 0.7705474484319244, "grad_norm": 0.10747858881950378, "learning_rate": 8.232561760290794e-05, "loss": 2.7957, "step": 13280 }, { "epoch": 0.7711276799442978, "grad_norm": 0.11238089948892593, "learning_rate": 8.192988290561157e-05, "loss": 2.7922, "step": 13290 }, { "epoch": 0.7717079114566712, "grad_norm": 0.1034981980919838, "learning_rate": 8.153495116533056e-05, "loss": 2.789, "step": 13300 }, { "epoch": 0.7722881429690447, "grad_norm": 0.10910629481077194, "learning_rate": 8.11408238362453e-05, "loss": 2.7899, "step": 13310 }, { "epoch": 0.7728683744814181, "grad_norm": 0.11309719830751419, "learning_rate": 8.07475023695737e-05, "loss": 2.7978, "step": 13320 }, { "epoch": 0.7734486059937915, "grad_norm": 0.10908596217632294, "learning_rate": 8.035498821356664e-05, "loss": 2.7938, "step": 13330 }, { "epoch": 0.7740288375061649, "grad_norm": 0.11714279651641846, "learning_rate": 7.996328281350252e-05, "loss": 2.7967, "step": 13340 }, { "epoch": 0.7746090690185384, "grad_norm": 0.10943669080734253, "learning_rate": 7.957238761168135e-05, "loss": 2.7803, "step": 13350 }, { "epoch": 0.7751893005309118, "grad_norm": 0.11171719431877136, "learning_rate": 7.918230404742045e-05, "loss": 2.7941, "step": 13360 }, { "epoch": 0.7757695320432852, "grad_norm": 0.10363152623176575, "learning_rate": 7.879303355704834e-05, "loss": 2.8043, "step": 13370 }, { "epoch": 0.7763497635556587, "grad_norm": 0.1147744432091713, "learning_rate": 7.840457757389968e-05, "loss": 2.8022, "step": 13380 }, { "epoch": 0.7769299950680322, "grad_norm": 0.10682083666324615, "learning_rate": 7.801693752831012e-05, "loss": 2.7914, "step": 13390 }, { "epoch": 0.7775102265804056, "grad_norm": 0.11352023482322693, "learning_rate": 7.763011484761082e-05, "loss": 2.7958, "step": 13400 }, { "epoch": 0.778090458092779, "grad_norm": 0.10785870254039764, "learning_rate": 7.724411095612366e-05, "loss": 2.7971, "step": 13410 }, { "epoch": 0.7786706896051525, "grad_norm": 0.10762759298086166, "learning_rate": 7.68589272751551e-05, "loss": 2.7916, "step": 13420 }, { "epoch": 0.7792509211175259, "grad_norm": 0.10556434839963913, "learning_rate": 7.647456522299207e-05, "loss": 2.784, "step": 13430 }, { "epoch": 0.7798311526298993, "grad_norm": 0.1077750101685524, "learning_rate": 7.609102621489577e-05, "loss": 2.7906, "step": 13440 }, { "epoch": 0.7804113841422727, "grad_norm": 0.10472170263528824, "learning_rate": 7.570831166309693e-05, "loss": 2.7833, "step": 13450 }, { "epoch": 0.7809916156546463, "grad_norm": 0.1061674952507019, "learning_rate": 7.532642297679093e-05, "loss": 2.796, "step": 13460 }, { "epoch": 0.7815718471670197, "grad_norm": 0.10716653615236282, "learning_rate": 7.494536156213151e-05, "loss": 2.791, "step": 13470 }, { "epoch": 0.7821520786793931, "grad_norm": 0.11008104681968689, "learning_rate": 7.456512882222703e-05, "loss": 2.7874, "step": 13480 }, { "epoch": 0.7827323101917665, "grad_norm": 0.11095033586025238, "learning_rate": 7.418572615713413e-05, "loss": 2.7874, "step": 13490 }, { "epoch": 0.78331254170414, "grad_norm": 0.10690274834632874, "learning_rate": 7.380715496385316e-05, "loss": 2.7897, "step": 13500 }, { "epoch": 0.7838927732165134, "grad_norm": 0.10463336110115051, "learning_rate": 7.34294166363231e-05, "loss": 2.7965, "step": 13510 }, { "epoch": 0.7844730047288868, "grad_norm": 0.10628803819417953, "learning_rate": 7.30525125654157e-05, "loss": 2.7878, "step": 13520 }, { "epoch": 0.7850532362412602, "grad_norm": 0.10758186876773834, "learning_rate": 7.267644413893152e-05, "loss": 2.7893, "step": 13530 }, { "epoch": 0.7856334677536337, "grad_norm": 0.10785481333732605, "learning_rate": 7.230121274159384e-05, "loss": 2.7896, "step": 13540 }, { "epoch": 0.7862136992660071, "grad_norm": 0.10700030624866486, "learning_rate": 7.192681975504382e-05, "loss": 2.786, "step": 13550 }, { "epoch": 0.7867939307783806, "grad_norm": 0.10182949900627136, "learning_rate": 7.155326655783597e-05, "loss": 2.7889, "step": 13560 }, { "epoch": 0.787374162290754, "grad_norm": 0.10802864283323288, "learning_rate": 7.118055452543193e-05, "loss": 2.7946, "step": 13570 }, { "epoch": 0.7879543938031275, "grad_norm": 0.10849913954734802, "learning_rate": 7.080868503019672e-05, "loss": 2.786, "step": 13580 }, { "epoch": 0.7885346253155009, "grad_norm": 0.10770730674266815, "learning_rate": 7.043765944139264e-05, "loss": 2.7804, "step": 13590 }, { "epoch": 0.7891148568278743, "grad_norm": 0.11441770195960999, "learning_rate": 7.006747912517475e-05, "loss": 2.79, "step": 13600 }, { "epoch": 0.7896950883402477, "grad_norm": 0.10908571630716324, "learning_rate": 6.9698145444586e-05, "loss": 2.7897, "step": 13610 }, { "epoch": 0.7902753198526212, "grad_norm": 0.10705877095460892, "learning_rate": 6.932965975955134e-05, "loss": 2.7857, "step": 13620 }, { "epoch": 0.7908555513649946, "grad_norm": 0.11635982990264893, "learning_rate": 6.896202342687397e-05, "loss": 2.7888, "step": 13630 }, { "epoch": 0.791435782877368, "grad_norm": 0.1107436865568161, "learning_rate": 6.859523780022911e-05, "loss": 2.7902, "step": 13640 }, { "epoch": 0.7920160143897415, "grad_norm": 0.11131720244884491, "learning_rate": 6.822930423016003e-05, "loss": 2.7982, "step": 13650 }, { "epoch": 0.792596245902115, "grad_norm": 0.10535065829753876, "learning_rate": 6.786422406407247e-05, "loss": 2.7838, "step": 13660 }, { "epoch": 0.7931764774144884, "grad_norm": 0.10784085094928741, "learning_rate": 6.749999864622973e-05, "loss": 2.7778, "step": 13670 }, { "epoch": 0.7937567089268618, "grad_norm": 0.10266363620758057, "learning_rate": 6.713662931774818e-05, "loss": 2.7929, "step": 13680 }, { "epoch": 0.7943369404392353, "grad_norm": 0.11121921241283417, "learning_rate": 6.677411741659145e-05, "loss": 2.787, "step": 13690 }, { "epoch": 0.7949171719516087, "grad_norm": 0.10687406361103058, "learning_rate": 6.641246427756657e-05, "loss": 2.7915, "step": 13700 }, { "epoch": 0.7954974034639821, "grad_norm": 0.10604474693536758, "learning_rate": 6.605167123231822e-05, "loss": 2.7816, "step": 13710 }, { "epoch": 0.7960776349763555, "grad_norm": 0.10484491288661957, "learning_rate": 6.569173960932404e-05, "loss": 2.7844, "step": 13720 }, { "epoch": 0.796657866488729, "grad_norm": 0.10788851231336594, "learning_rate": 6.533267073389034e-05, "loss": 2.7815, "step": 13730 }, { "epoch": 0.7972380980011025, "grad_norm": 0.10421809554100037, "learning_rate": 6.49744659281459e-05, "loss": 2.7953, "step": 13740 }, { "epoch": 0.7978183295134759, "grad_norm": 0.10567434132099152, "learning_rate": 6.461712651103859e-05, "loss": 2.7898, "step": 13750 }, { "epoch": 0.7983985610258493, "grad_norm": 0.10381162911653519, "learning_rate": 6.426065379832959e-05, "loss": 2.7902, "step": 13760 }, { "epoch": 0.7989787925382228, "grad_norm": 0.10707089304924011, "learning_rate": 6.390504910258867e-05, "loss": 2.7923, "step": 13770 }, { "epoch": 0.7995590240505962, "grad_norm": 0.10568366944789886, "learning_rate": 6.355031373318961e-05, "loss": 2.793, "step": 13780 }, { "epoch": 0.8001392555629696, "grad_norm": 0.10662976652383804, "learning_rate": 6.319644899630514e-05, "loss": 2.7954, "step": 13790 }, { "epoch": 0.800719487075343, "grad_norm": 0.10822783410549164, "learning_rate": 6.28434561949024e-05, "loss": 2.7875, "step": 13800 }, { "epoch": 0.8012997185877165, "grad_norm": 0.10903995484113693, "learning_rate": 6.249133662873783e-05, "loss": 2.7952, "step": 13810 }, { "epoch": 0.8018799501000899, "grad_norm": 0.11016574501991272, "learning_rate": 6.214009159435254e-05, "loss": 2.7833, "step": 13820 }, { "epoch": 0.8024601816124634, "grad_norm": 0.10669629275798798, "learning_rate": 6.178972238506758e-05, "loss": 2.7966, "step": 13830 }, { "epoch": 0.8030404131248368, "grad_norm": 0.10725666582584381, "learning_rate": 6.144023029097891e-05, "loss": 2.781, "step": 13840 }, { "epoch": 0.8036206446372103, "grad_norm": 0.10259473323822021, "learning_rate": 6.10916165989533e-05, "loss": 2.7858, "step": 13850 }, { "epoch": 0.8042008761495837, "grad_norm": 0.10819372534751892, "learning_rate": 6.0743882592622736e-05, "loss": 2.782, "step": 13860 }, { "epoch": 0.8047811076619571, "grad_norm": 0.09982424229383469, "learning_rate": 6.039702955238026e-05, "loss": 2.7767, "step": 13870 }, { "epoch": 0.8053613391743305, "grad_norm": 0.11254626512527466, "learning_rate": 6.005105875537515e-05, "loss": 2.7773, "step": 13880 }, { "epoch": 0.805941570686704, "grad_norm": 0.10880761593580246, "learning_rate": 5.970597147550808e-05, "loss": 2.7925, "step": 13890 }, { "epoch": 0.8065218021990774, "grad_norm": 0.10454876720905304, "learning_rate": 5.936176898342649e-05, "loss": 2.7887, "step": 13900 }, { "epoch": 0.8071020337114508, "grad_norm": 0.10871117562055588, "learning_rate": 5.9018452546520165e-05, "loss": 2.7914, "step": 13910 }, { "epoch": 0.8076822652238244, "grad_norm": 0.10645408183336258, "learning_rate": 5.8676023428916175e-05, "loss": 2.7946, "step": 13920 }, { "epoch": 0.8082624967361978, "grad_norm": 0.11597729474306107, "learning_rate": 5.83344828914743e-05, "loss": 2.7917, "step": 13930 }, { "epoch": 0.8088427282485712, "grad_norm": 0.1034785658121109, "learning_rate": 5.799383219178264e-05, "loss": 2.7912, "step": 13940 }, { "epoch": 0.8094229597609446, "grad_norm": 0.10739534348249435, "learning_rate": 5.7654072584152787e-05, "loss": 2.7848, "step": 13950 }, { "epoch": 0.8100031912733181, "grad_norm": 0.10825861990451813, "learning_rate": 5.731520531961505e-05, "loss": 2.7908, "step": 13960 }, { "epoch": 0.8105834227856915, "grad_norm": 0.10880185663700104, "learning_rate": 5.697723164591441e-05, "loss": 2.7904, "step": 13970 }, { "epoch": 0.8111636542980649, "grad_norm": 0.1085624098777771, "learning_rate": 5.6640152807505236e-05, "loss": 2.7839, "step": 13980 }, { "epoch": 0.8117438858104383, "grad_norm": 0.10740832984447479, "learning_rate": 5.630397004554713e-05, "loss": 2.7858, "step": 13990 }, { "epoch": 0.8123241173228118, "grad_norm": 0.10401804000139236, "learning_rate": 5.596868459790025e-05, "loss": 2.7802, "step": 14000 }, { "epoch": 0.8123241173228118, "eval_loss": 2.749423027038574, "eval_runtime": 3.2586, "eval_samples_per_second": 1328.792, "eval_steps_per_second": 2.762, "step": 14000 }, { "epoch": 0.8129043488351853, "grad_norm": 0.10784956812858582, "learning_rate": 5.563429769912071e-05, "loss": 2.7852, "step": 14010 }, { "epoch": 0.8134845803475587, "grad_norm": 0.10523492097854614, "learning_rate": 5.530081058045606e-05, "loss": 2.7856, "step": 14020 }, { "epoch": 0.8140648118599321, "grad_norm": 0.10354667156934738, "learning_rate": 5.4968224469840935e-05, "loss": 2.7826, "step": 14030 }, { "epoch": 0.8146450433723056, "grad_norm": 0.10460636019706726, "learning_rate": 5.4636540591892164e-05, "loss": 2.7844, "step": 14040 }, { "epoch": 0.815225274884679, "grad_norm": 0.11116158217191696, "learning_rate": 5.430576016790453e-05, "loss": 2.7879, "step": 14050 }, { "epoch": 0.8158055063970524, "grad_norm": 0.11445162445306778, "learning_rate": 5.3975884415846206e-05, "loss": 2.7847, "step": 14060 }, { "epoch": 0.8163857379094258, "grad_norm": 0.10757939517498016, "learning_rate": 5.3646914550354204e-05, "loss": 2.7884, "step": 14070 }, { "epoch": 0.8169659694217993, "grad_norm": 0.10770777612924576, "learning_rate": 5.331885178273015e-05, "loss": 2.775, "step": 14080 }, { "epoch": 0.8175462009341727, "grad_norm": 0.10863149166107178, "learning_rate": 5.2991697320935486e-05, "loss": 2.7883, "step": 14090 }, { "epoch": 0.8181264324465461, "grad_norm": 0.10049009323120117, "learning_rate": 5.266545236958718e-05, "loss": 2.7878, "step": 14100 }, { "epoch": 0.8187066639589196, "grad_norm": 0.104975625872612, "learning_rate": 5.2340118129953346e-05, "loss": 2.7806, "step": 14110 }, { "epoch": 0.8192868954712931, "grad_norm": 0.10563846677541733, "learning_rate": 5.201569579994865e-05, "loss": 2.7807, "step": 14120 }, { "epoch": 0.8198671269836665, "grad_norm": 0.10182633996009827, "learning_rate": 5.1692186574130324e-05, "loss": 2.7782, "step": 14130 }, { "epoch": 0.8204473584960399, "grad_norm": 0.10903611779212952, "learning_rate": 5.1369591643692896e-05, "loss": 2.7792, "step": 14140 }, { "epoch": 0.8210275900084134, "grad_norm": 0.10453125089406967, "learning_rate": 5.1047912196464944e-05, "loss": 2.7814, "step": 14150 }, { "epoch": 0.8216078215207868, "grad_norm": 0.11026264727115631, "learning_rate": 5.072714941690387e-05, "loss": 2.7847, "step": 14160 }, { "epoch": 0.8221880530331602, "grad_norm": 0.10732634365558624, "learning_rate": 5.040730448609166e-05, "loss": 2.7716, "step": 14170 }, { "epoch": 0.8227682845455336, "grad_norm": 0.10351432114839554, "learning_rate": 5.008837858173113e-05, "loss": 2.7883, "step": 14180 }, { "epoch": 0.8233485160579072, "grad_norm": 0.10946208238601685, "learning_rate": 4.9770372878140575e-05, "loss": 2.786, "step": 14190 }, { "epoch": 0.8239287475702806, "grad_norm": 0.1038416251540184, "learning_rate": 4.9453288546250494e-05, "loss": 2.7799, "step": 14200 }, { "epoch": 0.824508979082654, "grad_norm": 0.10568647086620331, "learning_rate": 4.913712675359861e-05, "loss": 2.7874, "step": 14210 }, { "epoch": 0.8250892105950274, "grad_norm": 0.10334275662899017, "learning_rate": 4.882188866432568e-05, "loss": 2.7835, "step": 14220 }, { "epoch": 0.8256694421074009, "grad_norm": 0.10559739917516708, "learning_rate": 4.850757543917144e-05, "loss": 2.7791, "step": 14230 }, { "epoch": 0.8262496736197743, "grad_norm": 0.1026688888669014, "learning_rate": 4.819418823546999e-05, "loss": 2.7777, "step": 14240 }, { "epoch": 0.8268299051321477, "grad_norm": 0.10159046947956085, "learning_rate": 4.788172820714611e-05, "loss": 2.7876, "step": 14250 }, { "epoch": 0.8274101366445211, "grad_norm": 0.114133320748806, "learning_rate": 4.7570196504710026e-05, "loss": 2.7777, "step": 14260 }, { "epoch": 0.8279903681568946, "grad_norm": 0.10327325016260147, "learning_rate": 4.725959427525432e-05, "loss": 2.7976, "step": 14270 }, { "epoch": 0.828570599669268, "grad_norm": 0.10618502646684647, "learning_rate": 4.694992266244889e-05, "loss": 2.7904, "step": 14280 }, { "epoch": 0.8291508311816415, "grad_norm": 0.10732074081897736, "learning_rate": 4.6641182806537e-05, "loss": 2.7724, "step": 14290 }, { "epoch": 0.8297310626940149, "grad_norm": 0.10467931628227234, "learning_rate": 4.63333758443313e-05, "loss": 2.7843, "step": 14300 }, { "epoch": 0.8303112942063884, "grad_norm": 0.10281146317720413, "learning_rate": 4.6026502909209004e-05, "loss": 2.7842, "step": 14310 }, { "epoch": 0.8308915257187618, "grad_norm": 0.1023208498954773, "learning_rate": 4.572056513110867e-05, "loss": 2.774, "step": 14320 }, { "epoch": 0.8314717572311352, "grad_norm": 0.10323374718427658, "learning_rate": 4.541556363652511e-05, "loss": 2.7755, "step": 14330 }, { "epoch": 0.8320519887435086, "grad_norm": 0.10136920213699341, "learning_rate": 4.5111499548505727e-05, "loss": 2.7814, "step": 14340 }, { "epoch": 0.8326322202558821, "grad_norm": 0.10571028292179108, "learning_rate": 4.4808373986646565e-05, "loss": 2.7878, "step": 14350 }, { "epoch": 0.8332124517682555, "grad_norm": 0.10252848267555237, "learning_rate": 4.45061880670874e-05, "loss": 2.7754, "step": 14360 }, { "epoch": 0.8337926832806289, "grad_norm": 0.10471548140048981, "learning_rate": 4.420494290250869e-05, "loss": 2.7767, "step": 14370 }, { "epoch": 0.8343729147930025, "grad_norm": 0.10701679438352585, "learning_rate": 4.390463960212658e-05, "loss": 2.7792, "step": 14380 }, { "epoch": 0.8349531463053759, "grad_norm": 0.10377515107393265, "learning_rate": 4.3605279271689264e-05, "loss": 2.7829, "step": 14390 }, { "epoch": 0.8355333778177493, "grad_norm": 0.10350141674280167, "learning_rate": 4.330686301347298e-05, "loss": 2.7861, "step": 14400 }, { "epoch": 0.8361136093301227, "grad_norm": 0.10299152880907059, "learning_rate": 4.300939192627742e-05, "loss": 2.7891, "step": 14410 }, { "epoch": 0.8366938408424962, "grad_norm": 0.1038345992565155, "learning_rate": 4.2712867105422465e-05, "loss": 2.7812, "step": 14420 }, { "epoch": 0.8372740723548696, "grad_norm": 0.10262761265039444, "learning_rate": 4.241728964274352e-05, "loss": 2.7784, "step": 14430 }, { "epoch": 0.837854303867243, "grad_norm": 0.10034337639808655, "learning_rate": 4.212266062658777e-05, "loss": 2.7857, "step": 14440 }, { "epoch": 0.8384345353796164, "grad_norm": 0.10054679960012436, "learning_rate": 4.1828981141810104e-05, "loss": 2.7783, "step": 14450 }, { "epoch": 0.83901476689199, "grad_norm": 0.10352133959531784, "learning_rate": 4.15362522697691e-05, "loss": 2.7936, "step": 14460 }, { "epoch": 0.8395949984043634, "grad_norm": 0.10465723276138306, "learning_rate": 4.124447508832332e-05, "loss": 2.7692, "step": 14470 }, { "epoch": 0.8401752299167368, "grad_norm": 0.10384640097618103, "learning_rate": 4.095365067182665e-05, "loss": 2.781, "step": 14480 }, { "epoch": 0.8407554614291102, "grad_norm": 0.10312188416719437, "learning_rate": 4.066378009112523e-05, "loss": 2.7767, "step": 14490 }, { "epoch": 0.8413356929414837, "grad_norm": 0.10447024554014206, "learning_rate": 4.037486441355288e-05, "loss": 2.7832, "step": 14500 }, { "epoch": 0.8419159244538571, "grad_norm": 0.10162138938903809, "learning_rate": 4.008690470292732e-05, "loss": 2.7786, "step": 14510 }, { "epoch": 0.8424961559662305, "grad_norm": 0.09777431935071945, "learning_rate": 3.979990201954653e-05, "loss": 2.7792, "step": 14520 }, { "epoch": 0.8430763874786039, "grad_norm": 0.10050346702337265, "learning_rate": 3.9513857420184216e-05, "loss": 2.7866, "step": 14530 }, { "epoch": 0.8436566189909774, "grad_norm": 0.10209480673074722, "learning_rate": 3.922877195808678e-05, "loss": 2.7886, "step": 14540 }, { "epoch": 0.8442368505033508, "grad_norm": 0.10496553033590317, "learning_rate": 3.894464668296864e-05, "loss": 2.7854, "step": 14550 }, { "epoch": 0.8448170820157243, "grad_norm": 0.10205195099115372, "learning_rate": 3.8661482641008866e-05, "loss": 2.7869, "step": 14560 }, { "epoch": 0.8453973135280977, "grad_norm": 0.10940441489219666, "learning_rate": 3.837928087484711e-05, "loss": 2.7799, "step": 14570 }, { "epoch": 0.8459775450404712, "grad_norm": 0.10287832468748093, "learning_rate": 3.8098042423579766e-05, "loss": 2.7804, "step": 14580 }, { "epoch": 0.8465577765528446, "grad_norm": 0.0999421551823616, "learning_rate": 3.781776832275639e-05, "loss": 2.7835, "step": 14590 }, { "epoch": 0.847138008065218, "grad_norm": 0.10340355336666107, "learning_rate": 3.753845960437557e-05, "loss": 2.7831, "step": 14600 }, { "epoch": 0.8477182395775914, "grad_norm": 0.10355892032384872, "learning_rate": 3.72601172968812e-05, "loss": 2.7749, "step": 14610 }, { "epoch": 0.8482984710899649, "grad_norm": 0.10467097908258438, "learning_rate": 3.6982742425158886e-05, "loss": 2.7834, "step": 14620 }, { "epoch": 0.8488787026023383, "grad_norm": 0.1060672402381897, "learning_rate": 3.670633601053182e-05, "loss": 2.7801, "step": 14630 }, { "epoch": 0.8494589341147117, "grad_norm": 0.10443491488695145, "learning_rate": 3.643089907075759e-05, "loss": 2.7896, "step": 14640 }, { "epoch": 0.8500391656270853, "grad_norm": 0.1023486852645874, "learning_rate": 3.6156432620023726e-05, "loss": 2.7691, "step": 14650 }, { "epoch": 0.8506193971394587, "grad_norm": 0.10417921096086502, "learning_rate": 3.5882937668944476e-05, "loss": 2.7703, "step": 14660 }, { "epoch": 0.8511996286518321, "grad_norm": 0.10138606280088425, "learning_rate": 3.561041522455691e-05, "loss": 2.7885, "step": 14670 }, { "epoch": 0.8517798601642055, "grad_norm": 0.10121186077594757, "learning_rate": 3.5338866290317204e-05, "loss": 2.7721, "step": 14680 }, { "epoch": 0.852360091676579, "grad_norm": 0.10391680151224136, "learning_rate": 3.506829186609691e-05, "loss": 2.7818, "step": 14690 }, { "epoch": 0.8529403231889524, "grad_norm": 0.10207725316286087, "learning_rate": 3.479869294817955e-05, "loss": 2.775, "step": 14700 }, { "epoch": 0.8535205547013258, "grad_norm": 0.10676626861095428, "learning_rate": 3.4530070529256524e-05, "loss": 2.7759, "step": 14710 }, { "epoch": 0.8541007862136992, "grad_norm": 0.10105539858341217, "learning_rate": 3.42624255984237e-05, "loss": 2.7855, "step": 14720 }, { "epoch": 0.8546810177260727, "grad_norm": 0.10040144622325897, "learning_rate": 3.399575914117777e-05, "loss": 2.7736, "step": 14730 }, { "epoch": 0.8552612492384462, "grad_norm": 0.10322125256061554, "learning_rate": 3.3730072139412456e-05, "loss": 2.7834, "step": 14740 }, { "epoch": 0.8558414807508196, "grad_norm": 0.10220754891633987, "learning_rate": 3.3465365571415315e-05, "loss": 2.7692, "step": 14750 }, { "epoch": 0.856421712263193, "grad_norm": 0.10107099264860153, "learning_rate": 3.3201640411863584e-05, "loss": 2.7672, "step": 14760 }, { "epoch": 0.8570019437755665, "grad_norm": 0.10284842550754547, "learning_rate": 3.293889763182089e-05, "loss": 2.7851, "step": 14770 }, { "epoch": 0.8575821752879399, "grad_norm": 0.10386528819799423, "learning_rate": 3.26771381987337e-05, "loss": 2.7787, "step": 14780 }, { "epoch": 0.8581624068003133, "grad_norm": 0.1039406880736351, "learning_rate": 3.241636307642769e-05, "loss": 2.7838, "step": 14790 }, { "epoch": 0.8587426383126867, "grad_norm": 0.1034376472234726, "learning_rate": 3.2156573225104145e-05, "loss": 2.7794, "step": 14800 }, { "epoch": 0.8593228698250602, "grad_norm": 0.10199546813964844, "learning_rate": 3.189776960133645e-05, "loss": 2.7806, "step": 14810 }, { "epoch": 0.8599031013374336, "grad_norm": 0.10086624324321747, "learning_rate": 3.163995315806681e-05, "loss": 2.7666, "step": 14820 }, { "epoch": 0.860483332849807, "grad_norm": 0.10021676123142242, "learning_rate": 3.138312484460228e-05, "loss": 2.7738, "step": 14830 }, { "epoch": 0.8610635643621805, "grad_norm": 0.10465867072343826, "learning_rate": 3.112728560661164e-05, "loss": 2.7786, "step": 14840 }, { "epoch": 0.861643795874554, "grad_norm": 0.10076703131198883, "learning_rate": 3.0872436386121776e-05, "loss": 2.7705, "step": 14850 }, { "epoch": 0.8622240273869274, "grad_norm": 0.10121941566467285, "learning_rate": 3.061857812151414e-05, "loss": 2.7737, "step": 14860 }, { "epoch": 0.8628042588993008, "grad_norm": 0.10309196263551712, "learning_rate": 3.0365711747521538e-05, "loss": 2.7783, "step": 14870 }, { "epoch": 0.8633844904116743, "grad_norm": 0.10456740111112595, "learning_rate": 3.011383819522446e-05, "loss": 2.7809, "step": 14880 }, { "epoch": 0.8639647219240477, "grad_norm": 0.1025143563747406, "learning_rate": 2.986295839204764e-05, "loss": 2.7813, "step": 14890 }, { "epoch": 0.8645449534364211, "grad_norm": 0.10585116595029831, "learning_rate": 2.961307326175688e-05, "loss": 2.7738, "step": 14900 }, { "epoch": 0.8651251849487945, "grad_norm": 0.10203658789396286, "learning_rate": 2.936418372445527e-05, "loss": 2.7777, "step": 14910 }, { "epoch": 0.865705416461168, "grad_norm": 0.10538860410451889, "learning_rate": 2.911629069658037e-05, "loss": 2.7757, "step": 14920 }, { "epoch": 0.8662856479735415, "grad_norm": 0.10184674710035324, "learning_rate": 2.8869395090900037e-05, "loss": 2.7797, "step": 14930 }, { "epoch": 0.8668658794859149, "grad_norm": 0.10757064819335938, "learning_rate": 2.862349781650991e-05, "loss": 2.7837, "step": 14940 }, { "epoch": 0.8674461109982883, "grad_norm": 0.09947676211595535, "learning_rate": 2.8378599778829492e-05, "loss": 2.7764, "step": 14950 }, { "epoch": 0.8680263425106618, "grad_norm": 0.0980169028043747, "learning_rate": 2.8134701879598965e-05, "loss": 2.7877, "step": 14960 }, { "epoch": 0.8686065740230352, "grad_norm": 0.09837668389081955, "learning_rate": 2.7891805016876057e-05, "loss": 2.7806, "step": 14970 }, { "epoch": 0.8691868055354086, "grad_norm": 0.09911120682954788, "learning_rate": 2.7649910085032277e-05, "loss": 2.7807, "step": 14980 }, { "epoch": 0.869767037047782, "grad_norm": 0.09837288409471512, "learning_rate": 2.7409017974750257e-05, "loss": 2.7677, "step": 14990 }, { "epoch": 0.8703472685601555, "grad_norm": 0.10560393333435059, "learning_rate": 2.7169129573019943e-05, "loss": 2.7785, "step": 15000 }, { "epoch": 0.8703472685601555, "eval_loss": 2.7414441108703613, "eval_runtime": 3.2661, "eval_samples_per_second": 1325.755, "eval_steps_per_second": 2.756, "step": 15000 }, { "epoch": 0.870927500072529, "grad_norm": 0.09839779883623123, "learning_rate": 2.6930245763135504e-05, "loss": 2.7759, "step": 15010 }, { "epoch": 0.8715077315849024, "grad_norm": 0.09770379960536957, "learning_rate": 2.6692367424692272e-05, "loss": 2.787, "step": 15020 }, { "epoch": 0.8720879630972758, "grad_norm": 0.09834130108356476, "learning_rate": 2.645549543358304e-05, "loss": 2.7731, "step": 15030 }, { "epoch": 0.8726681946096493, "grad_norm": 0.1047162264585495, "learning_rate": 2.6219630661995528e-05, "loss": 2.7832, "step": 15040 }, { "epoch": 0.8732484261220227, "grad_norm": 0.10111907124519348, "learning_rate": 2.5984773978408257e-05, "loss": 2.779, "step": 15050 }, { "epoch": 0.8738286576343961, "grad_norm": 0.10093654692173004, "learning_rate": 2.5750926247588322e-05, "loss": 2.768, "step": 15060 }, { "epoch": 0.8744088891467695, "grad_norm": 0.10071719437837601, "learning_rate": 2.551808833058755e-05, "loss": 2.7867, "step": 15070 }, { "epoch": 0.874989120659143, "grad_norm": 0.10237322747707367, "learning_rate": 2.5286261084739445e-05, "loss": 2.7838, "step": 15080 }, { "epoch": 0.8755693521715164, "grad_norm": 0.09815766662359238, "learning_rate": 2.5055445363656358e-05, "loss": 2.7839, "step": 15090 }, { "epoch": 0.8761495836838898, "grad_norm": 0.10203532874584198, "learning_rate": 2.482564201722581e-05, "loss": 2.7878, "step": 15100 }, { "epoch": 0.8767298151962634, "grad_norm": 0.10766585171222687, "learning_rate": 2.4596851891607884e-05, "loss": 2.7823, "step": 15110 }, { "epoch": 0.8773100467086368, "grad_norm": 0.09876078367233276, "learning_rate": 2.4369075829231766e-05, "loss": 2.7762, "step": 15120 }, { "epoch": 0.8778902782210102, "grad_norm": 0.10014016181230545, "learning_rate": 2.414231466879274e-05, "loss": 2.7733, "step": 15130 }, { "epoch": 0.8784705097333836, "grad_norm": 0.10114018619060516, "learning_rate": 2.3916569245249306e-05, "loss": 2.7861, "step": 15140 }, { "epoch": 0.8790507412457571, "grad_norm": 0.10012462735176086, "learning_rate": 2.3691840389819526e-05, "loss": 2.7635, "step": 15150 }, { "epoch": 0.8796309727581305, "grad_norm": 0.10367590934038162, "learning_rate": 2.3468128929978757e-05, "loss": 2.7727, "step": 15160 }, { "epoch": 0.8802112042705039, "grad_norm": 0.10224179178476334, "learning_rate": 2.3245435689456015e-05, "loss": 2.7712, "step": 15170 }, { "epoch": 0.8807914357828773, "grad_norm": 0.0989450216293335, "learning_rate": 2.302376148823102e-05, "loss": 2.7761, "step": 15180 }, { "epoch": 0.8813716672952508, "grad_norm": 0.10036759078502655, "learning_rate": 2.2803107142531617e-05, "loss": 2.7815, "step": 15190 }, { "epoch": 0.8819518988076243, "grad_norm": 0.10400567203760147, "learning_rate": 2.2583473464830005e-05, "loss": 2.7826, "step": 15200 }, { "epoch": 0.8825321303199977, "grad_norm": 0.09990741312503815, "learning_rate": 2.2364861263840507e-05, "loss": 2.7869, "step": 15210 }, { "epoch": 0.8831123618323711, "grad_norm": 0.10067487508058548, "learning_rate": 2.2147271344516128e-05, "loss": 2.7771, "step": 15220 }, { "epoch": 0.8836925933447446, "grad_norm": 0.10068360716104507, "learning_rate": 2.1930704508045714e-05, "loss": 2.781, "step": 15230 }, { "epoch": 0.884272824857118, "grad_norm": 0.10076344013214111, "learning_rate": 2.171516155185117e-05, "loss": 2.7793, "step": 15240 }, { "epoch": 0.8848530563694914, "grad_norm": 0.0988764762878418, "learning_rate": 2.1500643269584027e-05, "loss": 2.772, "step": 15250 }, { "epoch": 0.8854332878818648, "grad_norm": 0.09937159717082977, "learning_rate": 2.1287150451123224e-05, "loss": 2.7786, "step": 15260 }, { "epoch": 0.8860135193942383, "grad_norm": 0.10244645178318024, "learning_rate": 2.1074683882571675e-05, "loss": 2.7752, "step": 15270 }, { "epoch": 0.8865937509066117, "grad_norm": 0.09691537171602249, "learning_rate": 2.0863244346253517e-05, "loss": 2.7735, "step": 15280 }, { "epoch": 0.8871739824189852, "grad_norm": 0.09877140074968338, "learning_rate": 2.065283262071128e-05, "loss": 2.777, "step": 15290 }, { "epoch": 0.8877542139313586, "grad_norm": 0.09832227975130081, "learning_rate": 2.044344948070289e-05, "loss": 2.7718, "step": 15300 }, { "epoch": 0.8883344454437321, "grad_norm": 0.09934905916452408, "learning_rate": 2.02350956971992e-05, "loss": 2.7725, "step": 15310 }, { "epoch": 0.8889146769561055, "grad_norm": 0.09960002452135086, "learning_rate": 2.0027772037380463e-05, "loss": 2.77, "step": 15320 }, { "epoch": 0.8894949084684789, "grad_norm": 0.10142461210489273, "learning_rate": 1.9821479264634234e-05, "loss": 2.7781, "step": 15330 }, { "epoch": 0.8900751399808524, "grad_norm": 0.09648580849170685, "learning_rate": 1.96162181385521e-05, "loss": 2.7774, "step": 15340 }, { "epoch": 0.8906553714932258, "grad_norm": 0.09822871536016464, "learning_rate": 1.9411989414926953e-05, "loss": 2.7718, "step": 15350 }, { "epoch": 0.8912356030055992, "grad_norm": 0.1000954881310463, "learning_rate": 1.9208793845750504e-05, "loss": 2.7763, "step": 15360 }, { "epoch": 0.8918158345179726, "grad_norm": 0.10170748084783554, "learning_rate": 1.9006632179209925e-05, "loss": 2.78, "step": 15370 }, { "epoch": 0.8923960660303462, "grad_norm": 0.10458207130432129, "learning_rate": 1.8805505159685807e-05, "loss": 2.77, "step": 15380 }, { "epoch": 0.8929762975427196, "grad_norm": 0.09986699372529984, "learning_rate": 1.8605413527748823e-05, "loss": 2.776, "step": 15390 }, { "epoch": 0.893556529055093, "grad_norm": 0.09813553094863892, "learning_rate": 1.8406358020157364e-05, "loss": 2.7711, "step": 15400 }, { "epoch": 0.8941367605674664, "grad_norm": 0.09960541874170303, "learning_rate": 1.8208339369854663e-05, "loss": 2.7781, "step": 15410 }, { "epoch": 0.8947169920798399, "grad_norm": 0.09737250953912735, "learning_rate": 1.801135830596605e-05, "loss": 2.7657, "step": 15420 }, { "epoch": 0.8952972235922133, "grad_norm": 0.0949782207608223, "learning_rate": 1.7815415553796575e-05, "loss": 2.7705, "step": 15430 }, { "epoch": 0.8958774551045867, "grad_norm": 0.09773328900337219, "learning_rate": 1.762051183482788e-05, "loss": 2.7684, "step": 15440 }, { "epoch": 0.8964576866169601, "grad_norm": 0.09638100862503052, "learning_rate": 1.7426647866715925e-05, "loss": 2.7724, "step": 15450 }, { "epoch": 0.8970379181293336, "grad_norm": 0.09620904177427292, "learning_rate": 1.7233824363288118e-05, "loss": 2.7738, "step": 15460 }, { "epoch": 0.897618149641707, "grad_norm": 0.09929810464382172, "learning_rate": 1.7042042034540783e-05, "loss": 2.7754, "step": 15470 }, { "epoch": 0.8981983811540805, "grad_norm": 0.09778960049152374, "learning_rate": 1.6851301586636613e-05, "loss": 2.7766, "step": 15480 }, { "epoch": 0.8987786126664539, "grad_norm": 0.09684190899133682, "learning_rate": 1.6661603721901873e-05, "loss": 2.7777, "step": 15490 }, { "epoch": 0.8993588441788274, "grad_norm": 0.09664195775985718, "learning_rate": 1.6472949138823967e-05, "loss": 2.7859, "step": 15500 }, { "epoch": 0.8999390756912008, "grad_norm": 0.10036718100309372, "learning_rate": 1.628533853204883e-05, "loss": 2.7713, "step": 15510 }, { "epoch": 0.9005193072035742, "grad_norm": 0.09811628609895706, "learning_rate": 1.6098772592378417e-05, "loss": 2.7733, "step": 15520 }, { "epoch": 0.9010995387159476, "grad_norm": 0.09862551838159561, "learning_rate": 1.591325200676795e-05, "loss": 2.7701, "step": 15530 }, { "epoch": 0.9016797702283211, "grad_norm": 0.09947618097066879, "learning_rate": 1.5728777458323803e-05, "loss": 2.7771, "step": 15540 }, { "epoch": 0.9022600017406945, "grad_norm": 0.09834101796150208, "learning_rate": 1.554534962630053e-05, "loss": 2.7768, "step": 15550 }, { "epoch": 0.902840233253068, "grad_norm": 0.10113567858934402, "learning_rate": 1.5362969186098594e-05, "loss": 2.7682, "step": 15560 }, { "epoch": 0.9034204647654415, "grad_norm": 0.0977102592587471, "learning_rate": 1.5181636809261921e-05, "loss": 2.7769, "step": 15570 }, { "epoch": 0.9040006962778149, "grad_norm": 0.09831026196479797, "learning_rate": 1.5001353163475283e-05, "loss": 2.7681, "step": 15580 }, { "epoch": 0.9045809277901883, "grad_norm": 0.09537149965763092, "learning_rate": 1.4822118912561943e-05, "loss": 2.7628, "step": 15590 }, { "epoch": 0.9051611593025617, "grad_norm": 0.09654498845338821, "learning_rate": 1.4643934716481253e-05, "loss": 2.7676, "step": 15600 }, { "epoch": 0.9057413908149352, "grad_norm": 0.09738855808973312, "learning_rate": 1.446680123132603e-05, "loss": 2.7744, "step": 15610 }, { "epoch": 0.9063216223273086, "grad_norm": 0.10082467645406723, "learning_rate": 1.4290719109320382e-05, "loss": 2.7706, "step": 15620 }, { "epoch": 0.906901853839682, "grad_norm": 0.10283984988927841, "learning_rate": 1.4115688998817043e-05, "loss": 2.7742, "step": 15630 }, { "epoch": 0.9074820853520554, "grad_norm": 0.09994236379861832, "learning_rate": 1.3941711544295287e-05, "loss": 2.7638, "step": 15640 }, { "epoch": 0.908062316864429, "grad_norm": 0.09737379103899002, "learning_rate": 1.3768787386358282e-05, "loss": 2.7715, "step": 15650 }, { "epoch": 0.9086425483768024, "grad_norm": 0.09915235638618469, "learning_rate": 1.3596917161730902e-05, "loss": 2.7694, "step": 15660 }, { "epoch": 0.9092227798891758, "grad_norm": 0.09791626036167145, "learning_rate": 1.3426101503257358e-05, "loss": 2.7628, "step": 15670 }, { "epoch": 0.9098030114015492, "grad_norm": 0.09681922197341919, "learning_rate": 1.3256341039898766e-05, "loss": 2.7741, "step": 15680 }, { "epoch": 0.9103832429139227, "grad_norm": 0.09645412862300873, "learning_rate": 1.3087636396730949e-05, "loss": 2.7704, "step": 15690 }, { "epoch": 0.9109634744262961, "grad_norm": 0.09795381873846054, "learning_rate": 1.2919988194942011e-05, "loss": 2.7666, "step": 15700 }, { "epoch": 0.9115437059386695, "grad_norm": 0.09636548161506653, "learning_rate": 1.2753397051830294e-05, "loss": 2.7763, "step": 15710 }, { "epoch": 0.9121239374510429, "grad_norm": 0.0992702841758728, "learning_rate": 1.2587863580801794e-05, "loss": 2.7693, "step": 15720 }, { "epoch": 0.9127041689634164, "grad_norm": 0.09708980470895767, "learning_rate": 1.2423388391368083e-05, "loss": 2.7696, "step": 15730 }, { "epoch": 0.9132844004757898, "grad_norm": 0.09657064080238342, "learning_rate": 1.2259972089144054e-05, "loss": 2.7799, "step": 15740 }, { "epoch": 0.9138646319881633, "grad_norm": 0.09743205457925797, "learning_rate": 1.2097615275845617e-05, "loss": 2.7683, "step": 15750 }, { "epoch": 0.9144448635005367, "grad_norm": 0.09803003072738647, "learning_rate": 1.1936318549287638e-05, "loss": 2.7731, "step": 15760 }, { "epoch": 0.9150250950129102, "grad_norm": 0.0977969542145729, "learning_rate": 1.1776082503381468e-05, "loss": 2.778, "step": 15770 }, { "epoch": 0.9156053265252836, "grad_norm": 0.0986003428697586, "learning_rate": 1.1616907728133084e-05, "loss": 2.7794, "step": 15780 }, { "epoch": 0.916185558037657, "grad_norm": 0.09887285530567169, "learning_rate": 1.1458794809640693e-05, "loss": 2.7743, "step": 15790 }, { "epoch": 0.9167657895500304, "grad_norm": 0.10056151449680328, "learning_rate": 1.1301744330092522e-05, "loss": 2.7739, "step": 15800 }, { "epoch": 0.9173460210624039, "grad_norm": 0.09636414051055908, "learning_rate": 1.1145756867765033e-05, "loss": 2.7772, "step": 15810 }, { "epoch": 0.9179262525747773, "grad_norm": 0.09793318808078766, "learning_rate": 1.0990832997020282e-05, "loss": 2.7729, "step": 15820 }, { "epoch": 0.9185064840871507, "grad_norm": 0.09378232061862946, "learning_rate": 1.0836973288304229e-05, "loss": 2.7783, "step": 15830 }, { "epoch": 0.9190867155995243, "grad_norm": 0.09904693067073822, "learning_rate": 1.0684178308144498e-05, "loss": 2.7697, "step": 15840 }, { "epoch": 0.9196669471118977, "grad_norm": 0.0982363149523735, "learning_rate": 1.0532448619148115e-05, "loss": 2.7712, "step": 15850 }, { "epoch": 0.9202471786242711, "grad_norm": 0.0995451807975769, "learning_rate": 1.038178477999978e-05, "loss": 2.7702, "step": 15860 }, { "epoch": 0.9208274101366445, "grad_norm": 0.09749618917703629, "learning_rate": 1.0232187345459431e-05, "loss": 2.771, "step": 15870 }, { "epoch": 0.921407641649018, "grad_norm": 0.09808894246816635, "learning_rate": 1.0083656866360646e-05, "loss": 2.7706, "step": 15880 }, { "epoch": 0.9219878731613914, "grad_norm": 0.09838584810495377, "learning_rate": 9.936193889608012e-06, "loss": 2.7656, "step": 15890 }, { "epoch": 0.9225681046737648, "grad_norm": 0.10016359388828278, "learning_rate": 9.789798958175832e-06, "loss": 2.7749, "step": 15900 }, { "epoch": 0.9231483361861382, "grad_norm": 0.09670013934373856, "learning_rate": 9.64447261110548e-06, "loss": 2.7693, "step": 15910 }, { "epoch": 0.9237285676985117, "grad_norm": 0.09639087319374084, "learning_rate": 9.500215383503784e-06, "loss": 2.7675, "step": 15920 }, { "epoch": 0.9243087992108852, "grad_norm": 0.09851641952991486, "learning_rate": 9.357027806541084e-06, "loss": 2.7748, "step": 15930 }, { "epoch": 0.9248890307232586, "grad_norm": 0.10145829617977142, "learning_rate": 9.214910407448871e-06, "loss": 2.7841, "step": 15940 }, { "epoch": 0.925469262235632, "grad_norm": 0.09769120067358017, "learning_rate": 9.073863709518426e-06, "loss": 2.7703, "step": 15950 }, { "epoch": 0.9260494937480055, "grad_norm": 0.09475893527269363, "learning_rate": 8.933888232098408e-06, "loss": 2.7703, "step": 15960 }, { "epoch": 0.9266297252603789, "grad_norm": 0.09624000638723373, "learning_rate": 8.794984490593171e-06, "loss": 2.7753, "step": 15970 }, { "epoch": 0.9272099567727523, "grad_norm": 0.09569297730922699, "learning_rate": 8.657152996460958e-06, "loss": 2.7635, "step": 15980 }, { "epoch": 0.9277901882851257, "grad_norm": 0.10107609629631042, "learning_rate": 8.520394257211605e-06, "loss": 2.7714, "step": 15990 }, { "epoch": 0.9283704197974992, "grad_norm": 0.09753672778606415, "learning_rate": 8.384708776405236e-06, "loss": 2.7706, "step": 16000 }, { "epoch": 0.9283704197974992, "eval_loss": 2.7369606494903564, "eval_runtime": 3.2559, "eval_samples_per_second": 1329.896, "eval_steps_per_second": 2.764, "step": 16000 }, { "epoch": 0.9289506513098726, "grad_norm": 0.09548928588628769, "learning_rate": 8.25009705364994e-06, "loss": 2.7754, "step": 16010 }, { "epoch": 0.929530882822246, "grad_norm": 0.09287203848361969, "learning_rate": 8.116559584600201e-06, "loss": 2.7777, "step": 16020 }, { "epoch": 0.9301111143346195, "grad_norm": 0.0972280502319336, "learning_rate": 7.984096860955036e-06, "loss": 2.781, "step": 16030 }, { "epoch": 0.930691345846993, "grad_norm": 0.09617298096418381, "learning_rate": 7.852709370455922e-06, "loss": 2.7692, "step": 16040 }, { "epoch": 0.9312715773593664, "grad_norm": 0.09682459384202957, "learning_rate": 7.72239759688551e-06, "loss": 2.7742, "step": 16050 }, { "epoch": 0.9318518088717398, "grad_norm": 0.09648177772760391, "learning_rate": 7.593162020065313e-06, "loss": 2.7783, "step": 16060 }, { "epoch": 0.9324320403841133, "grad_norm": 0.09511367976665497, "learning_rate": 7.4650031158542845e-06, "loss": 2.7706, "step": 16070 }, { "epoch": 0.9330122718964867, "grad_norm": 0.09434488415718079, "learning_rate": 7.337921356146981e-06, "loss": 2.7694, "step": 16080 }, { "epoch": 0.9335925034088601, "grad_norm": 0.09737717360258102, "learning_rate": 7.211917208871665e-06, "loss": 2.7674, "step": 16090 }, { "epoch": 0.9341727349212335, "grad_norm": 0.09725455194711685, "learning_rate": 7.086991137988906e-06, "loss": 2.7639, "step": 16100 }, { "epoch": 0.9347529664336071, "grad_norm": 0.10136746615171432, "learning_rate": 6.963143603489518e-06, "loss": 2.7677, "step": 16110 }, { "epoch": 0.9353331979459805, "grad_norm": 0.09756675362586975, "learning_rate": 6.840375061393122e-06, "loss": 2.765, "step": 16120 }, { "epoch": 0.9359134294583539, "grad_norm": 0.09939330816268921, "learning_rate": 6.718685963746318e-06, "loss": 2.7751, "step": 16130 }, { "epoch": 0.9364936609707273, "grad_norm": 0.09836092591285706, "learning_rate": 6.598076758621118e-06, "loss": 2.7828, "step": 16140 }, { "epoch": 0.9370738924831008, "grad_norm": 0.09677501767873764, "learning_rate": 6.4785478901133506e-06, "loss": 2.769, "step": 16150 }, { "epoch": 0.9376541239954742, "grad_norm": 0.097322478890419, "learning_rate": 6.360099798340656e-06, "loss": 2.7656, "step": 16160 }, { "epoch": 0.9382343555078476, "grad_norm": 0.09472298622131348, "learning_rate": 6.242732919441462e-06, "loss": 2.7737, "step": 16170 }, { "epoch": 0.938814587020221, "grad_norm": 0.09517394751310349, "learning_rate": 6.126447685572844e-06, "loss": 2.7807, "step": 16180 }, { "epoch": 0.9393948185325945, "grad_norm": 0.09591302275657654, "learning_rate": 6.011244524909198e-06, "loss": 2.7774, "step": 16190 }, { "epoch": 0.939975050044968, "grad_norm": 0.09797896444797516, "learning_rate": 5.8971238616407405e-06, "loss": 2.7637, "step": 16200 }, { "epoch": 0.9405552815573414, "grad_norm": 0.09744720160961151, "learning_rate": 5.7840861159715425e-06, "loss": 2.7773, "step": 16210 }, { "epoch": 0.9411355130697148, "grad_norm": 0.09814444929361343, "learning_rate": 5.672131704118565e-06, "loss": 2.7741, "step": 16220 }, { "epoch": 0.9417157445820883, "grad_norm": 0.09604529291391373, "learning_rate": 5.561261038309628e-06, "loss": 2.7727, "step": 16230 }, { "epoch": 0.9422959760944617, "grad_norm": 0.09737398475408554, "learning_rate": 5.4514745267821404e-06, "loss": 2.7737, "step": 16240 }, { "epoch": 0.9428762076068351, "grad_norm": 0.09697815030813217, "learning_rate": 5.342772573781507e-06, "loss": 2.7638, "step": 16250 }, { "epoch": 0.9434564391192085, "grad_norm": 0.09917178004980087, "learning_rate": 5.235155579559725e-06, "loss": 2.7709, "step": 16260 }, { "epoch": 0.944036670631582, "grad_norm": 0.096290223300457, "learning_rate": 5.128623940373888e-06, "loss": 2.7674, "step": 16270 }, { "epoch": 0.9446169021439554, "grad_norm": 0.09504272043704987, "learning_rate": 5.023178048484589e-06, "loss": 2.7694, "step": 16280 }, { "epoch": 0.9451971336563288, "grad_norm": 0.09743209183216095, "learning_rate": 4.91881829215468e-06, "loss": 2.781, "step": 16290 }, { "epoch": 0.9457773651687024, "grad_norm": 0.09843679517507553, "learning_rate": 4.815545055647718e-06, "loss": 2.776, "step": 16300 }, { "epoch": 0.9463575966810758, "grad_norm": 0.0955999493598938, "learning_rate": 4.713358719226523e-06, "loss": 2.7789, "step": 16310 }, { "epoch": 0.9469378281934492, "grad_norm": 0.09576351940631866, "learning_rate": 4.612259659151984e-06, "loss": 2.7716, "step": 16320 }, { "epoch": 0.9475180597058226, "grad_norm": 0.09730935841798782, "learning_rate": 4.512248247681394e-06, "loss": 2.7802, "step": 16330 }, { "epoch": 0.9480982912181961, "grad_norm": 0.09646177291870117, "learning_rate": 4.413324853067213e-06, "loss": 2.7765, "step": 16340 }, { "epoch": 0.9486785227305695, "grad_norm": 0.09553349018096924, "learning_rate": 4.3154898395557744e-06, "loss": 2.778, "step": 16350 }, { "epoch": 0.9492587542429429, "grad_norm": 0.09604230523109436, "learning_rate": 4.218743567385852e-06, "loss": 2.78, "step": 16360 }, { "epoch": 0.9498389857553163, "grad_norm": 0.09518173336982727, "learning_rate": 4.123086392787289e-06, "loss": 2.7695, "step": 16370 }, { "epoch": 0.9504192172676899, "grad_norm": 0.09625556319952011, "learning_rate": 4.0285186679799406e-06, "loss": 2.7694, "step": 16380 }, { "epoch": 0.9509994487800633, "grad_norm": 0.09755248576402664, "learning_rate": 3.935040741171969e-06, "loss": 2.7625, "step": 16390 }, { "epoch": 0.9515796802924367, "grad_norm": 0.09465952962636948, "learning_rate": 3.842652956558945e-06, "loss": 2.7658, "step": 16400 }, { "epoch": 0.9521599118048101, "grad_norm": 0.0960998460650444, "learning_rate": 3.7513556543223855e-06, "loss": 2.7846, "step": 16410 }, { "epoch": 0.9527401433171836, "grad_norm": 0.09892145544290543, "learning_rate": 3.6611491706284856e-06, "loss": 2.7708, "step": 16420 }, { "epoch": 0.953320374829557, "grad_norm": 0.09714221954345703, "learning_rate": 3.572033837626953e-06, "loss": 2.7874, "step": 16430 }, { "epoch": 0.9539006063419304, "grad_norm": 0.09727420657873154, "learning_rate": 3.484009983449809e-06, "loss": 2.7834, "step": 16440 }, { "epoch": 0.9544808378543038, "grad_norm": 0.09665530920028687, "learning_rate": 3.397077932210124e-06, "loss": 2.7726, "step": 16450 }, { "epoch": 0.9550610693666773, "grad_norm": 0.09558922797441483, "learning_rate": 3.3112380040008156e-06, "loss": 2.7723, "step": 16460 }, { "epoch": 0.9556413008790507, "grad_norm": 0.0972527414560318, "learning_rate": 3.2264905148934208e-06, "loss": 2.772, "step": 16470 }, { "epoch": 0.9562215323914242, "grad_norm": 0.09882599860429764, "learning_rate": 3.142835776937158e-06, "loss": 2.7685, "step": 16480 }, { "epoch": 0.9568017639037976, "grad_norm": 0.09505190700292587, "learning_rate": 3.060274098157467e-06, "loss": 2.7694, "step": 16490 }, { "epoch": 0.9573819954161711, "grad_norm": 0.09600254893302917, "learning_rate": 2.9788057825551714e-06, "loss": 2.7778, "step": 16500 }, { "epoch": 0.9579622269285445, "grad_norm": 0.09696151316165924, "learning_rate": 2.8984311301050835e-06, "loss": 2.784, "step": 16510 }, { "epoch": 0.9585424584409179, "grad_norm": 0.09621264785528183, "learning_rate": 2.819150436755135e-06, "loss": 2.7668, "step": 16520 }, { "epoch": 0.9591226899532914, "grad_norm": 0.09673577547073364, "learning_rate": 2.7409639944251162e-06, "loss": 2.774, "step": 16530 }, { "epoch": 0.9597029214656648, "grad_norm": 0.09513070434331894, "learning_rate": 2.6638720910056697e-06, "loss": 2.7783, "step": 16540 }, { "epoch": 0.9602831529780382, "grad_norm": 0.09311112761497498, "learning_rate": 2.587875010357332e-06, "loss": 2.7665, "step": 16550 }, { "epoch": 0.9608633844904116, "grad_norm": 0.09406144171953201, "learning_rate": 2.5129730323092622e-06, "loss": 2.7671, "step": 16560 }, { "epoch": 0.9614436160027852, "grad_norm": 0.09770730882883072, "learning_rate": 2.439166432658446e-06, "loss": 2.7673, "step": 16570 }, { "epoch": 0.9620238475151586, "grad_norm": 0.09938254207372665, "learning_rate": 2.366455483168428e-06, "loss": 2.7637, "step": 16580 }, { "epoch": 0.962604079027532, "grad_norm": 0.09504234790802002, "learning_rate": 2.2948404515686136e-06, "loss": 2.7708, "step": 16590 }, { "epoch": 0.9631843105399054, "grad_norm": 0.09619156271219254, "learning_rate": 2.2243216015530362e-06, "loss": 2.7716, "step": 16600 }, { "epoch": 0.9637645420522789, "grad_norm": 0.09520803391933441, "learning_rate": 2.1548991927794244e-06, "loss": 2.771, "step": 16610 }, { "epoch": 0.9643447735646523, "grad_norm": 0.09521950781345367, "learning_rate": 2.0865734808684697e-06, "loss": 2.7679, "step": 16620 }, { "epoch": 0.9649250050770257, "grad_norm": 0.09744451195001602, "learning_rate": 2.0193447174025268e-06, "loss": 2.7715, "step": 16630 }, { "epoch": 0.9655052365893991, "grad_norm": 0.09531662613153458, "learning_rate": 1.953213149924948e-06, "loss": 2.7824, "step": 16640 }, { "epoch": 0.9660854681017726, "grad_norm": 0.09525689482688904, "learning_rate": 1.8881790219391512e-06, "loss": 2.7694, "step": 16650 }, { "epoch": 0.9666656996141461, "grad_norm": 0.09457177668809891, "learning_rate": 1.8242425729075527e-06, "loss": 2.7588, "step": 16660 }, { "epoch": 0.9672459311265195, "grad_norm": 0.09685463458299637, "learning_rate": 1.7614040382508687e-06, "loss": 2.7714, "step": 16670 }, { "epoch": 0.9678261626388929, "grad_norm": 0.09774652868509293, "learning_rate": 1.6996636493471494e-06, "loss": 2.7683, "step": 16680 }, { "epoch": 0.9684063941512664, "grad_norm": 0.09525836259126663, "learning_rate": 1.6390216335309792e-06, "loss": 2.77, "step": 16690 }, { "epoch": 0.9689866256636398, "grad_norm": 0.09421420842409134, "learning_rate": 1.5794782140926775e-06, "loss": 2.7723, "step": 16700 }, { "epoch": 0.9695668571760132, "grad_norm": 0.09693361073732376, "learning_rate": 1.5210336102772668e-06, "loss": 2.772, "step": 16710 }, { "epoch": 0.9701470886883866, "grad_norm": 0.09740012139081955, "learning_rate": 1.463688037283972e-06, "loss": 2.7673, "step": 16720 }, { "epoch": 0.9707273202007601, "grad_norm": 0.09596629440784454, "learning_rate": 1.4074417062651221e-06, "loss": 2.7878, "step": 16730 }, { "epoch": 0.9713075517131335, "grad_norm": 0.09561031311750412, "learning_rate": 1.3522948243256503e-06, "loss": 2.7728, "step": 16740 }, { "epoch": 0.971887783225507, "grad_norm": 0.09793524444103241, "learning_rate": 1.2982475945221615e-06, "loss": 2.7718, "step": 16750 }, { "epoch": 0.9724680147378804, "grad_norm": 0.09407012164592743, "learning_rate": 1.245300215862166e-06, "loss": 2.7797, "step": 16760 }, { "epoch": 0.9730482462502539, "grad_norm": 0.09444325417280197, "learning_rate": 1.1934528833035139e-06, "loss": 2.7725, "step": 16770 }, { "epoch": 0.9736284777626273, "grad_norm": 0.09787797182798386, "learning_rate": 1.1427057877534951e-06, "loss": 2.7691, "step": 16780 }, { "epoch": 0.9742087092750007, "grad_norm": 0.09456036239862442, "learning_rate": 1.09305911606824e-06, "loss": 2.7766, "step": 16790 }, { "epoch": 0.9747889407873742, "grad_norm": 0.095250204205513, "learning_rate": 1.044513051051954e-06, "loss": 2.7701, "step": 16800 }, { "epoch": 0.9753691722997476, "grad_norm": 0.09521818906068802, "learning_rate": 9.970677714563835e-07, "loss": 2.7734, "step": 16810 }, { "epoch": 0.975949403812121, "grad_norm": 0.09462135285139084, "learning_rate": 9.507234519800178e-07, "loss": 2.7705, "step": 16820 }, { "epoch": 0.9765296353244944, "grad_norm": 0.09560775011777878, "learning_rate": 9.054802632674551e-07, "loss": 2.7691, "step": 16830 }, { "epoch": 0.977109866836868, "grad_norm": 0.09410873800516129, "learning_rate": 8.61338371908904e-07, "loss": 2.7787, "step": 16840 }, { "epoch": 0.9776900983492414, "grad_norm": 0.09606259316205978, "learning_rate": 8.18297940439383e-07, "loss": 2.7766, "step": 16850 }, { "epoch": 0.9782703298616148, "grad_norm": 0.09549134224653244, "learning_rate": 7.763591273382885e-07, "loss": 2.7701, "step": 16860 }, { "epoch": 0.9788505613739882, "grad_norm": 0.09225918352603912, "learning_rate": 7.355220870287615e-07, "loss": 2.7635, "step": 16870 }, { "epoch": 0.9794307928863617, "grad_norm": 0.09305543452501297, "learning_rate": 6.95786969876988e-07, "loss": 2.7659, "step": 16880 }, { "epoch": 0.9800110243987351, "grad_norm": 0.09393244236707687, "learning_rate": 6.571539221918997e-07, "loss": 2.7743, "step": 16890 }, { "epoch": 0.9805912559111085, "grad_norm": 0.09278815984725952, "learning_rate": 6.196230862244078e-07, "loss": 2.78, "step": 16900 }, { "epoch": 0.9811714874234819, "grad_norm": 0.09347451478242874, "learning_rate": 5.831946001669697e-07, "loss": 2.7747, "step": 16910 }, { "epoch": 0.9817517189358554, "grad_norm": 0.09540887176990509, "learning_rate": 5.478685981530894e-07, "loss": 2.7758, "step": 16920 }, { "epoch": 0.9823319504482289, "grad_norm": 0.09621070325374603, "learning_rate": 5.136452102567856e-07, "loss": 2.7713, "step": 16930 }, { "epoch": 0.9829121819606023, "grad_norm": 0.09409264475107193, "learning_rate": 4.805245624922238e-07, "loss": 2.7778, "step": 16940 }, { "epoch": 0.9834924134729757, "grad_norm": 0.09619985520839691, "learning_rate": 4.4850677681301795e-07, "loss": 2.7701, "step": 16950 }, { "epoch": 0.9840726449853492, "grad_norm": 0.09401355683803558, "learning_rate": 4.1759197111206344e-07, "loss": 2.7689, "step": 16960 }, { "epoch": 0.9846528764977226, "grad_norm": 0.09698129445314407, "learning_rate": 3.877802592209045e-07, "loss": 2.7703, "step": 16970 }, { "epoch": 0.985233108010096, "grad_norm": 0.09333529323339462, "learning_rate": 3.590717509093677e-07, "loss": 2.7784, "step": 16980 }, { "epoch": 0.9858133395224694, "grad_norm": 0.09353555738925934, "learning_rate": 3.3146655188519557e-07, "loss": 2.7687, "step": 16990 }, { "epoch": 0.9863935710348429, "grad_norm": 0.09438835084438324, "learning_rate": 3.0496476379364697e-07, "loss": 2.7665, "step": 17000 }, { "epoch": 0.9863935710348429, "eval_loss": 2.735684633255005, "eval_runtime": 3.2561, "eval_samples_per_second": 1329.798, "eval_steps_per_second": 2.764, "step": 17000 }, { "epoch": 0.9869738025472163, "grad_norm": 0.09504197537899017, "learning_rate": 2.7956648421703087e-07, "loss": 2.7762, "step": 17010 }, { "epoch": 0.9875540340595897, "grad_norm": 0.09602217376232147, "learning_rate": 2.5527180667453963e-07, "loss": 2.7673, "step": 17020 }, { "epoch": 0.9881342655719633, "grad_norm": 0.09483738243579865, "learning_rate": 2.3208082062168288e-07, "loss": 2.7705, "step": 17030 }, { "epoch": 0.9887144970843367, "grad_norm": 0.09395676851272583, "learning_rate": 2.0999361145008775e-07, "loss": 2.7692, "step": 17040 }, { "epoch": 0.9892947285967101, "grad_norm": 0.09432484954595566, "learning_rate": 1.8901026048719902e-07, "loss": 2.7707, "step": 17050 }, { "epoch": 0.9898749601090835, "grad_norm": 0.09382540732622147, "learning_rate": 1.6913084499587948e-07, "loss": 2.7788, "step": 17060 }, { "epoch": 0.990455191621457, "grad_norm": 0.09619873762130737, "learning_rate": 1.5035543817427663e-07, "loss": 2.7604, "step": 17070 }, { "epoch": 0.9910354231338304, "grad_norm": 0.09365525841712952, "learning_rate": 1.3268410915532323e-07, "loss": 2.7785, "step": 17080 }, { "epoch": 0.9916156546462038, "grad_norm": 0.09718578308820724, "learning_rate": 1.1611692300680376e-07, "loss": 2.7745, "step": 17090 }, { "epoch": 0.9921958861585772, "grad_norm": 0.0956762507557869, "learning_rate": 1.0065394073075494e-07, "loss": 2.7813, "step": 17100 }, { "epoch": 0.9927761176709508, "grad_norm": 0.09347262978553772, "learning_rate": 8.629521926353244e-08, "loss": 2.7714, "step": 17110 }, { "epoch": 0.9933563491833242, "grad_norm": 0.09415694326162338, "learning_rate": 7.304081147544439e-08, "loss": 2.7837, "step": 17120 }, { "epoch": 0.9939365806956976, "grad_norm": 0.09390881657600403, "learning_rate": 6.089076617058486e-08, "loss": 2.7725, "step": 17130 }, { "epoch": 0.994516812208071, "grad_norm": 0.09363935142755508, "learning_rate": 4.984512808673402e-08, "loss": 2.776, "step": 17140 }, { "epoch": 0.9950970437204445, "grad_norm": 0.0957217812538147, "learning_rate": 3.9903937895091606e-08, "loss": 2.7731, "step": 17150 }, { "epoch": 0.9956772752328179, "grad_norm": 0.09717927128076553, "learning_rate": 3.1067232200110426e-08, "loss": 2.7703, "step": 17160 }, { "epoch": 0.9962575067451913, "grad_norm": 0.09413953870534897, "learning_rate": 2.333504353952964e-08, "loss": 2.7733, "step": 17170 }, { "epoch": 0.9968377382575647, "grad_norm": 0.09774868190288544, "learning_rate": 1.670740038400842e-08, "loss": 2.7658, "step": 17180 }, { "epoch": 0.9974179697699382, "grad_norm": 0.09658750146627426, "learning_rate": 1.1184327137292448e-08, "loss": 2.7734, "step": 17190 }, { "epoch": 0.9979982012823116, "grad_norm": 0.0932522714138031, "learning_rate": 6.765844135847576e-09, "loss": 2.7708, "step": 17200 }, { "epoch": 0.9985784327946851, "grad_norm": 0.09543392807245255, "learning_rate": 3.4519676490596393e-09, "loss": 2.7746, "step": 17210 }, { "epoch": 0.9991586643070585, "grad_norm": 0.09391433745622635, "learning_rate": 1.2427098789347111e-09, "loss": 2.7707, "step": 17220 }, { "epoch": 0.999738895819432, "grad_norm": 0.0975637212395668, "learning_rate": 1.3807896016571064e-10, "loss": 2.77, "step": 17230 }, { "epoch": 0.9999709884243814, "step": 17234, "total_flos": 4.402536853133695e+19, "train_loss": 3.082940493684724, "train_runtime": 20985.9807, "train_samples_per_second": 420.462, "train_steps_per_second": 0.821 } ], "logging_steps": 10, "max_steps": 17234, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.402536853133695e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }