{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999709884243814, "eval_steps": 1000, "global_step": 17234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000580231512373437, "grad_norm": 4.982813358306885, "learning_rate": 8.352668213457076e-06, "loss": 10.6252, "step": 10 }, { "epoch": 0.001160463024746874, "grad_norm": 1.8926478624343872, "learning_rate": 1.763341067285383e-05, "loss": 9.4385, "step": 20 }, { "epoch": 0.001740694537120311, "grad_norm": 1.5721722841262817, "learning_rate": 2.691415313225058e-05, "loss": 9.0542, "step": 30 }, { "epoch": 0.002320926049493748, "grad_norm": 1.3773772716522217, "learning_rate": 3.619489559164733e-05, "loss": 8.6743, "step": 40 }, { "epoch": 0.002901157561867185, "grad_norm": 1.4442760944366455, "learning_rate": 4.547563805104408e-05, "loss": 8.236, "step": 50 }, { "epoch": 0.003481389074240622, "grad_norm": 1.8863673210144043, "learning_rate": 5.475638051044084e-05, "loss": 7.8448, "step": 60 }, { "epoch": 0.004061620586614059, "grad_norm": 2.472522020339966, "learning_rate": 6.40371229698376e-05, "loss": 7.5377, "step": 70 }, { "epoch": 0.004641852098987496, "grad_norm": 0.7553613781929016, "learning_rate": 7.331786542923434e-05, "loss": 7.2639, "step": 80 }, { "epoch": 0.005222083611360933, "grad_norm": 1.2483851909637451, "learning_rate": 8.25986078886311e-05, "loss": 7.0251, "step": 90 }, { "epoch": 0.00580231512373437, "grad_norm": 0.6070403456687927, "learning_rate": 9.187935034802784e-05, "loss": 6.8806, "step": 100 }, { "epoch": 0.006382546636107807, "grad_norm": 0.6660650968551636, "learning_rate": 0.0001011600928074246, "loss": 6.7344, "step": 110 }, { "epoch": 0.006962778148481244, "grad_norm": 0.4268014132976532, "learning_rate": 0.00011044083526682135, "loss": 6.5871, "step": 120 }, { "epoch": 0.007543009660854681, "grad_norm": 1.2828865051269531, "learning_rate": 0.0001197215777262181, "loss": 6.4627, "step": 130 }, { "epoch": 0.008123241173228117, "grad_norm": 0.5586172938346863, "learning_rate": 0.00012900232018561483, "loss": 6.3749, "step": 140 }, { "epoch": 0.008703472685601555, "grad_norm": 0.6511664986610413, "learning_rate": 0.0001382830626450116, "loss": 6.2813, "step": 150 }, { "epoch": 0.009283704197974993, "grad_norm": 0.5605055689811707, "learning_rate": 0.00014756380510440838, "loss": 6.224, "step": 160 }, { "epoch": 0.009863935710348428, "grad_norm": 0.626899003982544, "learning_rate": 0.00015684454756380512, "loss": 6.1366, "step": 170 }, { "epoch": 0.010444167222721866, "grad_norm": 0.5936773419380188, "learning_rate": 0.00016612529002320186, "loss": 6.0792, "step": 180 }, { "epoch": 0.011024398735095304, "grad_norm": 0.40546998381614685, "learning_rate": 0.0001754060324825986, "loss": 6.0285, "step": 190 }, { "epoch": 0.01160463024746874, "grad_norm": 0.7223477363586426, "learning_rate": 0.00018468677494199538, "loss": 5.997, "step": 200 }, { "epoch": 0.012184861759842177, "grad_norm": 0.43312889337539673, "learning_rate": 0.00019396751740139212, "loss": 5.9333, "step": 210 }, { "epoch": 0.012765093272215615, "grad_norm": 0.37224727869033813, "learning_rate": 0.00020324825986078887, "loss": 5.8913, "step": 220 }, { "epoch": 0.01334532478458905, "grad_norm": 1.0582356452941895, "learning_rate": 0.0002125290023201856, "loss": 5.8474, "step": 230 }, { "epoch": 0.013925556296962488, "grad_norm": 0.4765668213367462, "learning_rate": 0.00022180974477958238, "loss": 5.846, "step": 240 }, { "epoch": 0.014505787809335926, "grad_norm": 1.033646583557129, "learning_rate": 0.00023109048723897916, "loss": 5.7797, "step": 250 }, { "epoch": 0.015086019321709361, "grad_norm": 0.6201047301292419, "learning_rate": 0.00024037122969837587, "loss": 5.7681, "step": 260 }, { "epoch": 0.0156662508340828, "grad_norm": 0.4362906217575073, "learning_rate": 0.0002496519721577726, "loss": 5.7067, "step": 270 }, { "epoch": 0.016246482346456235, "grad_norm": 0.55441814661026, "learning_rate": 0.0002589327146171694, "loss": 5.6734, "step": 280 }, { "epoch": 0.016826713858829674, "grad_norm": 0.4820825457572937, "learning_rate": 0.00026821345707656616, "loss": 5.6405, "step": 290 }, { "epoch": 0.01740694537120311, "grad_norm": 0.5360891819000244, "learning_rate": 0.0002774941995359629, "loss": 5.5969, "step": 300 }, { "epoch": 0.017987176883576546, "grad_norm": 0.3845142126083374, "learning_rate": 0.00028677494199535965, "loss": 5.5684, "step": 310 }, { "epoch": 0.018567408395949985, "grad_norm": 0.5543452501296997, "learning_rate": 0.0002960556844547564, "loss": 5.5412, "step": 320 }, { "epoch": 0.01914763990832342, "grad_norm": 0.4227641522884369, "learning_rate": 0.00030533642691415314, "loss": 5.4778, "step": 330 }, { "epoch": 0.019727871420696857, "grad_norm": 0.5194647312164307, "learning_rate": 0.0003146171693735499, "loss": 5.4761, "step": 340 }, { "epoch": 0.020308102933070296, "grad_norm": 0.5091412663459778, "learning_rate": 0.0003238979118329466, "loss": 5.4325, "step": 350 }, { "epoch": 0.020888334445443732, "grad_norm": 0.5310063362121582, "learning_rate": 0.0003331786542923434, "loss": 5.3834, "step": 360 }, { "epoch": 0.021468565957817168, "grad_norm": 0.4743990898132324, "learning_rate": 0.00034245939675174017, "loss": 5.3568, "step": 370 }, { "epoch": 0.022048797470190607, "grad_norm": 0.5041143894195557, "learning_rate": 0.0003517401392111369, "loss": 5.3173, "step": 380 }, { "epoch": 0.022629028982564043, "grad_norm": 0.7070014476776123, "learning_rate": 0.00036102088167053366, "loss": 5.3042, "step": 390 }, { "epoch": 0.02320926049493748, "grad_norm": 0.31528109312057495, "learning_rate": 0.0003703016241299304, "loss": 5.2866, "step": 400 }, { "epoch": 0.023789492007310918, "grad_norm": 0.582568883895874, "learning_rate": 0.0003795823665893272, "loss": 5.2333, "step": 410 }, { "epoch": 0.024369723519684354, "grad_norm": 0.41179174184799194, "learning_rate": 0.0003888631090487239, "loss": 5.1997, "step": 420 }, { "epoch": 0.02494995503205779, "grad_norm": 0.6085094809532166, "learning_rate": 0.00039814385150812063, "loss": 5.1513, "step": 430 }, { "epoch": 0.02553018654443123, "grad_norm": 0.4752216935157776, "learning_rate": 0.00040742459396751746, "loss": 5.1338, "step": 440 }, { "epoch": 0.026110418056804665, "grad_norm": 0.4710049629211426, "learning_rate": 0.0004167053364269141, "loss": 5.1104, "step": 450 }, { "epoch": 0.0266906495691781, "grad_norm": 0.3671312630176544, "learning_rate": 0.0004259860788863109, "loss": 5.0645, "step": 460 }, { "epoch": 0.02727088108155154, "grad_norm": 0.34694692492485046, "learning_rate": 0.00043526682134570766, "loss": 5.0086, "step": 470 }, { "epoch": 0.027851112593924976, "grad_norm": 0.43389248847961426, "learning_rate": 0.00044454756380510443, "loss": 4.9914, "step": 480 }, { "epoch": 0.028431344106298412, "grad_norm": 0.3050656020641327, "learning_rate": 0.0004538283062645012, "loss": 4.9517, "step": 490 }, { "epoch": 0.02901157561867185, "grad_norm": 0.4305351674556732, "learning_rate": 0.000463109048723898, "loss": 4.9351, "step": 500 }, { "epoch": 0.029591807131045287, "grad_norm": 0.4160853624343872, "learning_rate": 0.0004723897911832947, "loss": 4.8845, "step": 510 }, { "epoch": 0.030172038643418723, "grad_norm": 0.31704947352409363, "learning_rate": 0.00048167053364269147, "loss": 4.8519, "step": 520 }, { "epoch": 0.030752270155792162, "grad_norm": 0.32812002301216125, "learning_rate": 0.0004909512761020881, "loss": 4.8192, "step": 530 }, { "epoch": 0.0313325016681656, "grad_norm": 0.3280174136161804, "learning_rate": 0.0005002320185614849, "loss": 4.7774, "step": 540 }, { "epoch": 0.031912733180539034, "grad_norm": 0.3379286825656891, "learning_rate": 0.0005095127610208817, "loss": 4.7648, "step": 550 }, { "epoch": 0.03249296469291247, "grad_norm": 0.3835337460041046, "learning_rate": 0.0005187935034802784, "loss": 4.73, "step": 560 }, { "epoch": 0.03307319620528591, "grad_norm": 0.3576149046421051, "learning_rate": 0.0005280742459396752, "loss": 4.7075, "step": 570 }, { "epoch": 0.03365342771765935, "grad_norm": 0.3289433717727661, "learning_rate": 0.000537354988399072, "loss": 4.6726, "step": 580 }, { "epoch": 0.034233659230032784, "grad_norm": 0.3066006898880005, "learning_rate": 0.0005466357308584688, "loss": 4.6341, "step": 590 }, { "epoch": 0.03481389074240622, "grad_norm": 0.3446463942527771, "learning_rate": 0.0005559164733178655, "loss": 4.6156, "step": 600 }, { "epoch": 0.035394122254779656, "grad_norm": 0.4418504536151886, "learning_rate": 0.0005651972157772622, "loss": 4.5849, "step": 610 }, { "epoch": 0.03597435376715309, "grad_norm": 0.25342825055122375, "learning_rate": 0.000574477958236659, "loss": 4.5786, "step": 620 }, { "epoch": 0.036554585279526534, "grad_norm": 0.3860240876674652, "learning_rate": 0.0005837587006960557, "loss": 4.5424, "step": 630 }, { "epoch": 0.03713481679189997, "grad_norm": 0.27333933115005493, "learning_rate": 0.0005930394431554525, "loss": 4.5084, "step": 640 }, { "epoch": 0.037715048304273406, "grad_norm": 0.3511396050453186, "learning_rate": 0.0006023201856148492, "loss": 4.4868, "step": 650 }, { "epoch": 0.03829527981664684, "grad_norm": 0.23285244405269623, "learning_rate": 0.0006116009280742459, "loss": 4.455, "step": 660 }, { "epoch": 0.03887551132902028, "grad_norm": 0.2970772981643677, "learning_rate": 0.0006208816705336427, "loss": 4.4163, "step": 670 }, { "epoch": 0.039455742841393714, "grad_norm": 0.3687499761581421, "learning_rate": 0.0006301624129930395, "loss": 4.409, "step": 680 }, { "epoch": 0.040035974353767156, "grad_norm": 0.2673681676387787, "learning_rate": 0.0006394431554524361, "loss": 4.3984, "step": 690 }, { "epoch": 0.04061620586614059, "grad_norm": 0.25875669717788696, "learning_rate": 0.0006487238979118329, "loss": 4.356, "step": 700 }, { "epoch": 0.04119643737851403, "grad_norm": 0.2780872881412506, "learning_rate": 0.0006580046403712297, "loss": 4.3205, "step": 710 }, { "epoch": 0.041776668890887464, "grad_norm": 0.3448280990123749, "learning_rate": 0.0006672853828306265, "loss": 4.3108, "step": 720 }, { "epoch": 0.0423569004032609, "grad_norm": 0.3050176799297333, "learning_rate": 0.0006765661252900232, "loss": 4.2965, "step": 730 }, { "epoch": 0.042937131915634336, "grad_norm": 0.2748768627643585, "learning_rate": 0.00068584686774942, "loss": 4.2485, "step": 740 }, { "epoch": 0.04351736342800778, "grad_norm": 0.29153186082839966, "learning_rate": 0.0006951276102088168, "loss": 4.2393, "step": 750 }, { "epoch": 0.044097594940381214, "grad_norm": 0.27588367462158203, "learning_rate": 0.0007044083526682135, "loss": 4.2037, "step": 760 }, { "epoch": 0.04467782645275465, "grad_norm": 0.3229079842567444, "learning_rate": 0.0007136890951276103, "loss": 4.1929, "step": 770 }, { "epoch": 0.045258057965128086, "grad_norm": 0.3054658770561218, "learning_rate": 0.000722969837587007, "loss": 4.1895, "step": 780 }, { "epoch": 0.04583828947750152, "grad_norm": 0.22277499735355377, "learning_rate": 0.0007322505800464037, "loss": 4.1598, "step": 790 }, { "epoch": 0.04641852098987496, "grad_norm": 0.21997177600860596, "learning_rate": 0.0007415313225058005, "loss": 4.1381, "step": 800 }, { "epoch": 0.0469987525022484, "grad_norm": 0.30231335759162903, "learning_rate": 0.0007508120649651973, "loss": 4.1192, "step": 810 }, { "epoch": 0.047578984014621836, "grad_norm": 0.24960269033908844, "learning_rate": 0.0007600928074245941, "loss": 4.1015, "step": 820 }, { "epoch": 0.04815921552699527, "grad_norm": 0.2396165281534195, "learning_rate": 0.0007693735498839907, "loss": 4.0808, "step": 830 }, { "epoch": 0.04873944703936871, "grad_norm": 0.32354429364204407, "learning_rate": 0.0007786542923433875, "loss": 4.0807, "step": 840 }, { "epoch": 0.049319678551742144, "grad_norm": 0.22621972858905792, "learning_rate": 0.0007879350348027843, "loss": 4.0691, "step": 850 }, { "epoch": 0.04989991006411558, "grad_norm": 0.2520240843296051, "learning_rate": 0.0007972157772621809, "loss": 4.0462, "step": 860 }, { "epoch": 0.05048014157648902, "grad_norm": 0.21748529374599457, "learning_rate": 0.0007999996391536774, "loss": 4.0232, "step": 870 }, { "epoch": 0.05106037308886246, "grad_norm": 0.2697643041610718, "learning_rate": 0.0007999978717446846, "loss": 4.011, "step": 880 }, { "epoch": 0.051640604601235894, "grad_norm": 0.23635219037532806, "learning_rate": 0.0007999946315016254, "loss": 3.992, "step": 890 }, { "epoch": 0.05222083611360933, "grad_norm": 0.22652658820152283, "learning_rate": 0.0007999899184364306, "loss": 3.9888, "step": 900 }, { "epoch": 0.052801067625982766, "grad_norm": 0.19167619943618774, "learning_rate": 0.0007999837325664544, "loss": 3.9702, "step": 910 }, { "epoch": 0.0533812991383562, "grad_norm": 0.28191596269607544, "learning_rate": 0.0007999760739144737, "loss": 3.9576, "step": 920 }, { "epoch": 0.053961530650729644, "grad_norm": 0.1901404857635498, "learning_rate": 0.0007999669425086884, "loss": 3.9437, "step": 930 }, { "epoch": 0.05454176216310308, "grad_norm": 0.21451230347156525, "learning_rate": 0.0007999563383827214, "loss": 3.9262, "step": 940 }, { "epoch": 0.055121993675476516, "grad_norm": 0.2342328280210495, "learning_rate": 0.0007999442615756181, "loss": 3.9164, "step": 950 }, { "epoch": 0.05570222518784995, "grad_norm": 0.2367662936449051, "learning_rate": 0.0007999307121318467, "loss": 3.9063, "step": 960 }, { "epoch": 0.05628245670022339, "grad_norm": 0.17875801026821136, "learning_rate": 0.0007999156901012974, "loss": 3.8941, "step": 970 }, { "epoch": 0.056862688212596824, "grad_norm": 0.19872689247131348, "learning_rate": 0.0007998991955392833, "loss": 3.8877, "step": 980 }, { "epoch": 0.057442919724970266, "grad_norm": 0.1925002783536911, "learning_rate": 0.0007998812285065388, "loss": 3.8654, "step": 990 }, { "epoch": 0.0580231512373437, "grad_norm": 0.2752264738082886, "learning_rate": 0.0007998617890692205, "loss": 3.8765, "step": 1000 }, { "epoch": 0.0580231512373437, "eval_loss": 3.819340944290161, "eval_runtime": 3.5215, "eval_samples_per_second": 1229.588, "eval_steps_per_second": 4.827, "step": 1000 }, { "epoch": 0.05860338274971714, "grad_norm": 0.17380791902542114, "learning_rate": 0.0007998408772989065, "loss": 3.8481, "step": 1010 }, { "epoch": 0.059183614262090574, "grad_norm": 0.2224138081073761, "learning_rate": 0.0007998184932725959, "loss": 3.8338, "step": 1020 }, { "epoch": 0.05976384577446401, "grad_norm": 0.20416170358657837, "learning_rate": 0.0007997946370727093, "loss": 3.8239, "step": 1030 }, { "epoch": 0.060344077286837446, "grad_norm": 0.2043341100215912, "learning_rate": 0.0007997693087870876, "loss": 3.8333, "step": 1040 }, { "epoch": 0.06092430879921089, "grad_norm": 0.1625361144542694, "learning_rate": 0.0007997425085089921, "loss": 3.8209, "step": 1050 }, { "epoch": 0.061504540311584324, "grad_norm": 0.19766223430633545, "learning_rate": 0.0007997142363371045, "loss": 3.8001, "step": 1060 }, { "epoch": 0.06208477182395776, "grad_norm": 0.21795207262039185, "learning_rate": 0.0007996844923755257, "loss": 3.7986, "step": 1070 }, { "epoch": 0.0626650033363312, "grad_norm": 0.190157949924469, "learning_rate": 0.0007996532767337762, "loss": 3.796, "step": 1080 }, { "epoch": 0.06324523484870463, "grad_norm": 0.16628651320934296, "learning_rate": 0.000799620589526795, "loss": 3.773, "step": 1090 }, { "epoch": 0.06382546636107807, "grad_norm": 0.17414052784442902, "learning_rate": 0.0007995864308749402, "loss": 3.7789, "step": 1100 }, { "epoch": 0.0644056978734515, "grad_norm": 0.21780098974704742, "learning_rate": 0.0007995508009039872, "loss": 3.7625, "step": 1110 }, { "epoch": 0.06498592938582494, "grad_norm": 0.1593518704175949, "learning_rate": 0.0007995136997451295, "loss": 3.7483, "step": 1120 }, { "epoch": 0.06556616089819838, "grad_norm": 0.1702566295862198, "learning_rate": 0.0007994751275349775, "loss": 3.7412, "step": 1130 }, { "epoch": 0.06614639241057182, "grad_norm": 0.19132129848003387, "learning_rate": 0.0007994350844155578, "loss": 3.7359, "step": 1140 }, { "epoch": 0.06672662392294526, "grad_norm": 0.20573927462100983, "learning_rate": 0.0007993935705343137, "loss": 3.7341, "step": 1150 }, { "epoch": 0.0673068554353187, "grad_norm": 0.17265363037586212, "learning_rate": 0.0007993505860441036, "loss": 3.7142, "step": 1160 }, { "epoch": 0.06788708694769213, "grad_norm": 0.1894044727087021, "learning_rate": 0.0007993061311032006, "loss": 3.7054, "step": 1170 }, { "epoch": 0.06846731846006557, "grad_norm": 0.18005013465881348, "learning_rate": 0.0007992602058752929, "loss": 3.7086, "step": 1180 }, { "epoch": 0.069047549972439, "grad_norm": 0.16466042399406433, "learning_rate": 0.0007992128105294816, "loss": 3.7024, "step": 1190 }, { "epoch": 0.06962778148481244, "grad_norm": 0.1829235851764679, "learning_rate": 0.0007991639452402816, "loss": 3.6982, "step": 1200 }, { "epoch": 0.07020801299718588, "grad_norm": 0.19115886092185974, "learning_rate": 0.0007991136101876199, "loss": 3.6917, "step": 1210 }, { "epoch": 0.07078824450955931, "grad_norm": 0.285118043422699, "learning_rate": 0.0007990618055568354, "loss": 3.7085, "step": 1220 }, { "epoch": 0.07136847602193275, "grad_norm": 0.13975660502910614, "learning_rate": 0.0007990085315386782, "loss": 3.6949, "step": 1230 }, { "epoch": 0.07194870753430618, "grad_norm": 0.1724250465631485, "learning_rate": 0.0007989537883293089, "loss": 3.6793, "step": 1240 }, { "epoch": 0.07252893904667962, "grad_norm": 0.1736043244600296, "learning_rate": 0.0007988975761302977, "loss": 3.6598, "step": 1250 }, { "epoch": 0.07310917055905307, "grad_norm": 0.1797361671924591, "learning_rate": 0.0007988398951486238, "loss": 3.6595, "step": 1260 }, { "epoch": 0.0736894020714265, "grad_norm": 0.19887861609458923, "learning_rate": 0.0007987807455966746, "loss": 3.6594, "step": 1270 }, { "epoch": 0.07426963358379994, "grad_norm": 0.16201245784759521, "learning_rate": 0.0007987201276922447, "loss": 3.6473, "step": 1280 }, { "epoch": 0.07484986509617338, "grad_norm": 0.16996973752975464, "learning_rate": 0.000798658041658536, "loss": 3.6488, "step": 1290 }, { "epoch": 0.07543009660854681, "grad_norm": 0.1770084798336029, "learning_rate": 0.0007985944877241557, "loss": 3.6434, "step": 1300 }, { "epoch": 0.07601032812092025, "grad_norm": 0.1701856404542923, "learning_rate": 0.0007985294661231159, "loss": 3.6482, "step": 1310 }, { "epoch": 0.07659055963329368, "grad_norm": 0.16800357401371002, "learning_rate": 0.0007984629770948329, "loss": 3.6301, "step": 1320 }, { "epoch": 0.07717079114566712, "grad_norm": 0.20723000168800354, "learning_rate": 0.0007983950208841264, "loss": 3.6221, "step": 1330 }, { "epoch": 0.07775102265804056, "grad_norm": 0.17594030499458313, "learning_rate": 0.0007983255977412182, "loss": 3.6214, "step": 1340 }, { "epoch": 0.07833125417041399, "grad_norm": 0.1696191132068634, "learning_rate": 0.0007982547079217316, "loss": 3.6193, "step": 1350 }, { "epoch": 0.07891148568278743, "grad_norm": 0.1643722951412201, "learning_rate": 0.0007981823516866904, "loss": 3.6155, "step": 1360 }, { "epoch": 0.07949171719516086, "grad_norm": 0.1793804168701172, "learning_rate": 0.0007981085293025179, "loss": 3.6091, "step": 1370 }, { "epoch": 0.08007194870753431, "grad_norm": 0.17910826206207275, "learning_rate": 0.0007980332410410357, "loss": 3.6108, "step": 1380 }, { "epoch": 0.08065218021990775, "grad_norm": 0.15728121995925903, "learning_rate": 0.0007979564871794632, "loss": 3.5945, "step": 1390 }, { "epoch": 0.08123241173228118, "grad_norm": 0.16888955235481262, "learning_rate": 0.0007978782680004159, "loss": 3.6014, "step": 1400 }, { "epoch": 0.08181264324465462, "grad_norm": 0.17229601740837097, "learning_rate": 0.0007977985837919053, "loss": 3.5951, "step": 1410 }, { "epoch": 0.08239287475702806, "grad_norm": 0.15012314915657043, "learning_rate": 0.0007977174348473369, "loss": 3.5914, "step": 1420 }, { "epoch": 0.08297310626940149, "grad_norm": 0.17115098237991333, "learning_rate": 0.0007976348214655095, "loss": 3.59, "step": 1430 }, { "epoch": 0.08355333778177493, "grad_norm": 0.1689465194940567, "learning_rate": 0.0007975507439506146, "loss": 3.5698, "step": 1440 }, { "epoch": 0.08413356929414836, "grad_norm": 0.18002712726593018, "learning_rate": 0.0007974652026122339, "loss": 3.564, "step": 1450 }, { "epoch": 0.0847138008065218, "grad_norm": 0.15876726806163788, "learning_rate": 0.00079737819776534, "loss": 3.5663, "step": 1460 }, { "epoch": 0.08529403231889524, "grad_norm": 0.16440439224243164, "learning_rate": 0.0007972897297302936, "loss": 3.5736, "step": 1470 }, { "epoch": 0.08587426383126867, "grad_norm": 0.17444723844528198, "learning_rate": 0.0007971997988328433, "loss": 3.5654, "step": 1480 }, { "epoch": 0.08645449534364211, "grad_norm": 0.15380242466926575, "learning_rate": 0.0007971084054041243, "loss": 3.5619, "step": 1490 }, { "epoch": 0.08703472685601556, "grad_norm": 0.15826934576034546, "learning_rate": 0.0007970155497806564, "loss": 3.5484, "step": 1500 }, { "epoch": 0.08761495836838899, "grad_norm": 0.18467594683170319, "learning_rate": 0.0007969212323043442, "loss": 3.5422, "step": 1510 }, { "epoch": 0.08819518988076243, "grad_norm": 0.15650728344917297, "learning_rate": 0.0007968254533224741, "loss": 3.5374, "step": 1520 }, { "epoch": 0.08877542139313586, "grad_norm": 0.153532475233078, "learning_rate": 0.0007967282131877146, "loss": 3.5459, "step": 1530 }, { "epoch": 0.0893556529055093, "grad_norm": 0.15811505913734436, "learning_rate": 0.0007966295122581139, "loss": 3.5297, "step": 1540 }, { "epoch": 0.08993588441788274, "grad_norm": 0.17548500001430511, "learning_rate": 0.0007965293508970995, "loss": 3.5323, "step": 1550 }, { "epoch": 0.09051611593025617, "grad_norm": 0.16509811580181122, "learning_rate": 0.0007964277294734755, "loss": 3.5329, "step": 1560 }, { "epoch": 0.09109634744262961, "grad_norm": 0.17707116901874542, "learning_rate": 0.0007963246483614231, "loss": 3.5294, "step": 1570 }, { "epoch": 0.09167657895500304, "grad_norm": 0.19381259381771088, "learning_rate": 0.0007962201079404974, "loss": 3.5281, "step": 1580 }, { "epoch": 0.09225681046737648, "grad_norm": 0.1358375996351242, "learning_rate": 0.0007961141085956275, "loss": 3.5087, "step": 1590 }, { "epoch": 0.09283704197974992, "grad_norm": 0.1562700718641281, "learning_rate": 0.0007960066507171139, "loss": 3.5138, "step": 1600 }, { "epoch": 0.09341727349212335, "grad_norm": 0.17380204796791077, "learning_rate": 0.0007958977347006276, "loss": 3.5167, "step": 1610 }, { "epoch": 0.0939975050044968, "grad_norm": 0.1646881252527237, "learning_rate": 0.0007957873609472092, "loss": 3.5136, "step": 1620 }, { "epoch": 0.09457773651687024, "grad_norm": 0.1501435488462448, "learning_rate": 0.0007956755298632661, "loss": 3.5039, "step": 1630 }, { "epoch": 0.09515796802924367, "grad_norm": 0.17160342633724213, "learning_rate": 0.0007955622418605722, "loss": 3.5067, "step": 1640 }, { "epoch": 0.09573819954161711, "grad_norm": 0.18148139119148254, "learning_rate": 0.0007954474973562658, "loss": 3.5095, "step": 1650 }, { "epoch": 0.09631843105399054, "grad_norm": 0.14332358539104462, "learning_rate": 0.0007953312967728481, "loss": 3.5016, "step": 1660 }, { "epoch": 0.09689866256636398, "grad_norm": 0.17075377702713013, "learning_rate": 0.0007952136405381819, "loss": 3.4886, "step": 1670 }, { "epoch": 0.09747889407873742, "grad_norm": 0.15740430355072021, "learning_rate": 0.0007950945290854897, "loss": 3.4912, "step": 1680 }, { "epoch": 0.09805912559111085, "grad_norm": 0.15826788544654846, "learning_rate": 0.0007949739628533523, "loss": 3.4975, "step": 1690 }, { "epoch": 0.09863935710348429, "grad_norm": 0.15989266335964203, "learning_rate": 0.0007948519422857075, "loss": 3.484, "step": 1700 }, { "epoch": 0.09921958861585772, "grad_norm": 0.1579332947731018, "learning_rate": 0.0007947284678318475, "loss": 3.484, "step": 1710 }, { "epoch": 0.09979982012823116, "grad_norm": 0.1635797768831253, "learning_rate": 0.0007946035399464184, "loss": 3.4871, "step": 1720 }, { "epoch": 0.1003800516406046, "grad_norm": 0.14496110379695892, "learning_rate": 0.0007944771590894175, "loss": 3.4796, "step": 1730 }, { "epoch": 0.10096028315297804, "grad_norm": 0.15867535769939423, "learning_rate": 0.0007943493257261924, "loss": 3.4776, "step": 1740 }, { "epoch": 0.10154051466535148, "grad_norm": 0.17466506361961365, "learning_rate": 0.0007942200403274392, "loss": 3.4817, "step": 1750 }, { "epoch": 0.10212074617772492, "grad_norm": 0.17878016829490662, "learning_rate": 0.0007940893033692, "loss": 3.4582, "step": 1760 }, { "epoch": 0.10270097769009835, "grad_norm": 0.19181516766548157, "learning_rate": 0.0007939571153328622, "loss": 3.4704, "step": 1770 }, { "epoch": 0.10328120920247179, "grad_norm": 0.1467759758234024, "learning_rate": 0.0007938234767051559, "loss": 3.4627, "step": 1780 }, { "epoch": 0.10386144071484522, "grad_norm": 0.14268510043621063, "learning_rate": 0.0007936883879781524, "loss": 3.4737, "step": 1790 }, { "epoch": 0.10444167222721866, "grad_norm": 0.22379091382026672, "learning_rate": 0.0007935518496492627, "loss": 3.4569, "step": 1800 }, { "epoch": 0.1050219037395921, "grad_norm": 0.15746822953224182, "learning_rate": 0.0007934138622212352, "loss": 3.4557, "step": 1810 }, { "epoch": 0.10560213525196553, "grad_norm": 0.15994954109191895, "learning_rate": 0.000793274426202154, "loss": 3.4563, "step": 1820 }, { "epoch": 0.10618236676433897, "grad_norm": 0.18349260091781616, "learning_rate": 0.0007931335421054373, "loss": 3.4568, "step": 1830 }, { "epoch": 0.1067625982767124, "grad_norm": 0.1704210340976715, "learning_rate": 0.0007929912104498348, "loss": 3.4526, "step": 1840 }, { "epoch": 0.10734282978908584, "grad_norm": 0.1595371514558792, "learning_rate": 0.0007928474317594269, "loss": 3.4438, "step": 1850 }, { "epoch": 0.10792306130145929, "grad_norm": 0.15416350960731506, "learning_rate": 0.0007927022065636216, "loss": 3.4525, "step": 1860 }, { "epoch": 0.10850329281383272, "grad_norm": 0.16923953592777252, "learning_rate": 0.0007925555353971534, "loss": 3.4509, "step": 1870 }, { "epoch": 0.10908352432620616, "grad_norm": 0.15602600574493408, "learning_rate": 0.0007924074188000807, "loss": 3.4403, "step": 1880 }, { "epoch": 0.1096637558385796, "grad_norm": 0.18738558888435364, "learning_rate": 0.0007922578573177846, "loss": 3.4459, "step": 1890 }, { "epoch": 0.11024398735095303, "grad_norm": 0.1813129186630249, "learning_rate": 0.000792106851500966, "loss": 3.4349, "step": 1900 }, { "epoch": 0.11082421886332647, "grad_norm": 0.1548515260219574, "learning_rate": 0.0007919544019056441, "loss": 3.4311, "step": 1910 }, { "epoch": 0.1114044503756999, "grad_norm": 0.15673480927944183, "learning_rate": 0.0007918005090931543, "loss": 3.4254, "step": 1920 }, { "epoch": 0.11198468188807334, "grad_norm": 0.1796632558107376, "learning_rate": 0.000791645173630146, "loss": 3.4285, "step": 1930 }, { "epoch": 0.11256491340044678, "grad_norm": 0.1732536405324936, "learning_rate": 0.0007914883960885808, "loss": 3.4351, "step": 1940 }, { "epoch": 0.11314514491282021, "grad_norm": 0.15486814081668854, "learning_rate": 0.00079133017704573, "loss": 3.4121, "step": 1950 }, { "epoch": 0.11372537642519365, "grad_norm": 0.1571420431137085, "learning_rate": 0.0007911705170841727, "loss": 3.4077, "step": 1960 }, { "epoch": 0.11430560793756708, "grad_norm": 0.1603892296552658, "learning_rate": 0.0007910094167917933, "loss": 3.4191, "step": 1970 }, { "epoch": 0.11488583944994053, "grad_norm": 0.15975409746170044, "learning_rate": 0.0007908468767617805, "loss": 3.4258, "step": 1980 }, { "epoch": 0.11546607096231397, "grad_norm": 0.16867893934249878, "learning_rate": 0.0007906828975926233, "loss": 3.4162, "step": 1990 }, { "epoch": 0.1160463024746874, "grad_norm": 0.16505342721939087, "learning_rate": 0.0007905174798881103, "loss": 3.421, "step": 2000 }, { "epoch": 0.1160463024746874, "eval_loss": 3.3706634044647217, "eval_runtime": 3.5135, "eval_samples_per_second": 1232.376, "eval_steps_per_second": 4.838, "step": 2000 }, { "epoch": 0.11662653398706084, "grad_norm": 0.15656636655330658, "learning_rate": 0.0007903506242573268, "loss": 3.4124, "step": 2010 }, { "epoch": 0.11720676549943428, "grad_norm": 0.16360080242156982, "learning_rate": 0.0007901823313146529, "loss": 3.4098, "step": 2020 }, { "epoch": 0.11778699701180771, "grad_norm": 0.17425882816314697, "learning_rate": 0.0007900126016797604, "loss": 3.4109, "step": 2030 }, { "epoch": 0.11836722852418115, "grad_norm": 0.17185156047344208, "learning_rate": 0.000789841435977612, "loss": 3.3932, "step": 2040 }, { "epoch": 0.11894746003655458, "grad_norm": 0.1598571389913559, "learning_rate": 0.0007896688348384576, "loss": 3.4023, "step": 2050 }, { "epoch": 0.11952769154892802, "grad_norm": 0.1520805060863495, "learning_rate": 0.0007894947988978327, "loss": 3.4118, "step": 2060 }, { "epoch": 0.12010792306130146, "grad_norm": 0.1758430153131485, "learning_rate": 0.000789319328796556, "loss": 3.4096, "step": 2070 }, { "epoch": 0.12068815457367489, "grad_norm": 0.14718368649482727, "learning_rate": 0.0007891424251807264, "loss": 3.392, "step": 2080 }, { "epoch": 0.12126838608604833, "grad_norm": 0.18458478152751923, "learning_rate": 0.0007889640887017219, "loss": 3.3979, "step": 2090 }, { "epoch": 0.12184861759842178, "grad_norm": 0.15048211812973022, "learning_rate": 0.000788784320016196, "loss": 3.3915, "step": 2100 }, { "epoch": 0.12242884911079521, "grad_norm": 0.17065855860710144, "learning_rate": 0.000788603119786076, "loss": 3.3989, "step": 2110 }, { "epoch": 0.12300908062316865, "grad_norm": 0.15301364660263062, "learning_rate": 0.0007884204886785598, "loss": 3.4006, "step": 2120 }, { "epoch": 0.12358931213554208, "grad_norm": 0.14875450730323792, "learning_rate": 0.0007882364273661145, "loss": 3.3869, "step": 2130 }, { "epoch": 0.12416954364791552, "grad_norm": 0.17466358840465546, "learning_rate": 0.0007880509365264729, "loss": 3.3902, "step": 2140 }, { "epoch": 0.12474977516028896, "grad_norm": 0.14540773630142212, "learning_rate": 0.000787864016842632, "loss": 3.3867, "step": 2150 }, { "epoch": 0.1253300066726624, "grad_norm": 0.15955834090709686, "learning_rate": 0.0007876756690028495, "loss": 3.3836, "step": 2160 }, { "epoch": 0.12591023818503583, "grad_norm": 0.1485227346420288, "learning_rate": 0.0007874858937006419, "loss": 3.3884, "step": 2170 }, { "epoch": 0.12649046969740926, "grad_norm": 0.15775996446609497, "learning_rate": 0.0007872946916347817, "loss": 3.3855, "step": 2180 }, { "epoch": 0.1270707012097827, "grad_norm": 0.15353278815746307, "learning_rate": 0.000787102063509295, "loss": 3.3804, "step": 2190 }, { "epoch": 0.12765093272215614, "grad_norm": 0.15105192363262177, "learning_rate": 0.0007869080100334588, "loss": 3.3826, "step": 2200 }, { "epoch": 0.12823116423452957, "grad_norm": 0.2278457134962082, "learning_rate": 0.0007867125319217981, "loss": 3.3656, "step": 2210 }, { "epoch": 0.128811395746903, "grad_norm": 0.15736441314220428, "learning_rate": 0.0007865156298940839, "loss": 3.3822, "step": 2220 }, { "epoch": 0.12939162725927644, "grad_norm": 0.16988937556743622, "learning_rate": 0.0007863173046753303, "loss": 3.382, "step": 2230 }, { "epoch": 0.12997185877164988, "grad_norm": 0.16401201486587524, "learning_rate": 0.0007861175569957913, "loss": 3.3703, "step": 2240 }, { "epoch": 0.13055209028402331, "grad_norm": 0.15762518346309662, "learning_rate": 0.0007859163875909588, "loss": 3.3706, "step": 2250 }, { "epoch": 0.13113232179639675, "grad_norm": 0.176237553358078, "learning_rate": 0.0007857137972015596, "loss": 3.3637, "step": 2260 }, { "epoch": 0.13171255330877019, "grad_norm": 0.18969546258449554, "learning_rate": 0.0007855097865735529, "loss": 3.3611, "step": 2270 }, { "epoch": 0.13229278482114365, "grad_norm": 0.15781009197235107, "learning_rate": 0.000785304356458127, "loss": 3.3694, "step": 2280 }, { "epoch": 0.13287301633351709, "grad_norm": 0.15106526017189026, "learning_rate": 0.0007850975076116973, "loss": 3.3578, "step": 2290 }, { "epoch": 0.13345324784589052, "grad_norm": 0.1673547476530075, "learning_rate": 0.0007848892407959027, "loss": 3.3598, "step": 2300 }, { "epoch": 0.13403347935826396, "grad_norm": 0.1558987945318222, "learning_rate": 0.0007846795567776037, "loss": 3.3683, "step": 2310 }, { "epoch": 0.1346137108706374, "grad_norm": 0.16403828561306, "learning_rate": 0.0007844684563288786, "loss": 3.3497, "step": 2320 }, { "epoch": 0.13519394238301083, "grad_norm": 0.14873118698596954, "learning_rate": 0.0007842559402270213, "loss": 3.3534, "step": 2330 }, { "epoch": 0.13577417389538426, "grad_norm": 0.1887115091085434, "learning_rate": 0.0007840420092545386, "loss": 3.3528, "step": 2340 }, { "epoch": 0.1363544054077577, "grad_norm": 0.15806806087493896, "learning_rate": 0.0007838266641991467, "loss": 3.3522, "step": 2350 }, { "epoch": 0.13693463692013114, "grad_norm": 0.1499200314283371, "learning_rate": 0.0007836099058537686, "loss": 3.3459, "step": 2360 }, { "epoch": 0.13751486843250457, "grad_norm": 0.1501178741455078, "learning_rate": 0.0007833917350165313, "loss": 3.3545, "step": 2370 }, { "epoch": 0.138095099944878, "grad_norm": 0.18025530874729156, "learning_rate": 0.0007831721524907625, "loss": 3.3455, "step": 2380 }, { "epoch": 0.13867533145725144, "grad_norm": 0.16151362657546997, "learning_rate": 0.0007829511590849882, "loss": 3.3324, "step": 2390 }, { "epoch": 0.13925556296962488, "grad_norm": 0.17848755419254303, "learning_rate": 0.0007827287556129292, "loss": 3.3386, "step": 2400 }, { "epoch": 0.13983579448199832, "grad_norm": 0.1609233319759369, "learning_rate": 0.0007825049428934985, "loss": 3.3316, "step": 2410 }, { "epoch": 0.14041602599437175, "grad_norm": 0.1828456073999405, "learning_rate": 0.0007822797217507978, "loss": 3.3386, "step": 2420 }, { "epoch": 0.1409962575067452, "grad_norm": 0.15143896639347076, "learning_rate": 0.0007820530930141152, "loss": 3.3475, "step": 2430 }, { "epoch": 0.14157648901911862, "grad_norm": 0.15408799052238464, "learning_rate": 0.0007818250575179211, "loss": 3.3437, "step": 2440 }, { "epoch": 0.14215672053149206, "grad_norm": 0.1719510555267334, "learning_rate": 0.0007815956161018664, "loss": 3.3345, "step": 2450 }, { "epoch": 0.1427369520438655, "grad_norm": 0.19798249006271362, "learning_rate": 0.0007813647696107783, "loss": 3.3467, "step": 2460 }, { "epoch": 0.14331718355623893, "grad_norm": 0.139098659157753, "learning_rate": 0.0007811325188946578, "loss": 3.3497, "step": 2470 }, { "epoch": 0.14389741506861237, "grad_norm": 0.16338998079299927, "learning_rate": 0.0007808988648086763, "loss": 3.3298, "step": 2480 }, { "epoch": 0.1444776465809858, "grad_norm": 0.154744490981102, "learning_rate": 0.0007806638082131726, "loss": 3.3264, "step": 2490 }, { "epoch": 0.14505787809335924, "grad_norm": 0.1869029700756073, "learning_rate": 0.0007804273499736499, "loss": 3.3287, "step": 2500 }, { "epoch": 0.14563810960573267, "grad_norm": 0.17387431859970093, "learning_rate": 0.000780189490960772, "loss": 3.3321, "step": 2510 }, { "epoch": 0.14621834111810614, "grad_norm": 0.15834355354309082, "learning_rate": 0.0007799502320503609, "loss": 3.3351, "step": 2520 }, { "epoch": 0.14679857263047957, "grad_norm": 0.17102797329425812, "learning_rate": 0.0007797095741233927, "loss": 3.3287, "step": 2530 }, { "epoch": 0.147378804142853, "grad_norm": 0.1568601429462433, "learning_rate": 0.0007794675180659953, "loss": 3.3222, "step": 2540 }, { "epoch": 0.14795903565522645, "grad_norm": 0.16121257841587067, "learning_rate": 0.0007792240647694444, "loss": 3.3271, "step": 2550 }, { "epoch": 0.14853926716759988, "grad_norm": 0.163092240691185, "learning_rate": 0.0007789792151301605, "loss": 3.3268, "step": 2560 }, { "epoch": 0.14911949867997332, "grad_norm": 0.1544051468372345, "learning_rate": 0.0007787329700497058, "loss": 3.3192, "step": 2570 }, { "epoch": 0.14969973019234675, "grad_norm": 0.15779772400856018, "learning_rate": 0.0007784853304347802, "loss": 3.3189, "step": 2580 }, { "epoch": 0.1502799617047202, "grad_norm": 0.17438490688800812, "learning_rate": 0.0007782362971972191, "loss": 3.3112, "step": 2590 }, { "epoch": 0.15086019321709362, "grad_norm": 0.14386020600795746, "learning_rate": 0.0007779858712539885, "loss": 3.3104, "step": 2600 }, { "epoch": 0.15144042472946706, "grad_norm": 0.18188470602035522, "learning_rate": 0.0007777340535271833, "loss": 3.3044, "step": 2610 }, { "epoch": 0.1520206562418405, "grad_norm": 0.15126991271972656, "learning_rate": 0.0007774808449440226, "loss": 3.3138, "step": 2620 }, { "epoch": 0.15260088775421393, "grad_norm": 0.19542071223258972, "learning_rate": 0.000777226246436847, "loss": 3.314, "step": 2630 }, { "epoch": 0.15318111926658737, "grad_norm": 0.16422171890735626, "learning_rate": 0.0007769702589431149, "loss": 3.3102, "step": 2640 }, { "epoch": 0.1537613507789608, "grad_norm": 0.17245811223983765, "learning_rate": 0.0007767128834053989, "loss": 3.3111, "step": 2650 }, { "epoch": 0.15434158229133424, "grad_norm": 0.152163565158844, "learning_rate": 0.0007764541207713828, "loss": 3.3109, "step": 2660 }, { "epoch": 0.15492181380370768, "grad_norm": 0.18762722611427307, "learning_rate": 0.0007761939719938577, "loss": 3.305, "step": 2670 }, { "epoch": 0.1555020453160811, "grad_norm": 0.14804594218730927, "learning_rate": 0.0007759324380307187, "loss": 3.2969, "step": 2680 }, { "epoch": 0.15608227682845455, "grad_norm": 0.1481277346611023, "learning_rate": 0.0007756695198449613, "loss": 3.3079, "step": 2690 }, { "epoch": 0.15666250834082798, "grad_norm": 0.1659483164548874, "learning_rate": 0.0007754052184046777, "loss": 3.2927, "step": 2700 }, { "epoch": 0.15724273985320142, "grad_norm": 0.1500425487756729, "learning_rate": 0.0007751395346830535, "loss": 3.3052, "step": 2710 }, { "epoch": 0.15782297136557485, "grad_norm": 0.1832876354455948, "learning_rate": 0.000774872469658364, "loss": 3.298, "step": 2720 }, { "epoch": 0.1584032028779483, "grad_norm": 0.1515001356601715, "learning_rate": 0.0007746040243139707, "loss": 3.2962, "step": 2730 }, { "epoch": 0.15898343439032173, "grad_norm": 0.17293982207775116, "learning_rate": 0.0007743341996383173, "loss": 3.3018, "step": 2740 }, { "epoch": 0.15956366590269516, "grad_norm": 0.21347884833812714, "learning_rate": 0.0007740629966249266, "loss": 3.2926, "step": 2750 }, { "epoch": 0.16014389741506863, "grad_norm": 0.16939662396907806, "learning_rate": 0.0007737904162723965, "loss": 3.3017, "step": 2760 }, { "epoch": 0.16072412892744206, "grad_norm": 0.17285092175006866, "learning_rate": 0.0007735164595843966, "loss": 3.299, "step": 2770 }, { "epoch": 0.1613043604398155, "grad_norm": 0.15439069271087646, "learning_rate": 0.0007732411275696638, "loss": 3.2837, "step": 2780 }, { "epoch": 0.16188459195218893, "grad_norm": 0.16417156159877777, "learning_rate": 0.0007729644212419997, "loss": 3.2933, "step": 2790 }, { "epoch": 0.16246482346456237, "grad_norm": 0.17124611139297485, "learning_rate": 0.0007726863416202658, "loss": 3.2876, "step": 2800 }, { "epoch": 0.1630450549769358, "grad_norm": 0.14375852048397064, "learning_rate": 0.0007724068897283804, "loss": 3.2908, "step": 2810 }, { "epoch": 0.16362528648930924, "grad_norm": 0.18919309973716736, "learning_rate": 0.0007721260665953148, "loss": 3.2899, "step": 2820 }, { "epoch": 0.16420551800168268, "grad_norm": 0.1679265946149826, "learning_rate": 0.000771843873255089, "loss": 3.2908, "step": 2830 }, { "epoch": 0.1647857495140561, "grad_norm": 0.1555187702178955, "learning_rate": 0.0007715603107467686, "loss": 3.2853, "step": 2840 }, { "epoch": 0.16536598102642955, "grad_norm": 0.15940934419631958, "learning_rate": 0.0007712753801144605, "loss": 3.2826, "step": 2850 }, { "epoch": 0.16594621253880298, "grad_norm": 0.1721728891134262, "learning_rate": 0.0007709890824073089, "loss": 3.2859, "step": 2860 }, { "epoch": 0.16652644405117642, "grad_norm": 0.14665627479553223, "learning_rate": 0.0007707014186794921, "loss": 3.2787, "step": 2870 }, { "epoch": 0.16710667556354986, "grad_norm": 0.16204817593097687, "learning_rate": 0.0007704123899902181, "loss": 3.2733, "step": 2880 }, { "epoch": 0.1676869070759233, "grad_norm": 0.17132225632667542, "learning_rate": 0.0007701219974037207, "loss": 3.2787, "step": 2890 }, { "epoch": 0.16826713858829673, "grad_norm": 0.16214324533939362, "learning_rate": 0.000769830241989256, "loss": 3.2762, "step": 2900 }, { "epoch": 0.16884737010067016, "grad_norm": 0.164068341255188, "learning_rate": 0.0007695371248210978, "loss": 3.282, "step": 2910 }, { "epoch": 0.1694276016130436, "grad_norm": 0.15656143426895142, "learning_rate": 0.0007692426469785345, "loss": 3.2735, "step": 2920 }, { "epoch": 0.17000783312541703, "grad_norm": 0.15498484671115875, "learning_rate": 0.000768946809545864, "loss": 3.2737, "step": 2930 }, { "epoch": 0.17058806463779047, "grad_norm": 0.15789471566677094, "learning_rate": 0.0007686496136123909, "loss": 3.2717, "step": 2940 }, { "epoch": 0.1711682961501639, "grad_norm": 0.15315581858158112, "learning_rate": 0.0007683510602724218, "loss": 3.2707, "step": 2950 }, { "epoch": 0.17174852766253734, "grad_norm": 0.15418575704097748, "learning_rate": 0.0007680511506252615, "loss": 3.275, "step": 2960 }, { "epoch": 0.17232875917491078, "grad_norm": 0.15866506099700928, "learning_rate": 0.0007677498857752084, "loss": 3.2747, "step": 2970 }, { "epoch": 0.17290899068728421, "grad_norm": 0.15650290250778198, "learning_rate": 0.0007674472668315514, "loss": 3.2665, "step": 2980 }, { "epoch": 0.17348922219965765, "grad_norm": 0.14542441070079803, "learning_rate": 0.0007671432949085652, "loss": 3.2647, "step": 2990 }, { "epoch": 0.1740694537120311, "grad_norm": 0.1615607589483261, "learning_rate": 0.0007668379711255063, "loss": 3.279, "step": 3000 }, { "epoch": 0.1740694537120311, "eval_loss": 3.219675302505493, "eval_runtime": 3.4975, "eval_samples_per_second": 1238.02, "eval_steps_per_second": 4.861, "step": 3000 }, { "epoch": 0.17464968522440455, "grad_norm": 0.17387358844280243, "learning_rate": 0.0007665312966066087, "loss": 3.2614, "step": 3010 }, { "epoch": 0.17522991673677799, "grad_norm": 0.1694205105304718, "learning_rate": 0.0007662232724810802, "loss": 3.2644, "step": 3020 }, { "epoch": 0.17581014824915142, "grad_norm": 0.18072472512722015, "learning_rate": 0.000765913899883098, "loss": 3.2664, "step": 3030 }, { "epoch": 0.17639037976152486, "grad_norm": 0.14656253159046173, "learning_rate": 0.0007656031799518042, "loss": 3.2651, "step": 3040 }, { "epoch": 0.1769706112738983, "grad_norm": 0.15717099606990814, "learning_rate": 0.0007652911138313022, "loss": 3.2697, "step": 3050 }, { "epoch": 0.17755084278627173, "grad_norm": 0.15262934565544128, "learning_rate": 0.0007649777026706523, "loss": 3.2673, "step": 3060 }, { "epoch": 0.17813107429864516, "grad_norm": 0.14670057594776154, "learning_rate": 0.0007646629476238669, "loss": 3.264, "step": 3070 }, { "epoch": 0.1787113058110186, "grad_norm": 0.15649975836277008, "learning_rate": 0.0007643468498499075, "loss": 3.2582, "step": 3080 }, { "epoch": 0.17929153732339204, "grad_norm": 0.16560803353786469, "learning_rate": 0.0007640294105126788, "loss": 3.2579, "step": 3090 }, { "epoch": 0.17987176883576547, "grad_norm": 0.18301266431808472, "learning_rate": 0.0007637106307810261, "loss": 3.2663, "step": 3100 }, { "epoch": 0.1804520003481389, "grad_norm": 0.16271339356899261, "learning_rate": 0.0007633905118287293, "loss": 3.2444, "step": 3110 }, { "epoch": 0.18103223186051234, "grad_norm": 0.17710572481155396, "learning_rate": 0.0007630690548345003, "loss": 3.2561, "step": 3120 }, { "epoch": 0.18161246337288578, "grad_norm": 0.17552264034748077, "learning_rate": 0.0007627462609819774, "loss": 3.255, "step": 3130 }, { "epoch": 0.18219269488525922, "grad_norm": 0.1480962038040161, "learning_rate": 0.000762422131459721, "loss": 3.2549, "step": 3140 }, { "epoch": 0.18277292639763265, "grad_norm": 0.16545799374580383, "learning_rate": 0.0007620966674612104, "loss": 3.2437, "step": 3150 }, { "epoch": 0.1833531579100061, "grad_norm": 0.16670258343219757, "learning_rate": 0.0007617698701848378, "loss": 3.2568, "step": 3160 }, { "epoch": 0.18393338942237952, "grad_norm": 0.1615840345621109, "learning_rate": 0.0007614417408339054, "loss": 3.2476, "step": 3170 }, { "epoch": 0.18451362093475296, "grad_norm": 0.1525503695011139, "learning_rate": 0.0007611122806166194, "loss": 3.2508, "step": 3180 }, { "epoch": 0.1850938524471264, "grad_norm": 0.16565243899822235, "learning_rate": 0.0007607814907460872, "loss": 3.2441, "step": 3190 }, { "epoch": 0.18567408395949983, "grad_norm": 0.1544932723045349, "learning_rate": 0.0007604493724403115, "loss": 3.2474, "step": 3200 }, { "epoch": 0.18625431547187327, "grad_norm": 0.1691979318857193, "learning_rate": 0.0007601159269221868, "loss": 3.247, "step": 3210 }, { "epoch": 0.1868345469842467, "grad_norm": 0.16041205823421478, "learning_rate": 0.0007597811554194945, "loss": 3.2453, "step": 3220 }, { "epoch": 0.18741477849662014, "grad_norm": 0.14651961624622345, "learning_rate": 0.0007594450591648982, "loss": 3.2413, "step": 3230 }, { "epoch": 0.1879950100089936, "grad_norm": 0.16929645836353302, "learning_rate": 0.0007591076393959396, "loss": 3.2378, "step": 3240 }, { "epoch": 0.18857524152136704, "grad_norm": 0.16702421009540558, "learning_rate": 0.0007587688973550335, "loss": 3.2465, "step": 3250 }, { "epoch": 0.18915547303374047, "grad_norm": 0.1797904670238495, "learning_rate": 0.0007584288342894639, "loss": 3.2411, "step": 3260 }, { "epoch": 0.1897357045461139, "grad_norm": 0.16431447863578796, "learning_rate": 0.0007580874514513784, "loss": 3.227, "step": 3270 }, { "epoch": 0.19031593605848734, "grad_norm": 0.1436549872159958, "learning_rate": 0.0007577447500977844, "loss": 3.2415, "step": 3280 }, { "epoch": 0.19089616757086078, "grad_norm": 0.16937850415706635, "learning_rate": 0.0007574007314905441, "loss": 3.2432, "step": 3290 }, { "epoch": 0.19147639908323422, "grad_norm": 0.16476495563983917, "learning_rate": 0.0007570553968963701, "loss": 3.2408, "step": 3300 }, { "epoch": 0.19205663059560765, "grad_norm": 0.14324772357940674, "learning_rate": 0.0007567087475868207, "loss": 3.2314, "step": 3310 }, { "epoch": 0.1926368621079811, "grad_norm": 0.1664527803659439, "learning_rate": 0.0007563607848382948, "loss": 3.2295, "step": 3320 }, { "epoch": 0.19321709362035452, "grad_norm": 0.14981228113174438, "learning_rate": 0.0007560115099320278, "loss": 3.2426, "step": 3330 }, { "epoch": 0.19379732513272796, "grad_norm": 0.1476903110742569, "learning_rate": 0.0007556609241540864, "loss": 3.2294, "step": 3340 }, { "epoch": 0.1943775566451014, "grad_norm": 0.1611299067735672, "learning_rate": 0.0007553090287953642, "loss": 3.2408, "step": 3350 }, { "epoch": 0.19495778815747483, "grad_norm": 0.15556120872497559, "learning_rate": 0.0007549558251515769, "loss": 3.2244, "step": 3360 }, { "epoch": 0.19553801966984827, "grad_norm": 0.1537899374961853, "learning_rate": 0.0007546013145232572, "loss": 3.2359, "step": 3370 }, { "epoch": 0.1961182511822217, "grad_norm": 0.15119978785514832, "learning_rate": 0.0007542454982157503, "loss": 3.2316, "step": 3380 }, { "epoch": 0.19669848269459514, "grad_norm": 0.16069358587265015, "learning_rate": 0.0007538883775392092, "loss": 3.2321, "step": 3390 }, { "epoch": 0.19727871420696858, "grad_norm": 0.1364680528640747, "learning_rate": 0.0007535299538085896, "loss": 3.2247, "step": 3400 }, { "epoch": 0.197858945719342, "grad_norm": 0.1613670140504837, "learning_rate": 0.0007531702283436453, "loss": 3.2293, "step": 3410 }, { "epoch": 0.19843917723171545, "grad_norm": 0.1605822890996933, "learning_rate": 0.0007528092024689229, "loss": 3.2264, "step": 3420 }, { "epoch": 0.19901940874408888, "grad_norm": 0.1686430126428604, "learning_rate": 0.000752446877513758, "loss": 3.2212, "step": 3430 }, { "epoch": 0.19959964025646232, "grad_norm": 0.1641404628753662, "learning_rate": 0.0007520832548122686, "loss": 3.2151, "step": 3440 }, { "epoch": 0.20017987176883575, "grad_norm": 0.15074734389781952, "learning_rate": 0.0007517183357033517, "loss": 3.2269, "step": 3450 }, { "epoch": 0.2007601032812092, "grad_norm": 0.1816314458847046, "learning_rate": 0.0007513521215306776, "loss": 3.2218, "step": 3460 }, { "epoch": 0.20134033479358263, "grad_norm": 0.17646701633930206, "learning_rate": 0.0007509846136426856, "loss": 3.2226, "step": 3470 }, { "epoch": 0.2019205663059561, "grad_norm": 0.14577239751815796, "learning_rate": 0.0007506158133925777, "loss": 3.2231, "step": 3480 }, { "epoch": 0.20250079781832953, "grad_norm": 0.16330170631408691, "learning_rate": 0.0007502457221383156, "loss": 3.2189, "step": 3490 }, { "epoch": 0.20308102933070296, "grad_norm": 0.1625996083021164, "learning_rate": 0.0007498743412426137, "loss": 3.2264, "step": 3500 }, { "epoch": 0.2036612608430764, "grad_norm": 0.18696647882461548, "learning_rate": 0.0007495016720729354, "loss": 3.2106, "step": 3510 }, { "epoch": 0.20424149235544983, "grad_norm": 0.14726507663726807, "learning_rate": 0.0007491277160014878, "loss": 3.2119, "step": 3520 }, { "epoch": 0.20482172386782327, "grad_norm": 0.15068233013153076, "learning_rate": 0.0007487524744052162, "loss": 3.2167, "step": 3530 }, { "epoch": 0.2054019553801967, "grad_norm": 0.14645299315452576, "learning_rate": 0.0007483759486657995, "loss": 3.2111, "step": 3540 }, { "epoch": 0.20598218689257014, "grad_norm": 0.17312663793563843, "learning_rate": 0.0007479981401696449, "loss": 3.2087, "step": 3550 }, { "epoch": 0.20656241840494358, "grad_norm": 0.14498692750930786, "learning_rate": 0.0007476190503078829, "loss": 3.2108, "step": 3560 }, { "epoch": 0.207142649917317, "grad_norm": 0.16870485246181488, "learning_rate": 0.000747238680476362, "loss": 3.2173, "step": 3570 }, { "epoch": 0.20772288142969045, "grad_norm": 0.16222164034843445, "learning_rate": 0.0007468570320756439, "loss": 3.2127, "step": 3580 }, { "epoch": 0.20830311294206388, "grad_norm": 0.16587336361408234, "learning_rate": 0.0007464741065109978, "loss": 3.219, "step": 3590 }, { "epoch": 0.20888334445443732, "grad_norm": 0.15786835551261902, "learning_rate": 0.0007460899051923959, "loss": 3.2096, "step": 3600 }, { "epoch": 0.20946357596681076, "grad_norm": 0.1677330583333969, "learning_rate": 0.0007457044295345076, "loss": 3.2106, "step": 3610 }, { "epoch": 0.2100438074791842, "grad_norm": 0.1512485295534134, "learning_rate": 0.0007453176809566948, "loss": 3.2195, "step": 3620 }, { "epoch": 0.21062403899155763, "grad_norm": 0.14546190202236176, "learning_rate": 0.0007449296608830063, "loss": 3.2074, "step": 3630 }, { "epoch": 0.21120427050393106, "grad_norm": 0.18463167548179626, "learning_rate": 0.0007445403707421724, "loss": 3.2107, "step": 3640 }, { "epoch": 0.2117845020163045, "grad_norm": 0.15126778185367584, "learning_rate": 0.0007441498119676004, "loss": 3.2075, "step": 3650 }, { "epoch": 0.21236473352867793, "grad_norm": 0.15070891380310059, "learning_rate": 0.0007437579859973686, "loss": 3.212, "step": 3660 }, { "epoch": 0.21294496504105137, "grad_norm": 0.15106479823589325, "learning_rate": 0.000743364894274221, "loss": 3.2019, "step": 3670 }, { "epoch": 0.2135251965534248, "grad_norm": 0.15685296058654785, "learning_rate": 0.0007429705382455628, "loss": 3.2068, "step": 3680 }, { "epoch": 0.21410542806579824, "grad_norm": 0.16195222735404968, "learning_rate": 0.0007425749193634543, "loss": 3.2056, "step": 3690 }, { "epoch": 0.21468565957817168, "grad_norm": 0.1727381944656372, "learning_rate": 0.0007421780390846053, "loss": 3.1972, "step": 3700 }, { "epoch": 0.21526589109054511, "grad_norm": 0.1869836002588272, "learning_rate": 0.0007417798988703709, "loss": 3.2045, "step": 3710 }, { "epoch": 0.21584612260291858, "grad_norm": 0.15097570419311523, "learning_rate": 0.0007413805001867449, "loss": 3.2041, "step": 3720 }, { "epoch": 0.216426354115292, "grad_norm": 0.1638958752155304, "learning_rate": 0.0007409798445043551, "loss": 3.2092, "step": 3730 }, { "epoch": 0.21700658562766545, "grad_norm": 0.15390229225158691, "learning_rate": 0.0007405779332984578, "loss": 3.1981, "step": 3740 }, { "epoch": 0.21758681714003889, "grad_norm": 0.15282894670963287, "learning_rate": 0.0007401747680489321, "loss": 3.199, "step": 3750 }, { "epoch": 0.21816704865241232, "grad_norm": 0.15143954753875732, "learning_rate": 0.0007397703502402746, "loss": 3.2032, "step": 3760 }, { "epoch": 0.21874728016478576, "grad_norm": 0.15200908482074738, "learning_rate": 0.000739364681361594, "loss": 3.1899, "step": 3770 }, { "epoch": 0.2193275116771592, "grad_norm": 0.1549234390258789, "learning_rate": 0.0007389577629066059, "loss": 3.2031, "step": 3780 }, { "epoch": 0.21990774318953263, "grad_norm": 0.14660657942295074, "learning_rate": 0.0007385495963736265, "loss": 3.1994, "step": 3790 }, { "epoch": 0.22048797470190606, "grad_norm": 0.14468826353549957, "learning_rate": 0.0007381401832655676, "loss": 3.1943, "step": 3800 }, { "epoch": 0.2210682062142795, "grad_norm": 0.15662981569766998, "learning_rate": 0.0007377295250899314, "loss": 3.2025, "step": 3810 }, { "epoch": 0.22164843772665294, "grad_norm": 0.1512591540813446, "learning_rate": 0.0007373176233588043, "loss": 3.1934, "step": 3820 }, { "epoch": 0.22222866923902637, "grad_norm": 0.14835985004901886, "learning_rate": 0.0007369044795888513, "loss": 3.1953, "step": 3830 }, { "epoch": 0.2228089007513998, "grad_norm": 0.14297959208488464, "learning_rate": 0.0007364900953013115, "loss": 3.1897, "step": 3840 }, { "epoch": 0.22338913226377324, "grad_norm": 0.16208681464195251, "learning_rate": 0.0007360744720219908, "loss": 3.1975, "step": 3850 }, { "epoch": 0.22396936377614668, "grad_norm": 0.15817061066627502, "learning_rate": 0.000735657611281258, "loss": 3.1872, "step": 3860 }, { "epoch": 0.22454959528852012, "grad_norm": 0.14679491519927979, "learning_rate": 0.0007352395146140379, "loss": 3.1845, "step": 3870 }, { "epoch": 0.22512982680089355, "grad_norm": 0.16666549444198608, "learning_rate": 0.0007348201835598062, "loss": 3.1888, "step": 3880 }, { "epoch": 0.225710058313267, "grad_norm": 0.1385534107685089, "learning_rate": 0.0007343996196625838, "loss": 3.1817, "step": 3890 }, { "epoch": 0.22629028982564042, "grad_norm": 0.14590144157409668, "learning_rate": 0.000733977824470931, "loss": 3.1853, "step": 3900 }, { "epoch": 0.22687052133801386, "grad_norm": 0.1565076857805252, "learning_rate": 0.0007335547995379419, "loss": 3.188, "step": 3910 }, { "epoch": 0.2274507528503873, "grad_norm": 0.15521582961082458, "learning_rate": 0.0007331305464212387, "loss": 3.1849, "step": 3920 }, { "epoch": 0.22803098436276073, "grad_norm": 0.16882893443107605, "learning_rate": 0.0007327050666829656, "loss": 3.1815, "step": 3930 }, { "epoch": 0.22861121587513417, "grad_norm": 0.14680877327919006, "learning_rate": 0.0007322783618897838, "loss": 3.1869, "step": 3940 }, { "epoch": 0.2291914473875076, "grad_norm": 0.14188314974308014, "learning_rate": 0.000731850433612865, "loss": 3.1843, "step": 3950 }, { "epoch": 0.22977167889988107, "grad_norm": 0.14620228111743927, "learning_rate": 0.0007314212834278859, "loss": 3.1949, "step": 3960 }, { "epoch": 0.2303519104122545, "grad_norm": 0.17595823109149933, "learning_rate": 0.0007309909129150225, "loss": 3.186, "step": 3970 }, { "epoch": 0.23093214192462794, "grad_norm": 0.16758275032043457, "learning_rate": 0.0007305593236589438, "loss": 3.1764, "step": 3980 }, { "epoch": 0.23151237343700137, "grad_norm": 0.14021886885166168, "learning_rate": 0.0007301265172488071, "loss": 3.1828, "step": 3990 }, { "epoch": 0.2320926049493748, "grad_norm": 0.1383085399866104, "learning_rate": 0.0007296924952782507, "loss": 3.1821, "step": 4000 }, { "epoch": 0.2320926049493748, "eval_loss": 3.1375861167907715, "eval_runtime": 3.4977, "eval_samples_per_second": 1237.972, "eval_steps_per_second": 4.86, "step": 4000 }, { "epoch": 0.23267283646174824, "grad_norm": 0.14542756974697113, "learning_rate": 0.0007292572593453891, "loss": 3.1815, "step": 4010 }, { "epoch": 0.23325306797412168, "grad_norm": 0.16569748520851135, "learning_rate": 0.0007288208110528064, "loss": 3.1914, "step": 4020 }, { "epoch": 0.23383329948649512, "grad_norm": 0.16292521357536316, "learning_rate": 0.0007283831520075511, "loss": 3.1794, "step": 4030 }, { "epoch": 0.23441353099886855, "grad_norm": 0.15363453328609467, "learning_rate": 0.0007279442838211295, "loss": 3.1763, "step": 4040 }, { "epoch": 0.234993762511242, "grad_norm": 0.16711680591106415, "learning_rate": 0.0007275042081095004, "loss": 3.179, "step": 4050 }, { "epoch": 0.23557399402361542, "grad_norm": 0.14706681668758392, "learning_rate": 0.0007270629264930686, "loss": 3.1855, "step": 4060 }, { "epoch": 0.23615422553598886, "grad_norm": 0.14695675671100616, "learning_rate": 0.0007266204405966793, "loss": 3.1733, "step": 4070 }, { "epoch": 0.2367344570483623, "grad_norm": 0.1567709594964981, "learning_rate": 0.0007261767520496119, "loss": 3.1746, "step": 4080 }, { "epoch": 0.23731468856073573, "grad_norm": 0.16033339500427246, "learning_rate": 0.0007257318624855739, "loss": 3.1694, "step": 4090 }, { "epoch": 0.23789492007310917, "grad_norm": 0.15018244087696075, "learning_rate": 0.0007252857735426956, "loss": 3.1671, "step": 4100 }, { "epoch": 0.2384751515854826, "grad_norm": 0.17193986475467682, "learning_rate": 0.0007248384868635229, "loss": 3.1705, "step": 4110 }, { "epoch": 0.23905538309785604, "grad_norm": 0.15580850839614868, "learning_rate": 0.0007243900040950125, "loss": 3.1741, "step": 4120 }, { "epoch": 0.23963561461022947, "grad_norm": 0.16209259629249573, "learning_rate": 0.0007239403268885249, "loss": 3.174, "step": 4130 }, { "epoch": 0.2402158461226029, "grad_norm": 0.157977893948555, "learning_rate": 0.0007234894568998186, "loss": 3.177, "step": 4140 }, { "epoch": 0.24079607763497635, "grad_norm": 0.16139239072799683, "learning_rate": 0.000723037395789044, "loss": 3.1737, "step": 4150 }, { "epoch": 0.24137630914734978, "grad_norm": 0.16818347573280334, "learning_rate": 0.0007225841452207376, "loss": 3.1745, "step": 4160 }, { "epoch": 0.24195654065972322, "grad_norm": 0.16484715044498444, "learning_rate": 0.0007221297068638157, "loss": 3.1697, "step": 4170 }, { "epoch": 0.24253677217209665, "grad_norm": 0.15860258042812347, "learning_rate": 0.0007216740823915676, "loss": 3.1716, "step": 4180 }, { "epoch": 0.2431170036844701, "grad_norm": 0.15132510662078857, "learning_rate": 0.0007212172734816503, "loss": 3.1664, "step": 4190 }, { "epoch": 0.24369723519684355, "grad_norm": 0.1616482436656952, "learning_rate": 0.0007207592818160823, "loss": 3.1675, "step": 4200 }, { "epoch": 0.244277466709217, "grad_norm": 0.16150842607021332, "learning_rate": 0.0007203001090812368, "loss": 3.1652, "step": 4210 }, { "epoch": 0.24485769822159043, "grad_norm": 0.13779228925704956, "learning_rate": 0.0007198397569678359, "loss": 3.1731, "step": 4220 }, { "epoch": 0.24543792973396386, "grad_norm": 0.1547456681728363, "learning_rate": 0.0007193782271709444, "loss": 3.1652, "step": 4230 }, { "epoch": 0.2460181612463373, "grad_norm": 0.15583071112632751, "learning_rate": 0.0007189155213899631, "loss": 3.1561, "step": 4240 }, { "epoch": 0.24659839275871073, "grad_norm": 0.16655480861663818, "learning_rate": 0.0007184516413286234, "loss": 3.1615, "step": 4250 }, { "epoch": 0.24717862427108417, "grad_norm": 0.13721682131290436, "learning_rate": 0.0007179865886949802, "loss": 3.1692, "step": 4260 }, { "epoch": 0.2477588557834576, "grad_norm": 0.15880094468593597, "learning_rate": 0.0007175203652014061, "loss": 3.1625, "step": 4270 }, { "epoch": 0.24833908729583104, "grad_norm": 0.16666831076145172, "learning_rate": 0.0007170529725645848, "loss": 3.1677, "step": 4280 }, { "epoch": 0.24891931880820448, "grad_norm": 0.15433961153030396, "learning_rate": 0.0007165844125055049, "loss": 3.1637, "step": 4290 }, { "epoch": 0.2494995503205779, "grad_norm": 0.14771400392055511, "learning_rate": 0.0007161146867494535, "loss": 3.1649, "step": 4300 }, { "epoch": 0.2500797818329513, "grad_norm": 0.13733576238155365, "learning_rate": 0.0007156437970260104, "loss": 3.1562, "step": 4310 }, { "epoch": 0.2506600133453248, "grad_norm": 0.14560434222221375, "learning_rate": 0.0007151717450690405, "loss": 3.1578, "step": 4320 }, { "epoch": 0.25124024485769825, "grad_norm": 0.13674066960811615, "learning_rate": 0.0007146985326166889, "loss": 3.1671, "step": 4330 }, { "epoch": 0.25182047637007166, "grad_norm": 0.1438528448343277, "learning_rate": 0.0007142241614113732, "loss": 3.156, "step": 4340 }, { "epoch": 0.2524007078824451, "grad_norm": 0.14920951426029205, "learning_rate": 0.0007137486331997778, "loss": 3.1662, "step": 4350 }, { "epoch": 0.2529809393948185, "grad_norm": 0.1858551949262619, "learning_rate": 0.0007132719497328476, "loss": 3.1625, "step": 4360 }, { "epoch": 0.253561170907192, "grad_norm": 0.15135957300662994, "learning_rate": 0.0007127941127657811, "loss": 3.161, "step": 4370 }, { "epoch": 0.2541414024195654, "grad_norm": 0.14163395762443542, "learning_rate": 0.0007123151240580238, "loss": 3.164, "step": 4380 }, { "epoch": 0.25472163393193886, "grad_norm": 0.1525522619485855, "learning_rate": 0.0007118349853732624, "loss": 3.1664, "step": 4390 }, { "epoch": 0.25530186544431227, "grad_norm": 0.14989475905895233, "learning_rate": 0.000711353698479418, "loss": 3.1623, "step": 4400 }, { "epoch": 0.25588209695668573, "grad_norm": 0.13239869475364685, "learning_rate": 0.0007108712651486393, "loss": 3.1514, "step": 4410 }, { "epoch": 0.25646232846905914, "grad_norm": 0.1713549792766571, "learning_rate": 0.0007103876871572963, "loss": 3.1484, "step": 4420 }, { "epoch": 0.2570425599814326, "grad_norm": 0.1533433198928833, "learning_rate": 0.0007099029662859738, "loss": 3.1525, "step": 4430 }, { "epoch": 0.257622791493806, "grad_norm": 0.15039394795894623, "learning_rate": 0.0007094171043194648, "loss": 3.16, "step": 4440 }, { "epoch": 0.2582030230061795, "grad_norm": 0.15499238669872284, "learning_rate": 0.0007089301030467639, "loss": 3.1514, "step": 4450 }, { "epoch": 0.2587832545185529, "grad_norm": 0.16344475746154785, "learning_rate": 0.0007084419642610611, "loss": 3.147, "step": 4460 }, { "epoch": 0.25936348603092635, "grad_norm": 0.1554035246372223, "learning_rate": 0.0007079526897597343, "loss": 3.1498, "step": 4470 }, { "epoch": 0.25994371754329976, "grad_norm": 0.15636684000492096, "learning_rate": 0.0007074622813443435, "loss": 3.1508, "step": 4480 }, { "epoch": 0.2605239490556732, "grad_norm": 0.135266974568367, "learning_rate": 0.000706970740820624, "loss": 3.1444, "step": 4490 }, { "epoch": 0.26110418056804663, "grad_norm": 0.15683424472808838, "learning_rate": 0.0007064780699984793, "loss": 3.1442, "step": 4500 }, { "epoch": 0.2616844120804201, "grad_norm": 0.1561397761106491, "learning_rate": 0.000705984270691975, "loss": 3.1539, "step": 4510 }, { "epoch": 0.2622646435927935, "grad_norm": 0.13371412456035614, "learning_rate": 0.0007054893447193321, "loss": 3.1477, "step": 4520 }, { "epoch": 0.26284487510516696, "grad_norm": 0.14937594532966614, "learning_rate": 0.0007049932939029196, "loss": 3.1415, "step": 4530 }, { "epoch": 0.26342510661754037, "grad_norm": 0.18092121183872223, "learning_rate": 0.0007044961200692488, "loss": 3.1424, "step": 4540 }, { "epoch": 0.26400533812991384, "grad_norm": 0.15069086849689484, "learning_rate": 0.0007039978250489657, "loss": 3.148, "step": 4550 }, { "epoch": 0.2645855696422873, "grad_norm": 0.15044303238391876, "learning_rate": 0.0007034984106768448, "loss": 3.1478, "step": 4560 }, { "epoch": 0.2651658011546607, "grad_norm": 0.15583853423595428, "learning_rate": 0.0007029978787917821, "loss": 3.1527, "step": 4570 }, { "epoch": 0.26574603266703417, "grad_norm": 0.12967164814472198, "learning_rate": 0.0007024962312367886, "loss": 3.1543, "step": 4580 }, { "epoch": 0.2663262641794076, "grad_norm": 0.15050862729549408, "learning_rate": 0.0007019934698589827, "loss": 3.1481, "step": 4590 }, { "epoch": 0.26690649569178104, "grad_norm": 0.13848599791526794, "learning_rate": 0.0007014895965095851, "loss": 3.1446, "step": 4600 }, { "epoch": 0.26748672720415445, "grad_norm": 0.14600566029548645, "learning_rate": 0.0007009846130439098, "loss": 3.1471, "step": 4610 }, { "epoch": 0.2680669587165279, "grad_norm": 0.12978219985961914, "learning_rate": 0.000700478521321359, "loss": 3.1469, "step": 4620 }, { "epoch": 0.2686471902289013, "grad_norm": 0.17083846032619476, "learning_rate": 0.0006999713232054153, "loss": 3.1345, "step": 4630 }, { "epoch": 0.2692274217412748, "grad_norm": 0.1470998227596283, "learning_rate": 0.0006994630205636354, "loss": 3.1458, "step": 4640 }, { "epoch": 0.2698076532536482, "grad_norm": 0.15643703937530518, "learning_rate": 0.0006989536152676429, "loss": 3.145, "step": 4650 }, { "epoch": 0.27038788476602166, "grad_norm": 0.1556815654039383, "learning_rate": 0.0006984431091931215, "loss": 3.1513, "step": 4660 }, { "epoch": 0.27096811627839507, "grad_norm": 0.1476157009601593, "learning_rate": 0.000697931504219808, "loss": 3.1274, "step": 4670 }, { "epoch": 0.27154834779076853, "grad_norm": 0.1438508778810501, "learning_rate": 0.0006974188022314854, "loss": 3.1449, "step": 4680 }, { "epoch": 0.27212857930314194, "grad_norm": 0.1564301997423172, "learning_rate": 0.0006969050051159763, "loss": 3.1377, "step": 4690 }, { "epoch": 0.2727088108155154, "grad_norm": 0.15082314610481262, "learning_rate": 0.0006963901147651352, "loss": 3.1498, "step": 4700 }, { "epoch": 0.2732890423278888, "grad_norm": 0.15249784290790558, "learning_rate": 0.0006958741330748426, "loss": 3.1419, "step": 4710 }, { "epoch": 0.2738692738402623, "grad_norm": 0.15166673064231873, "learning_rate": 0.0006953570619449972, "loss": 3.137, "step": 4720 }, { "epoch": 0.2744495053526357, "grad_norm": 0.136736199259758, "learning_rate": 0.0006948389032795088, "loss": 3.1411, "step": 4730 }, { "epoch": 0.27502973686500914, "grad_norm": 0.15456107258796692, "learning_rate": 0.0006943196589862921, "loss": 3.1351, "step": 4740 }, { "epoch": 0.27560996837738255, "grad_norm": 0.14316433668136597, "learning_rate": 0.0006937993309772588, "loss": 3.1367, "step": 4750 }, { "epoch": 0.276190199889756, "grad_norm": 0.15227961540222168, "learning_rate": 0.0006932779211683113, "loss": 3.1382, "step": 4760 }, { "epoch": 0.2767704314021294, "grad_norm": 0.15172402560710907, "learning_rate": 0.0006927554314793352, "loss": 3.1221, "step": 4770 }, { "epoch": 0.2773506629145029, "grad_norm": 0.14476482570171356, "learning_rate": 0.0006922318638341919, "loss": 3.1379, "step": 4780 }, { "epoch": 0.2779308944268763, "grad_norm": 0.1386752724647522, "learning_rate": 0.0006917072201607127, "loss": 3.1497, "step": 4790 }, { "epoch": 0.27851112593924976, "grad_norm": 0.14624926447868347, "learning_rate": 0.0006911815023906904, "loss": 3.1426, "step": 4800 }, { "epoch": 0.2790913574516232, "grad_norm": 0.14945849776268005, "learning_rate": 0.0006906547124598729, "loss": 3.1297, "step": 4810 }, { "epoch": 0.27967158896399663, "grad_norm": 0.13318543136119843, "learning_rate": 0.000690126852307956, "loss": 3.1317, "step": 4820 }, { "epoch": 0.2802518204763701, "grad_norm": 0.16204489767551422, "learning_rate": 0.0006895979238785759, "loss": 3.1444, "step": 4830 }, { "epoch": 0.2808320519887435, "grad_norm": 0.13900279998779297, "learning_rate": 0.0006890679291193027, "loss": 3.1228, "step": 4840 }, { "epoch": 0.28141228350111697, "grad_norm": 0.14405326545238495, "learning_rate": 0.0006885368699816324, "loss": 3.1321, "step": 4850 }, { "epoch": 0.2819925150134904, "grad_norm": 0.1495261937379837, "learning_rate": 0.0006880047484209807, "loss": 3.1301, "step": 4860 }, { "epoch": 0.28257274652586384, "grad_norm": 0.13505682349205017, "learning_rate": 0.0006874715663966746, "loss": 3.1234, "step": 4870 }, { "epoch": 0.28315297803823725, "grad_norm": 0.1577468067407608, "learning_rate": 0.0006869373258719464, "loss": 3.1294, "step": 4880 }, { "epoch": 0.2837332095506107, "grad_norm": 0.1523805558681488, "learning_rate": 0.0006864020288139255, "loss": 3.1369, "step": 4890 }, { "epoch": 0.2843134410629841, "grad_norm": 0.14397644996643066, "learning_rate": 0.0006858656771936318, "loss": 3.1174, "step": 4900 }, { "epoch": 0.2848936725753576, "grad_norm": 0.14309902489185333, "learning_rate": 0.0006853282729859683, "loss": 3.1258, "step": 4910 }, { "epoch": 0.285473904087731, "grad_norm": 0.16630397737026215, "learning_rate": 0.0006847898181697133, "loss": 3.1259, "step": 4920 }, { "epoch": 0.28605413560010445, "grad_norm": 0.14610059559345245, "learning_rate": 0.0006842503147275141, "loss": 3.1289, "step": 4930 }, { "epoch": 0.28663436711247786, "grad_norm": 0.14541591703891754, "learning_rate": 0.0006837097646458787, "loss": 3.1274, "step": 4940 }, { "epoch": 0.2872145986248513, "grad_norm": 0.15029065310955048, "learning_rate": 0.0006831681699151693, "loss": 3.1232, "step": 4950 }, { "epoch": 0.28779483013722473, "grad_norm": 0.14973795413970947, "learning_rate": 0.0006826255325295943, "loss": 3.1203, "step": 4960 }, { "epoch": 0.2883750616495982, "grad_norm": 0.14177200198173523, "learning_rate": 0.0006820818544872016, "loss": 3.1376, "step": 4970 }, { "epoch": 0.2889552931619716, "grad_norm": 0.16396763920783997, "learning_rate": 0.0006815371377898704, "loss": 3.1281, "step": 4980 }, { "epoch": 0.28953552467434507, "grad_norm": 0.14975488185882568, "learning_rate": 0.0006809913844433048, "loss": 3.1312, "step": 4990 }, { "epoch": 0.2901157561867185, "grad_norm": 0.13437089323997498, "learning_rate": 0.000680444596457026, "loss": 3.1191, "step": 5000 }, { "epoch": 0.2901157561867185, "eval_loss": 3.077634334564209, "eval_runtime": 3.5007, "eval_samples_per_second": 1236.88, "eval_steps_per_second": 4.856, "step": 5000 }, { "epoch": 0.29069598769909194, "grad_norm": 0.13099373877048492, "learning_rate": 0.0006798967758443643, "loss": 3.1228, "step": 5010 }, { "epoch": 0.29127621921146535, "grad_norm": 0.15170305967330933, "learning_rate": 0.0006793479246224527, "loss": 3.1148, "step": 5020 }, { "epoch": 0.2918564507238388, "grad_norm": 0.1462208330631256, "learning_rate": 0.0006787980448122189, "loss": 3.12, "step": 5030 }, { "epoch": 0.2924366822362123, "grad_norm": 0.1365407109260559, "learning_rate": 0.0006782471384383782, "loss": 3.1229, "step": 5040 }, { "epoch": 0.2930169137485857, "grad_norm": 0.1582639366388321, "learning_rate": 0.000677695207529425, "loss": 3.1181, "step": 5050 }, { "epoch": 0.29359714526095915, "grad_norm": 0.1355699598789215, "learning_rate": 0.0006771422541176274, "loss": 3.1206, "step": 5060 }, { "epoch": 0.29417737677333256, "grad_norm": 0.13802824914455414, "learning_rate": 0.0006765882802390174, "loss": 3.1179, "step": 5070 }, { "epoch": 0.294757608285706, "grad_norm": 0.16647005081176758, "learning_rate": 0.0006760332879333851, "loss": 3.1255, "step": 5080 }, { "epoch": 0.2953378397980794, "grad_norm": 0.1353151798248291, "learning_rate": 0.0006754772792442699, "loss": 3.1162, "step": 5090 }, { "epoch": 0.2959180713104529, "grad_norm": 0.16625338792800903, "learning_rate": 0.0006749202562189542, "loss": 3.12, "step": 5100 }, { "epoch": 0.2964983028228263, "grad_norm": 0.13335494697093964, "learning_rate": 0.0006743622209084554, "loss": 3.1171, "step": 5110 }, { "epoch": 0.29707853433519976, "grad_norm": 0.16014111042022705, "learning_rate": 0.0006738031753675179, "loss": 3.1103, "step": 5120 }, { "epoch": 0.29765876584757317, "grad_norm": 0.1412811279296875, "learning_rate": 0.0006732431216546056, "loss": 3.1165, "step": 5130 }, { "epoch": 0.29823899735994663, "grad_norm": 0.18237030506134033, "learning_rate": 0.0006726820618318954, "loss": 3.1241, "step": 5140 }, { "epoch": 0.29881922887232004, "grad_norm": 0.12872178852558136, "learning_rate": 0.0006721199979652679, "loss": 3.1095, "step": 5150 }, { "epoch": 0.2993994603846935, "grad_norm": 0.13701236248016357, "learning_rate": 0.0006715569321243014, "loss": 3.1144, "step": 5160 }, { "epoch": 0.2999796918970669, "grad_norm": 0.14516842365264893, "learning_rate": 0.0006709928663822633, "loss": 3.125, "step": 5170 }, { "epoch": 0.3005599234094404, "grad_norm": 0.18508046865463257, "learning_rate": 0.0006704278028161027, "loss": 3.1114, "step": 5180 }, { "epoch": 0.3011401549218138, "grad_norm": 0.14788569509983063, "learning_rate": 0.0006698617435064429, "loss": 3.1151, "step": 5190 }, { "epoch": 0.30172038643418725, "grad_norm": 0.13983985781669617, "learning_rate": 0.0006692946905375735, "loss": 3.1093, "step": 5200 }, { "epoch": 0.30230061794656066, "grad_norm": 0.1404978185892105, "learning_rate": 0.0006687266459974431, "loss": 3.1123, "step": 5210 }, { "epoch": 0.3028808494589341, "grad_norm": 0.14832602441310883, "learning_rate": 0.0006681576119776509, "loss": 3.1142, "step": 5220 }, { "epoch": 0.30346108097130753, "grad_norm": 0.1313869059085846, "learning_rate": 0.00066758759057344, "loss": 3.1164, "step": 5230 }, { "epoch": 0.304041312483681, "grad_norm": 0.15153004229068756, "learning_rate": 0.0006670165838836889, "loss": 3.1119, "step": 5240 }, { "epoch": 0.3046215439960544, "grad_norm": 0.15079191327095032, "learning_rate": 0.0006664445940109039, "loss": 3.1105, "step": 5250 }, { "epoch": 0.30520177550842786, "grad_norm": 0.15339751541614532, "learning_rate": 0.000665871623061212, "loss": 3.1097, "step": 5260 }, { "epoch": 0.3057820070208013, "grad_norm": 0.14419248700141907, "learning_rate": 0.0006652976731443518, "loss": 3.1168, "step": 5270 }, { "epoch": 0.30636223853317474, "grad_norm": 0.139011949300766, "learning_rate": 0.0006647227463736673, "loss": 3.1016, "step": 5280 }, { "epoch": 0.3069424700455482, "grad_norm": 0.13946521282196045, "learning_rate": 0.000664146844866099, "loss": 3.1082, "step": 5290 }, { "epoch": 0.3075227015579216, "grad_norm": 0.15732550621032715, "learning_rate": 0.0006635699707421767, "loss": 3.1149, "step": 5300 }, { "epoch": 0.30810293307029507, "grad_norm": 0.1405564546585083, "learning_rate": 0.0006629921261260112, "loss": 3.1115, "step": 5310 }, { "epoch": 0.3086831645826685, "grad_norm": 0.1414571851491928, "learning_rate": 0.0006624133131452869, "loss": 3.1137, "step": 5320 }, { "epoch": 0.30926339609504194, "grad_norm": 0.16276155412197113, "learning_rate": 0.0006618335339312539, "loss": 3.1009, "step": 5330 }, { "epoch": 0.30984362760741535, "grad_norm": 0.13396096229553223, "learning_rate": 0.0006612527906187202, "loss": 3.0971, "step": 5340 }, { "epoch": 0.3104238591197888, "grad_norm": 0.1540164053440094, "learning_rate": 0.0006606710853460431, "loss": 3.0958, "step": 5350 }, { "epoch": 0.3110040906321622, "grad_norm": 0.15637041628360748, "learning_rate": 0.0006600884202551228, "loss": 3.1118, "step": 5360 }, { "epoch": 0.3115843221445357, "grad_norm": 0.1541391909122467, "learning_rate": 0.0006595047974913929, "loss": 3.1167, "step": 5370 }, { "epoch": 0.3121645536569091, "grad_norm": 0.13356366753578186, "learning_rate": 0.0006589202192038135, "loss": 3.1122, "step": 5380 }, { "epoch": 0.31274478516928256, "grad_norm": 0.15475161373615265, "learning_rate": 0.0006583346875448634, "loss": 3.0989, "step": 5390 }, { "epoch": 0.31332501668165597, "grad_norm": 0.1348615437746048, "learning_rate": 0.0006577482046705312, "loss": 3.109, "step": 5400 }, { "epoch": 0.31390524819402943, "grad_norm": 0.14969608187675476, "learning_rate": 0.0006571607727403084, "loss": 3.1052, "step": 5410 }, { "epoch": 0.31448547970640284, "grad_norm": 0.13109458982944489, "learning_rate": 0.0006565723939171809, "loss": 3.1036, "step": 5420 }, { "epoch": 0.3150657112187763, "grad_norm": 0.13276948034763336, "learning_rate": 0.0006559830703676213, "loss": 3.1126, "step": 5430 }, { "epoch": 0.3156459427311497, "grad_norm": 0.13578692078590393, "learning_rate": 0.0006553928042615805, "loss": 3.1039, "step": 5440 }, { "epoch": 0.3162261742435232, "grad_norm": 0.15386302769184113, "learning_rate": 0.0006548015977724803, "loss": 3.1037, "step": 5450 }, { "epoch": 0.3168064057558966, "grad_norm": 0.13567377626895905, "learning_rate": 0.0006542094530772044, "loss": 3.1032, "step": 5460 }, { "epoch": 0.31738663726827004, "grad_norm": 0.14540360867977142, "learning_rate": 0.0006536163723560923, "loss": 3.1036, "step": 5470 }, { "epoch": 0.31796686878064345, "grad_norm": 0.14375334978103638, "learning_rate": 0.000653022357792929, "loss": 3.1069, "step": 5480 }, { "epoch": 0.3185471002930169, "grad_norm": 0.14300775527954102, "learning_rate": 0.0006524274115749383, "loss": 3.1, "step": 5490 }, { "epoch": 0.3191273318053903, "grad_norm": 0.1372559368610382, "learning_rate": 0.0006518315358927747, "loss": 3.1002, "step": 5500 }, { "epoch": 0.3197075633177638, "grad_norm": 0.1467244029045105, "learning_rate": 0.0006512347329405146, "loss": 3.0978, "step": 5510 }, { "epoch": 0.32028779483013725, "grad_norm": 0.15439152717590332, "learning_rate": 0.0006506370049156493, "loss": 3.0921, "step": 5520 }, { "epoch": 0.32086802634251066, "grad_norm": 0.15855064988136292, "learning_rate": 0.0006500383540190758, "loss": 3.1034, "step": 5530 }, { "epoch": 0.3214482578548841, "grad_norm": 0.14545102417469025, "learning_rate": 0.0006494387824550896, "loss": 3.0981, "step": 5540 }, { "epoch": 0.32202848936725753, "grad_norm": 0.13340061902999878, "learning_rate": 0.0006488382924313757, "loss": 3.1042, "step": 5550 }, { "epoch": 0.322608720879631, "grad_norm": 0.1602497696876526, "learning_rate": 0.0006482368861590015, "loss": 3.1009, "step": 5560 }, { "epoch": 0.3231889523920044, "grad_norm": 0.13411889970302582, "learning_rate": 0.0006476345658524078, "loss": 3.0985, "step": 5570 }, { "epoch": 0.32376918390437787, "grad_norm": 0.15455657243728638, "learning_rate": 0.0006470313337294013, "loss": 3.0858, "step": 5580 }, { "epoch": 0.3243494154167513, "grad_norm": 0.1275639533996582, "learning_rate": 0.0006464271920111455, "loss": 3.0849, "step": 5590 }, { "epoch": 0.32492964692912474, "grad_norm": 0.1391342580318451, "learning_rate": 0.0006458221429221537, "loss": 3.0946, "step": 5600 }, { "epoch": 0.32550987844149815, "grad_norm": 0.14146512746810913, "learning_rate": 0.0006452161886902801, "loss": 3.0943, "step": 5610 }, { "epoch": 0.3260901099538716, "grad_norm": 0.13598129153251648, "learning_rate": 0.0006446093315467115, "loss": 3.0995, "step": 5620 }, { "epoch": 0.326670341466245, "grad_norm": 0.14798174798488617, "learning_rate": 0.0006440015737259597, "loss": 3.089, "step": 5630 }, { "epoch": 0.3272505729786185, "grad_norm": 0.15382760763168335, "learning_rate": 0.0006433929174658523, "loss": 3.093, "step": 5640 }, { "epoch": 0.3278308044909919, "grad_norm": 0.14495523273944855, "learning_rate": 0.0006427833650075258, "loss": 3.094, "step": 5650 }, { "epoch": 0.32841103600336535, "grad_norm": 0.13422837853431702, "learning_rate": 0.000642172918595416, "loss": 3.0918, "step": 5660 }, { "epoch": 0.32899126751573876, "grad_norm": 0.15112635493278503, "learning_rate": 0.0006415615804772504, "loss": 3.0823, "step": 5670 }, { "epoch": 0.3295714990281122, "grad_norm": 0.14222180843353271, "learning_rate": 0.0006409493529040403, "loss": 3.09, "step": 5680 }, { "epoch": 0.33015173054048563, "grad_norm": 0.16140936315059662, "learning_rate": 0.0006403362381300716, "loss": 3.1116, "step": 5690 }, { "epoch": 0.3307319620528591, "grad_norm": 0.15797702968120575, "learning_rate": 0.0006397222384128971, "loss": 3.0975, "step": 5700 }, { "epoch": 0.3313121935652325, "grad_norm": 0.13766621053218842, "learning_rate": 0.0006391073560133282, "loss": 3.0927, "step": 5710 }, { "epoch": 0.33189242507760597, "grad_norm": 0.14964471757411957, "learning_rate": 0.0006384915931954262, "loss": 3.0797, "step": 5720 }, { "epoch": 0.3324726565899794, "grad_norm": 0.15482941269874573, "learning_rate": 0.0006378749522264943, "loss": 3.0834, "step": 5730 }, { "epoch": 0.33305288810235284, "grad_norm": 0.16019770503044128, "learning_rate": 0.0006372574353770693, "loss": 3.091, "step": 5740 }, { "epoch": 0.3336331196147263, "grad_norm": 0.1344575136899948, "learning_rate": 0.0006366390449209126, "loss": 3.0996, "step": 5750 }, { "epoch": 0.3342133511270997, "grad_norm": 0.14848081767559052, "learning_rate": 0.0006360197831350031, "loss": 3.0895, "step": 5760 }, { "epoch": 0.3347935826394732, "grad_norm": 0.14543505012989044, "learning_rate": 0.0006353996522995271, "loss": 3.0938, "step": 5770 }, { "epoch": 0.3353738141518466, "grad_norm": 0.1381482183933258, "learning_rate": 0.0006347786546978716, "loss": 3.0901, "step": 5780 }, { "epoch": 0.33595404566422005, "grad_norm": 0.15540656447410583, "learning_rate": 0.0006341567926166147, "loss": 3.0952, "step": 5790 }, { "epoch": 0.33653427717659345, "grad_norm": 0.14097265899181366, "learning_rate": 0.0006335340683455178, "loss": 3.0846, "step": 5800 }, { "epoch": 0.3371145086889669, "grad_norm": 0.14413347840309143, "learning_rate": 0.0006329104841775169, "loss": 3.0815, "step": 5810 }, { "epoch": 0.3376947402013403, "grad_norm": 0.16168859601020813, "learning_rate": 0.0006322860424087142, "loss": 3.0808, "step": 5820 }, { "epoch": 0.3382749717137138, "grad_norm": 0.1339394748210907, "learning_rate": 0.0006316607453383696, "loss": 3.0863, "step": 5830 }, { "epoch": 0.3388552032260872, "grad_norm": 0.12748858332633972, "learning_rate": 0.0006310345952688926, "loss": 3.0794, "step": 5840 }, { "epoch": 0.33943543473846066, "grad_norm": 0.1329609751701355, "learning_rate": 0.0006304075945058334, "loss": 3.0916, "step": 5850 }, { "epoch": 0.34001566625083407, "grad_norm": 0.13105526566505432, "learning_rate": 0.0006297797453578742, "loss": 3.0853, "step": 5860 }, { "epoch": 0.34059589776320753, "grad_norm": 0.1517835110425949, "learning_rate": 0.0006291510501368214, "loss": 3.091, "step": 5870 }, { "epoch": 0.34117612927558094, "grad_norm": 0.1394689381122589, "learning_rate": 0.0006285215111575969, "loss": 3.0884, "step": 5880 }, { "epoch": 0.3417563607879544, "grad_norm": 0.13380849361419678, "learning_rate": 0.0006278911307382292, "loss": 3.0801, "step": 5890 }, { "epoch": 0.3423365923003278, "grad_norm": 0.14783403277397156, "learning_rate": 0.0006272599111998445, "loss": 3.0799, "step": 5900 }, { "epoch": 0.3429168238127013, "grad_norm": 0.14787407219409943, "learning_rate": 0.00062662785486666, "loss": 3.0828, "step": 5910 }, { "epoch": 0.3434970553250747, "grad_norm": 0.15095750987529755, "learning_rate": 0.0006259949640659729, "loss": 3.0773, "step": 5920 }, { "epoch": 0.34407728683744815, "grad_norm": 0.15267327427864075, "learning_rate": 0.0006253612411281534, "loss": 3.064, "step": 5930 }, { "epoch": 0.34465751834982156, "grad_norm": 0.1540476381778717, "learning_rate": 0.0006247266883866358, "loss": 3.0782, "step": 5940 }, { "epoch": 0.345237749862195, "grad_norm": 0.1443357765674591, "learning_rate": 0.0006240913081779099, "loss": 3.0801, "step": 5950 }, { "epoch": 0.34581798137456843, "grad_norm": 0.13168315589427948, "learning_rate": 0.0006234551028415121, "loss": 3.075, "step": 5960 }, { "epoch": 0.3463982128869419, "grad_norm": 0.15474265813827515, "learning_rate": 0.0006228180747200171, "loss": 3.0867, "step": 5970 }, { "epoch": 0.3469784443993153, "grad_norm": 0.14644061028957367, "learning_rate": 0.0006221802261590292, "loss": 3.0774, "step": 5980 }, { "epoch": 0.34755867591168876, "grad_norm": 0.13259078562259674, "learning_rate": 0.0006215415595071735, "loss": 3.0701, "step": 5990 }, { "epoch": 0.3481389074240622, "grad_norm": 0.13343210518360138, "learning_rate": 0.0006209020771160876, "loss": 3.0841, "step": 6000 }, { "epoch": 0.3481389074240622, "eval_loss": 3.0316665172576904, "eval_runtime": 3.5085, "eval_samples_per_second": 1234.147, "eval_steps_per_second": 4.845, "step": 6000 }, { "epoch": 0.34871913893643564, "grad_norm": 0.15520484745502472, "learning_rate": 0.0006202617813404128, "loss": 3.0768, "step": 6010 }, { "epoch": 0.3492993704488091, "grad_norm": 0.1423664689064026, "learning_rate": 0.0006196206745377851, "loss": 3.0814, "step": 6020 }, { "epoch": 0.3498796019611825, "grad_norm": 0.14231909811496735, "learning_rate": 0.0006189787590688267, "loss": 3.0766, "step": 6030 }, { "epoch": 0.35045983347355597, "grad_norm": 0.1438128650188446, "learning_rate": 0.000618336037297138, "loss": 3.0742, "step": 6040 }, { "epoch": 0.3510400649859294, "grad_norm": 0.13780520856380463, "learning_rate": 0.0006176925115892876, "loss": 3.0782, "step": 6050 }, { "epoch": 0.35162029649830284, "grad_norm": 0.1355493813753128, "learning_rate": 0.0006170481843148047, "loss": 3.082, "step": 6060 }, { "epoch": 0.35220052801067625, "grad_norm": 0.13003236055374146, "learning_rate": 0.0006164030578461697, "loss": 3.0693, "step": 6070 }, { "epoch": 0.3527807595230497, "grad_norm": 0.14208032190799713, "learning_rate": 0.000615757134558806, "loss": 3.0856, "step": 6080 }, { "epoch": 0.3533609910354231, "grad_norm": 0.13306844234466553, "learning_rate": 0.0006151104168310708, "loss": 3.0803, "step": 6090 }, { "epoch": 0.3539412225477966, "grad_norm": 0.1512952595949173, "learning_rate": 0.0006144629070442463, "loss": 3.0705, "step": 6100 }, { "epoch": 0.35452145406017, "grad_norm": 0.15171298384666443, "learning_rate": 0.0006138146075825316, "loss": 3.0716, "step": 6110 }, { "epoch": 0.35510168557254346, "grad_norm": 0.1334518939256668, "learning_rate": 0.0006131655208330331, "loss": 3.0726, "step": 6120 }, { "epoch": 0.35568191708491687, "grad_norm": 0.12969031929969788, "learning_rate": 0.0006125156491857564, "loss": 3.0766, "step": 6130 }, { "epoch": 0.35626214859729033, "grad_norm": 0.13735461235046387, "learning_rate": 0.0006118649950335968, "loss": 3.0758, "step": 6140 }, { "epoch": 0.35684238010966374, "grad_norm": 0.14541500806808472, "learning_rate": 0.0006112135607723309, "loss": 3.0719, "step": 6150 }, { "epoch": 0.3574226116220372, "grad_norm": 0.13016124069690704, "learning_rate": 0.0006105613488006083, "loss": 3.0733, "step": 6160 }, { "epoch": 0.3580028431344106, "grad_norm": 0.12773442268371582, "learning_rate": 0.0006099083615199415, "loss": 3.0764, "step": 6170 }, { "epoch": 0.3585830746467841, "grad_norm": 0.1433558166027069, "learning_rate": 0.0006092546013346982, "loss": 3.0745, "step": 6180 }, { "epoch": 0.3591633061591575, "grad_norm": 0.14369390904903412, "learning_rate": 0.0006086000706520919, "loss": 3.0747, "step": 6190 }, { "epoch": 0.35974353767153094, "grad_norm": 0.12322000414133072, "learning_rate": 0.000607944771882173, "loss": 3.0624, "step": 6200 }, { "epoch": 0.36032376918390435, "grad_norm": 0.13215206563472748, "learning_rate": 0.0006072887074378203, "loss": 3.0658, "step": 6210 }, { "epoch": 0.3609040006962778, "grad_norm": 0.1413458287715912, "learning_rate": 0.000606631879734732, "loss": 3.0705, "step": 6220 }, { "epoch": 0.3614842322086513, "grad_norm": 0.1452745646238327, "learning_rate": 0.0006059742911914161, "loss": 3.0651, "step": 6230 }, { "epoch": 0.3620644637210247, "grad_norm": 0.15031304955482483, "learning_rate": 0.0006053159442291827, "loss": 3.0776, "step": 6240 }, { "epoch": 0.36264469523339815, "grad_norm": 0.13241888582706451, "learning_rate": 0.0006046568412721346, "loss": 3.0682, "step": 6250 }, { "epoch": 0.36322492674577156, "grad_norm": 0.12760193645954132, "learning_rate": 0.0006039969847471574, "loss": 3.0698, "step": 6260 }, { "epoch": 0.363805158258145, "grad_norm": 0.1420195996761322, "learning_rate": 0.0006033363770839124, "loss": 3.0675, "step": 6270 }, { "epoch": 0.36438538977051843, "grad_norm": 0.14577995240688324, "learning_rate": 0.0006026750207148259, "loss": 3.0725, "step": 6280 }, { "epoch": 0.3649656212828919, "grad_norm": 0.1315937042236328, "learning_rate": 0.0006020129180750814, "loss": 3.073, "step": 6290 }, { "epoch": 0.3655458527952653, "grad_norm": 0.14048981666564941, "learning_rate": 0.0006013500716026103, "loss": 3.0685, "step": 6300 }, { "epoch": 0.36612608430763877, "grad_norm": 0.14861349761486053, "learning_rate": 0.0006006864837380827, "loss": 3.0682, "step": 6310 }, { "epoch": 0.3667063158200122, "grad_norm": 0.13127431273460388, "learning_rate": 0.0006000221569248986, "loss": 3.0618, "step": 6320 }, { "epoch": 0.36728654733238564, "grad_norm": 0.13969634473323822, "learning_rate": 0.0005993570936091789, "loss": 3.0702, "step": 6330 }, { "epoch": 0.36786677884475905, "grad_norm": 0.1190052479505539, "learning_rate": 0.0005986912962397566, "loss": 3.0738, "step": 6340 }, { "epoch": 0.3684470103571325, "grad_norm": 0.13660824298858643, "learning_rate": 0.0005980247672681671, "loss": 3.0762, "step": 6350 }, { "epoch": 0.3690272418695059, "grad_norm": 0.15142664313316345, "learning_rate": 0.0005973575091486399, "loss": 3.0642, "step": 6360 }, { "epoch": 0.3696074733818794, "grad_norm": 0.13183598220348358, "learning_rate": 0.0005966895243380894, "loss": 3.0694, "step": 6370 }, { "epoch": 0.3701877048942528, "grad_norm": 0.1251576989889145, "learning_rate": 0.0005960208152961056, "loss": 3.0638, "step": 6380 }, { "epoch": 0.37076793640662625, "grad_norm": 0.14814746379852295, "learning_rate": 0.0005953513844849453, "loss": 3.0626, "step": 6390 }, { "epoch": 0.37134816791899966, "grad_norm": 0.13559052348136902, "learning_rate": 0.000594681234369523, "loss": 3.0723, "step": 6400 }, { "epoch": 0.3719283994313731, "grad_norm": 0.143977552652359, "learning_rate": 0.0005940103674174014, "loss": 3.0636, "step": 6410 }, { "epoch": 0.37250863094374653, "grad_norm": 0.14618372917175293, "learning_rate": 0.0005933387860987829, "loss": 3.0629, "step": 6420 }, { "epoch": 0.37308886245612, "grad_norm": 0.1428544670343399, "learning_rate": 0.0005926664928865004, "loss": 3.0644, "step": 6430 }, { "epoch": 0.3736690939684934, "grad_norm": 0.14707732200622559, "learning_rate": 0.000591993490256008, "loss": 3.0601, "step": 6440 }, { "epoch": 0.37424932548086687, "grad_norm": 0.13262979686260223, "learning_rate": 0.0005913197806853717, "loss": 3.0636, "step": 6450 }, { "epoch": 0.3748295569932403, "grad_norm": 0.1417519599199295, "learning_rate": 0.0005906453666552608, "loss": 3.0621, "step": 6460 }, { "epoch": 0.37540978850561374, "grad_norm": 0.13249574601650238, "learning_rate": 0.0005899702506489382, "loss": 3.0578, "step": 6470 }, { "epoch": 0.3759900200179872, "grad_norm": 0.1464381068944931, "learning_rate": 0.0005892944351522522, "loss": 3.0642, "step": 6480 }, { "epoch": 0.3765702515303606, "grad_norm": 0.14942754805088043, "learning_rate": 0.0005886179226536259, "loss": 3.0577, "step": 6490 }, { "epoch": 0.3771504830427341, "grad_norm": 0.13688527047634125, "learning_rate": 0.0005879407156440492, "loss": 3.0588, "step": 6500 }, { "epoch": 0.3777307145551075, "grad_norm": 0.13875730335712433, "learning_rate": 0.0005872628166170696, "loss": 3.0534, "step": 6510 }, { "epoch": 0.37831094606748095, "grad_norm": 0.1475764513015747, "learning_rate": 0.0005865842280687818, "loss": 3.0606, "step": 6520 }, { "epoch": 0.37889117757985435, "grad_norm": 0.14338374137878418, "learning_rate": 0.0005859049524978204, "loss": 3.0613, "step": 6530 }, { "epoch": 0.3794714090922278, "grad_norm": 0.12258878350257874, "learning_rate": 0.0005852249924053488, "loss": 3.0575, "step": 6540 }, { "epoch": 0.3800516406046012, "grad_norm": 0.1355215460062027, "learning_rate": 0.0005845443502950517, "loss": 3.0618, "step": 6550 }, { "epoch": 0.3806318721169747, "grad_norm": 0.14356692135334015, "learning_rate": 0.0005838630286731245, "loss": 3.0579, "step": 6560 }, { "epoch": 0.3812121036293481, "grad_norm": 0.141872838139534, "learning_rate": 0.0005831810300482647, "loss": 3.0633, "step": 6570 }, { "epoch": 0.38179233514172156, "grad_norm": 0.12874390184879303, "learning_rate": 0.0005824983569316626, "loss": 3.0684, "step": 6580 }, { "epoch": 0.38237256665409497, "grad_norm": 0.15871542692184448, "learning_rate": 0.0005818150118369923, "loss": 3.0536, "step": 6590 }, { "epoch": 0.38295279816646843, "grad_norm": 0.14536400139331818, "learning_rate": 0.000581130997280402, "loss": 3.0562, "step": 6600 }, { "epoch": 0.38353302967884184, "grad_norm": 0.13203443586826324, "learning_rate": 0.0005804463157805049, "loss": 3.0481, "step": 6610 }, { "epoch": 0.3841132611912153, "grad_norm": 0.14830882847309113, "learning_rate": 0.0005797609698583702, "loss": 3.0557, "step": 6620 }, { "epoch": 0.3846934927035887, "grad_norm": 0.14107373356819153, "learning_rate": 0.000579074962037513, "loss": 3.0542, "step": 6630 }, { "epoch": 0.3852737242159622, "grad_norm": 0.1280239075422287, "learning_rate": 0.0005783882948438863, "loss": 3.0627, "step": 6640 }, { "epoch": 0.3858539557283356, "grad_norm": 0.12451250851154327, "learning_rate": 0.0005777009708058705, "loss": 3.0455, "step": 6650 }, { "epoch": 0.38643418724070905, "grad_norm": 0.1431894153356552, "learning_rate": 0.0005770129924542647, "loss": 3.0523, "step": 6660 }, { "epoch": 0.38701441875308246, "grad_norm": 0.12256111204624176, "learning_rate": 0.0005763243623222773, "loss": 3.0627, "step": 6670 }, { "epoch": 0.3875946502654559, "grad_norm": 0.1340099722146988, "learning_rate": 0.0005756350829455166, "loss": 3.0466, "step": 6680 }, { "epoch": 0.38817488177782933, "grad_norm": 0.13833726942539215, "learning_rate": 0.0005749451568619815, "loss": 3.0489, "step": 6690 }, { "epoch": 0.3887551132902028, "grad_norm": 0.14747487008571625, "learning_rate": 0.000574254586612052, "loss": 3.0484, "step": 6700 }, { "epoch": 0.38933534480257626, "grad_norm": 0.1573353111743927, "learning_rate": 0.0005735633747384802, "loss": 3.0591, "step": 6710 }, { "epoch": 0.38991557631494966, "grad_norm": 0.14901325106620789, "learning_rate": 0.0005728715237863805, "loss": 3.0551, "step": 6720 }, { "epoch": 0.3904958078273231, "grad_norm": 0.12897038459777832, "learning_rate": 0.0005721790363032207, "loss": 3.0493, "step": 6730 }, { "epoch": 0.39107603933969654, "grad_norm": 0.12195435911417007, "learning_rate": 0.0005714859148388122, "loss": 3.055, "step": 6740 }, { "epoch": 0.39165627085207, "grad_norm": 0.12533505260944366, "learning_rate": 0.0005707921619453011, "loss": 3.0513, "step": 6750 }, { "epoch": 0.3922365023644434, "grad_norm": 0.13250428438186646, "learning_rate": 0.0005700977801771578, "loss": 3.0463, "step": 6760 }, { "epoch": 0.39281673387681687, "grad_norm": 0.13766299188137054, "learning_rate": 0.000569402772091169, "loss": 3.0481, "step": 6770 }, { "epoch": 0.3933969653891903, "grad_norm": 0.15243090689182281, "learning_rate": 0.0005687071402464271, "loss": 3.0437, "step": 6780 }, { "epoch": 0.39397719690156374, "grad_norm": 0.14014096558094025, "learning_rate": 0.0005680108872043215, "loss": 3.0532, "step": 6790 }, { "epoch": 0.39455742841393715, "grad_norm": 0.1550852209329605, "learning_rate": 0.0005673140155285288, "loss": 3.0509, "step": 6800 }, { "epoch": 0.3951376599263106, "grad_norm": 0.1402769535779953, "learning_rate": 0.0005666165277850039, "loss": 3.0496, "step": 6810 }, { "epoch": 0.395717891438684, "grad_norm": 0.13453513383865356, "learning_rate": 0.0005659184265419692, "loss": 3.0495, "step": 6820 }, { "epoch": 0.3962981229510575, "grad_norm": 0.12632259726524353, "learning_rate": 0.0005652197143699068, "loss": 3.051, "step": 6830 }, { "epoch": 0.3968783544634309, "grad_norm": 0.13051046431064606, "learning_rate": 0.0005645203938415481, "loss": 3.0426, "step": 6840 }, { "epoch": 0.39745858597580436, "grad_norm": 0.14430472254753113, "learning_rate": 0.0005638204675318646, "loss": 3.0371, "step": 6850 }, { "epoch": 0.39803881748817777, "grad_norm": 0.12584824860095978, "learning_rate": 0.0005631199380180582, "loss": 3.0467, "step": 6860 }, { "epoch": 0.39861904900055123, "grad_norm": 0.12726908922195435, "learning_rate": 0.0005624188078795523, "loss": 3.0556, "step": 6870 }, { "epoch": 0.39919928051292464, "grad_norm": 0.12641461193561554, "learning_rate": 0.0005617170796979811, "loss": 3.0435, "step": 6880 }, { "epoch": 0.3997795120252981, "grad_norm": 0.1385938823223114, "learning_rate": 0.0005610147560571813, "loss": 3.0481, "step": 6890 }, { "epoch": 0.4003597435376715, "grad_norm": 0.12625518441200256, "learning_rate": 0.0005603118395431822, "loss": 3.0442, "step": 6900 }, { "epoch": 0.40093997505004497, "grad_norm": 0.14146681129932404, "learning_rate": 0.0005596083327441962, "loss": 3.0447, "step": 6910 }, { "epoch": 0.4015202065624184, "grad_norm": 0.1408119946718216, "learning_rate": 0.000558904238250609, "loss": 3.0409, "step": 6920 }, { "epoch": 0.40210043807479184, "grad_norm": 0.13616541028022766, "learning_rate": 0.00055819955865497, "loss": 3.0411, "step": 6930 }, { "epoch": 0.40268066958716525, "grad_norm": 0.1373262107372284, "learning_rate": 0.0005574942965519836, "loss": 3.043, "step": 6940 }, { "epoch": 0.4032609010995387, "grad_norm": 0.13503822684288025, "learning_rate": 0.0005567884545384986, "loss": 3.0492, "step": 6950 }, { "epoch": 0.4038411326119122, "grad_norm": 0.14063769578933716, "learning_rate": 0.0005560820352134992, "loss": 3.0395, "step": 6960 }, { "epoch": 0.4044213641242856, "grad_norm": 0.11978533864021301, "learning_rate": 0.0005553750411780955, "loss": 3.0457, "step": 6970 }, { "epoch": 0.40500159563665905, "grad_norm": 0.14779379963874817, "learning_rate": 0.0005546674750355136, "loss": 3.0421, "step": 6980 }, { "epoch": 0.40558182714903246, "grad_norm": 0.12330121546983719, "learning_rate": 0.0005539593393910859, "loss": 3.0439, "step": 6990 }, { "epoch": 0.4061620586614059, "grad_norm": 0.1356820911169052, "learning_rate": 0.0005532506368522423, "loss": 3.0417, "step": 7000 }, { "epoch": 0.4061620586614059, "eval_loss": 2.9948248863220215, "eval_runtime": 3.4925, "eval_samples_per_second": 1239.807, "eval_steps_per_second": 4.868, "step": 7000 }, { "epoch": 0.40674229017377933, "grad_norm": 0.12303619831800461, "learning_rate": 0.0005525413700284996, "loss": 3.0434, "step": 7010 }, { "epoch": 0.4073225216861528, "grad_norm": 0.1458366960287094, "learning_rate": 0.0005518315415314528, "loss": 3.0432, "step": 7020 }, { "epoch": 0.4079027531985262, "grad_norm": 0.12410583347082138, "learning_rate": 0.0005511211539747646, "loss": 3.0544, "step": 7030 }, { "epoch": 0.40848298471089967, "grad_norm": 0.13156363368034363, "learning_rate": 0.0005504102099741564, "loss": 3.0389, "step": 7040 }, { "epoch": 0.4090632162232731, "grad_norm": 0.12394770979881287, "learning_rate": 0.0005496987121473985, "loss": 3.0346, "step": 7050 }, { "epoch": 0.40964344773564654, "grad_norm": 0.1272379457950592, "learning_rate": 0.0005489866631143005, "loss": 3.0427, "step": 7060 }, { "epoch": 0.41022367924801995, "grad_norm": 0.16070450842380524, "learning_rate": 0.0005482740654967013, "loss": 3.034, "step": 7070 }, { "epoch": 0.4108039107603934, "grad_norm": 0.1489638090133667, "learning_rate": 0.0005475609219184604, "loss": 3.0479, "step": 7080 }, { "epoch": 0.4113841422727668, "grad_norm": 0.12640169262886047, "learning_rate": 0.0005468472350054468, "loss": 3.0447, "step": 7090 }, { "epoch": 0.4119643737851403, "grad_norm": 0.11892352998256683, "learning_rate": 0.0005461330073855305, "loss": 3.0404, "step": 7100 }, { "epoch": 0.4125446052975137, "grad_norm": 0.12563811242580414, "learning_rate": 0.0005454182416885728, "loss": 3.0405, "step": 7110 }, { "epoch": 0.41312483680988715, "grad_norm": 0.1330655813217163, "learning_rate": 0.0005447029405464154, "loss": 3.0428, "step": 7120 }, { "epoch": 0.41370506832226056, "grad_norm": 0.1286982297897339, "learning_rate": 0.0005439871065928722, "loss": 3.0354, "step": 7130 }, { "epoch": 0.414285299834634, "grad_norm": 0.1392429769039154, "learning_rate": 0.0005432707424637189, "loss": 3.0262, "step": 7140 }, { "epoch": 0.41486553134700743, "grad_norm": 0.1322159469127655, "learning_rate": 0.0005425538507966832, "loss": 3.0354, "step": 7150 }, { "epoch": 0.4154457628593809, "grad_norm": 0.13469074666500092, "learning_rate": 0.0005418364342314352, "loss": 3.0455, "step": 7160 }, { "epoch": 0.4160259943717543, "grad_norm": 0.1266402006149292, "learning_rate": 0.0005411184954095781, "loss": 3.0287, "step": 7170 }, { "epoch": 0.41660622588412777, "grad_norm": 0.17815560102462769, "learning_rate": 0.0005404000369746378, "loss": 3.0445, "step": 7180 }, { "epoch": 0.41718645739650123, "grad_norm": 0.12896455824375153, "learning_rate": 0.0005396810615720533, "loss": 3.0435, "step": 7190 }, { "epoch": 0.41776668890887464, "grad_norm": 0.12288496643304825, "learning_rate": 0.0005389615718491674, "loss": 3.0424, "step": 7200 }, { "epoch": 0.4183469204212481, "grad_norm": 0.12553273141384125, "learning_rate": 0.0005382415704552168, "loss": 3.0387, "step": 7210 }, { "epoch": 0.4189271519336215, "grad_norm": 0.13473576307296753, "learning_rate": 0.0005375210600413221, "loss": 3.0372, "step": 7220 }, { "epoch": 0.419507383445995, "grad_norm": 0.11927106976509094, "learning_rate": 0.0005368000432604779, "loss": 3.0245, "step": 7230 }, { "epoch": 0.4200876149583684, "grad_norm": 0.133677139878273, "learning_rate": 0.0005360785227675437, "loss": 3.0326, "step": 7240 }, { "epoch": 0.42066784647074185, "grad_norm": 0.12948152422904968, "learning_rate": 0.0005353565012192333, "loss": 3.0246, "step": 7250 }, { "epoch": 0.42124807798311525, "grad_norm": 0.12241530418395996, "learning_rate": 0.0005346339812741059, "loss": 3.043, "step": 7260 }, { "epoch": 0.4218283094954887, "grad_norm": 0.12694603204727173, "learning_rate": 0.0005339109655925556, "loss": 3.0357, "step": 7270 }, { "epoch": 0.4224085410078621, "grad_norm": 0.1323709487915039, "learning_rate": 0.000533187456836802, "loss": 3.0258, "step": 7280 }, { "epoch": 0.4229887725202356, "grad_norm": 0.13372564315795898, "learning_rate": 0.0005324634576708798, "loss": 3.0348, "step": 7290 }, { "epoch": 0.423569004032609, "grad_norm": 0.13448093831539154, "learning_rate": 0.00053173897076063, "loss": 3.0254, "step": 7300 }, { "epoch": 0.42414923554498246, "grad_norm": 0.1256895810365677, "learning_rate": 0.0005310139987736891, "loss": 3.0275, "step": 7310 }, { "epoch": 0.42472946705735587, "grad_norm": 0.11919272691011429, "learning_rate": 0.0005302885443794801, "loss": 3.0318, "step": 7320 }, { "epoch": 0.42530969856972933, "grad_norm": 0.1193266436457634, "learning_rate": 0.0005295626102492022, "loss": 3.0184, "step": 7330 }, { "epoch": 0.42588993008210274, "grad_norm": 0.13139508664608002, "learning_rate": 0.0005288361990558206, "loss": 3.031, "step": 7340 }, { "epoch": 0.4264701615944762, "grad_norm": 0.12625828385353088, "learning_rate": 0.0005281093134740576, "loss": 3.038, "step": 7350 }, { "epoch": 0.4270503931068496, "grad_norm": 0.12623044848442078, "learning_rate": 0.000527381956180382, "loss": 3.0291, "step": 7360 }, { "epoch": 0.4276306246192231, "grad_norm": 0.13411734998226166, "learning_rate": 0.0005266541298529995, "loss": 3.0327, "step": 7370 }, { "epoch": 0.4282108561315965, "grad_norm": 0.12928707897663116, "learning_rate": 0.000525925837171843, "loss": 3.0312, "step": 7380 }, { "epoch": 0.42879108764396995, "grad_norm": 0.1463450938463211, "learning_rate": 0.0005251970808185625, "loss": 3.0275, "step": 7390 }, { "epoch": 0.42937131915634336, "grad_norm": 0.13260109722614288, "learning_rate": 0.000524467863476515, "loss": 3.0286, "step": 7400 }, { "epoch": 0.4299515506687168, "grad_norm": 0.14642229676246643, "learning_rate": 0.0005237381878307553, "loss": 3.0298, "step": 7410 }, { "epoch": 0.43053178218109023, "grad_norm": 0.1342821568250656, "learning_rate": 0.0005230080565680256, "loss": 3.0308, "step": 7420 }, { "epoch": 0.4311120136934637, "grad_norm": 0.13104425370693207, "learning_rate": 0.0005222774723767455, "loss": 3.0251, "step": 7430 }, { "epoch": 0.43169224520583716, "grad_norm": 0.14158077538013458, "learning_rate": 0.0005215464379470027, "loss": 3.0289, "step": 7440 }, { "epoch": 0.43227247671821056, "grad_norm": 0.12375509738922119, "learning_rate": 0.0005208149559705426, "loss": 3.0288, "step": 7450 }, { "epoch": 0.432852708230584, "grad_norm": 0.13204160332679749, "learning_rate": 0.0005200830291407582, "loss": 3.0452, "step": 7460 }, { "epoch": 0.43343293974295743, "grad_norm": 0.12521019577980042, "learning_rate": 0.0005193506601526811, "loss": 3.0248, "step": 7470 }, { "epoch": 0.4340131712553309, "grad_norm": 0.1373249739408493, "learning_rate": 0.0005186178517029705, "loss": 3.021, "step": 7480 }, { "epoch": 0.4345934027677043, "grad_norm": 0.12585614621639252, "learning_rate": 0.0005178846064899038, "loss": 3.0363, "step": 7490 }, { "epoch": 0.43517363428007777, "grad_norm": 0.13090114295482635, "learning_rate": 0.0005171509272133666, "loss": 3.0218, "step": 7500 }, { "epoch": 0.4357538657924512, "grad_norm": 0.1382007896900177, "learning_rate": 0.0005164168165748431, "loss": 3.0313, "step": 7510 }, { "epoch": 0.43633409730482464, "grad_norm": 0.13011516630649567, "learning_rate": 0.0005156822772774054, "loss": 3.0224, "step": 7520 }, { "epoch": 0.43691432881719805, "grad_norm": 0.12334044277667999, "learning_rate": 0.0005149473120257043, "loss": 3.0319, "step": 7530 }, { "epoch": 0.4374945603295715, "grad_norm": 0.13634119927883148, "learning_rate": 0.0005142119235259587, "loss": 3.0327, "step": 7540 }, { "epoch": 0.4380747918419449, "grad_norm": 0.12633016705513, "learning_rate": 0.0005134761144859462, "loss": 3.0212, "step": 7550 }, { "epoch": 0.4386550233543184, "grad_norm": 0.14127787947654724, "learning_rate": 0.0005127398876149927, "loss": 3.0319, "step": 7560 }, { "epoch": 0.4392352548666918, "grad_norm": 0.13168762624263763, "learning_rate": 0.0005120032456239628, "loss": 3.025, "step": 7570 }, { "epoch": 0.43981548637906526, "grad_norm": 0.1314259171485901, "learning_rate": 0.0005112661912252492, "loss": 3.0199, "step": 7580 }, { "epoch": 0.44039571789143866, "grad_norm": 0.15071754157543182, "learning_rate": 0.0005105287271327636, "loss": 3.0335, "step": 7590 }, { "epoch": 0.44097594940381213, "grad_norm": 0.13888002932071686, "learning_rate": 0.000509790856061926, "loss": 3.0156, "step": 7600 }, { "epoch": 0.44155618091618554, "grad_norm": 0.1198134794831276, "learning_rate": 0.0005090525807296551, "loss": 3.014, "step": 7610 }, { "epoch": 0.442136412428559, "grad_norm": 0.13763317465782166, "learning_rate": 0.0005083139038543577, "loss": 3.0151, "step": 7620 }, { "epoch": 0.4427166439409324, "grad_norm": 0.1240348368883133, "learning_rate": 0.0005075748281559198, "loss": 3.0217, "step": 7630 }, { "epoch": 0.44329687545330587, "grad_norm": 0.12932796776294708, "learning_rate": 0.0005068353563556954, "loss": 3.0236, "step": 7640 }, { "epoch": 0.4438771069656793, "grad_norm": 0.13718535006046295, "learning_rate": 0.0005060954911764971, "loss": 3.028, "step": 7650 }, { "epoch": 0.44445733847805274, "grad_norm": 0.12380992621183395, "learning_rate": 0.000505355235342586, "loss": 3.0203, "step": 7660 }, { "epoch": 0.4450375699904262, "grad_norm": 0.12614348530769348, "learning_rate": 0.0005046145915796616, "loss": 3.021, "step": 7670 }, { "epoch": 0.4456178015027996, "grad_norm": 0.12163844704627991, "learning_rate": 0.0005038735626148519, "loss": 3.0195, "step": 7680 }, { "epoch": 0.4461980330151731, "grad_norm": 0.1386152058839798, "learning_rate": 0.0005031321511767031, "loss": 3.0221, "step": 7690 }, { "epoch": 0.4467782645275465, "grad_norm": 0.1282549351453781, "learning_rate": 0.00050239035999517, "loss": 3.0146, "step": 7700 }, { "epoch": 0.44735849603991995, "grad_norm": 0.13319770991802216, "learning_rate": 0.0005016481918016051, "loss": 3.0128, "step": 7710 }, { "epoch": 0.44793872755229336, "grad_norm": 0.13480350375175476, "learning_rate": 0.0005009056493287498, "loss": 3.0218, "step": 7720 }, { "epoch": 0.4485189590646668, "grad_norm": 0.12706001102924347, "learning_rate": 0.0005001627353107229, "loss": 3.0253, "step": 7730 }, { "epoch": 0.44909919057704023, "grad_norm": 0.12390197813510895, "learning_rate": 0.0004994194524830116, "loss": 3.023, "step": 7740 }, { "epoch": 0.4496794220894137, "grad_norm": 0.12882788479328156, "learning_rate": 0.0004986758035824617, "loss": 3.0199, "step": 7750 }, { "epoch": 0.4502596536017871, "grad_norm": 0.13646461069583893, "learning_rate": 0.000497931791347266, "loss": 3.0065, "step": 7760 }, { "epoch": 0.45083988511416057, "grad_norm": 0.121762216091156, "learning_rate": 0.0004971874185169553, "loss": 3.0338, "step": 7770 }, { "epoch": 0.451420116626534, "grad_norm": 0.12630295753479004, "learning_rate": 0.0004964426878323886, "loss": 3.0128, "step": 7780 }, { "epoch": 0.45200034813890744, "grad_norm": 0.12586043775081635, "learning_rate": 0.0004956976020357422, "loss": 3.0176, "step": 7790 }, { "epoch": 0.45258057965128085, "grad_norm": 0.1376063972711563, "learning_rate": 0.0004949521638705002, "loss": 3.0036, "step": 7800 }, { "epoch": 0.4531608111636543, "grad_norm": 0.13597889244556427, "learning_rate": 0.0004942063760814438, "loss": 3.0144, "step": 7810 }, { "epoch": 0.4537410426760277, "grad_norm": 0.1323329508304596, "learning_rate": 0.000493460241414642, "loss": 3.0119, "step": 7820 }, { "epoch": 0.4543212741884012, "grad_norm": 0.16766121983528137, "learning_rate": 0.0004927137626174405, "loss": 3.0184, "step": 7830 }, { "epoch": 0.4549015057007746, "grad_norm": 0.14581693708896637, "learning_rate": 0.0004919669424384527, "loss": 3.0162, "step": 7840 }, { "epoch": 0.45548173721314805, "grad_norm": 0.13573148846626282, "learning_rate": 0.0004912197836275487, "loss": 3.0203, "step": 7850 }, { "epoch": 0.45606196872552146, "grad_norm": 0.12805421650409698, "learning_rate": 0.0004904722889358453, "loss": 3.0195, "step": 7860 }, { "epoch": 0.4566422002378949, "grad_norm": 0.1366797238588333, "learning_rate": 0.0004897244611156966, "loss": 3.0104, "step": 7870 }, { "epoch": 0.45722243175026833, "grad_norm": 0.13198062777519226, "learning_rate": 0.0004889763029206827, "loss": 3.0215, "step": 7880 }, { "epoch": 0.4578026632626418, "grad_norm": 0.12173867225646973, "learning_rate": 0.00048822781710560045, "loss": 3.0178, "step": 7890 }, { "epoch": 0.4583828947750152, "grad_norm": 0.13213156163692474, "learning_rate": 0.00048747900642645325, "loss": 3.0095, "step": 7900 }, { "epoch": 0.45896312628738867, "grad_norm": 0.11584512144327164, "learning_rate": 0.00048672987364044025, "loss": 3.0123, "step": 7910 }, { "epoch": 0.45954335779976213, "grad_norm": 0.12746469676494598, "learning_rate": 0.00048598042150594685, "loss": 3.0112, "step": 7920 }, { "epoch": 0.46012358931213554, "grad_norm": 0.1450142115354538, "learning_rate": 0.0004852306527825346, "loss": 3.0192, "step": 7930 }, { "epoch": 0.460703820824509, "grad_norm": 0.12270225584506989, "learning_rate": 0.0004844805702309301, "loss": 3.0135, "step": 7940 }, { "epoch": 0.4612840523368824, "grad_norm": 0.12805183231830597, "learning_rate": 0.00048373017661301613, "loss": 3.009, "step": 7950 }, { "epoch": 0.4618642838492559, "grad_norm": 0.14287428557872772, "learning_rate": 0.00048297947469182054, "loss": 3.0112, "step": 7960 }, { "epoch": 0.4624445153616293, "grad_norm": 0.12612827122211456, "learning_rate": 0.00048222846723150643, "loss": 3.0066, "step": 7970 }, { "epoch": 0.46302474687400275, "grad_norm": 0.12449418008327484, "learning_rate": 0.00048147715699736185, "loss": 3.0109, "step": 7980 }, { "epoch": 0.46360497838637615, "grad_norm": 0.1274731159210205, "learning_rate": 0.00048072554675579, "loss": 3.0072, "step": 7990 }, { "epoch": 0.4641852098987496, "grad_norm": 0.1265745460987091, "learning_rate": 0.0004799736392742983, "loss": 3.0114, "step": 8000 }, { "epoch": 0.4641852098987496, "eval_loss": 2.965857744216919, "eval_runtime": 3.5063, "eval_samples_per_second": 1234.917, "eval_steps_per_second": 4.848, "step": 8000 }, { "epoch": 0.464765441411123, "grad_norm": 0.12254419177770615, "learning_rate": 0.0004792214373214891, "loss": 3.0128, "step": 8010 }, { "epoch": 0.4653456729234965, "grad_norm": 0.13092252612113953, "learning_rate": 0.0004784689436670486, "loss": 3.0118, "step": 8020 }, { "epoch": 0.4659259044358699, "grad_norm": 0.12821543216705322, "learning_rate": 0.00047771616108173735, "loss": 3.0089, "step": 8030 }, { "epoch": 0.46650613594824336, "grad_norm": 0.12405045330524445, "learning_rate": 0.00047696309233737956, "loss": 3.0055, "step": 8040 }, { "epoch": 0.46708636746061677, "grad_norm": 0.12393677979707718, "learning_rate": 0.0004762097402068534, "loss": 3.0118, "step": 8050 }, { "epoch": 0.46766659897299023, "grad_norm": 0.13137251138687134, "learning_rate": 0.00047545610746408036, "loss": 3.0157, "step": 8060 }, { "epoch": 0.46824683048536364, "grad_norm": 0.1212739571928978, "learning_rate": 0.000474702196884015, "loss": 3.0102, "step": 8070 }, { "epoch": 0.4688270619977371, "grad_norm": 0.1276984065771103, "learning_rate": 0.00047394801124263516, "loss": 3.0048, "step": 8080 }, { "epoch": 0.4694072935101105, "grad_norm": 0.12522514164447784, "learning_rate": 0.00047319355331693123, "loss": 3.0117, "step": 8090 }, { "epoch": 0.469987525022484, "grad_norm": 0.12416790425777435, "learning_rate": 0.0004724388258848965, "loss": 3.005, "step": 8100 }, { "epoch": 0.4705677565348574, "grad_norm": 0.12152735143899918, "learning_rate": 0.0004716838317255164, "loss": 3.0045, "step": 8110 }, { "epoch": 0.47114798804723085, "grad_norm": 0.13686949014663696, "learning_rate": 0.0004709285736187585, "loss": 3.0154, "step": 8120 }, { "epoch": 0.47172821955960426, "grad_norm": 0.13260766863822937, "learning_rate": 0.00047017305434556223, "loss": 3.0092, "step": 8130 }, { "epoch": 0.4723084510719777, "grad_norm": 0.13700617849826813, "learning_rate": 0.00046941727668782876, "loss": 3.0178, "step": 8140 }, { "epoch": 0.4728886825843512, "grad_norm": 0.12646503746509552, "learning_rate": 0.00046866124342841045, "loss": 3.0115, "step": 8150 }, { "epoch": 0.4734689140967246, "grad_norm": 0.1190716028213501, "learning_rate": 0.00046790495735110106, "loss": 3.0122, "step": 8160 }, { "epoch": 0.47404914560909805, "grad_norm": 0.12732787430286407, "learning_rate": 0.0004671484212406253, "loss": 3.0055, "step": 8170 }, { "epoch": 0.47462937712147146, "grad_norm": 0.12872536480426788, "learning_rate": 0.0004663916378826282, "loss": 3.0099, "step": 8180 }, { "epoch": 0.4752096086338449, "grad_norm": 0.13794904947280884, "learning_rate": 0.00046563461006366547, "loss": 3.0039, "step": 8190 }, { "epoch": 0.47578984014621833, "grad_norm": 0.1306883841753006, "learning_rate": 0.00046487734057119265, "loss": 3.0033, "step": 8200 }, { "epoch": 0.4763700716585918, "grad_norm": 0.13469251990318298, "learning_rate": 0.0004641198321935556, "loss": 3.0049, "step": 8210 }, { "epoch": 0.4769503031709652, "grad_norm": 0.12475228309631348, "learning_rate": 0.00046336208771997927, "loss": 3.002, "step": 8220 }, { "epoch": 0.47753053468333867, "grad_norm": 0.1225697323679924, "learning_rate": 0.0004626041099405584, "loss": 2.9954, "step": 8230 }, { "epoch": 0.4781107661957121, "grad_norm": 0.13275326788425446, "learning_rate": 0.0004618459016462465, "loss": 3.0072, "step": 8240 }, { "epoch": 0.47869099770808554, "grad_norm": 0.12760354578495026, "learning_rate": 0.000461087465628846, "loss": 2.9963, "step": 8250 }, { "epoch": 0.47927122922045895, "grad_norm": 0.12008246779441833, "learning_rate": 0.0004603288046809976, "loss": 2.9992, "step": 8260 }, { "epoch": 0.4798514607328324, "grad_norm": 0.1280468851327896, "learning_rate": 0.00045956992159617053, "loss": 3.0062, "step": 8270 }, { "epoch": 0.4804316922452058, "grad_norm": 0.122902050614357, "learning_rate": 0.0004588108191686517, "loss": 3.0063, "step": 8280 }, { "epoch": 0.4810119237575793, "grad_norm": 0.14071571826934814, "learning_rate": 0.0004580515001935359, "loss": 3.0031, "step": 8290 }, { "epoch": 0.4815921552699527, "grad_norm": 0.12949834764003754, "learning_rate": 0.00045729196746671505, "loss": 3.0078, "step": 8300 }, { "epoch": 0.48217238678232616, "grad_norm": 0.1227622777223587, "learning_rate": 0.00045653222378486813, "loss": 3.0058, "step": 8310 }, { "epoch": 0.48275261829469956, "grad_norm": 0.13193581998348236, "learning_rate": 0.0004557722719454512, "loss": 3.0016, "step": 8320 }, { "epoch": 0.48333284980707303, "grad_norm": 0.1256847381591797, "learning_rate": 0.00045501211474668646, "loss": 2.9901, "step": 8330 }, { "epoch": 0.48391308131944644, "grad_norm": 0.15065541863441467, "learning_rate": 0.00045425175498755216, "loss": 2.9999, "step": 8340 }, { "epoch": 0.4844933128318199, "grad_norm": 0.11859238892793655, "learning_rate": 0.0004534911954677728, "loss": 3.0006, "step": 8350 }, { "epoch": 0.4850735443441933, "grad_norm": 0.12452604621648788, "learning_rate": 0.0004527304389878081, "loss": 2.9949, "step": 8360 }, { "epoch": 0.48565377585656677, "grad_norm": 0.12375226616859436, "learning_rate": 0.0004519694883488432, "loss": 2.9968, "step": 8370 }, { "epoch": 0.4862340073689402, "grad_norm": 0.12726223468780518, "learning_rate": 0.00045120834635277814, "loss": 2.9991, "step": 8380 }, { "epoch": 0.48681423888131364, "grad_norm": 0.13420931994915009, "learning_rate": 0.00045044701580221726, "loss": 2.9998, "step": 8390 }, { "epoch": 0.4873944703936871, "grad_norm": 0.12803731858730316, "learning_rate": 0.00044968549950045946, "loss": 2.9973, "step": 8400 }, { "epoch": 0.4879747019060605, "grad_norm": 0.11515044420957565, "learning_rate": 0.0004489238002514876, "loss": 3.0004, "step": 8410 }, { "epoch": 0.488554933418434, "grad_norm": 0.12214256078004837, "learning_rate": 0.0004481619208599581, "loss": 2.9959, "step": 8420 }, { "epoch": 0.4891351649308074, "grad_norm": 0.11597707122564316, "learning_rate": 0.00044739986413119067, "loss": 2.9987, "step": 8430 }, { "epoch": 0.48971539644318085, "grad_norm": 0.12344956398010254, "learning_rate": 0.0004466376328711581, "loss": 3.0036, "step": 8440 }, { "epoch": 0.49029562795555426, "grad_norm": 0.12265517562627792, "learning_rate": 0.00044587522988647567, "loss": 2.9886, "step": 8450 }, { "epoch": 0.4908758594679277, "grad_norm": 0.12832215428352356, "learning_rate": 0.00044511265798439093, "loss": 3.001, "step": 8460 }, { "epoch": 0.49145609098030113, "grad_norm": 0.12218471616506577, "learning_rate": 0.0004443499199727736, "loss": 3.0029, "step": 8470 }, { "epoch": 0.4920363224926746, "grad_norm": 0.12205421179533005, "learning_rate": 0.0004435870186601051, "loss": 3.0001, "step": 8480 }, { "epoch": 0.492616554005048, "grad_norm": 0.1365329623222351, "learning_rate": 0.0004428239568554677, "loss": 3.0049, "step": 8490 }, { "epoch": 0.49319678551742147, "grad_norm": 0.13004398345947266, "learning_rate": 0.0004420607373685352, "loss": 2.9886, "step": 8500 }, { "epoch": 0.4937770170297949, "grad_norm": 0.12751324474811554, "learning_rate": 0.0004412973630095613, "loss": 3.002, "step": 8510 }, { "epoch": 0.49435724854216834, "grad_norm": 0.1167600229382515, "learning_rate": 0.0004405338365893708, "loss": 3.0002, "step": 8520 }, { "epoch": 0.49493748005454175, "grad_norm": 0.13961093127727509, "learning_rate": 0.0004397701609193477, "loss": 2.9941, "step": 8530 }, { "epoch": 0.4955177115669152, "grad_norm": 0.12514491379261017, "learning_rate": 0.0004390063388114259, "loss": 2.9942, "step": 8540 }, { "epoch": 0.4960979430792886, "grad_norm": 0.13159744441509247, "learning_rate": 0.0004382423730780784, "loss": 2.989, "step": 8550 }, { "epoch": 0.4966781745916621, "grad_norm": 0.1285783350467682, "learning_rate": 0.0004374782665323071, "loss": 2.9964, "step": 8560 }, { "epoch": 0.4972584061040355, "grad_norm": 0.11707151681184769, "learning_rate": 0.0004367140219876322, "loss": 2.9897, "step": 8570 }, { "epoch": 0.49783863761640895, "grad_norm": 0.13998247683048248, "learning_rate": 0.00043594964225808237, "loss": 2.9932, "step": 8580 }, { "epoch": 0.49841886912878236, "grad_norm": 0.1305503398180008, "learning_rate": 0.0004351851301581837, "loss": 2.9954, "step": 8590 }, { "epoch": 0.4989991006411558, "grad_norm": 0.13159409165382385, "learning_rate": 0.00043442048850294993, "loss": 2.9888, "step": 8600 }, { "epoch": 0.49957933215352923, "grad_norm": 0.12154188007116318, "learning_rate": 0.00043365572010787174, "loss": 2.9879, "step": 8610 }, { "epoch": 0.5001595636659026, "grad_norm": 0.12399255484342575, "learning_rate": 0.00043289082778890633, "loss": 2.9927, "step": 8620 }, { "epoch": 0.5007397951782762, "grad_norm": 0.12647025287151337, "learning_rate": 0.00043212581436246754, "loss": 2.9976, "step": 8630 }, { "epoch": 0.5013200266906496, "grad_norm": 0.1319727599620819, "learning_rate": 0.0004313606826454148, "loss": 3.0035, "step": 8640 }, { "epoch": 0.501900258203023, "grad_norm": 0.13229894638061523, "learning_rate": 0.00043059543545504326, "loss": 3.0069, "step": 8650 }, { "epoch": 0.5024804897153965, "grad_norm": 0.12166497111320496, "learning_rate": 0.00042983007560907344, "loss": 2.9866, "step": 8660 }, { "epoch": 0.5030607212277699, "grad_norm": 0.12833116948604584, "learning_rate": 0.0004290646059256402, "loss": 2.9999, "step": 8670 }, { "epoch": 0.5036409527401433, "grad_norm": 0.13493898510932922, "learning_rate": 0.0004282990292232832, "loss": 2.9954, "step": 8680 }, { "epoch": 0.5042211842525167, "grad_norm": 0.12510940432548523, "learning_rate": 0.00042753334832093613, "loss": 2.9891, "step": 8690 }, { "epoch": 0.5048014157648902, "grad_norm": 0.11504089832305908, "learning_rate": 0.0004267675660379162, "loss": 2.9972, "step": 8700 }, { "epoch": 0.5053816472772636, "grad_norm": 0.11173667013645172, "learning_rate": 0.00042600168519391394, "loss": 2.9921, "step": 8710 }, { "epoch": 0.505961878789637, "grad_norm": 0.1324940323829651, "learning_rate": 0.00042523570860898306, "loss": 2.9939, "step": 8720 }, { "epoch": 0.5065421103020105, "grad_norm": 0.11683640629053116, "learning_rate": 0.00042446963910352934, "loss": 2.9875, "step": 8730 }, { "epoch": 0.507122341814384, "grad_norm": 0.1292978972196579, "learning_rate": 0.00042370347949830114, "loss": 2.9882, "step": 8740 }, { "epoch": 0.5077025733267574, "grad_norm": 0.12668271362781525, "learning_rate": 0.0004229372326143784, "loss": 2.9962, "step": 8750 }, { "epoch": 0.5082828048391308, "grad_norm": 0.1264737844467163, "learning_rate": 0.00042217090127316225, "loss": 2.9945, "step": 8760 }, { "epoch": 0.5088630363515042, "grad_norm": 0.1287977248430252, "learning_rate": 0.00042140448829636494, "loss": 2.975, "step": 8770 }, { "epoch": 0.5094432678638777, "grad_norm": 0.12344055622816086, "learning_rate": 0.0004206379965059995, "loss": 2.9904, "step": 8780 }, { "epoch": 0.5100234993762511, "grad_norm": 0.1377340406179428, "learning_rate": 0.00041987142872436904, "loss": 2.9883, "step": 8790 }, { "epoch": 0.5106037308886245, "grad_norm": 0.12725047767162323, "learning_rate": 0.00041910478777405633, "loss": 2.9901, "step": 8800 }, { "epoch": 0.511183962400998, "grad_norm": 0.11563998460769653, "learning_rate": 0.00041833807647791366, "loss": 2.9863, "step": 8810 }, { "epoch": 0.5117641939133715, "grad_norm": 0.11704185605049133, "learning_rate": 0.0004175712976590524, "loss": 2.9859, "step": 8820 }, { "epoch": 0.5123444254257449, "grad_norm": 0.1327212154865265, "learning_rate": 0.0004168044541408323, "loss": 2.994, "step": 8830 }, { "epoch": 0.5129246569381183, "grad_norm": 0.1343165785074234, "learning_rate": 0.0004160375487468517, "loss": 2.9882, "step": 8840 }, { "epoch": 0.5135048884504917, "grad_norm": 0.13178564608097076, "learning_rate": 0.00041527058430093655, "loss": 2.9945, "step": 8850 }, { "epoch": 0.5140851199628652, "grad_norm": 0.13252697885036469, "learning_rate": 0.0004145035636271304, "loss": 2.9933, "step": 8860 }, { "epoch": 0.5146653514752386, "grad_norm": 0.12008585780858994, "learning_rate": 0.00041373648954968356, "loss": 2.991, "step": 8870 }, { "epoch": 0.515245582987612, "grad_norm": 0.11822432279586792, "learning_rate": 0.0004129693648930432, "loss": 2.9881, "step": 8880 }, { "epoch": 0.5158258144999855, "grad_norm": 0.12050991505384445, "learning_rate": 0.00041220219248184266, "loss": 2.991, "step": 8890 }, { "epoch": 0.516406046012359, "grad_norm": 0.1276601105928421, "learning_rate": 0.0004114349751408911, "loss": 2.9849, "step": 8900 }, { "epoch": 0.5169862775247324, "grad_norm": 0.12049540132284164, "learning_rate": 0.00041066771569516327, "loss": 2.9921, "step": 8910 }, { "epoch": 0.5175665090371058, "grad_norm": 0.12151946127414703, "learning_rate": 0.00040990041696978876, "loss": 2.9792, "step": 8920 }, { "epoch": 0.5181467405494793, "grad_norm": 0.13656169176101685, "learning_rate": 0.0004091330817900417, "loss": 2.9909, "step": 8930 }, { "epoch": 0.5187269720618527, "grad_norm": 0.14420534670352936, "learning_rate": 0.0004083657129813308, "loss": 2.985, "step": 8940 }, { "epoch": 0.5193072035742261, "grad_norm": 0.13805179297924042, "learning_rate": 0.00040759831336918825, "loss": 2.985, "step": 8950 }, { "epoch": 0.5198874350865995, "grad_norm": 0.11674559861421585, "learning_rate": 0.00040683088577926, "loss": 2.9836, "step": 8960 }, { "epoch": 0.520467666598973, "grad_norm": 0.1216716542840004, "learning_rate": 0.00040606343303729473, "loss": 2.985, "step": 8970 }, { "epoch": 0.5210478981113464, "grad_norm": 0.11664440482854843, "learning_rate": 0.00040529595796913383, "loss": 2.9885, "step": 8980 }, { "epoch": 0.5216281296237198, "grad_norm": 0.11117060482501984, "learning_rate": 0.00040452846340070087, "loss": 2.9808, "step": 8990 }, { "epoch": 0.5222083611360933, "grad_norm": 0.11910637468099594, "learning_rate": 0.00040376095215799126, "loss": 2.9864, "step": 9000 }, { "epoch": 0.5222083611360933, "eval_loss": 2.9385592937469482, "eval_runtime": 3.5031, "eval_samples_per_second": 1236.034, "eval_steps_per_second": 4.853, "step": 9000 }, { "epoch": 0.5227885926484668, "grad_norm": 0.12593922019004822, "learning_rate": 0.00040299342706706176, "loss": 2.9832, "step": 9010 }, { "epoch": 0.5233688241608402, "grad_norm": 0.12040293216705322, "learning_rate": 0.00040222589095402034, "loss": 2.9827, "step": 9020 }, { "epoch": 0.5239490556732136, "grad_norm": 0.12265130132436752, "learning_rate": 0.00040145834664501515, "loss": 2.9852, "step": 9030 }, { "epoch": 0.524529287185587, "grad_norm": 0.12079159915447235, "learning_rate": 0.0004006907969662248, "loss": 2.9901, "step": 9040 }, { "epoch": 0.5251095186979605, "grad_norm": 0.11104680597782135, "learning_rate": 0.0003999232447438478, "loss": 2.9863, "step": 9050 }, { "epoch": 0.5256897502103339, "grad_norm": 0.11193856596946716, "learning_rate": 0.0003991556928040916, "loss": 2.993, "step": 9060 }, { "epoch": 0.5262699817227073, "grad_norm": 0.12274361401796341, "learning_rate": 0.0003983881439731631, "loss": 2.9729, "step": 9070 }, { "epoch": 0.5268502132350807, "grad_norm": 0.12874899804592133, "learning_rate": 0.00039762060107725715, "loss": 2.9803, "step": 9080 }, { "epoch": 0.5274304447474543, "grad_norm": 0.1328824758529663, "learning_rate": 0.0003968530669425474, "loss": 2.9769, "step": 9090 }, { "epoch": 0.5280106762598277, "grad_norm": 0.13120834529399872, "learning_rate": 0.00039608554439517473, "loss": 2.9812, "step": 9100 }, { "epoch": 0.5285909077722011, "grad_norm": 0.11995182186365128, "learning_rate": 0.00039531803626123764, "loss": 2.9802, "step": 9110 }, { "epoch": 0.5291711392845746, "grad_norm": 0.1455584317445755, "learning_rate": 0.00039455054536678176, "loss": 2.9865, "step": 9120 }, { "epoch": 0.529751370796948, "grad_norm": 0.12229720503091812, "learning_rate": 0.00039378307453778847, "loss": 2.9804, "step": 9130 }, { "epoch": 0.5303316023093214, "grad_norm": 0.13109011948108673, "learning_rate": 0.0003930156266001661, "loss": 2.9817, "step": 9140 }, { "epoch": 0.5309118338216948, "grad_norm": 0.12344162911176682, "learning_rate": 0.00039224820437973817, "loss": 2.9871, "step": 9150 }, { "epoch": 0.5314920653340683, "grad_norm": 0.14061598479747772, "learning_rate": 0.0003914808107022338, "loss": 2.976, "step": 9160 }, { "epoch": 0.5320722968464418, "grad_norm": 0.12224850058555603, "learning_rate": 0.000390713448393277, "loss": 2.9794, "step": 9170 }, { "epoch": 0.5326525283588152, "grad_norm": 0.13019710779190063, "learning_rate": 0.00038994612027837587, "loss": 2.9843, "step": 9180 }, { "epoch": 0.5332327598711886, "grad_norm": 0.1308557242155075, "learning_rate": 0.00038917882918291315, "loss": 2.9858, "step": 9190 }, { "epoch": 0.5338129913835621, "grad_norm": 0.12456322461366653, "learning_rate": 0.00038841157793213495, "loss": 2.9811, "step": 9200 }, { "epoch": 0.5343932228959355, "grad_norm": 0.11514720320701599, "learning_rate": 0.00038764436935114096, "loss": 2.9903, "step": 9210 }, { "epoch": 0.5349734544083089, "grad_norm": 0.12431762367486954, "learning_rate": 0.00038687720626487317, "loss": 2.9726, "step": 9220 }, { "epoch": 0.5355536859206823, "grad_norm": 0.12682022154331207, "learning_rate": 0.00038611009149810656, "loss": 2.988, "step": 9230 }, { "epoch": 0.5361339174330558, "grad_norm": 0.11372144520282745, "learning_rate": 0.00038534302787543835, "loss": 2.9835, "step": 9240 }, { "epoch": 0.5367141489454292, "grad_norm": 0.11397378146648407, "learning_rate": 0.0003845760182212768, "loss": 2.9818, "step": 9250 }, { "epoch": 0.5372943804578026, "grad_norm": 0.11378198862075806, "learning_rate": 0.00038380906535983195, "loss": 2.9773, "step": 9260 }, { "epoch": 0.537874611970176, "grad_norm": 0.11755436658859253, "learning_rate": 0.00038304217211510445, "loss": 2.9744, "step": 9270 }, { "epoch": 0.5384548434825496, "grad_norm": 0.12496274709701538, "learning_rate": 0.0003822753413108758, "loss": 2.9746, "step": 9280 }, { "epoch": 0.539035074994923, "grad_norm": 0.12139610201120377, "learning_rate": 0.00038150857577069725, "loss": 2.9729, "step": 9290 }, { "epoch": 0.5396153065072964, "grad_norm": 0.11439456790685654, "learning_rate": 0.0003807418783178797, "loss": 2.974, "step": 9300 }, { "epoch": 0.5401955380196698, "grad_norm": 0.12058482319116592, "learning_rate": 0.00037997525177548377, "loss": 2.9741, "step": 9310 }, { "epoch": 0.5407757695320433, "grad_norm": 0.12571559846401215, "learning_rate": 0.0003792086989663084, "loss": 2.9743, "step": 9320 }, { "epoch": 0.5413560010444167, "grad_norm": 0.1070980429649353, "learning_rate": 0.00037844222271288166, "loss": 2.9767, "step": 9330 }, { "epoch": 0.5419362325567901, "grad_norm": 0.10775572061538696, "learning_rate": 0.00037767582583744905, "loss": 2.9747, "step": 9340 }, { "epoch": 0.5425164640691637, "grad_norm": 0.131280779838562, "learning_rate": 0.0003769095111619643, "loss": 2.9699, "step": 9350 }, { "epoch": 0.5430966955815371, "grad_norm": 0.11556284129619598, "learning_rate": 0.00037614328150807847, "loss": 2.976, "step": 9360 }, { "epoch": 0.5436769270939105, "grad_norm": 0.12227209657430649, "learning_rate": 0.0003753771396971292, "loss": 2.985, "step": 9370 }, { "epoch": 0.5442571586062839, "grad_norm": 0.11332014948129654, "learning_rate": 0.00037461108855013116, "loss": 2.9697, "step": 9380 }, { "epoch": 0.5448373901186574, "grad_norm": 0.13393546640872955, "learning_rate": 0.00037384513088776453, "loss": 2.9674, "step": 9390 }, { "epoch": 0.5454176216310308, "grad_norm": 0.1244276612997055, "learning_rate": 0.0003730792695303659, "loss": 2.9788, "step": 9400 }, { "epoch": 0.5459978531434042, "grad_norm": 0.1271297037601471, "learning_rate": 0.00037231350729791715, "loss": 2.9756, "step": 9410 }, { "epoch": 0.5465780846557776, "grad_norm": 0.12968561053276062, "learning_rate": 0.0003715478470100347, "loss": 2.9799, "step": 9420 }, { "epoch": 0.5471583161681511, "grad_norm": 0.11900387704372406, "learning_rate": 0.0003707822914859604, "loss": 2.9635, "step": 9430 }, { "epoch": 0.5477385476805245, "grad_norm": 0.1169251874089241, "learning_rate": 0.0003700168435445494, "loss": 2.9801, "step": 9440 }, { "epoch": 0.548318779192898, "grad_norm": 0.10975334793329239, "learning_rate": 0.0003692515060042615, "loss": 2.9784, "step": 9450 }, { "epoch": 0.5488990107052714, "grad_norm": 0.11917108297348022, "learning_rate": 0.0003684862816831496, "loss": 2.9623, "step": 9460 }, { "epoch": 0.5494792422176449, "grad_norm": 0.11328047513961792, "learning_rate": 0.00036772117339884975, "loss": 2.9612, "step": 9470 }, { "epoch": 0.5500594737300183, "grad_norm": 0.11446483433246613, "learning_rate": 0.0003669561839685711, "loss": 2.972, "step": 9480 }, { "epoch": 0.5506397052423917, "grad_norm": 0.11559679359197617, "learning_rate": 0.00036619131620908445, "loss": 2.9725, "step": 9490 }, { "epoch": 0.5512199367547651, "grad_norm": 0.12147093564271927, "learning_rate": 0.00036542657293671345, "loss": 2.9783, "step": 9500 }, { "epoch": 0.5518001682671386, "grad_norm": 0.12409328669309616, "learning_rate": 0.0003646619569673225, "loss": 2.9743, "step": 9510 }, { "epoch": 0.552380399779512, "grad_norm": 0.1250089555978775, "learning_rate": 0.0003638974711163079, "loss": 2.9779, "step": 9520 }, { "epoch": 0.5529606312918854, "grad_norm": 0.12481830269098282, "learning_rate": 0.0003631331181985868, "loss": 2.9695, "step": 9530 }, { "epoch": 0.5535408628042588, "grad_norm": 0.11836519837379456, "learning_rate": 0.0003623689010285864, "loss": 2.9713, "step": 9540 }, { "epoch": 0.5541210943166324, "grad_norm": 0.11716274172067642, "learning_rate": 0.0003616048224202348, "loss": 2.9732, "step": 9550 }, { "epoch": 0.5547013258290058, "grad_norm": 0.11922826617956161, "learning_rate": 0.000360840885186949, "loss": 2.966, "step": 9560 }, { "epoch": 0.5552815573413792, "grad_norm": 0.11280693113803864, "learning_rate": 0.0003600770921416262, "loss": 2.9779, "step": 9570 }, { "epoch": 0.5558617888537526, "grad_norm": 0.1292978674173355, "learning_rate": 0.0003593134460966323, "loss": 2.974, "step": 9580 }, { "epoch": 0.5564420203661261, "grad_norm": 0.122468963265419, "learning_rate": 0.00035854994986379206, "loss": 2.9558, "step": 9590 }, { "epoch": 0.5570222518784995, "grad_norm": 0.12729544937610626, "learning_rate": 0.0003577866062543788, "loss": 2.9748, "step": 9600 }, { "epoch": 0.5576024833908729, "grad_norm": 0.11830838024616241, "learning_rate": 0.00035702341807910333, "loss": 2.9618, "step": 9610 }, { "epoch": 0.5581827149032464, "grad_norm": 0.1238885223865509, "learning_rate": 0.00035626038814810465, "loss": 2.9743, "step": 9620 }, { "epoch": 0.5587629464156199, "grad_norm": 0.1125752255320549, "learning_rate": 0.0003554975192709388, "loss": 2.9689, "step": 9630 }, { "epoch": 0.5593431779279933, "grad_norm": 0.1268298327922821, "learning_rate": 0.00035473481425656885, "loss": 2.9753, "step": 9640 }, { "epoch": 0.5599234094403667, "grad_norm": 0.12624050676822662, "learning_rate": 0.0003539722759133549, "loss": 2.9744, "step": 9650 }, { "epoch": 0.5605036409527402, "grad_norm": 0.13024677336215973, "learning_rate": 0.00035320990704904265, "loss": 2.9713, "step": 9660 }, { "epoch": 0.5610838724651136, "grad_norm": 0.12166307121515274, "learning_rate": 0.00035244771047075426, "loss": 2.9655, "step": 9670 }, { "epoch": 0.561664103977487, "grad_norm": 0.1329546421766281, "learning_rate": 0.00035168568898497727, "loss": 2.9713, "step": 9680 }, { "epoch": 0.5622443354898604, "grad_norm": 0.12308143824338913, "learning_rate": 0.00035092384539755497, "loss": 2.9676, "step": 9690 }, { "epoch": 0.5628245670022339, "grad_norm": 0.12322785705327988, "learning_rate": 0.00035016218251367475, "loss": 2.9636, "step": 9700 }, { "epoch": 0.5634047985146073, "grad_norm": 0.11341747641563416, "learning_rate": 0.00034940070313785944, "loss": 2.9734, "step": 9710 }, { "epoch": 0.5639850300269807, "grad_norm": 0.11488571017980576, "learning_rate": 0.0003486394100739559, "loss": 2.968, "step": 9720 }, { "epoch": 0.5645652615393542, "grad_norm": 0.11449705064296722, "learning_rate": 0.00034787830612512474, "loss": 2.9653, "step": 9730 }, { "epoch": 0.5651454930517277, "grad_norm": 0.12567433714866638, "learning_rate": 0.00034711739409383073, "loss": 2.9573, "step": 9740 }, { "epoch": 0.5657257245641011, "grad_norm": 0.11687970161437988, "learning_rate": 0.00034635667678183134, "loss": 2.9661, "step": 9750 }, { "epoch": 0.5663059560764745, "grad_norm": 0.11766985058784485, "learning_rate": 0.00034559615699016755, "loss": 2.9755, "step": 9760 }, { "epoch": 0.5668861875888479, "grad_norm": 0.11615738272666931, "learning_rate": 0.00034483583751915314, "loss": 2.9732, "step": 9770 }, { "epoch": 0.5674664191012214, "grad_norm": 0.12588836252689362, "learning_rate": 0.0003440757211683636, "loss": 2.9679, "step": 9780 }, { "epoch": 0.5680466506135948, "grad_norm": 0.1253577172756195, "learning_rate": 0.0003433158107366273, "loss": 2.9634, "step": 9790 }, { "epoch": 0.5686268821259682, "grad_norm": 0.13995492458343506, "learning_rate": 0.00034255610902201385, "loss": 2.9652, "step": 9800 }, { "epoch": 0.5692071136383416, "grad_norm": 0.11616558581590652, "learning_rate": 0.0003417966188218248, "loss": 2.9661, "step": 9810 }, { "epoch": 0.5697873451507152, "grad_norm": 0.11519207060337067, "learning_rate": 0.0003410373429325823, "loss": 2.9652, "step": 9820 }, { "epoch": 0.5703675766630886, "grad_norm": 0.11953810602426529, "learning_rate": 0.00034027828415002, "loss": 2.9692, "step": 9830 }, { "epoch": 0.570947808175462, "grad_norm": 0.1192983016371727, "learning_rate": 0.0003395194452690717, "loss": 2.9698, "step": 9840 }, { "epoch": 0.5715280396878355, "grad_norm": 0.11357498914003372, "learning_rate": 0.00033876082908386164, "loss": 2.9634, "step": 9850 }, { "epoch": 0.5721082712002089, "grad_norm": 0.12101314216852188, "learning_rate": 0.0003380024383876943, "loss": 2.9667, "step": 9860 }, { "epoch": 0.5726885027125823, "grad_norm": 0.12401413917541504, "learning_rate": 0.0003372442759730434, "loss": 2.9596, "step": 9870 }, { "epoch": 0.5732687342249557, "grad_norm": 0.10911750048398972, "learning_rate": 0.0003364863446315424, "loss": 2.9765, "step": 9880 }, { "epoch": 0.5738489657373292, "grad_norm": 0.1123969703912735, "learning_rate": 0.00033572864715397416, "loss": 2.9591, "step": 9890 }, { "epoch": 0.5744291972497026, "grad_norm": 0.13271242380142212, "learning_rate": 0.00033497118633025986, "loss": 2.9592, "step": 9900 }, { "epoch": 0.5750094287620761, "grad_norm": 0.12631261348724365, "learning_rate": 0.00033421396494944986, "loss": 2.9565, "step": 9910 }, { "epoch": 0.5755896602744495, "grad_norm": 0.13029712438583374, "learning_rate": 0.0003334569857997124, "loss": 2.9659, "step": 9920 }, { "epoch": 0.576169891786823, "grad_norm": 0.11864218860864639, "learning_rate": 0.0003327002516683241, "loss": 2.9637, "step": 9930 }, { "epoch": 0.5767501232991964, "grad_norm": 0.12651433050632477, "learning_rate": 0.0003319437653416592, "loss": 2.9658, "step": 9940 }, { "epoch": 0.5773303548115698, "grad_norm": 0.11786976456642151, "learning_rate": 0.00033118752960517974, "loss": 2.9598, "step": 9950 }, { "epoch": 0.5779105863239432, "grad_norm": 0.12406033277511597, "learning_rate": 0.00033043154724342506, "loss": 2.9629, "step": 9960 }, { "epoch": 0.5784908178363167, "grad_norm": 0.11716942489147186, "learning_rate": 0.000329675821040001, "loss": 2.9628, "step": 9970 }, { "epoch": 0.5790710493486901, "grad_norm": 0.13326691091060638, "learning_rate": 0.00032892035377757095, "loss": 2.9665, "step": 9980 }, { "epoch": 0.5796512808610635, "grad_norm": 0.12042268365621567, "learning_rate": 0.0003281651482378443, "loss": 2.9553, "step": 9990 }, { "epoch": 0.580231512373437, "grad_norm": 0.11940687894821167, "learning_rate": 0.0003274102072015671, "loss": 2.9603, "step": 10000 }, { "epoch": 0.580231512373437, "eval_loss": 2.9137232303619385, "eval_runtime": 3.4961, "eval_samples_per_second": 1238.535, "eval_steps_per_second": 4.863, "step": 10000 }, { "epoch": 0.5808117438858105, "grad_norm": 0.12755872309207916, "learning_rate": 0.0003266555334485114, "loss": 2.9599, "step": 10010 }, { "epoch": 0.5813919753981839, "grad_norm": 0.12234493345022202, "learning_rate": 0.00032590112975746483, "loss": 2.9687, "step": 10020 }, { "epoch": 0.5819722069105573, "grad_norm": 0.12706685066223145, "learning_rate": 0.0003251469989062209, "loss": 2.9518, "step": 10030 }, { "epoch": 0.5825524384229307, "grad_norm": 0.12501336634159088, "learning_rate": 0.0003243931436715683, "loss": 2.957, "step": 10040 }, { "epoch": 0.5831326699353042, "grad_norm": 0.10916994512081146, "learning_rate": 0.0003236395668292813, "loss": 2.9641, "step": 10050 }, { "epoch": 0.5837129014476776, "grad_norm": 0.11000606417655945, "learning_rate": 0.0003228862711541083, "loss": 2.957, "step": 10060 }, { "epoch": 0.584293132960051, "grad_norm": 0.13003936409950256, "learning_rate": 0.0003221332594197632, "loss": 2.9594, "step": 10070 }, { "epoch": 0.5848733644724246, "grad_norm": 0.1223490834236145, "learning_rate": 0.00032138053439891413, "loss": 2.9581, "step": 10080 }, { "epoch": 0.585453595984798, "grad_norm": 0.13121525943279266, "learning_rate": 0.0003206280988631732, "loss": 2.9625, "step": 10090 }, { "epoch": 0.5860338274971714, "grad_norm": 0.11926265060901642, "learning_rate": 0.0003198759555830871, "loss": 2.9614, "step": 10100 }, { "epoch": 0.5866140590095448, "grad_norm": 0.1077842116355896, "learning_rate": 0.0003191241073281261, "loss": 2.9509, "step": 10110 }, { "epoch": 0.5871942905219183, "grad_norm": 0.12171042710542679, "learning_rate": 0.0003183725568666742, "loss": 2.9597, "step": 10120 }, { "epoch": 0.5877745220342917, "grad_norm": 0.11349521577358246, "learning_rate": 0.00031762130696601915, "loss": 2.9612, "step": 10130 }, { "epoch": 0.5883547535466651, "grad_norm": 0.11763352900743484, "learning_rate": 0.0003168703603923415, "loss": 2.9576, "step": 10140 }, { "epoch": 0.5889349850590385, "grad_norm": 0.1241789311170578, "learning_rate": 0.0003161197199107054, "loss": 2.954, "step": 10150 }, { "epoch": 0.589515216571412, "grad_norm": 0.12139084935188293, "learning_rate": 0.00031536938828504753, "loss": 2.9625, "step": 10160 }, { "epoch": 0.5900954480837854, "grad_norm": 0.1117582768201828, "learning_rate": 0.0003146193682781678, "loss": 2.9567, "step": 10170 }, { "epoch": 0.5906756795961589, "grad_norm": 0.12021225690841675, "learning_rate": 0.00031386966265171815, "loss": 2.956, "step": 10180 }, { "epoch": 0.5912559111085323, "grad_norm": 0.11641380935907364, "learning_rate": 0.00031312027416619345, "loss": 2.9596, "step": 10190 }, { "epoch": 0.5918361426209058, "grad_norm": 0.12370184808969498, "learning_rate": 0.0003123712055809206, "loss": 2.9614, "step": 10200 }, { "epoch": 0.5924163741332792, "grad_norm": 0.11585281789302826, "learning_rate": 0.0003116224596540485, "loss": 2.96, "step": 10210 }, { "epoch": 0.5929966056456526, "grad_norm": 0.11630109697580338, "learning_rate": 0.0003108740391425383, "loss": 2.9608, "step": 10220 }, { "epoch": 0.593576837158026, "grad_norm": 0.12009628862142563, "learning_rate": 0.0003101259468021524, "loss": 2.9523, "step": 10230 }, { "epoch": 0.5941570686703995, "grad_norm": 0.11749378591775894, "learning_rate": 0.0003093781853874453, "loss": 2.9538, "step": 10240 }, { "epoch": 0.5947373001827729, "grad_norm": 0.12220752239227295, "learning_rate": 0.00030863075765175297, "loss": 2.9562, "step": 10250 }, { "epoch": 0.5953175316951463, "grad_norm": 0.11156833171844482, "learning_rate": 0.0003078836663471825, "loss": 2.9596, "step": 10260 }, { "epoch": 0.5958977632075197, "grad_norm": 0.10898885875940323, "learning_rate": 0.00030713691422460255, "loss": 2.951, "step": 10270 }, { "epoch": 0.5964779947198933, "grad_norm": 0.11735124886035919, "learning_rate": 0.00030639050403363235, "loss": 2.9542, "step": 10280 }, { "epoch": 0.5970582262322667, "grad_norm": 0.11669401824474335, "learning_rate": 0.00030564443852263264, "loss": 2.9468, "step": 10290 }, { "epoch": 0.5976384577446401, "grad_norm": 0.11977334320545197, "learning_rate": 0.00030489872043869466, "loss": 2.9537, "step": 10300 }, { "epoch": 0.5982186892570136, "grad_norm": 0.12281852215528488, "learning_rate": 0.00030415335252763055, "loss": 2.9533, "step": 10310 }, { "epoch": 0.598798920769387, "grad_norm": 0.11608687788248062, "learning_rate": 0.0003034083375339632, "loss": 2.9467, "step": 10320 }, { "epoch": 0.5993791522817604, "grad_norm": 0.11336616426706314, "learning_rate": 0.0003026636782009156, "loss": 2.9555, "step": 10330 }, { "epoch": 0.5999593837941338, "grad_norm": 0.11283810436725616, "learning_rate": 0.00030191937727040176, "loss": 2.9604, "step": 10340 }, { "epoch": 0.6005396153065073, "grad_norm": 0.12219896167516708, "learning_rate": 0.00030117543748301524, "loss": 2.9568, "step": 10350 }, { "epoch": 0.6011198468188808, "grad_norm": 0.12714378535747528, "learning_rate": 0.0003004318615780205, "loss": 2.9492, "step": 10360 }, { "epoch": 0.6017000783312542, "grad_norm": 0.1103973239660263, "learning_rate": 0.000299688652293342, "loss": 2.9587, "step": 10370 }, { "epoch": 0.6022803098436276, "grad_norm": 0.11297036707401276, "learning_rate": 0.000298945812365554, "loss": 2.9609, "step": 10380 }, { "epoch": 0.6028605413560011, "grad_norm": 0.11668708920478821, "learning_rate": 0.00029820334452987124, "loss": 2.9461, "step": 10390 }, { "epoch": 0.6034407728683745, "grad_norm": 0.1264335960149765, "learning_rate": 0.0002974612515201376, "loss": 2.9521, "step": 10400 }, { "epoch": 0.6040210043807479, "grad_norm": 0.11221937835216522, "learning_rate": 0.00029671953606881773, "loss": 2.9453, "step": 10410 }, { "epoch": 0.6046012358931213, "grad_norm": 0.11206606775522232, "learning_rate": 0.0002959782009069853, "loss": 2.9488, "step": 10420 }, { "epoch": 0.6051814674054948, "grad_norm": 0.12074000388383865, "learning_rate": 0.00029523724876431417, "loss": 2.9458, "step": 10430 }, { "epoch": 0.6057616989178682, "grad_norm": 0.12387416511774063, "learning_rate": 0.0002944966823690678, "loss": 2.9576, "step": 10440 }, { "epoch": 0.6063419304302416, "grad_norm": 0.13284623622894287, "learning_rate": 0.0002937565044480892, "loss": 2.9562, "step": 10450 }, { "epoch": 0.6069221619426151, "grad_norm": 0.11335093528032303, "learning_rate": 0.000293016717726791, "loss": 2.9471, "step": 10460 }, { "epoch": 0.6075023934549886, "grad_norm": 0.10745152086019516, "learning_rate": 0.0002922773249291453, "loss": 2.9547, "step": 10470 }, { "epoch": 0.608082624967362, "grad_norm": 0.11629511415958405, "learning_rate": 0.00029153832877767396, "loss": 2.9476, "step": 10480 }, { "epoch": 0.6086628564797354, "grad_norm": 0.11118092387914658, "learning_rate": 0.0002907997319934384, "loss": 2.9592, "step": 10490 }, { "epoch": 0.6092430879921088, "grad_norm": 0.10906965285539627, "learning_rate": 0.00029006153729602894, "loss": 2.9491, "step": 10500 }, { "epoch": 0.6098233195044823, "grad_norm": 0.11346712708473206, "learning_rate": 0.0002893237474035561, "loss": 2.9474, "step": 10510 }, { "epoch": 0.6104035510168557, "grad_norm": 0.11595900356769562, "learning_rate": 0.00028858636503263944, "loss": 2.9396, "step": 10520 }, { "epoch": 0.6109837825292291, "grad_norm": 0.10991494357585907, "learning_rate": 0.0002878493928983982, "loss": 2.9542, "step": 10530 }, { "epoch": 0.6115640140416027, "grad_norm": 0.12836730480194092, "learning_rate": 0.0002871128337144408, "loss": 2.9466, "step": 10540 }, { "epoch": 0.6121442455539761, "grad_norm": 0.12034939229488373, "learning_rate": 0.0002863766901928553, "loss": 2.9545, "step": 10550 }, { "epoch": 0.6127244770663495, "grad_norm": 0.11163250356912613, "learning_rate": 0.0002856409650441996, "loss": 2.9475, "step": 10560 }, { "epoch": 0.6133047085787229, "grad_norm": 0.13410978019237518, "learning_rate": 0.00028490566097749036, "loss": 2.9473, "step": 10570 }, { "epoch": 0.6138849400910964, "grad_norm": 0.12076187878847122, "learning_rate": 0.0002841707807001946, "loss": 2.9488, "step": 10580 }, { "epoch": 0.6144651716034698, "grad_norm": 0.12877877056598663, "learning_rate": 0.00028343632691821806, "loss": 2.9458, "step": 10590 }, { "epoch": 0.6150454031158432, "grad_norm": 0.12139870971441269, "learning_rate": 0.0002827023023358967, "loss": 2.9481, "step": 10600 }, { "epoch": 0.6156256346282166, "grad_norm": 0.11746102571487427, "learning_rate": 0.00028196870965598606, "loss": 2.949, "step": 10610 }, { "epoch": 0.6162058661405901, "grad_norm": 0.11976625770330429, "learning_rate": 0.00028123555157965095, "loss": 2.9453, "step": 10620 }, { "epoch": 0.6167860976529635, "grad_norm": 0.10492799431085587, "learning_rate": 0.0002805028308064564, "loss": 2.9439, "step": 10630 }, { "epoch": 0.617366329165337, "grad_norm": 0.11106211692094803, "learning_rate": 0.00027977055003435697, "loss": 2.9489, "step": 10640 }, { "epoch": 0.6179465606777104, "grad_norm": 0.11078832298517227, "learning_rate": 0.0002790387119596874, "loss": 2.963, "step": 10650 }, { "epoch": 0.6185267921900839, "grad_norm": 0.1114284098148346, "learning_rate": 0.0002783073192771518, "loss": 2.9423, "step": 10660 }, { "epoch": 0.6191070237024573, "grad_norm": 0.11566140502691269, "learning_rate": 0.00027757637467981476, "loss": 2.9556, "step": 10670 }, { "epoch": 0.6196872552148307, "grad_norm": 0.11549056321382523, "learning_rate": 0.0002768458808590911, "loss": 2.9462, "step": 10680 }, { "epoch": 0.6202674867272041, "grad_norm": 0.12600113451480865, "learning_rate": 0.0002761158405047352, "loss": 2.9554, "step": 10690 }, { "epoch": 0.6208477182395776, "grad_norm": 0.11889569461345673, "learning_rate": 0.0002753862563048325, "loss": 2.9556, "step": 10700 }, { "epoch": 0.621427949751951, "grad_norm": 0.11570101231336594, "learning_rate": 0.0002746571309457882, "loss": 2.9382, "step": 10710 }, { "epoch": 0.6220081812643244, "grad_norm": 0.1091545969247818, "learning_rate": 0.0002739284671123183, "loss": 2.9373, "step": 10720 }, { "epoch": 0.6225884127766979, "grad_norm": 0.11900783330202103, "learning_rate": 0.00027320026748743945, "loss": 2.9468, "step": 10730 }, { "epoch": 0.6231686442890714, "grad_norm": 0.11762910336256027, "learning_rate": 0.00027247253475245884, "loss": 2.9531, "step": 10740 }, { "epoch": 0.6237488758014448, "grad_norm": 0.1298149675130844, "learning_rate": 0.00027174527158696477, "loss": 2.9437, "step": 10750 }, { "epoch": 0.6243291073138182, "grad_norm": 0.13842837512493134, "learning_rate": 0.00027101848066881626, "loss": 2.9511, "step": 10760 }, { "epoch": 0.6249093388261916, "grad_norm": 0.11322519183158875, "learning_rate": 0.0002702921646741338, "loss": 2.955, "step": 10770 }, { "epoch": 0.6254895703385651, "grad_norm": 0.11447368562221527, "learning_rate": 0.00026956632627728866, "loss": 2.9488, "step": 10780 }, { "epoch": 0.6260698018509385, "grad_norm": 0.1215549036860466, "learning_rate": 0.0002688409681508941, "loss": 2.9416, "step": 10790 }, { "epoch": 0.6266500333633119, "grad_norm": 0.11927559226751328, "learning_rate": 0.0002681160929657948, "loss": 2.9475, "step": 10800 }, { "epoch": 0.6272302648756855, "grad_norm": 0.11237683892250061, "learning_rate": 0.0002673917033910568, "loss": 2.9416, "step": 10810 }, { "epoch": 0.6278104963880589, "grad_norm": 0.12506188452243805, "learning_rate": 0.00026666780209395874, "loss": 2.9411, "step": 10820 }, { "epoch": 0.6283907279004323, "grad_norm": 0.12645015120506287, "learning_rate": 0.0002659443917399808, "loss": 2.9443, "step": 10830 }, { "epoch": 0.6289709594128057, "grad_norm": 0.12712042033672333, "learning_rate": 0.0002652214749927957, "loss": 2.9504, "step": 10840 }, { "epoch": 0.6295511909251792, "grad_norm": 0.12220533192157745, "learning_rate": 0.00026449905451425895, "loss": 2.9363, "step": 10850 }, { "epoch": 0.6301314224375526, "grad_norm": 0.11817632615566254, "learning_rate": 0.000263777132964398, "loss": 2.9379, "step": 10860 }, { "epoch": 0.630711653949926, "grad_norm": 0.11719761788845062, "learning_rate": 0.00026305571300140395, "loss": 2.9452, "step": 10870 }, { "epoch": 0.6312918854622994, "grad_norm": 0.11095809191465378, "learning_rate": 0.00026233479728162036, "loss": 2.9387, "step": 10880 }, { "epoch": 0.6318721169746729, "grad_norm": 0.11345186084508896, "learning_rate": 0.00026161438845953463, "loss": 2.9372, "step": 10890 }, { "epoch": 0.6324523484870463, "grad_norm": 0.1197100281715393, "learning_rate": 0.0002608944891877675, "loss": 2.9366, "step": 10900 }, { "epoch": 0.6330325799994198, "grad_norm": 0.1329440474510193, "learning_rate": 0.00026017510211706347, "loss": 2.9455, "step": 10910 }, { "epoch": 0.6336128115117932, "grad_norm": 0.11800654977560043, "learning_rate": 0.00025945622989628124, "loss": 2.9461, "step": 10920 }, { "epoch": 0.6341930430241667, "grad_norm": 0.12710924446582794, "learning_rate": 0.00025873787517238344, "loss": 2.9511, "step": 10930 }, { "epoch": 0.6347732745365401, "grad_norm": 0.11945926398038864, "learning_rate": 0.00025802004059042753, "loss": 2.9388, "step": 10940 }, { "epoch": 0.6353535060489135, "grad_norm": 0.12105191498994827, "learning_rate": 0.00025730272879355545, "loss": 2.9367, "step": 10950 }, { "epoch": 0.6359337375612869, "grad_norm": 0.1130596250295639, "learning_rate": 0.0002565859424229846, "loss": 2.9409, "step": 10960 }, { "epoch": 0.6365139690736604, "grad_norm": 0.11161385476589203, "learning_rate": 0.0002558696841179975, "loss": 2.9438, "step": 10970 }, { "epoch": 0.6370942005860338, "grad_norm": 0.10884016752243042, "learning_rate": 0.00025515395651593204, "loss": 2.945, "step": 10980 }, { "epoch": 0.6376744320984072, "grad_norm": 0.11017812043428421, "learning_rate": 0.00025443876225217247, "loss": 2.9426, "step": 10990 }, { "epoch": 0.6382546636107806, "grad_norm": 0.11533761769533157, "learning_rate": 0.00025372410396013877, "loss": 2.9408, "step": 11000 }, { "epoch": 0.6382546636107806, "eval_loss": 2.8931117057800293, "eval_runtime": 3.5003, "eval_samples_per_second": 1237.04, "eval_steps_per_second": 4.857, "step": 11000 }, { "epoch": 0.6388348951231542, "grad_norm": 0.11217015981674194, "learning_rate": 0.0002530099842712779, "loss": 2.952, "step": 11010 }, { "epoch": 0.6394151266355276, "grad_norm": 0.12614357471466064, "learning_rate": 0.00025229640581505306, "loss": 2.9377, "step": 11020 }, { "epoch": 0.639995358147901, "grad_norm": 0.11071863025426865, "learning_rate": 0.000251583371218935, "loss": 2.9358, "step": 11030 }, { "epoch": 0.6405755896602745, "grad_norm": 0.1202605664730072, "learning_rate": 0.00025087088310839196, "loss": 2.9343, "step": 11040 }, { "epoch": 0.6411558211726479, "grad_norm": 0.11185882985591888, "learning_rate": 0.0002501589441068796, "loss": 2.9425, "step": 11050 }, { "epoch": 0.6417360526850213, "grad_norm": 0.11718738824129105, "learning_rate": 0.0002494475568358321, "loss": 2.9373, "step": 11060 }, { "epoch": 0.6423162841973947, "grad_norm": 0.12277991324663162, "learning_rate": 0.0002487367239146516, "loss": 2.9263, "step": 11070 }, { "epoch": 0.6428965157097682, "grad_norm": 0.12239264696836472, "learning_rate": 0.0002480264479606996, "loss": 2.9378, "step": 11080 }, { "epoch": 0.6434767472221417, "grad_norm": 0.10394129157066345, "learning_rate": 0.00024731673158928663, "loss": 2.9394, "step": 11090 }, { "epoch": 0.6440569787345151, "grad_norm": 0.11357410252094269, "learning_rate": 0.0002466075774136625, "loss": 2.94, "step": 11100 }, { "epoch": 0.6446372102468885, "grad_norm": 0.1290815770626068, "learning_rate": 0.00024589898804500756, "loss": 2.9291, "step": 11110 }, { "epoch": 0.645217441759262, "grad_norm": 0.11551100015640259, "learning_rate": 0.00024519096609242156, "loss": 2.9394, "step": 11120 }, { "epoch": 0.6457976732716354, "grad_norm": 0.11799687147140503, "learning_rate": 0.00024448351416291595, "loss": 2.9327, "step": 11130 }, { "epoch": 0.6463779047840088, "grad_norm": 0.1109696552157402, "learning_rate": 0.0002437766348614025, "loss": 2.9422, "step": 11140 }, { "epoch": 0.6469581362963822, "grad_norm": 0.11457831412553787, "learning_rate": 0.00024307033079068506, "loss": 2.9357, "step": 11150 }, { "epoch": 0.6475383678087557, "grad_norm": 0.12238124758005142, "learning_rate": 0.00024236460455144933, "loss": 2.9327, "step": 11160 }, { "epoch": 0.6481185993211291, "grad_norm": 0.1273690164089203, "learning_rate": 0.00024165945874225292, "loss": 2.9306, "step": 11170 }, { "epoch": 0.6486988308335025, "grad_norm": 0.11272764950990677, "learning_rate": 0.00024095489595951696, "loss": 2.9384, "step": 11180 }, { "epoch": 0.649279062345876, "grad_norm": 0.12116546183824539, "learning_rate": 0.00024025091879751494, "loss": 2.9405, "step": 11190 }, { "epoch": 0.6498592938582495, "grad_norm": 0.11677329242229462, "learning_rate": 0.0002395475298483649, "loss": 2.9327, "step": 11200 }, { "epoch": 0.6504395253706229, "grad_norm": 0.11187145859003067, "learning_rate": 0.00023884473170201852, "loss": 2.9311, "step": 11210 }, { "epoch": 0.6510197568829963, "grad_norm": 0.11290306597948074, "learning_rate": 0.0002381425269462521, "loss": 2.944, "step": 11220 }, { "epoch": 0.6515999883953697, "grad_norm": 0.1148315817117691, "learning_rate": 0.0002374409181666574, "loss": 2.9226, "step": 11230 }, { "epoch": 0.6521802199077432, "grad_norm": 0.11893568933010101, "learning_rate": 0.00023673990794663107, "loss": 2.9371, "step": 11240 }, { "epoch": 0.6527604514201166, "grad_norm": 0.10734174400568008, "learning_rate": 0.00023603949886736642, "loss": 2.9362, "step": 11250 }, { "epoch": 0.65334068293249, "grad_norm": 0.1184125542640686, "learning_rate": 0.00023533969350784294, "loss": 2.9409, "step": 11260 }, { "epoch": 0.6539209144448636, "grad_norm": 0.11796090751886368, "learning_rate": 0.00023464049444481738, "loss": 2.9229, "step": 11270 }, { "epoch": 0.654501145957237, "grad_norm": 0.10981497913599014, "learning_rate": 0.00023394190425281397, "loss": 2.9329, "step": 11280 }, { "epoch": 0.6550813774696104, "grad_norm": 0.11429855972528458, "learning_rate": 0.00023324392550411473, "loss": 2.9352, "step": 11290 }, { "epoch": 0.6556616089819838, "grad_norm": 0.11114507168531418, "learning_rate": 0.00023254656076875075, "loss": 2.9365, "step": 11300 }, { "epoch": 0.6562418404943573, "grad_norm": 0.13142482936382294, "learning_rate": 0.00023184981261449204, "loss": 2.9284, "step": 11310 }, { "epoch": 0.6568220720067307, "grad_norm": 0.115805484354496, "learning_rate": 0.00023115368360683802, "loss": 2.9387, "step": 11320 }, { "epoch": 0.6574023035191041, "grad_norm": 0.1117892935872078, "learning_rate": 0.00023045817630900896, "loss": 2.937, "step": 11330 }, { "epoch": 0.6579825350314775, "grad_norm": 0.11008153110742569, "learning_rate": 0.00022976329328193554, "loss": 2.9403, "step": 11340 }, { "epoch": 0.658562766543851, "grad_norm": 0.10744712501764297, "learning_rate": 0.00022906903708424977, "loss": 2.9389, "step": 11350 }, { "epoch": 0.6591429980562244, "grad_norm": 0.10864759236574173, "learning_rate": 0.00022837541027227566, "loss": 2.9355, "step": 11360 }, { "epoch": 0.6597232295685979, "grad_norm": 0.12328560650348663, "learning_rate": 0.00022768241540001993, "loss": 2.9335, "step": 11370 }, { "epoch": 0.6603034610809713, "grad_norm": 0.11607347428798676, "learning_rate": 0.00022699005501916202, "loss": 2.9299, "step": 11380 }, { "epoch": 0.6608836925933448, "grad_norm": 0.10942059010267258, "learning_rate": 0.00022629833167904574, "loss": 2.9422, "step": 11390 }, { "epoch": 0.6614639241057182, "grad_norm": 0.11262045800685883, "learning_rate": 0.00022560724792666865, "loss": 2.9296, "step": 11400 }, { "epoch": 0.6620441556180916, "grad_norm": 0.11177600920200348, "learning_rate": 0.0002249168063066737, "loss": 2.9375, "step": 11410 }, { "epoch": 0.662624387130465, "grad_norm": 0.1066143810749054, "learning_rate": 0.0002242270093613391, "loss": 2.9342, "step": 11420 }, { "epoch": 0.6632046186428385, "grad_norm": 0.11729339510202408, "learning_rate": 0.00022353785963056955, "loss": 2.9367, "step": 11430 }, { "epoch": 0.6637848501552119, "grad_norm": 0.12385804951190948, "learning_rate": 0.00022284935965188632, "loss": 2.9486, "step": 11440 }, { "epoch": 0.6643650816675853, "grad_norm": 0.1270238608121872, "learning_rate": 0.00022216151196041887, "loss": 2.928, "step": 11450 }, { "epoch": 0.6649453131799588, "grad_norm": 0.11537665873765945, "learning_rate": 0.00022147431908889424, "loss": 2.9415, "step": 11460 }, { "epoch": 0.6655255446923323, "grad_norm": 0.1123494878411293, "learning_rate": 0.00022078778356762855, "loss": 2.9301, "step": 11470 }, { "epoch": 0.6661057762047057, "grad_norm": 0.11509925872087479, "learning_rate": 0.0002201019079245176, "loss": 2.9285, "step": 11480 }, { "epoch": 0.6666860077170791, "grad_norm": 0.10929159075021744, "learning_rate": 0.00021941669468502725, "loss": 2.941, "step": 11490 }, { "epoch": 0.6672662392294526, "grad_norm": 0.1147988960146904, "learning_rate": 0.00021873214637218436, "loss": 2.9271, "step": 11500 }, { "epoch": 0.667846470741826, "grad_norm": 0.121480293571949, "learning_rate": 0.00021804826550656778, "loss": 2.9271, "step": 11510 }, { "epoch": 0.6684267022541994, "grad_norm": 0.10935940593481064, "learning_rate": 0.00021736505460629837, "loss": 2.9299, "step": 11520 }, { "epoch": 0.6690069337665728, "grad_norm": 0.12035897374153137, "learning_rate": 0.00021668251618703024, "loss": 2.9403, "step": 11530 }, { "epoch": 0.6695871652789464, "grad_norm": 0.12228564918041229, "learning_rate": 0.00021600065276194133, "loss": 2.9285, "step": 11540 }, { "epoch": 0.6701673967913198, "grad_norm": 0.1131313145160675, "learning_rate": 0.00021531946684172408, "loss": 2.9247, "step": 11550 }, { "epoch": 0.6707476283036932, "grad_norm": 0.1156296506524086, "learning_rate": 0.0002146389609345767, "loss": 2.9311, "step": 11560 }, { "epoch": 0.6713278598160666, "grad_norm": 0.10842841863632202, "learning_rate": 0.00021395913754619303, "loss": 2.9269, "step": 11570 }, { "epoch": 0.6719080913284401, "grad_norm": 0.12424537539482117, "learning_rate": 0.00021327999917975405, "loss": 2.9285, "step": 11580 }, { "epoch": 0.6724883228408135, "grad_norm": 0.10670441389083862, "learning_rate": 0.00021260154833591837, "loss": 2.9269, "step": 11590 }, { "epoch": 0.6730685543531869, "grad_norm": 0.10753663629293442, "learning_rate": 0.00021192378751281292, "loss": 2.9268, "step": 11600 }, { "epoch": 0.6736487858655603, "grad_norm": 0.10578759759664536, "learning_rate": 0.0002112467192060245, "loss": 2.9129, "step": 11610 }, { "epoch": 0.6742290173779338, "grad_norm": 0.12178914248943329, "learning_rate": 0.0002105703459085889, "loss": 2.927, "step": 11620 }, { "epoch": 0.6748092488903072, "grad_norm": 0.11225739121437073, "learning_rate": 0.00020989467011098395, "loss": 2.9339, "step": 11630 }, { "epoch": 0.6753894804026807, "grad_norm": 0.111176498234272, "learning_rate": 0.0002092196943011186, "loss": 2.9313, "step": 11640 }, { "epoch": 0.6759697119150541, "grad_norm": 0.10836298763751984, "learning_rate": 0.00020854542096432427, "loss": 2.9346, "step": 11650 }, { "epoch": 0.6765499434274276, "grad_norm": 0.1073436364531517, "learning_rate": 0.00020787185258334652, "loss": 2.9355, "step": 11660 }, { "epoch": 0.677130174939801, "grad_norm": 0.12154415994882584, "learning_rate": 0.00020719899163833402, "loss": 2.9277, "step": 11670 }, { "epoch": 0.6777104064521744, "grad_norm": 0.12424246221780777, "learning_rate": 0.00020652684060683182, "loss": 2.926, "step": 11680 }, { "epoch": 0.6782906379645478, "grad_norm": 0.11322687566280365, "learning_rate": 0.00020585540196377023, "loss": 2.924, "step": 11690 }, { "epoch": 0.6788708694769213, "grad_norm": 0.11220389604568481, "learning_rate": 0.00020518467818145646, "loss": 2.9291, "step": 11700 }, { "epoch": 0.6794511009892947, "grad_norm": 0.11465161293745041, "learning_rate": 0.0002045146717295663, "loss": 2.9291, "step": 11710 }, { "epoch": 0.6800313325016681, "grad_norm": 0.11220245808362961, "learning_rate": 0.00020384538507513296, "loss": 2.9266, "step": 11720 }, { "epoch": 0.6806115640140415, "grad_norm": 0.11767267435789108, "learning_rate": 0.0002031768206825407, "loss": 2.9274, "step": 11730 }, { "epoch": 0.6811917955264151, "grad_norm": 0.10624901950359344, "learning_rate": 0.0002025089810135131, "loss": 2.9278, "step": 11740 }, { "epoch": 0.6817720270387885, "grad_norm": 0.1104850023984909, "learning_rate": 0.00020184186852710613, "loss": 2.9279, "step": 11750 }, { "epoch": 0.6823522585511619, "grad_norm": 0.1173277422785759, "learning_rate": 0.00020117548567969774, "loss": 2.9292, "step": 11760 }, { "epoch": 0.6829324900635354, "grad_norm": 0.10938852280378342, "learning_rate": 0.00020050983492497925, "loss": 2.928, "step": 11770 }, { "epoch": 0.6835127215759088, "grad_norm": 0.10957374423742294, "learning_rate": 0.00019984491871394692, "loss": 2.9124, "step": 11780 }, { "epoch": 0.6840929530882822, "grad_norm": 0.10347133874893188, "learning_rate": 0.00019918073949489124, "loss": 2.9223, "step": 11790 }, { "epoch": 0.6846731846006556, "grad_norm": 0.10665714740753174, "learning_rate": 0.00019851729971339017, "loss": 2.9193, "step": 11800 }, { "epoch": 0.6852534161130291, "grad_norm": 0.11447092890739441, "learning_rate": 0.00019785460181229834, "loss": 2.9384, "step": 11810 }, { "epoch": 0.6858336476254026, "grad_norm": 0.11855462938547134, "learning_rate": 0.0001971926482317387, "loss": 2.9181, "step": 11820 }, { "epoch": 0.686413879137776, "grad_norm": 0.11115296930074692, "learning_rate": 0.00019653144140909404, "loss": 2.9256, "step": 11830 }, { "epoch": 0.6869941106501494, "grad_norm": 0.11566124111413956, "learning_rate": 0.0001958709837789967, "loss": 2.9249, "step": 11840 }, { "epoch": 0.6875743421625229, "grad_norm": 0.11026368290185928, "learning_rate": 0.00019521127777332117, "loss": 2.9162, "step": 11850 }, { "epoch": 0.6881545736748963, "grad_norm": 0.11913493275642395, "learning_rate": 0.000194552325821174, "loss": 2.9199, "step": 11860 }, { "epoch": 0.6887348051872697, "grad_norm": 0.10717441141605377, "learning_rate": 0.00019389413034888513, "loss": 2.9175, "step": 11870 }, { "epoch": 0.6893150366996431, "grad_norm": 0.11114499717950821, "learning_rate": 0.00019323669377999974, "loss": 2.9212, "step": 11880 }, { "epoch": 0.6898952682120166, "grad_norm": 0.11325642466545105, "learning_rate": 0.00019258001853526743, "loss": 2.9176, "step": 11890 }, { "epoch": 0.69047549972439, "grad_norm": 0.11239134520292282, "learning_rate": 0.00019192410703263582, "loss": 2.9298, "step": 11900 }, { "epoch": 0.6910557312367634, "grad_norm": 0.10796649754047394, "learning_rate": 0.00019126896168723958, "loss": 2.9196, "step": 11910 }, { "epoch": 0.6916359627491369, "grad_norm": 0.11409149318933487, "learning_rate": 0.00019061458491139227, "loss": 2.9192, "step": 11920 }, { "epoch": 0.6922161942615104, "grad_norm": 0.1147644892334938, "learning_rate": 0.00018996097911457835, "loss": 2.9262, "step": 11930 }, { "epoch": 0.6927964257738838, "grad_norm": 0.11096258461475372, "learning_rate": 0.00018930814670344205, "loss": 2.9321, "step": 11940 }, { "epoch": 0.6933766572862572, "grad_norm": 0.1178256943821907, "learning_rate": 0.0001886560900817813, "loss": 2.9219, "step": 11950 }, { "epoch": 0.6939568887986306, "grad_norm": 0.11152894794940948, "learning_rate": 0.00018800481165053668, "loss": 2.926, "step": 11960 }, { "epoch": 0.6945371203110041, "grad_norm": 0.12084365636110306, "learning_rate": 0.00018735431380778356, "loss": 2.9213, "step": 11970 }, { "epoch": 0.6951173518233775, "grad_norm": 0.11856279522180557, "learning_rate": 0.00018670459894872313, "loss": 2.9278, "step": 11980 }, { "epoch": 0.6956975833357509, "grad_norm": 0.11058712005615234, "learning_rate": 0.0001860556694656735, "loss": 2.9215, "step": 11990 }, { "epoch": 0.6962778148481245, "grad_norm": 0.11684763431549072, "learning_rate": 0.00018540752774806146, "loss": 2.926, "step": 12000 }, { "epoch": 0.6962778148481245, "eval_loss": 2.8761942386627197, "eval_runtime": 3.5013, "eval_samples_per_second": 1236.666, "eval_steps_per_second": 4.855, "step": 12000 }, { "epoch": 0.6968580463604979, "grad_norm": 0.11740781366825104, "learning_rate": 0.00018476017618241204, "loss": 2.9293, "step": 12010 }, { "epoch": 0.6974382778728713, "grad_norm": 0.10910940170288086, "learning_rate": 0.00018411361715234197, "loss": 2.9304, "step": 12020 }, { "epoch": 0.6980185093852447, "grad_norm": 0.10588248074054718, "learning_rate": 0.00018346785303854923, "loss": 2.9249, "step": 12030 }, { "epoch": 0.6985987408976182, "grad_norm": 0.11413895338773727, "learning_rate": 0.00018282288621880474, "loss": 2.9202, "step": 12040 }, { "epoch": 0.6991789724099916, "grad_norm": 0.10688214004039764, "learning_rate": 0.0001821787190679444, "loss": 2.9241, "step": 12050 }, { "epoch": 0.699759203922365, "grad_norm": 0.11426564306020737, "learning_rate": 0.00018153535395785847, "loss": 2.9205, "step": 12060 }, { "epoch": 0.7003394354347384, "grad_norm": 0.12192817777395248, "learning_rate": 0.0001808927932574852, "loss": 2.915, "step": 12070 }, { "epoch": 0.7009196669471119, "grad_norm": 0.10396554321050644, "learning_rate": 0.0001802510393328002, "loss": 2.9338, "step": 12080 }, { "epoch": 0.7014998984594853, "grad_norm": 0.12143275886774063, "learning_rate": 0.00017961009454680867, "loss": 2.924, "step": 12090 }, { "epoch": 0.7020801299718588, "grad_norm": 0.12236862629652023, "learning_rate": 0.0001789699612595365, "loss": 2.9232, "step": 12100 }, { "epoch": 0.7026603614842322, "grad_norm": 0.10729341953992844, "learning_rate": 0.0001783306418280213, "loss": 2.9292, "step": 12110 }, { "epoch": 0.7032405929966057, "grad_norm": 0.11987815797328949, "learning_rate": 0.00017769213860630446, "loss": 2.9176, "step": 12120 }, { "epoch": 0.7038208245089791, "grad_norm": 0.10967355221509933, "learning_rate": 0.00017705445394542167, "loss": 2.9177, "step": 12130 }, { "epoch": 0.7044010560213525, "grad_norm": 0.10932415723800659, "learning_rate": 0.00017641759019339462, "loss": 2.928, "step": 12140 }, { "epoch": 0.7049812875337259, "grad_norm": 0.10741781443357468, "learning_rate": 0.00017578154969522233, "loss": 2.924, "step": 12150 }, { "epoch": 0.7055615190460994, "grad_norm": 0.10531704872846603, "learning_rate": 0.00017514633479287243, "loss": 2.9202, "step": 12160 }, { "epoch": 0.7061417505584728, "grad_norm": 0.10841057449579239, "learning_rate": 0.00017451194782527302, "loss": 2.9224, "step": 12170 }, { "epoch": 0.7067219820708462, "grad_norm": 0.107717365026474, "learning_rate": 0.0001738783911283032, "loss": 2.9194, "step": 12180 }, { "epoch": 0.7073022135832197, "grad_norm": 0.10722067207098007, "learning_rate": 0.00017324566703478514, "loss": 2.9306, "step": 12190 }, { "epoch": 0.7078824450955932, "grad_norm": 0.10724281519651413, "learning_rate": 0.00017261377787447524, "loss": 2.9146, "step": 12200 }, { "epoch": 0.7084626766079666, "grad_norm": 0.10672838985919952, "learning_rate": 0.00017198272597405552, "loss": 2.9121, "step": 12210 }, { "epoch": 0.70904290812034, "grad_norm": 0.11436674743890762, "learning_rate": 0.0001713525136571251, "loss": 2.9186, "step": 12220 }, { "epoch": 0.7096231396327135, "grad_norm": 0.10830719023942947, "learning_rate": 0.00017072314324419202, "loss": 2.9162, "step": 12230 }, { "epoch": 0.7102033711450869, "grad_norm": 0.11299096047878265, "learning_rate": 0.00017009461705266393, "loss": 2.9256, "step": 12240 }, { "epoch": 0.7107836026574603, "grad_norm": 0.10853977501392365, "learning_rate": 0.00016946693739684006, "loss": 2.9151, "step": 12250 }, { "epoch": 0.7113638341698337, "grad_norm": 0.11300002038478851, "learning_rate": 0.00016884010658790275, "loss": 2.9158, "step": 12260 }, { "epoch": 0.7119440656822072, "grad_norm": 0.11605297774076462, "learning_rate": 0.00016821412693390864, "loss": 2.9203, "step": 12270 }, { "epoch": 0.7125242971945807, "grad_norm": 0.11005743592977524, "learning_rate": 0.00016758900073978028, "loss": 2.9211, "step": 12280 }, { "epoch": 0.7131045287069541, "grad_norm": 0.11661600321531296, "learning_rate": 0.00016696473030729806, "loss": 2.9217, "step": 12290 }, { "epoch": 0.7136847602193275, "grad_norm": 0.11751366406679153, "learning_rate": 0.00016634131793509096, "loss": 2.9144, "step": 12300 }, { "epoch": 0.714264991731701, "grad_norm": 0.10463780164718628, "learning_rate": 0.0001657187659186287, "loss": 2.9256, "step": 12310 }, { "epoch": 0.7148452232440744, "grad_norm": 0.10872520506381989, "learning_rate": 0.00016509707655021293, "loss": 2.9196, "step": 12320 }, { "epoch": 0.7154254547564478, "grad_norm": 0.10005003213882446, "learning_rate": 0.00016447625211896916, "loss": 2.9161, "step": 12330 }, { "epoch": 0.7160056862688212, "grad_norm": 0.10709571093320847, "learning_rate": 0.00016385629491083768, "loss": 2.9146, "step": 12340 }, { "epoch": 0.7165859177811947, "grad_norm": 0.10586857050657272, "learning_rate": 0.0001632372072085663, "loss": 2.9133, "step": 12350 }, { "epoch": 0.7171661492935681, "grad_norm": 0.10271668434143066, "learning_rate": 0.00016261899129170056, "loss": 2.9133, "step": 12360 }, { "epoch": 0.7177463808059416, "grad_norm": 0.10729619115591049, "learning_rate": 0.00016200164943657626, "loss": 2.9221, "step": 12370 }, { "epoch": 0.718326612318315, "grad_norm": 0.11195025593042374, "learning_rate": 0.00016138518391631084, "loss": 2.9256, "step": 12380 }, { "epoch": 0.7189068438306885, "grad_norm": 0.10739976167678833, "learning_rate": 0.0001607695970007947, "loss": 2.9097, "step": 12390 }, { "epoch": 0.7194870753430619, "grad_norm": 0.11268290132284164, "learning_rate": 0.00016015489095668365, "loss": 2.9145, "step": 12400 }, { "epoch": 0.7200673068554353, "grad_norm": 0.11270696669816971, "learning_rate": 0.00015954106804738963, "loss": 2.9174, "step": 12410 }, { "epoch": 0.7206475383678087, "grad_norm": 0.11618941277265549, "learning_rate": 0.00015892813053307282, "loss": 2.9074, "step": 12420 }, { "epoch": 0.7212277698801822, "grad_norm": 0.10467299818992615, "learning_rate": 0.00015831608067063346, "loss": 2.9182, "step": 12430 }, { "epoch": 0.7218080013925556, "grad_norm": 0.11843869090080261, "learning_rate": 0.00015770492071370305, "loss": 2.9196, "step": 12440 }, { "epoch": 0.722388232904929, "grad_norm": 0.11030830442905426, "learning_rate": 0.000157094652912637, "loss": 2.9109, "step": 12450 }, { "epoch": 0.7229684644173026, "grad_norm": 0.10489300638437271, "learning_rate": 0.00015648527951450477, "loss": 2.9198, "step": 12460 }, { "epoch": 0.723548695929676, "grad_norm": 0.10262126475572586, "learning_rate": 0.00015587680276308343, "loss": 2.9144, "step": 12470 }, { "epoch": 0.7241289274420494, "grad_norm": 0.11257569491863251, "learning_rate": 0.00015526922489884795, "loss": 2.9164, "step": 12480 }, { "epoch": 0.7247091589544228, "grad_norm": 0.10799070447683334, "learning_rate": 0.00015466254815896363, "loss": 2.9126, "step": 12490 }, { "epoch": 0.7252893904667963, "grad_norm": 0.10695856809616089, "learning_rate": 0.00015405677477727813, "loss": 2.9041, "step": 12500 }, { "epoch": 0.7258696219791697, "grad_norm": 0.11197816580533981, "learning_rate": 0.00015345190698431193, "loss": 2.907, "step": 12510 }, { "epoch": 0.7264498534915431, "grad_norm": 0.10836388170719147, "learning_rate": 0.00015284794700725209, "loss": 2.9157, "step": 12520 }, { "epoch": 0.7270300850039165, "grad_norm": 0.11295382678508759, "learning_rate": 0.00015224489706994228, "loss": 2.9121, "step": 12530 }, { "epoch": 0.72761031651629, "grad_norm": 0.10819791257381439, "learning_rate": 0.0001516427593928755, "loss": 2.9204, "step": 12540 }, { "epoch": 0.7281905480286635, "grad_norm": 0.10760724544525146, "learning_rate": 0.00015104153619318616, "loss": 2.9211, "step": 12550 }, { "epoch": 0.7287707795410369, "grad_norm": 0.11190642416477203, "learning_rate": 0.00015044122968464044, "loss": 2.9207, "step": 12560 }, { "epoch": 0.7293510110534103, "grad_norm": 0.10582192242145538, "learning_rate": 0.00014984184207763037, "loss": 2.9084, "step": 12570 }, { "epoch": 0.7299312425657838, "grad_norm": 0.10466651618480682, "learning_rate": 0.00014924337557916339, "loss": 2.9186, "step": 12580 }, { "epoch": 0.7305114740781572, "grad_norm": 0.11605332791805267, "learning_rate": 0.0001486458323928562, "loss": 2.9114, "step": 12590 }, { "epoch": 0.7310917055905306, "grad_norm": 0.10586009919643402, "learning_rate": 0.00014804921471892537, "loss": 2.9148, "step": 12600 }, { "epoch": 0.731671937102904, "grad_norm": 0.10544715821743011, "learning_rate": 0.00014745352475417945, "loss": 2.9106, "step": 12610 }, { "epoch": 0.7322521686152775, "grad_norm": 0.11124227941036224, "learning_rate": 0.00014685876469201173, "loss": 2.9187, "step": 12620 }, { "epoch": 0.7328324001276509, "grad_norm": 0.10506170988082886, "learning_rate": 0.0001462649367223905, "loss": 2.9173, "step": 12630 }, { "epoch": 0.7334126316400243, "grad_norm": 0.10476493090391159, "learning_rate": 0.000145672043031853, "loss": 2.9266, "step": 12640 }, { "epoch": 0.7339928631523978, "grad_norm": 0.1084727793931961, "learning_rate": 0.00014508008580349575, "loss": 2.9161, "step": 12650 }, { "epoch": 0.7345730946647713, "grad_norm": 0.10554786771535873, "learning_rate": 0.00014448906721696716, "loss": 2.908, "step": 12660 }, { "epoch": 0.7351533261771447, "grad_norm": 0.10359372198581696, "learning_rate": 0.00014389898944845996, "loss": 2.9197, "step": 12670 }, { "epoch": 0.7357335576895181, "grad_norm": 0.10880708694458008, "learning_rate": 0.00014330985467070172, "loss": 2.9051, "step": 12680 }, { "epoch": 0.7363137892018915, "grad_norm": 0.10292264074087143, "learning_rate": 0.00014272166505294894, "loss": 2.9139, "step": 12690 }, { "epoch": 0.736894020714265, "grad_norm": 0.10571425408124924, "learning_rate": 0.0001421344227609772, "loss": 2.9098, "step": 12700 }, { "epoch": 0.7374742522266384, "grad_norm": 0.1137324869632721, "learning_rate": 0.00014154812995707396, "loss": 2.8991, "step": 12710 }, { "epoch": 0.7380544837390118, "grad_norm": 0.10648997873067856, "learning_rate": 0.0001409627888000312, "loss": 2.9093, "step": 12720 }, { "epoch": 0.7386347152513854, "grad_norm": 0.11195819079875946, "learning_rate": 0.00014037840144513584, "loss": 2.9159, "step": 12730 }, { "epoch": 0.7392149467637588, "grad_norm": 0.11138200759887695, "learning_rate": 0.00013979497004416373, "loss": 2.9053, "step": 12740 }, { "epoch": 0.7397951782761322, "grad_norm": 0.10920187830924988, "learning_rate": 0.00013921249674537028, "loss": 2.9155, "step": 12750 }, { "epoch": 0.7403754097885056, "grad_norm": 0.10279792547225952, "learning_rate": 0.00013863098369348305, "loss": 2.9127, "step": 12760 }, { "epoch": 0.7409556413008791, "grad_norm": 0.10274110734462738, "learning_rate": 0.00013805043302969448, "loss": 2.9156, "step": 12770 }, { "epoch": 0.7415358728132525, "grad_norm": 0.1055116280913353, "learning_rate": 0.00013747084689165232, "loss": 2.9104, "step": 12780 }, { "epoch": 0.7421161043256259, "grad_norm": 0.11023949831724167, "learning_rate": 0.00013689222741345382, "loss": 2.9072, "step": 12790 }, { "epoch": 0.7426963358379993, "grad_norm": 0.10738439857959747, "learning_rate": 0.00013631457672563647, "loss": 2.9073, "step": 12800 }, { "epoch": 0.7432765673503728, "grad_norm": 0.12367619574069977, "learning_rate": 0.00013573789695517054, "loss": 2.9186, "step": 12810 }, { "epoch": 0.7438567988627462, "grad_norm": 0.1205858662724495, "learning_rate": 0.0001351621902254514, "loss": 2.9151, "step": 12820 }, { "epoch": 0.7444370303751197, "grad_norm": 0.1009015142917633, "learning_rate": 0.00013458745865629142, "loss": 2.9057, "step": 12830 }, { "epoch": 0.7450172618874931, "grad_norm": 0.1065724790096283, "learning_rate": 0.00013401370436391278, "loss": 2.9132, "step": 12840 }, { "epoch": 0.7455974933998666, "grad_norm": 0.10562238842248917, "learning_rate": 0.00013344092946093841, "loss": 2.9164, "step": 12850 }, { "epoch": 0.74617772491224, "grad_norm": 0.12002112716436386, "learning_rate": 0.0001328691360563859, "loss": 2.9166, "step": 12860 }, { "epoch": 0.7467579564246134, "grad_norm": 0.11531555652618408, "learning_rate": 0.0001322983262556583, "loss": 2.9009, "step": 12870 }, { "epoch": 0.7473381879369868, "grad_norm": 0.10654497146606445, "learning_rate": 0.000131728502160537, "loss": 2.9131, "step": 12880 }, { "epoch": 0.7479184194493603, "grad_norm": 0.10612619668245316, "learning_rate": 0.0001311596658691744, "loss": 2.9218, "step": 12890 }, { "epoch": 0.7484986509617337, "grad_norm": 0.1150059774518013, "learning_rate": 0.00013059181947608475, "loss": 2.9106, "step": 12900 }, { "epoch": 0.7490788824741071, "grad_norm": 0.10354486852884293, "learning_rate": 0.0001300249650721384, "loss": 2.9108, "step": 12910 }, { "epoch": 0.7496591139864806, "grad_norm": 0.10963668674230576, "learning_rate": 0.0001294591047445525, "loss": 2.9075, "step": 12920 }, { "epoch": 0.7502393454988541, "grad_norm": 0.11041709035634995, "learning_rate": 0.00012889424057688405, "loss": 2.9077, "step": 12930 }, { "epoch": 0.7508195770112275, "grad_norm": 0.11710195988416672, "learning_rate": 0.00012833037464902205, "loss": 2.9129, "step": 12940 }, { "epoch": 0.7513998085236009, "grad_norm": 0.10554017871618271, "learning_rate": 0.00012776750903717985, "loss": 2.9091, "step": 12950 }, { "epoch": 0.7519800400359744, "grad_norm": 0.10230179131031036, "learning_rate": 0.0001272056458138878, "loss": 2.906, "step": 12960 }, { "epoch": 0.7525602715483478, "grad_norm": 0.10623400658369064, "learning_rate": 0.00012664478704798508, "loss": 2.8985, "step": 12970 }, { "epoch": 0.7531405030607212, "grad_norm": 0.11346332728862762, "learning_rate": 0.00012608493480461237, "loss": 2.9109, "step": 12980 }, { "epoch": 0.7537207345730946, "grad_norm": 0.11194764822721481, "learning_rate": 0.0001255260911452043, "loss": 2.9072, "step": 12990 }, { "epoch": 0.7543009660854681, "grad_norm": 0.10929550230503082, "learning_rate": 0.00012496825812748164, "loss": 2.9121, "step": 13000 }, { "epoch": 0.7543009660854681, "eval_loss": 2.8602888584136963, "eval_runtime": 3.5007, "eval_samples_per_second": 1236.892, "eval_steps_per_second": 4.856, "step": 13000 }, { "epoch": 0.7548811975978416, "grad_norm": 0.10286314785480499, "learning_rate": 0.00012441143780544427, "loss": 2.9068, "step": 13010 }, { "epoch": 0.755461429110215, "grad_norm": 0.10113418847322464, "learning_rate": 0.00012385563222936275, "loss": 2.9105, "step": 13020 }, { "epoch": 0.7560416606225884, "grad_norm": 0.10965818911790848, "learning_rate": 0.00012330084344577152, "loss": 2.9057, "step": 13030 }, { "epoch": 0.7566218921349619, "grad_norm": 0.11345063894987106, "learning_rate": 0.00012274707349746092, "loss": 2.9055, "step": 13040 }, { "epoch": 0.7572021236473353, "grad_norm": 0.1040196418762207, "learning_rate": 0.00012219432442346997, "loss": 2.9062, "step": 13050 }, { "epoch": 0.7577823551597087, "grad_norm": 0.10488880425691605, "learning_rate": 0.00012164259825907853, "loss": 2.903, "step": 13060 }, { "epoch": 0.7583625866720821, "grad_norm": 0.09662194550037384, "learning_rate": 0.00012109189703580037, "loss": 2.9038, "step": 13070 }, { "epoch": 0.7589428181844556, "grad_norm": 0.10708185285329819, "learning_rate": 0.00012054222278137501, "loss": 2.9068, "step": 13080 }, { "epoch": 0.759523049696829, "grad_norm": 0.10164474695920944, "learning_rate": 0.00011999357751976053, "loss": 2.9057, "step": 13090 }, { "epoch": 0.7601032812092025, "grad_norm": 0.10414143651723862, "learning_rate": 0.00011944596327112636, "loss": 2.9032, "step": 13100 }, { "epoch": 0.7606835127215759, "grad_norm": 0.10585027188062668, "learning_rate": 0.0001188993820518455, "loss": 2.9114, "step": 13110 }, { "epoch": 0.7612637442339494, "grad_norm": 0.10768268257379532, "learning_rate": 0.00011835383587448707, "loss": 2.902, "step": 13120 }, { "epoch": 0.7618439757463228, "grad_norm": 0.11847629398107529, "learning_rate": 0.00011780932674780962, "loss": 2.9102, "step": 13130 }, { "epoch": 0.7624242072586962, "grad_norm": 0.10435007512569427, "learning_rate": 0.00011726585667675256, "loss": 2.902, "step": 13140 }, { "epoch": 0.7630044387710696, "grad_norm": 0.10464408993721008, "learning_rate": 0.00011672342766242965, "loss": 2.9009, "step": 13150 }, { "epoch": 0.7635846702834431, "grad_norm": 0.11136206239461899, "learning_rate": 0.00011618204170212142, "loss": 2.9062, "step": 13160 }, { "epoch": 0.7641649017958165, "grad_norm": 0.10378240793943405, "learning_rate": 0.00011564170078926757, "loss": 2.9099, "step": 13170 }, { "epoch": 0.7647451333081899, "grad_norm": 0.10681048780679703, "learning_rate": 0.0001151024069134599, "loss": 2.8968, "step": 13180 }, { "epoch": 0.7653253648205635, "grad_norm": 0.11095359921455383, "learning_rate": 0.00011456416206043519, "loss": 2.9017, "step": 13190 }, { "epoch": 0.7659055963329369, "grad_norm": 0.10336118191480637, "learning_rate": 0.0001140269682120672, "loss": 2.9083, "step": 13200 }, { "epoch": 0.7664858278453103, "grad_norm": 0.11158061772584915, "learning_rate": 0.00011349082734636005, "loss": 2.9072, "step": 13210 }, { "epoch": 0.7670660593576837, "grad_norm": 0.10609913617372513, "learning_rate": 0.00011295574143744047, "loss": 2.9094, "step": 13220 }, { "epoch": 0.7676462908700572, "grad_norm": 0.1065196841955185, "learning_rate": 0.0001124217124555508, "loss": 2.9079, "step": 13230 }, { "epoch": 0.7682265223824306, "grad_norm": 0.10469912737607956, "learning_rate": 0.00011188874236704193, "loss": 2.9033, "step": 13240 }, { "epoch": 0.768806753894804, "grad_norm": 0.10259145498275757, "learning_rate": 0.00011135683313436534, "loss": 2.9103, "step": 13250 }, { "epoch": 0.7693869854071774, "grad_norm": 0.10454609990119934, "learning_rate": 0.00011082598671606664, "loss": 2.9043, "step": 13260 }, { "epoch": 0.769967216919551, "grad_norm": 0.10401228070259094, "learning_rate": 0.00011029620506677792, "loss": 2.9074, "step": 13270 }, { "epoch": 0.7705474484319244, "grad_norm": 0.10602720081806183, "learning_rate": 0.0001097674901372106, "loss": 2.9087, "step": 13280 }, { "epoch": 0.7711276799442978, "grad_norm": 0.11147190630435944, "learning_rate": 0.00010923984387414878, "loss": 2.9044, "step": 13290 }, { "epoch": 0.7717079114566712, "grad_norm": 0.10660642385482788, "learning_rate": 0.00010871326822044077, "loss": 2.8997, "step": 13300 }, { "epoch": 0.7722881429690447, "grad_norm": 0.10255785286426544, "learning_rate": 0.00010818776511499375, "loss": 2.9029, "step": 13310 }, { "epoch": 0.7728683744814181, "grad_norm": 0.10959793627262115, "learning_rate": 0.00010766333649276497, "loss": 2.9017, "step": 13320 }, { "epoch": 0.7734486059937915, "grad_norm": 0.10262299329042435, "learning_rate": 0.00010713998428475554, "loss": 2.909, "step": 13330 }, { "epoch": 0.7740288375061649, "grad_norm": 0.09683836251497269, "learning_rate": 0.00010661771041800337, "loss": 2.9072, "step": 13340 }, { "epoch": 0.7746090690185384, "grad_norm": 0.10676641017198563, "learning_rate": 0.00010609651681557515, "loss": 2.8949, "step": 13350 }, { "epoch": 0.7751893005309118, "grad_norm": 0.11433670669794083, "learning_rate": 0.00010557640539656062, "loss": 2.9033, "step": 13360 }, { "epoch": 0.7757695320432852, "grad_norm": 0.09931094199419022, "learning_rate": 0.00010505737807606445, "loss": 2.9131, "step": 13370 }, { "epoch": 0.7763497635556587, "grad_norm": 0.10620000958442688, "learning_rate": 0.00010453943676519959, "loss": 2.9119, "step": 13380 }, { "epoch": 0.7769299950680322, "grad_norm": 0.10597820580005646, "learning_rate": 0.00010402258337108018, "loss": 2.9025, "step": 13390 }, { "epoch": 0.7775102265804056, "grad_norm": 0.1097063198685646, "learning_rate": 0.00010350681979681445, "loss": 2.9043, "step": 13400 }, { "epoch": 0.778090458092779, "grad_norm": 0.10782230645418167, "learning_rate": 0.00010299214794149822, "loss": 2.9106, "step": 13410 }, { "epoch": 0.7786706896051525, "grad_norm": 0.11034680157899857, "learning_rate": 0.00010247856970020681, "loss": 2.9056, "step": 13420 }, { "epoch": 0.7792509211175259, "grad_norm": 0.10253550857305527, "learning_rate": 0.00010196608696398944, "loss": 2.898, "step": 13430 }, { "epoch": 0.7798311526298993, "grad_norm": 0.10425124317407608, "learning_rate": 0.00010145470161986104, "loss": 2.9013, "step": 13440 }, { "epoch": 0.7804113841422727, "grad_norm": 0.10316097736358643, "learning_rate": 0.00010094441555079592, "loss": 2.8969, "step": 13450 }, { "epoch": 0.7809916156546463, "grad_norm": 0.10671611875295639, "learning_rate": 0.00010043523063572124, "loss": 2.9057, "step": 13460 }, { "epoch": 0.7815718471670197, "grad_norm": 0.10064894706010818, "learning_rate": 9.992714874950868e-05, "loss": 2.8981, "step": 13470 }, { "epoch": 0.7821520786793931, "grad_norm": 0.10153688490390778, "learning_rate": 9.942017176296939e-05, "loss": 2.8995, "step": 13480 }, { "epoch": 0.7827323101917665, "grad_norm": 0.10427162051200867, "learning_rate": 9.891430154284554e-05, "loss": 2.9007, "step": 13490 }, { "epoch": 0.78331254170414, "grad_norm": 0.11092238873243332, "learning_rate": 9.840953995180421e-05, "loss": 2.9013, "step": 13500 }, { "epoch": 0.7838927732165134, "grad_norm": 0.10155448317527771, "learning_rate": 9.790588884843082e-05, "loss": 2.905, "step": 13510 }, { "epoch": 0.7844730047288868, "grad_norm": 0.10765383392572403, "learning_rate": 9.740335008722095e-05, "loss": 2.8974, "step": 13520 }, { "epoch": 0.7850532362412602, "grad_norm": 0.10454422235488892, "learning_rate": 9.690192551857538e-05, "loss": 2.9065, "step": 13530 }, { "epoch": 0.7856334677536337, "grad_norm": 0.10686997324228287, "learning_rate": 9.64016169887918e-05, "loss": 2.9022, "step": 13540 }, { "epoch": 0.7862136992660071, "grad_norm": 0.09971166402101517, "learning_rate": 9.590242634005844e-05, "loss": 2.9017, "step": 13550 }, { "epoch": 0.7867939307783806, "grad_norm": 0.10111749917268753, "learning_rate": 9.540435541044796e-05, "loss": 2.8992, "step": 13560 }, { "epoch": 0.787374162290754, "grad_norm": 0.10360756516456604, "learning_rate": 9.490740603390924e-05, "loss": 2.9035, "step": 13570 }, { "epoch": 0.7879543938031275, "grad_norm": 0.10605431348085403, "learning_rate": 9.44115800402623e-05, "loss": 2.896, "step": 13580 }, { "epoch": 0.7885346253155009, "grad_norm": 0.10242411494255066, "learning_rate": 9.39168792551902e-05, "loss": 2.8927, "step": 13590 }, { "epoch": 0.7891148568278743, "grad_norm": 0.11159484088420868, "learning_rate": 9.342330550023301e-05, "loss": 2.903, "step": 13600 }, { "epoch": 0.7896950883402477, "grad_norm": 0.1059231385588646, "learning_rate": 9.293086059278136e-05, "loss": 2.8983, "step": 13610 }, { "epoch": 0.7902753198526212, "grad_norm": 0.10109903663396835, "learning_rate": 9.243954634606846e-05, "loss": 2.9015, "step": 13620 }, { "epoch": 0.7908555513649946, "grad_norm": 0.10412076115608215, "learning_rate": 9.194936456916531e-05, "loss": 2.8993, "step": 13630 }, { "epoch": 0.791435782877368, "grad_norm": 0.10232435911893845, "learning_rate": 9.146031706697216e-05, "loss": 2.9023, "step": 13640 }, { "epoch": 0.7920160143897415, "grad_norm": 0.10621204227209091, "learning_rate": 9.097240564021339e-05, "loss": 2.9064, "step": 13650 }, { "epoch": 0.792596245902115, "grad_norm": 0.10677001625299454, "learning_rate": 9.048563208542998e-05, "loss": 2.8884, "step": 13660 }, { "epoch": 0.7931764774144884, "grad_norm": 0.10444250702857971, "learning_rate": 8.999999819497298e-05, "loss": 2.8892, "step": 13670 }, { "epoch": 0.7937567089268618, "grad_norm": 0.09775687009096146, "learning_rate": 8.951550575699759e-05, "loss": 2.9041, "step": 13680 }, { "epoch": 0.7943369404392353, "grad_norm": 0.10191529989242554, "learning_rate": 8.903215655545527e-05, "loss": 2.8984, "step": 13690 }, { "epoch": 0.7949171719516087, "grad_norm": 0.10648460686206818, "learning_rate": 8.854995237008878e-05, "loss": 2.9046, "step": 13700 }, { "epoch": 0.7954974034639821, "grad_norm": 0.10379672795534134, "learning_rate": 8.80688949764243e-05, "loss": 2.8953, "step": 13710 }, { "epoch": 0.7960776349763555, "grad_norm": 0.10597063601016998, "learning_rate": 8.75889861457654e-05, "loss": 2.8951, "step": 13720 }, { "epoch": 0.796657866488729, "grad_norm": 0.10298217833042145, "learning_rate": 8.711022764518712e-05, "loss": 2.8972, "step": 13730 }, { "epoch": 0.7972380980011025, "grad_norm": 0.10399536043405533, "learning_rate": 8.663262123752787e-05, "loss": 2.9053, "step": 13740 }, { "epoch": 0.7978183295134759, "grad_norm": 0.1015402153134346, "learning_rate": 8.615616868138481e-05, "loss": 2.9028, "step": 13750 }, { "epoch": 0.7983985610258493, "grad_norm": 0.10023748874664307, "learning_rate": 8.568087173110613e-05, "loss": 2.9014, "step": 13760 }, { "epoch": 0.7989787925382228, "grad_norm": 0.10471780598163605, "learning_rate": 8.520673213678491e-05, "loss": 2.9056, "step": 13770 }, { "epoch": 0.7995590240505962, "grad_norm": 0.10297100991010666, "learning_rate": 8.473375164425284e-05, "loss": 2.9023, "step": 13780 }, { "epoch": 0.8001392555629696, "grad_norm": 0.10079214721918106, "learning_rate": 8.426193199507352e-05, "loss": 2.9023, "step": 13790 }, { "epoch": 0.800719487075343, "grad_norm": 0.10564671456813812, "learning_rate": 8.379127492653656e-05, "loss": 2.899, "step": 13800 }, { "epoch": 0.8012997185877165, "grad_norm": 0.10113906860351562, "learning_rate": 8.332178217165046e-05, "loss": 2.909, "step": 13810 }, { "epoch": 0.8018799501000899, "grad_norm": 0.10526243597269058, "learning_rate": 8.285345545913674e-05, "loss": 2.8943, "step": 13820 }, { "epoch": 0.8024601816124634, "grad_norm": 0.105432890355587, "learning_rate": 8.238629651342344e-05, "loss": 2.9026, "step": 13830 }, { "epoch": 0.8030404131248368, "grad_norm": 0.10233651846647263, "learning_rate": 8.192030705463855e-05, "loss": 2.8935, "step": 13840 }, { "epoch": 0.8036206446372103, "grad_norm": 0.10111652314662933, "learning_rate": 8.14554887986044e-05, "loss": 2.8984, "step": 13850 }, { "epoch": 0.8042008761495837, "grad_norm": 0.10380090028047562, "learning_rate": 8.099184345683032e-05, "loss": 2.8953, "step": 13860 }, { "epoch": 0.8047811076619571, "grad_norm": 0.10351978987455368, "learning_rate": 8.052937273650702e-05, "loss": 2.8874, "step": 13870 }, { "epoch": 0.8053613391743305, "grad_norm": 0.10604514926671982, "learning_rate": 8.00680783405002e-05, "loss": 2.8943, "step": 13880 }, { "epoch": 0.805941570686704, "grad_norm": 0.10248877108097076, "learning_rate": 7.960796196734412e-05, "loss": 2.9084, "step": 13890 }, { "epoch": 0.8065218021990774, "grad_norm": 0.09987115114927292, "learning_rate": 7.914902531123534e-05, "loss": 2.9014, "step": 13900 }, { "epoch": 0.8071020337114508, "grad_norm": 0.10237400978803635, "learning_rate": 7.86912700620269e-05, "loss": 2.9049, "step": 13910 }, { "epoch": 0.8076822652238244, "grad_norm": 0.10180395096540451, "learning_rate": 7.823469790522158e-05, "loss": 2.9058, "step": 13920 }, { "epoch": 0.8082624967361978, "grad_norm": 0.10299412906169891, "learning_rate": 7.777931052196574e-05, "loss": 2.9005, "step": 13930 }, { "epoch": 0.8088427282485712, "grad_norm": 0.10630860179662704, "learning_rate": 7.732510958904354e-05, "loss": 2.9001, "step": 13940 }, { "epoch": 0.8094229597609446, "grad_norm": 0.10103683173656464, "learning_rate": 7.687209677887039e-05, "loss": 2.8964, "step": 13950 }, { "epoch": 0.8100031912733181, "grad_norm": 0.10312862694263458, "learning_rate": 7.642027375948675e-05, "loss": 2.9036, "step": 13960 }, { "epoch": 0.8105834227856915, "grad_norm": 0.0994095429778099, "learning_rate": 7.596964219455256e-05, "loss": 2.9005, "step": 13970 }, { "epoch": 0.8111636542980649, "grad_norm": 0.10047020018100739, "learning_rate": 7.552020374334032e-05, "loss": 2.9009, "step": 13980 }, { "epoch": 0.8117438858104383, "grad_norm": 0.1043546199798584, "learning_rate": 7.507196006072952e-05, "loss": 2.8984, "step": 13990 }, { "epoch": 0.8123241173228118, "grad_norm": 0.10159917175769806, "learning_rate": 7.462491279720034e-05, "loss": 2.8928, "step": 14000 }, { "epoch": 0.8123241173228118, "eval_loss": 2.8492555618286133, "eval_runtime": 3.5014, "eval_samples_per_second": 1236.645, "eval_steps_per_second": 4.855, "step": 14000 }, { "epoch": 0.8129043488351853, "grad_norm": 0.10643550753593445, "learning_rate": 7.417906359882763e-05, "loss": 2.8986, "step": 14010 }, { "epoch": 0.8134845803475587, "grad_norm": 0.09880738705396652, "learning_rate": 7.373441410727475e-05, "loss": 2.8969, "step": 14020 }, { "epoch": 0.8140648118599321, "grad_norm": 0.10168132185935974, "learning_rate": 7.329096595978793e-05, "loss": 2.8868, "step": 14030 }, { "epoch": 0.8146450433723056, "grad_norm": 0.09928074479103088, "learning_rate": 7.284872078918956e-05, "loss": 2.8981, "step": 14040 }, { "epoch": 0.815225274884679, "grad_norm": 0.0986517071723938, "learning_rate": 7.240768022387273e-05, "loss": 2.898, "step": 14050 }, { "epoch": 0.8158055063970524, "grad_norm": 0.09639947861433029, "learning_rate": 7.196784588779495e-05, "loss": 2.8978, "step": 14060 }, { "epoch": 0.8163857379094258, "grad_norm": 0.10378675162792206, "learning_rate": 7.152921940047228e-05, "loss": 2.9012, "step": 14070 }, { "epoch": 0.8169659694217993, "grad_norm": 0.10355675965547562, "learning_rate": 7.109180237697355e-05, "loss": 2.8885, "step": 14080 }, { "epoch": 0.8175462009341727, "grad_norm": 0.09884154051542282, "learning_rate": 7.0655596427914e-05, "loss": 2.9023, "step": 14090 }, { "epoch": 0.8181264324465461, "grad_norm": 0.09735111147165298, "learning_rate": 7.022060315944958e-05, "loss": 2.9015, "step": 14100 }, { "epoch": 0.8187066639589196, "grad_norm": 0.10175276547670364, "learning_rate": 6.978682417327113e-05, "loss": 2.8925, "step": 14110 }, { "epoch": 0.8192868954712931, "grad_norm": 0.11503997445106506, "learning_rate": 6.935426106659822e-05, "loss": 2.89, "step": 14120 }, { "epoch": 0.8198671269836665, "grad_norm": 0.10373996198177338, "learning_rate": 6.892291543217378e-05, "loss": 2.8921, "step": 14130 }, { "epoch": 0.8204473584960399, "grad_norm": 0.11327492445707321, "learning_rate": 6.849278885825721e-05, "loss": 2.893, "step": 14140 }, { "epoch": 0.8210275900084134, "grad_norm": 0.10706917196512222, "learning_rate": 6.806388292861993e-05, "loss": 2.8934, "step": 14150 }, { "epoch": 0.8216078215207868, "grad_norm": 0.10297518968582153, "learning_rate": 6.76361992225385e-05, "loss": 2.8973, "step": 14160 }, { "epoch": 0.8221880530331602, "grad_norm": 0.11366488039493561, "learning_rate": 6.720973931478889e-05, "loss": 2.8825, "step": 14170 }, { "epoch": 0.8227682845455336, "grad_norm": 0.10098817944526672, "learning_rate": 6.678450477564151e-05, "loss": 2.8958, "step": 14180 }, { "epoch": 0.8233485160579072, "grad_norm": 0.09862834960222244, "learning_rate": 6.636049717085411e-05, "loss": 2.9015, "step": 14190 }, { "epoch": 0.8239287475702806, "grad_norm": 0.0998816043138504, "learning_rate": 6.593771806166733e-05, "loss": 2.8926, "step": 14200 }, { "epoch": 0.824508979082654, "grad_norm": 0.10064134001731873, "learning_rate": 6.551616900479815e-05, "loss": 2.8991, "step": 14210 }, { "epoch": 0.8250892105950274, "grad_norm": 0.10604777187108994, "learning_rate": 6.509585155243425e-05, "loss": 2.8909, "step": 14220 }, { "epoch": 0.8256694421074009, "grad_norm": 0.1057165339589119, "learning_rate": 6.46767672522286e-05, "loss": 2.8904, "step": 14230 }, { "epoch": 0.8262496736197743, "grad_norm": 0.10482773929834366, "learning_rate": 6.425891764729332e-05, "loss": 2.8889, "step": 14240 }, { "epoch": 0.8268299051321477, "grad_norm": 0.09676790237426758, "learning_rate": 6.384230427619482e-05, "loss": 2.8993, "step": 14250 }, { "epoch": 0.8274101366445211, "grad_norm": 0.10526445508003235, "learning_rate": 6.34269286729467e-05, "loss": 2.8899, "step": 14260 }, { "epoch": 0.8279903681568946, "grad_norm": 0.0997476875782013, "learning_rate": 6.301279236700578e-05, "loss": 2.9088, "step": 14270 }, { "epoch": 0.828570599669268, "grad_norm": 0.10601063072681427, "learning_rate": 6.25998968832652e-05, "loss": 2.8971, "step": 14280 }, { "epoch": 0.8291508311816415, "grad_norm": 0.10106954723596573, "learning_rate": 6.218824374204935e-05, "loss": 2.8871, "step": 14290 }, { "epoch": 0.8297310626940149, "grad_norm": 0.09940861165523529, "learning_rate": 6.177783445910841e-05, "loss": 2.8943, "step": 14300 }, { "epoch": 0.8303112942063884, "grad_norm": 0.10276630520820618, "learning_rate": 6.136867054561202e-05, "loss": 2.8958, "step": 14310 }, { "epoch": 0.8308915257187618, "grad_norm": 0.10001721978187561, "learning_rate": 6.0960753508144896e-05, "loss": 2.8826, "step": 14320 }, { "epoch": 0.8314717572311352, "grad_norm": 0.09902679920196533, "learning_rate": 6.055408484870015e-05, "loss": 2.8841, "step": 14330 }, { "epoch": 0.8320519887435086, "grad_norm": 0.1033594161272049, "learning_rate": 6.014866606467431e-05, "loss": 2.8952, "step": 14340 }, { "epoch": 0.8326322202558821, "grad_norm": 0.1051039770245552, "learning_rate": 5.974449864886209e-05, "loss": 2.8998, "step": 14350 }, { "epoch": 0.8332124517682555, "grad_norm": 0.09860280156135559, "learning_rate": 5.9341584089449875e-05, "loss": 2.8873, "step": 14360 }, { "epoch": 0.8337926832806289, "grad_norm": 0.10190751403570175, "learning_rate": 5.893992387001159e-05, "loss": 2.8877, "step": 14370 }, { "epoch": 0.8343729147930025, "grad_norm": 0.09607376903295517, "learning_rate": 5.853951946950211e-05, "loss": 2.8906, "step": 14380 }, { "epoch": 0.8349531463053759, "grad_norm": 0.09999407827854156, "learning_rate": 5.814037236225236e-05, "loss": 2.8983, "step": 14390 }, { "epoch": 0.8355333778177493, "grad_norm": 0.10061750560998917, "learning_rate": 5.7742484017963984e-05, "loss": 2.8968, "step": 14400 }, { "epoch": 0.8361136093301227, "grad_norm": 0.10094620287418365, "learning_rate": 5.7345855901703225e-05, "loss": 2.9035, "step": 14410 }, { "epoch": 0.8366938408424962, "grad_norm": 0.09896020591259003, "learning_rate": 5.695048947389663e-05, "loss": 2.8908, "step": 14420 }, { "epoch": 0.8372740723548696, "grad_norm": 0.10651181638240814, "learning_rate": 5.6556386190324706e-05, "loss": 2.8887, "step": 14430 }, { "epoch": 0.837854303867243, "grad_norm": 0.10073324292898178, "learning_rate": 5.616354750211703e-05, "loss": 2.8932, "step": 14440 }, { "epoch": 0.8384345353796164, "grad_norm": 0.09969612210988998, "learning_rate": 5.577197485574681e-05, "loss": 2.8929, "step": 14450 }, { "epoch": 0.83901476689199, "grad_norm": 0.10277923941612244, "learning_rate": 5.538166969302547e-05, "loss": 2.9037, "step": 14460 }, { "epoch": 0.8395949984043634, "grad_norm": 0.0997147262096405, "learning_rate": 5.4992633451097773e-05, "loss": 2.8858, "step": 14470 }, { "epoch": 0.8401752299167368, "grad_norm": 0.09541675448417664, "learning_rate": 5.460486756243555e-05, "loss": 2.8937, "step": 14480 }, { "epoch": 0.8407554614291102, "grad_norm": 0.09750719368457794, "learning_rate": 5.421837345483365e-05, "loss": 2.8914, "step": 14490 }, { "epoch": 0.8413356929414837, "grad_norm": 0.09816502779722214, "learning_rate": 5.383315255140384e-05, "loss": 2.8955, "step": 14500 }, { "epoch": 0.8419159244538571, "grad_norm": 0.10594295710325241, "learning_rate": 5.3449206270569774e-05, "loss": 2.8865, "step": 14510 }, { "epoch": 0.8424961559662305, "grad_norm": 0.0968475341796875, "learning_rate": 5.306653602606204e-05, "loss": 2.8942, "step": 14520 }, { "epoch": 0.8430763874786039, "grad_norm": 0.09571373462677002, "learning_rate": 5.268514322691229e-05, "loss": 2.8943, "step": 14530 }, { "epoch": 0.8436566189909774, "grad_norm": 0.0948018804192543, "learning_rate": 5.2305029277449044e-05, "loss": 2.9002, "step": 14540 }, { "epoch": 0.8442368505033508, "grad_norm": 0.1066710576415062, "learning_rate": 5.192619557729153e-05, "loss": 2.8958, "step": 14550 }, { "epoch": 0.8448170820157243, "grad_norm": 0.10297758132219315, "learning_rate": 5.154864352134516e-05, "loss": 2.9004, "step": 14560 }, { "epoch": 0.8453973135280977, "grad_norm": 0.10075704008340836, "learning_rate": 5.117237449979615e-05, "loss": 2.8873, "step": 14570 }, { "epoch": 0.8459775450404712, "grad_norm": 0.09905263036489487, "learning_rate": 5.079738989810636e-05, "loss": 2.8874, "step": 14580 }, { "epoch": 0.8465577765528446, "grad_norm": 0.0961359441280365, "learning_rate": 5.042369109700853e-05, "loss": 2.8909, "step": 14590 }, { "epoch": 0.847138008065218, "grad_norm": 0.09857707470655441, "learning_rate": 5.0051279472500764e-05, "loss": 2.8887, "step": 14600 }, { "epoch": 0.8477182395775914, "grad_norm": 0.10070210695266724, "learning_rate": 4.96801563958416e-05, "loss": 2.8881, "step": 14610 }, { "epoch": 0.8482984710899649, "grad_norm": 0.0973038449883461, "learning_rate": 4.931032323354519e-05, "loss": 2.8962, "step": 14620 }, { "epoch": 0.8488787026023383, "grad_norm": 0.09583109617233276, "learning_rate": 4.894178134737577e-05, "loss": 2.8949, "step": 14630 }, { "epoch": 0.8494589341147117, "grad_norm": 0.09954696893692017, "learning_rate": 4.857453209434346e-05, "loss": 2.8993, "step": 14640 }, { "epoch": 0.8500391656270853, "grad_norm": 0.10481850057840347, "learning_rate": 4.82085768266983e-05, "loss": 2.8839, "step": 14650 }, { "epoch": 0.8506193971394587, "grad_norm": 0.10066717118024826, "learning_rate": 4.784391689192598e-05, "loss": 2.8825, "step": 14660 }, { "epoch": 0.8511996286518321, "grad_norm": 0.09504562616348267, "learning_rate": 4.7480553632742555e-05, "loss": 2.8992, "step": 14670 }, { "epoch": 0.8517798601642055, "grad_norm": 0.09874894469976425, "learning_rate": 4.711848838708961e-05, "loss": 2.8875, "step": 14680 }, { "epoch": 0.852360091676579, "grad_norm": 0.09990754723548889, "learning_rate": 4.675772248812922e-05, "loss": 2.8894, "step": 14690 }, { "epoch": 0.8529403231889524, "grad_norm": 0.09535972774028778, "learning_rate": 4.639825726423941e-05, "loss": 2.8891, "step": 14700 }, { "epoch": 0.8535205547013258, "grad_norm": 0.10271086543798447, "learning_rate": 4.604009403900871e-05, "loss": 2.8865, "step": 14710 }, { "epoch": 0.8541007862136992, "grad_norm": 0.09617126733064651, "learning_rate": 4.568323413123161e-05, "loss": 2.8946, "step": 14720 }, { "epoch": 0.8546810177260727, "grad_norm": 0.0963728278875351, "learning_rate": 4.53276788549037e-05, "loss": 2.8931, "step": 14730 }, { "epoch": 0.8552612492384462, "grad_norm": 0.09683659672737122, "learning_rate": 4.497342951921661e-05, "loss": 2.8988, "step": 14740 }, { "epoch": 0.8558414807508196, "grad_norm": 0.09892312437295914, "learning_rate": 4.462048742855376e-05, "loss": 2.8848, "step": 14750 }, { "epoch": 0.856421712263193, "grad_norm": 0.09641166776418686, "learning_rate": 4.426885388248478e-05, "loss": 2.8796, "step": 14760 }, { "epoch": 0.8570019437755665, "grad_norm": 0.09844641387462616, "learning_rate": 4.39185301757612e-05, "loss": 2.8976, "step": 14770 }, { "epoch": 0.8575821752879399, "grad_norm": 0.09970907121896744, "learning_rate": 4.3569517598311605e-05, "loss": 2.8966, "step": 14780 }, { "epoch": 0.8581624068003133, "grad_norm": 0.09571300446987152, "learning_rate": 4.322181743523692e-05, "loss": 2.8946, "step": 14790 }, { "epoch": 0.8587426383126867, "grad_norm": 0.09592873603105545, "learning_rate": 4.287543096680553e-05, "loss": 2.8939, "step": 14800 }, { "epoch": 0.8593228698250602, "grad_norm": 0.0989769920706749, "learning_rate": 4.2530359468448613e-05, "loss": 2.8917, "step": 14810 }, { "epoch": 0.8599031013374336, "grad_norm": 0.10042741894721985, "learning_rate": 4.2186604210755754e-05, "loss": 2.8796, "step": 14820 }, { "epoch": 0.860483332849807, "grad_norm": 0.09428322315216064, "learning_rate": 4.184416645946971e-05, "loss": 2.8817, "step": 14830 }, { "epoch": 0.8610635643621805, "grad_norm": 0.10060140490531921, "learning_rate": 4.150304747548219e-05, "loss": 2.89, "step": 14840 }, { "epoch": 0.861643795874554, "grad_norm": 0.09661011397838593, "learning_rate": 4.116324851482904e-05, "loss": 2.8878, "step": 14850 }, { "epoch": 0.8622240273869274, "grad_norm": 0.10304513573646545, "learning_rate": 4.082477082868552e-05, "loss": 2.8877, "step": 14860 }, { "epoch": 0.8628042588993008, "grad_norm": 0.09637437015771866, "learning_rate": 4.0487615663362055e-05, "loss": 2.8891, "step": 14870 }, { "epoch": 0.8633844904116743, "grad_norm": 0.10265222191810608, "learning_rate": 4.015178426029928e-05, "loss": 2.8935, "step": 14880 }, { "epoch": 0.8639647219240477, "grad_norm": 0.0949605405330658, "learning_rate": 3.981727785606353e-05, "loss": 2.8965, "step": 14890 }, { "epoch": 0.8645449534364211, "grad_norm": 0.10104341804981232, "learning_rate": 3.948409768234251e-05, "loss": 2.8846, "step": 14900 }, { "epoch": 0.8651251849487945, "grad_norm": 0.09906849265098572, "learning_rate": 3.915224496594037e-05, "loss": 2.8886, "step": 14910 }, { "epoch": 0.865705416461168, "grad_norm": 0.09711761027574539, "learning_rate": 3.882172092877383e-05, "loss": 2.8882, "step": 14920 }, { "epoch": 0.8662856479735415, "grad_norm": 0.09567418694496155, "learning_rate": 3.849252678786672e-05, "loss": 2.8914, "step": 14930 }, { "epoch": 0.8668658794859149, "grad_norm": 0.0991610512137413, "learning_rate": 3.816466375534655e-05, "loss": 2.8944, "step": 14940 }, { "epoch": 0.8674461109982883, "grad_norm": 0.09431017935276031, "learning_rate": 3.783813303843933e-05, "loss": 2.8896, "step": 14950 }, { "epoch": 0.8680263425106618, "grad_norm": 0.10229049623012543, "learning_rate": 3.751293583946529e-05, "loss": 2.9041, "step": 14960 }, { "epoch": 0.8686065740230352, "grad_norm": 0.09249807894229889, "learning_rate": 3.7189073355834745e-05, "loss": 2.8951, "step": 14970 }, { "epoch": 0.8691868055354086, "grad_norm": 0.09961903095245361, "learning_rate": 3.6866546780043045e-05, "loss": 2.8952, "step": 14980 }, { "epoch": 0.869767037047782, "grad_norm": 0.0955023393034935, "learning_rate": 3.6545357299667016e-05, "loss": 2.8797, "step": 14990 }, { "epoch": 0.8703472685601555, "grad_norm": 0.09926097095012665, "learning_rate": 3.622550609735993e-05, "loss": 2.8938, "step": 15000 }, { "epoch": 0.8703472685601555, "eval_loss": 2.8413448333740234, "eval_runtime": 3.4993, "eval_samples_per_second": 1237.391, "eval_steps_per_second": 4.858, "step": 15000 }, { "epoch": 0.870927500072529, "grad_norm": 0.09487929195165634, "learning_rate": 3.5906994350847346e-05, "loss": 2.89, "step": 15010 }, { "epoch": 0.8715077315849024, "grad_norm": 0.09622934460639954, "learning_rate": 3.5589823232923034e-05, "loss": 2.8975, "step": 15020 }, { "epoch": 0.8720879630972758, "grad_norm": 0.09740899503231049, "learning_rate": 3.527399391144406e-05, "loss": 2.883, "step": 15030 }, { "epoch": 0.8726681946096493, "grad_norm": 0.09661401808261871, "learning_rate": 3.495950754932738e-05, "loss": 2.8927, "step": 15040 }, { "epoch": 0.8732484261220227, "grad_norm": 0.09757443517446518, "learning_rate": 3.4646365304544345e-05, "loss": 2.8923, "step": 15050 }, { "epoch": 0.8738286576343961, "grad_norm": 0.0959380567073822, "learning_rate": 3.433456833011777e-05, "loss": 2.879, "step": 15060 }, { "epoch": 0.8744088891467695, "grad_norm": 0.09999074041843414, "learning_rate": 3.402411777411674e-05, "loss": 2.8996, "step": 15070 }, { "epoch": 0.874989120659143, "grad_norm": 0.09846296906471252, "learning_rate": 3.37150147796526e-05, "loss": 2.8946, "step": 15080 }, { "epoch": 0.8755693521715164, "grad_norm": 0.09447459876537323, "learning_rate": 3.3407260484875146e-05, "loss": 2.9029, "step": 15090 }, { "epoch": 0.8761495836838898, "grad_norm": 0.09683697670698166, "learning_rate": 3.310085602296775e-05, "loss": 2.8993, "step": 15100 }, { "epoch": 0.8767298151962634, "grad_norm": 0.09494055062532425, "learning_rate": 3.2795802522143846e-05, "loss": 2.8958, "step": 15110 }, { "epoch": 0.8773100467086368, "grad_norm": 0.09578869491815567, "learning_rate": 3.249210110564236e-05, "loss": 2.8914, "step": 15120 }, { "epoch": 0.8778902782210102, "grad_norm": 0.09415120631456375, "learning_rate": 3.2189752891723656e-05, "loss": 2.887, "step": 15130 }, { "epoch": 0.8784705097333836, "grad_norm": 0.0967675969004631, "learning_rate": 3.1888758993665745e-05, "loss": 2.8973, "step": 15140 }, { "epoch": 0.8790507412457571, "grad_norm": 0.09370094537734985, "learning_rate": 3.158912051975937e-05, "loss": 2.8794, "step": 15150 }, { "epoch": 0.8796309727581305, "grad_norm": 0.09639986604452133, "learning_rate": 3.1290838573305014e-05, "loss": 2.8895, "step": 15160 }, { "epoch": 0.8802112042705039, "grad_norm": 0.10118415951728821, "learning_rate": 3.099391425260803e-05, "loss": 2.8798, "step": 15170 }, { "epoch": 0.8807914357828773, "grad_norm": 0.09219887107610703, "learning_rate": 3.06983486509747e-05, "loss": 2.8864, "step": 15180 }, { "epoch": 0.8813716672952508, "grad_norm": 0.09481830894947052, "learning_rate": 3.040414285670883e-05, "loss": 2.8912, "step": 15190 }, { "epoch": 0.8819518988076243, "grad_norm": 0.09875360131263733, "learning_rate": 3.0111297953106676e-05, "loss": 2.894, "step": 15200 }, { "epoch": 0.8825321303199977, "grad_norm": 0.09495782852172852, "learning_rate": 2.9819815018454012e-05, "loss": 2.8965, "step": 15210 }, { "epoch": 0.8831123618323711, "grad_norm": 0.09452831000089645, "learning_rate": 2.9529695126021507e-05, "loss": 2.8941, "step": 15220 }, { "epoch": 0.8836925933447446, "grad_norm": 0.09670066088438034, "learning_rate": 2.9240939344060958e-05, "loss": 2.8907, "step": 15230 }, { "epoch": 0.884272824857118, "grad_norm": 0.09641928970813751, "learning_rate": 2.895354873580156e-05, "loss": 2.8969, "step": 15240 }, { "epoch": 0.8848530563694914, "grad_norm": 0.09530998766422272, "learning_rate": 2.8667524359445375e-05, "loss": 2.8853, "step": 15250 }, { "epoch": 0.8854332878818648, "grad_norm": 0.10135927051305771, "learning_rate": 2.8382867268164304e-05, "loss": 2.8859, "step": 15260 }, { "epoch": 0.8860135193942383, "grad_norm": 0.09841107577085495, "learning_rate": 2.809957851009557e-05, "loss": 2.8848, "step": 15270 }, { "epoch": 0.8865937509066117, "grad_norm": 0.09506677091121674, "learning_rate": 2.7817659128338025e-05, "loss": 2.8836, "step": 15280 }, { "epoch": 0.8871739824189852, "grad_norm": 0.09353721141815186, "learning_rate": 2.7537110160948378e-05, "loss": 2.8892, "step": 15290 }, { "epoch": 0.8877542139313586, "grad_norm": 0.09569704532623291, "learning_rate": 2.725793264093719e-05, "loss": 2.884, "step": 15300 }, { "epoch": 0.8883344454437321, "grad_norm": 0.09320175647735596, "learning_rate": 2.6980127596265604e-05, "loss": 2.8834, "step": 15310 }, { "epoch": 0.8889146769561055, "grad_norm": 0.09571266919374466, "learning_rate": 2.670369604984062e-05, "loss": 2.8837, "step": 15320 }, { "epoch": 0.8894949084684789, "grad_norm": 0.09907983243465424, "learning_rate": 2.6428639019512315e-05, "loss": 2.8914, "step": 15330 }, { "epoch": 0.8900751399808524, "grad_norm": 0.0950406938791275, "learning_rate": 2.6154957518069466e-05, "loss": 2.8918, "step": 15340 }, { "epoch": 0.8906553714932258, "grad_norm": 0.09680736809968948, "learning_rate": 2.588265255323594e-05, "loss": 2.8856, "step": 15350 }, { "epoch": 0.8912356030055992, "grad_norm": 0.09411243349313736, "learning_rate": 2.561172512766734e-05, "loss": 2.8926, "step": 15360 }, { "epoch": 0.8918158345179726, "grad_norm": 0.09880263358354568, "learning_rate": 2.534217623894657e-05, "loss": 2.894, "step": 15370 }, { "epoch": 0.8923960660303462, "grad_norm": 0.1019010841846466, "learning_rate": 2.507400687958108e-05, "loss": 2.8848, "step": 15380 }, { "epoch": 0.8929762975427196, "grad_norm": 0.09457585960626602, "learning_rate": 2.4807218036998438e-05, "loss": 2.8908, "step": 15390 }, { "epoch": 0.893556529055093, "grad_norm": 0.09536702930927277, "learning_rate": 2.4541810693543154e-05, "loss": 2.8811, "step": 15400 }, { "epoch": 0.8941367605674664, "grad_norm": 0.09617622196674347, "learning_rate": 2.4277785826472888e-05, "loss": 2.8924, "step": 15410 }, { "epoch": 0.8947169920798399, "grad_norm": 0.09118277579545975, "learning_rate": 2.4015144407954737e-05, "loss": 2.8766, "step": 15420 }, { "epoch": 0.8952972235922133, "grad_norm": 0.09631020575761795, "learning_rate": 2.3753887405062103e-05, "loss": 2.8847, "step": 15430 }, { "epoch": 0.8958774551045867, "grad_norm": 0.09699005633592606, "learning_rate": 2.3494015779770505e-05, "loss": 2.8828, "step": 15440 }, { "epoch": 0.8964576866169601, "grad_norm": 0.09585963934659958, "learning_rate": 2.323553048895457e-05, "loss": 2.8821, "step": 15450 }, { "epoch": 0.8970379181293336, "grad_norm": 0.09346633404493332, "learning_rate": 2.297843248438416e-05, "loss": 2.8865, "step": 15460 }, { "epoch": 0.897618149641707, "grad_norm": 0.09547195583581924, "learning_rate": 2.2722722712721046e-05, "loss": 2.8874, "step": 15470 }, { "epoch": 0.8981983811540805, "grad_norm": 0.09408808499574661, "learning_rate": 2.2468402115515486e-05, "loss": 2.8909, "step": 15480 }, { "epoch": 0.8987786126664539, "grad_norm": 0.09334868937730789, "learning_rate": 2.22154716292025e-05, "loss": 2.8947, "step": 15490 }, { "epoch": 0.8993588441788274, "grad_norm": 0.09379076957702637, "learning_rate": 2.1963932185098623e-05, "loss": 2.8989, "step": 15500 }, { "epoch": 0.8999390756912008, "grad_norm": 0.09205812215805054, "learning_rate": 2.1713784709398443e-05, "loss": 2.886, "step": 15510 }, { "epoch": 0.9005193072035742, "grad_norm": 0.09728465229272842, "learning_rate": 2.1465030123171226e-05, "loss": 2.882, "step": 15520 }, { "epoch": 0.9010995387159476, "grad_norm": 0.09351946413516998, "learning_rate": 2.121766934235727e-05, "loss": 2.8854, "step": 15530 }, { "epoch": 0.9016797702283211, "grad_norm": 0.0997409000992775, "learning_rate": 2.0971703277765074e-05, "loss": 2.8915, "step": 15540 }, { "epoch": 0.9022600017406945, "grad_norm": 0.09500865638256073, "learning_rate": 2.0727132835067375e-05, "loss": 2.8925, "step": 15550 }, { "epoch": 0.902840233253068, "grad_norm": 0.09159790724515915, "learning_rate": 2.048395891479813e-05, "loss": 2.8797, "step": 15560 }, { "epoch": 0.9034204647654415, "grad_norm": 0.09279274940490723, "learning_rate": 2.024218241234923e-05, "loss": 2.8899, "step": 15570 }, { "epoch": 0.9040006962778149, "grad_norm": 0.09316090494394302, "learning_rate": 2.0001804217967046e-05, "loss": 2.882, "step": 15580 }, { "epoch": 0.9045809277901883, "grad_norm": 0.09024138003587723, "learning_rate": 1.9762825216749258e-05, "loss": 2.8763, "step": 15590 }, { "epoch": 0.9051611593025617, "grad_norm": 0.09573855251073837, "learning_rate": 1.9525246288641676e-05, "loss": 2.8785, "step": 15600 }, { "epoch": 0.9057413908149352, "grad_norm": 0.09761449694633484, "learning_rate": 1.928906830843471e-05, "loss": 2.8844, "step": 15610 }, { "epoch": 0.9063216223273086, "grad_norm": 0.09465468674898148, "learning_rate": 1.905429214576051e-05, "loss": 2.8836, "step": 15620 }, { "epoch": 0.906901853839682, "grad_norm": 0.09344152361154556, "learning_rate": 1.8820918665089393e-05, "loss": 2.8859, "step": 15630 }, { "epoch": 0.9074820853520554, "grad_norm": 0.09277279675006866, "learning_rate": 1.858894872572705e-05, "loss": 2.8805, "step": 15640 }, { "epoch": 0.908062316864429, "grad_norm": 0.09771661460399628, "learning_rate": 1.8358383181811045e-05, "loss": 2.8873, "step": 15650 }, { "epoch": 0.9086425483768024, "grad_norm": 0.09525156766176224, "learning_rate": 1.812922288230787e-05, "loss": 2.8804, "step": 15660 }, { "epoch": 0.9092227798891758, "grad_norm": 0.09209754317998886, "learning_rate": 1.7901468671009815e-05, "loss": 2.876, "step": 15670 }, { "epoch": 0.9098030114015492, "grad_norm": 0.09147682785987854, "learning_rate": 1.767512138653169e-05, "loss": 2.8858, "step": 15680 }, { "epoch": 0.9103832429139227, "grad_norm": 0.0912102535367012, "learning_rate": 1.7450181862307935e-05, "loss": 2.8837, "step": 15690 }, { "epoch": 0.9109634744262961, "grad_norm": 0.09235077351331711, "learning_rate": 1.722665092658935e-05, "loss": 2.8851, "step": 15700 }, { "epoch": 0.9115437059386695, "grad_norm": 0.09295492619276047, "learning_rate": 1.7004529402440395e-05, "loss": 2.8868, "step": 15710 }, { "epoch": 0.9121239374510429, "grad_norm": 0.09576184302568436, "learning_rate": 1.6783818107735726e-05, "loss": 2.8845, "step": 15720 }, { "epoch": 0.9127041689634164, "grad_norm": 0.09288014471530914, "learning_rate": 1.6564517855157447e-05, "loss": 2.8844, "step": 15730 }, { "epoch": 0.9132844004757898, "grad_norm": 0.09252408146858215, "learning_rate": 1.6346629452192076e-05, "loss": 2.8931, "step": 15740 }, { "epoch": 0.9138646319881633, "grad_norm": 0.09247728437185287, "learning_rate": 1.6130153701127492e-05, "loss": 2.8783, "step": 15750 }, { "epoch": 0.9144448635005367, "grad_norm": 0.09248922020196915, "learning_rate": 1.5915091399050186e-05, "loss": 2.8894, "step": 15760 }, { "epoch": 0.9150250950129102, "grad_norm": 0.09153193235397339, "learning_rate": 1.570144333784196e-05, "loss": 2.8912, "step": 15770 }, { "epoch": 0.9156053265252836, "grad_norm": 0.09224896878004074, "learning_rate": 1.5489210304177447e-05, "loss": 2.8935, "step": 15780 }, { "epoch": 0.916185558037657, "grad_norm": 0.09364725649356842, "learning_rate": 1.5278393079520924e-05, "loss": 2.8887, "step": 15790 }, { "epoch": 0.9167657895500304, "grad_norm": 0.09538205713033676, "learning_rate": 1.5068992440123363e-05, "loss": 2.887, "step": 15800 }, { "epoch": 0.9173460210624039, "grad_norm": 0.09308181703090668, "learning_rate": 1.4861009157020046e-05, "loss": 2.8863, "step": 15810 }, { "epoch": 0.9179262525747773, "grad_norm": 0.09197646379470825, "learning_rate": 1.4654443996027045e-05, "loss": 2.8824, "step": 15820 }, { "epoch": 0.9185064840871507, "grad_norm": 0.08937271684408188, "learning_rate": 1.4449297717738974e-05, "loss": 2.8923, "step": 15830 }, { "epoch": 0.9190867155995243, "grad_norm": 0.09317086637020111, "learning_rate": 1.4245571077525999e-05, "loss": 2.8839, "step": 15840 }, { "epoch": 0.9196669471118977, "grad_norm": 0.09397581964731216, "learning_rate": 1.4043264825530822e-05, "loss": 2.8892, "step": 15850 }, { "epoch": 0.9202471786242711, "grad_norm": 0.09369346499443054, "learning_rate": 1.3842379706666375e-05, "loss": 2.8846, "step": 15860 }, { "epoch": 0.9208274101366445, "grad_norm": 0.0952911525964737, "learning_rate": 1.3642916460612576e-05, "loss": 2.8824, "step": 15870 }, { "epoch": 0.921407641649018, "grad_norm": 0.09475695341825485, "learning_rate": 1.3444875821814195e-05, "loss": 2.8777, "step": 15880 }, { "epoch": 0.9219878731613914, "grad_norm": 0.0953005999326706, "learning_rate": 1.3248258519477353e-05, "loss": 2.8829, "step": 15890 }, { "epoch": 0.9225681046737648, "grad_norm": 0.0925806388258934, "learning_rate": 1.3053065277567777e-05, "loss": 2.8819, "step": 15900 }, { "epoch": 0.9231483361861382, "grad_norm": 0.09104064106941223, "learning_rate": 1.285929681480731e-05, "loss": 2.8839, "step": 15910 }, { "epoch": 0.9237285676985117, "grad_norm": 0.09129658341407776, "learning_rate": 1.2666953844671715e-05, "loss": 2.8828, "step": 15920 }, { "epoch": 0.9243087992108852, "grad_norm": 0.0935157984495163, "learning_rate": 1.2476037075388114e-05, "loss": 2.8901, "step": 15930 }, { "epoch": 0.9248890307232586, "grad_norm": 0.09558272361755371, "learning_rate": 1.228654720993183e-05, "loss": 2.8981, "step": 15940 }, { "epoch": 0.925469262235632, "grad_norm": 0.09126738458871841, "learning_rate": 1.2098484946024569e-05, "loss": 2.8838, "step": 15950 }, { "epoch": 0.9260494937480055, "grad_norm": 0.09226221591234207, "learning_rate": 1.191185097613121e-05, "loss": 2.8862, "step": 15960 }, { "epoch": 0.9266297252603789, "grad_norm": 0.09146758913993835, "learning_rate": 1.1726645987457563e-05, "loss": 2.889, "step": 15970 }, { "epoch": 0.9272099567727523, "grad_norm": 0.09202836453914642, "learning_rate": 1.1542870661947946e-05, "loss": 2.8812, "step": 15980 }, { "epoch": 0.9277901882851257, "grad_norm": 0.09329676628112793, "learning_rate": 1.1360525676282141e-05, "loss": 2.8842, "step": 15990 }, { "epoch": 0.9283704197974992, "grad_norm": 0.09255045652389526, "learning_rate": 1.1179611701873649e-05, "loss": 2.8893, "step": 16000 }, { "epoch": 0.9283704197974992, "eval_loss": 2.8371694087982178, "eval_runtime": 3.5003, "eval_samples_per_second": 1237.035, "eval_steps_per_second": 4.857, "step": 16000 }, { "epoch": 0.9289506513098726, "grad_norm": 0.09306171536445618, "learning_rate": 1.1000129404866588e-05, "loss": 2.8899, "step": 16010 }, { "epoch": 0.929530882822246, "grad_norm": 0.09088173508644104, "learning_rate": 1.0822079446133604e-05, "loss": 2.884, "step": 16020 }, { "epoch": 0.9301111143346195, "grad_norm": 0.09387584775686264, "learning_rate": 1.0645462481273383e-05, "loss": 2.8895, "step": 16030 }, { "epoch": 0.930691345846993, "grad_norm": 0.0902022272348404, "learning_rate": 1.0470279160607899e-05, "loss": 2.8838, "step": 16040 }, { "epoch": 0.9312715773593664, "grad_norm": 0.09107448905706406, "learning_rate": 1.0296530129180682e-05, "loss": 2.883, "step": 16050 }, { "epoch": 0.9318518088717398, "grad_norm": 0.08909423649311066, "learning_rate": 1.0124216026753752e-05, "loss": 2.8883, "step": 16060 }, { "epoch": 0.9324320403841133, "grad_norm": 0.09280171245336533, "learning_rate": 9.953337487805713e-06, "loss": 2.887, "step": 16070 }, { "epoch": 0.9330122718964867, "grad_norm": 0.0916966050863266, "learning_rate": 9.78389514152931e-06, "loss": 2.8816, "step": 16080 }, { "epoch": 0.9335925034088601, "grad_norm": 0.09001501649618149, "learning_rate": 9.615889611828888e-06, "loss": 2.8831, "step": 16090 }, { "epoch": 0.9341727349212335, "grad_norm": 0.09480106085538864, "learning_rate": 9.449321517318544e-06, "loss": 2.872, "step": 16100 }, { "epoch": 0.9347529664336071, "grad_norm": 0.0953536182641983, "learning_rate": 9.28419147131936e-06, "loss": 2.8778, "step": 16110 }, { "epoch": 0.9353331979459805, "grad_norm": 0.09141107648611069, "learning_rate": 9.120500081857496e-06, "loss": 2.8767, "step": 16120 }, { "epoch": 0.9359134294583539, "grad_norm": 0.0902305468916893, "learning_rate": 8.958247951661757e-06, "loss": 2.882, "step": 16130 }, { "epoch": 0.9364936609707273, "grad_norm": 0.09362493455410004, "learning_rate": 8.797435678161492e-06, "loss": 2.8924, "step": 16140 }, { "epoch": 0.9370738924831008, "grad_norm": 0.09281910210847855, "learning_rate": 8.638063853484469e-06, "loss": 2.8869, "step": 16150 }, { "epoch": 0.9376541239954742, "grad_norm": 0.09474115073680878, "learning_rate": 8.48013306445421e-06, "loss": 2.8792, "step": 16160 }, { "epoch": 0.9382343555078476, "grad_norm": 0.09194543957710266, "learning_rate": 8.323643892588617e-06, "loss": 2.8907, "step": 16170 }, { "epoch": 0.938814587020221, "grad_norm": 0.09240542352199554, "learning_rate": 8.168596914097126e-06, "loss": 2.8894, "step": 16180 }, { "epoch": 0.9393948185325945, "grad_norm": 0.09344927966594696, "learning_rate": 8.014992699878932e-06, "loss": 2.8959, "step": 16190 }, { "epoch": 0.939975050044968, "grad_norm": 0.09461803734302521, "learning_rate": 7.862831815520987e-06, "loss": 2.8808, "step": 16200 }, { "epoch": 0.9405552815573414, "grad_norm": 0.09065861999988556, "learning_rate": 7.71211482129539e-06, "loss": 2.8929, "step": 16210 }, { "epoch": 0.9411355130697148, "grad_norm": 0.09475411474704742, "learning_rate": 7.562842272158088e-06, "loss": 2.8859, "step": 16220 }, { "epoch": 0.9417157445820883, "grad_norm": 0.0908050462603569, "learning_rate": 7.415014717746172e-06, "loss": 2.8867, "step": 16230 }, { "epoch": 0.9422959760944617, "grad_norm": 0.09298322349786758, "learning_rate": 7.268632702376188e-06, "loss": 2.8825, "step": 16240 }, { "epoch": 0.9428762076068351, "grad_norm": 0.08874692022800446, "learning_rate": 7.1236967650420095e-06, "loss": 2.8716, "step": 16250 }, { "epoch": 0.9434564391192085, "grad_norm": 0.09308428317308426, "learning_rate": 6.980207439412967e-06, "loss": 2.8845, "step": 16260 }, { "epoch": 0.944036670631582, "grad_norm": 0.09070302546024323, "learning_rate": 6.838165253831852e-06, "loss": 2.8794, "step": 16270 }, { "epoch": 0.9446169021439554, "grad_norm": 0.09134573489427567, "learning_rate": 6.697570731312785e-06, "loss": 2.883, "step": 16280 }, { "epoch": 0.9451971336563288, "grad_norm": 0.09180360287427902, "learning_rate": 6.558424389539574e-06, "loss": 2.8904, "step": 16290 }, { "epoch": 0.9457773651687024, "grad_norm": 0.09166044741868973, "learning_rate": 6.420726740863625e-06, "loss": 2.8859, "step": 16300 }, { "epoch": 0.9463575966810758, "grad_norm": 0.0912829264998436, "learning_rate": 6.284478292302032e-06, "loss": 2.8907, "step": 16310 }, { "epoch": 0.9469378281934492, "grad_norm": 0.09044892340898514, "learning_rate": 6.14967954553598e-06, "loss": 2.8813, "step": 16320 }, { "epoch": 0.9475180597058226, "grad_norm": 0.09006070345640182, "learning_rate": 6.0163309969085255e-06, "loss": 2.8907, "step": 16330 }, { "epoch": 0.9480982912181961, "grad_norm": 0.08975432068109512, "learning_rate": 5.884433137422951e-06, "loss": 2.8885, "step": 16340 }, { "epoch": 0.9486785227305695, "grad_norm": 0.08983542770147324, "learning_rate": 5.753986452741033e-06, "loss": 2.8914, "step": 16350 }, { "epoch": 0.9492587542429429, "grad_norm": 0.08931635320186615, "learning_rate": 5.6249914231811366e-06, "loss": 2.888, "step": 16360 }, { "epoch": 0.9498389857553163, "grad_norm": 0.08885401487350464, "learning_rate": 5.497448523716387e-06, "loss": 2.881, "step": 16370 }, { "epoch": 0.9504192172676899, "grad_norm": 0.09034562110900879, "learning_rate": 5.371358223973255e-06, "loss": 2.879, "step": 16380 }, { "epoch": 0.9509994487800633, "grad_norm": 0.0914997011423111, "learning_rate": 5.246720988229292e-06, "loss": 2.8747, "step": 16390 }, { "epoch": 0.9515796802924367, "grad_norm": 0.0920967236161232, "learning_rate": 5.123537275411927e-06, "loss": 2.8817, "step": 16400 }, { "epoch": 0.9521599118048101, "grad_norm": 0.09154117107391357, "learning_rate": 5.001807539096515e-06, "loss": 2.8927, "step": 16410 }, { "epoch": 0.9527401433171836, "grad_norm": 0.09099967032670975, "learning_rate": 4.8815322275046486e-06, "loss": 2.8867, "step": 16420 }, { "epoch": 0.953320374829557, "grad_norm": 0.09072386473417282, "learning_rate": 4.762711783502605e-06, "loss": 2.8984, "step": 16430 }, { "epoch": 0.9539006063419304, "grad_norm": 0.09255844354629517, "learning_rate": 4.645346644599746e-06, "loss": 2.8955, "step": 16440 }, { "epoch": 0.9544808378543038, "grad_norm": 0.08942582458257675, "learning_rate": 4.5294372429468325e-06, "loss": 2.8878, "step": 16450 }, { "epoch": 0.9550610693666773, "grad_norm": 0.08873233199119568, "learning_rate": 4.414984005334422e-06, "loss": 2.8895, "step": 16460 }, { "epoch": 0.9556413008790507, "grad_norm": 0.09123833477497101, "learning_rate": 4.301987353191228e-06, "loss": 2.8894, "step": 16470 }, { "epoch": 0.9562215323914242, "grad_norm": 0.0893755629658699, "learning_rate": 4.190447702582878e-06, "loss": 2.8793, "step": 16480 }, { "epoch": 0.9568017639037976, "grad_norm": 0.08934894949197769, "learning_rate": 4.080365464209957e-06, "loss": 2.8832, "step": 16490 }, { "epoch": 0.9573819954161711, "grad_norm": 0.08964931964874268, "learning_rate": 3.9717410434068955e-06, "loss": 2.8868, "step": 16500 }, { "epoch": 0.9579622269285445, "grad_norm": 0.0902102142572403, "learning_rate": 3.8645748401401116e-06, "loss": 2.8953, "step": 16510 }, { "epoch": 0.9585424584409179, "grad_norm": 0.09049762040376663, "learning_rate": 3.7588672490068476e-06, "loss": 2.8827, "step": 16520 }, { "epoch": 0.9591226899532914, "grad_norm": 0.08996568620204926, "learning_rate": 3.6546186592334886e-06, "loss": 2.8815, "step": 16530 }, { "epoch": 0.9597029214656648, "grad_norm": 0.09003441035747528, "learning_rate": 3.551829454674227e-06, "loss": 2.8869, "step": 16540 }, { "epoch": 0.9602831529780382, "grad_norm": 0.08883132040500641, "learning_rate": 3.4505000138097764e-06, "loss": 2.884, "step": 16550 }, { "epoch": 0.9608633844904116, "grad_norm": 0.08925202488899231, "learning_rate": 3.3506307097456834e-06, "loss": 2.8834, "step": 16560 }, { "epoch": 0.9614436160027852, "grad_norm": 0.09335799515247345, "learning_rate": 3.2522219102112615e-06, "loss": 2.8823, "step": 16570 }, { "epoch": 0.9620238475151586, "grad_norm": 0.09578206390142441, "learning_rate": 3.1552739775579042e-06, "loss": 2.879, "step": 16580 }, { "epoch": 0.962604079027532, "grad_norm": 0.08872062712907791, "learning_rate": 3.059787268758152e-06, "loss": 2.879, "step": 16590 }, { "epoch": 0.9631843105399054, "grad_norm": 0.09045431762933731, "learning_rate": 2.9657621354040487e-06, "loss": 2.8851, "step": 16600 }, { "epoch": 0.9637645420522789, "grad_norm": 0.08912616223096848, "learning_rate": 2.8731989237058998e-06, "loss": 2.8834, "step": 16610 }, { "epoch": 0.9643447735646523, "grad_norm": 0.08925558626651764, "learning_rate": 2.7820979744912936e-06, "loss": 2.8851, "step": 16620 }, { "epoch": 0.9649250050770257, "grad_norm": 0.08986404538154602, "learning_rate": 2.6924596232033695e-06, "loss": 2.8849, "step": 16630 }, { "epoch": 0.9655052365893991, "grad_norm": 0.09278219938278198, "learning_rate": 2.604284199899931e-06, "loss": 2.8947, "step": 16640 }, { "epoch": 0.9660854681017726, "grad_norm": 0.09188593178987503, "learning_rate": 2.517572029252202e-06, "loss": 2.8787, "step": 16650 }, { "epoch": 0.9666656996141461, "grad_norm": 0.08939186483621597, "learning_rate": 2.432323430543404e-06, "loss": 2.8749, "step": 16660 }, { "epoch": 0.9672459311265195, "grad_norm": 0.09052099287509918, "learning_rate": 2.348538717667825e-06, "loss": 2.8826, "step": 16670 }, { "epoch": 0.9678261626388929, "grad_norm": 0.08904402703046799, "learning_rate": 2.266218199129533e-06, "loss": 2.8837, "step": 16680 }, { "epoch": 0.9684063941512664, "grad_norm": 0.08961648494005203, "learning_rate": 2.1853621780413057e-06, "loss": 2.8816, "step": 16690 }, { "epoch": 0.9689866256636398, "grad_norm": 0.08842738717794418, "learning_rate": 2.1059709521235706e-06, "loss": 2.8876, "step": 16700 }, { "epoch": 0.9695668571760132, "grad_norm": 0.08900275081396103, "learning_rate": 2.0280448137030227e-06, "loss": 2.8842, "step": 16710 }, { "epoch": 0.9701470886883866, "grad_norm": 0.0904114842414856, "learning_rate": 1.951584049711963e-06, "loss": 2.8761, "step": 16720 }, { "epoch": 0.9707273202007601, "grad_norm": 0.09104707837104797, "learning_rate": 1.8765889416868298e-06, "loss": 2.8985, "step": 16730 }, { "epoch": 0.9713075517131335, "grad_norm": 0.08826414495706558, "learning_rate": 1.803059765767534e-06, "loss": 2.8873, "step": 16740 }, { "epoch": 0.971887783225507, "grad_norm": 0.09161245822906494, "learning_rate": 1.7309967926962156e-06, "loss": 2.8873, "step": 16750 }, { "epoch": 0.9724680147378804, "grad_norm": 0.08984562754631042, "learning_rate": 1.6604002878162216e-06, "loss": 2.8915, "step": 16760 }, { "epoch": 0.9730482462502539, "grad_norm": 0.088889941573143, "learning_rate": 1.5912705110713521e-06, "loss": 2.8852, "step": 16770 }, { "epoch": 0.9736284777626273, "grad_norm": 0.08902832120656967, "learning_rate": 1.5236077170046602e-06, "loss": 2.8841, "step": 16780 }, { "epoch": 0.9742087092750007, "grad_norm": 0.08947611600160599, "learning_rate": 1.4574121547576537e-06, "loss": 2.8842, "step": 16790 }, { "epoch": 0.9747889407873742, "grad_norm": 0.0889580100774765, "learning_rate": 1.392684068069272e-06, "loss": 2.8791, "step": 16800 }, { "epoch": 0.9753691722997476, "grad_norm": 0.08892838656902313, "learning_rate": 1.3294236952751781e-06, "loss": 2.8882, "step": 16810 }, { "epoch": 0.975949403812121, "grad_norm": 0.08976549655199051, "learning_rate": 1.2676312693066905e-06, "loss": 2.8805, "step": 16820 }, { "epoch": 0.9765296353244944, "grad_norm": 0.0896722599864006, "learning_rate": 1.2073070176899403e-06, "loss": 2.8809, "step": 16830 }, { "epoch": 0.977109866836868, "grad_norm": 0.08809029310941696, "learning_rate": 1.1484511625452054e-06, "loss": 2.8905, "step": 16840 }, { "epoch": 0.9776900983492414, "grad_norm": 0.08948676288127899, "learning_rate": 1.0910639205858442e-06, "loss": 2.8876, "step": 16850 }, { "epoch": 0.9782703298616148, "grad_norm": 0.09113940596580505, "learning_rate": 1.0351455031177181e-06, "loss": 2.8851, "step": 16860 }, { "epoch": 0.9788505613739882, "grad_norm": 0.08962107449769974, "learning_rate": 9.806961160383489e-07, "loss": 2.8736, "step": 16870 }, { "epoch": 0.9794307928863617, "grad_norm": 0.08911692351102829, "learning_rate": 9.277159598359841e-07, "loss": 2.8752, "step": 16880 }, { "epoch": 0.9800110243987351, "grad_norm": 0.08882761746644974, "learning_rate": 8.762052295891998e-07, "loss": 2.8873, "step": 16890 }, { "epoch": 0.9805912559111085, "grad_norm": 0.0880652666091919, "learning_rate": 8.261641149658772e-07, "loss": 2.8925, "step": 16900 }, { "epoch": 0.9811714874234819, "grad_norm": 0.08926094323396683, "learning_rate": 7.775928002226262e-07, "loss": 2.8888, "step": 16910 }, { "epoch": 0.9817517189358554, "grad_norm": 0.08992500603199005, "learning_rate": 7.304914642041194e-07, "loss": 2.8884, "step": 16920 }, { "epoch": 0.9823319504482289, "grad_norm": 0.08871559053659439, "learning_rate": 6.848602803423809e-07, "loss": 2.8871, "step": 16930 }, { "epoch": 0.9829121819606023, "grad_norm": 0.08762365579605103, "learning_rate": 6.406994166562986e-07, "loss": 2.8906, "step": 16940 }, { "epoch": 0.9834924134729757, "grad_norm": 0.08928296715021133, "learning_rate": 5.980090357506907e-07, "loss": 2.8819, "step": 16950 }, { "epoch": 0.9840726449853492, "grad_norm": 0.08903508633375168, "learning_rate": 5.567892948160846e-07, "loss": 2.8793, "step": 16960 }, { "epoch": 0.9846528764977226, "grad_norm": 0.09072455763816833, "learning_rate": 5.170403456278727e-07, "loss": 2.887, "step": 16970 }, { "epoch": 0.985233108010096, "grad_norm": 0.09042411297559738, "learning_rate": 4.787623345458236e-07, "loss": 2.8902, "step": 16980 }, { "epoch": 0.9858133395224694, "grad_norm": 0.08781596273183823, "learning_rate": 4.419554025135941e-07, "loss": 2.884, "step": 16990 }, { "epoch": 0.9863935710348429, "grad_norm": 0.08884679526090622, "learning_rate": 4.0661968505819603e-07, "loss": 2.8822, "step": 17000 }, { "epoch": 0.9863935710348429, "eval_loss": 2.835911273956299, "eval_runtime": 3.4963, "eval_samples_per_second": 1238.466, "eval_steps_per_second": 4.862, "step": 17000 }, { "epoch": 0.9869738025472163, "grad_norm": 0.08938398212194443, "learning_rate": 3.7275531228937456e-07, "loss": 2.889, "step": 17010 }, { "epoch": 0.9875540340595897, "grad_norm": 0.08916931599378586, "learning_rate": 3.403624088993862e-07, "loss": 2.8791, "step": 17020 }, { "epoch": 0.9881342655719633, "grad_norm": 0.08879252523183823, "learning_rate": 3.0944109416224386e-07, "loss": 2.888, "step": 17030 }, { "epoch": 0.9887144970843367, "grad_norm": 0.08880352228879929, "learning_rate": 2.7999148193345035e-07, "loss": 2.8788, "step": 17040 }, { "epoch": 0.9892947285967101, "grad_norm": 0.09070923179388046, "learning_rate": 2.520136806495987e-07, "loss": 2.8809, "step": 17050 }, { "epoch": 0.9898749601090835, "grad_norm": 0.08977285772562027, "learning_rate": 2.2550779332783934e-07, "loss": 2.8926, "step": 17060 }, { "epoch": 0.990455191621457, "grad_norm": 0.08938348293304443, "learning_rate": 2.0047391756570222e-07, "loss": 2.8703, "step": 17070 }, { "epoch": 0.9910354231338304, "grad_norm": 0.08904155343770981, "learning_rate": 1.7691214554043101e-07, "loss": 2.8896, "step": 17080 }, { "epoch": 0.9916156546462038, "grad_norm": 0.0895574688911438, "learning_rate": 1.5482256400907169e-07, "loss": 2.8881, "step": 17090 }, { "epoch": 0.9921958861585772, "grad_norm": 0.08951245993375778, "learning_rate": 1.3420525430767328e-07, "loss": 2.8944, "step": 17100 }, { "epoch": 0.9927761176709508, "grad_norm": 0.08813910186290741, "learning_rate": 1.1506029235137661e-07, "loss": 2.8786, "step": 17110 }, { "epoch": 0.9933563491833242, "grad_norm": 0.08863378316164017, "learning_rate": 9.738774863392586e-08, "loss": 2.8952, "step": 17120 }, { "epoch": 0.9939365806956976, "grad_norm": 0.08993889391422272, "learning_rate": 8.11876882274465e-08, "loss": 2.8826, "step": 17130 }, { "epoch": 0.994516812208071, "grad_norm": 0.08892907947301865, "learning_rate": 6.646017078231204e-08, "loss": 2.8854, "step": 17140 }, { "epoch": 0.9950970437204445, "grad_norm": 0.09042982757091522, "learning_rate": 5.320525052678882e-08, "loss": 2.883, "step": 17150 }, { "epoch": 0.9956772752328179, "grad_norm": 0.09041278064250946, "learning_rate": 4.142297626681391e-08, "loss": 2.8831, "step": 17160 }, { "epoch": 0.9962575067451913, "grad_norm": 0.08876681327819824, "learning_rate": 3.1113391386039525e-08, "loss": 2.8868, "step": 17170 }, { "epoch": 0.9968377382575647, "grad_norm": 0.09024324268102646, "learning_rate": 2.227653384534456e-08, "loss": 2.8812, "step": 17180 }, { "epoch": 0.9974179697699382, "grad_norm": 0.0894467830657959, "learning_rate": 1.49124361830566e-08, "loss": 2.8866, "step": 17190 }, { "epoch": 0.9979982012823116, "grad_norm": 0.08690998703241348, "learning_rate": 9.021125514463435e-09, "loss": 2.8836, "step": 17200 }, { "epoch": 0.9985784327946851, "grad_norm": 0.08950953185558319, "learning_rate": 4.60262353207952e-09, "loss": 2.892, "step": 17210 }, { "epoch": 0.9991586643070585, "grad_norm": 0.08751031756401062, "learning_rate": 1.6569465052462819e-09, "loss": 2.8827, "step": 17220 }, { "epoch": 0.999738895819432, "grad_norm": 0.09000025689601898, "learning_rate": 1.8410528022094753e-10, "loss": 2.8862, "step": 17230 }, { "epoch": 0.9999709884243814, "step": 17234, "total_flos": 2.054512066122744e+19, "train_loss": 3.1740073927513777, "train_runtime": 19579.6117, "train_samples_per_second": 450.663, "train_steps_per_second": 0.88 } ], "logging_steps": 10, "max_steps": 17234, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.054512066122744e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }