{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.020705694730546, "eval_steps": 500, "global_step": 47943, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002080867721840007, "grad_norm": 2.3411014080047607, "learning_rate": 2.773925104022192e-07, "loss": 0.84, "step": 10 }, { "epoch": 0.004161735443680014, "grad_norm": 2.1033313274383545, "learning_rate": 5.547850208044384e-07, "loss": 0.8333, "step": 20 }, { "epoch": 0.0062426031655200215, "grad_norm": 1.6868839263916016, "learning_rate": 8.321775312066576e-07, "loss": 0.8291, "step": 30 }, { "epoch": 0.008323470887360029, "grad_norm": 1.1484296321868896, "learning_rate": 1.1095700416088767e-06, "loss": 0.7712, "step": 40 }, { "epoch": 0.010404338609200037, "grad_norm": 0.5826171636581421, "learning_rate": 1.3869625520110958e-06, "loss": 0.7386, "step": 50 }, { "epoch": 0.012485206331040043, "grad_norm": 0.5573828220367432, "learning_rate": 1.6643550624133152e-06, "loss": 0.697, "step": 60 }, { "epoch": 0.014566074052880051, "grad_norm": 0.5225166082382202, "learning_rate": 1.941747572815534e-06, "loss": 0.6847, "step": 70 }, { "epoch": 0.016646941774720057, "grad_norm": 0.48244795203208923, "learning_rate": 2.2191400832177534e-06, "loss": 0.6823, "step": 80 }, { "epoch": 0.018727809496560065, "grad_norm": 0.460706502199173, "learning_rate": 2.4965325936199723e-06, "loss": 0.6485, "step": 90 }, { "epoch": 0.020808677218400073, "grad_norm": 0.4384317696094513, "learning_rate": 2.7739251040221917e-06, "loss": 0.646, "step": 100 }, { "epoch": 0.02288954494024008, "grad_norm": 0.40923649072647095, "learning_rate": 3.0513176144244106e-06, "loss": 0.6367, "step": 110 }, { "epoch": 0.024970412662080086, "grad_norm": 0.43868789076805115, "learning_rate": 3.3287101248266303e-06, "loss": 0.6349, "step": 120 }, { "epoch": 0.027051280383920094, "grad_norm": 0.39270323514938354, "learning_rate": 3.6061026352288493e-06, "loss": 0.6138, "step": 130 }, { "epoch": 0.029132148105760102, "grad_norm": 0.41815245151519775, "learning_rate": 3.883495145631068e-06, "loss": 0.6197, "step": 140 }, { "epoch": 0.03121301582760011, "grad_norm": 0.4409431219100952, "learning_rate": 4.160887656033287e-06, "loss": 0.6137, "step": 150 }, { "epoch": 0.033293883549440115, "grad_norm": 0.4535723924636841, "learning_rate": 4.438280166435507e-06, "loss": 0.5994, "step": 160 }, { "epoch": 0.035374751271280126, "grad_norm": 0.412886381149292, "learning_rate": 4.715672676837726e-06, "loss": 0.6064, "step": 170 }, { "epoch": 0.03745561899312013, "grad_norm": 0.470053106546402, "learning_rate": 4.993065187239945e-06, "loss": 0.6014, "step": 180 }, { "epoch": 0.039536486714960135, "grad_norm": 0.4806094765663147, "learning_rate": 5.2704576976421636e-06, "loss": 0.5915, "step": 190 }, { "epoch": 0.04161735443680015, "grad_norm": 0.4421631395816803, "learning_rate": 5.547850208044383e-06, "loss": 0.5826, "step": 200 }, { "epoch": 0.04369822215864015, "grad_norm": 0.5408437848091125, "learning_rate": 5.825242718446602e-06, "loss": 0.5838, "step": 210 }, { "epoch": 0.04577908988048016, "grad_norm": 0.4925328493118286, "learning_rate": 6.102635228848821e-06, "loss": 0.5687, "step": 220 }, { "epoch": 0.04785995760232017, "grad_norm": 0.48189249634742737, "learning_rate": 6.380027739251041e-06, "loss": 0.5693, "step": 230 }, { "epoch": 0.04994082532416017, "grad_norm": 0.48665720224380493, "learning_rate": 6.657420249653261e-06, "loss": 0.566, "step": 240 }, { "epoch": 0.052021693046000184, "grad_norm": 0.5162879228591919, "learning_rate": 6.93481276005548e-06, "loss": 0.5545, "step": 250 }, { "epoch": 0.05410256076784019, "grad_norm": 0.4470868408679962, "learning_rate": 7.2122052704576985e-06, "loss": 0.556, "step": 260 }, { "epoch": 0.0561834284896802, "grad_norm": 0.49502116441726685, "learning_rate": 7.489597780859917e-06, "loss": 0.5556, "step": 270 }, { "epoch": 0.058264296211520204, "grad_norm": 0.5173251032829285, "learning_rate": 7.766990291262136e-06, "loss": 0.5374, "step": 280 }, { "epoch": 0.06034516393336021, "grad_norm": 0.5482571721076965, "learning_rate": 8.044382801664356e-06, "loss": 0.5388, "step": 290 }, { "epoch": 0.06242603165520022, "grad_norm": 0.4879547953605652, "learning_rate": 8.321775312066574e-06, "loss": 0.5219, "step": 300 }, { "epoch": 0.06450689937704022, "grad_norm": 0.5000894069671631, "learning_rate": 8.599167822468794e-06, "loss": 0.5338, "step": 310 }, { "epoch": 0.06658776709888023, "grad_norm": 0.547823965549469, "learning_rate": 8.876560332871014e-06, "loss": 0.5217, "step": 320 }, { "epoch": 0.06866863482072023, "grad_norm": 0.5172742605209351, "learning_rate": 9.153952843273233e-06, "loss": 0.5158, "step": 330 }, { "epoch": 0.07074950254256025, "grad_norm": 0.550304651260376, "learning_rate": 9.431345353675451e-06, "loss": 0.5234, "step": 340 }, { "epoch": 0.07283037026440026, "grad_norm": 0.5031601786613464, "learning_rate": 9.708737864077671e-06, "loss": 0.5001, "step": 350 }, { "epoch": 0.07491123798624026, "grad_norm": 0.5092944502830505, "learning_rate": 9.98613037447989e-06, "loss": 0.5063, "step": 360 }, { "epoch": 0.07699210570808027, "grad_norm": 0.5248615741729736, "learning_rate": 1.0263522884882109e-05, "loss": 0.498, "step": 370 }, { "epoch": 0.07907297342992027, "grad_norm": 0.5415518283843994, "learning_rate": 1.0540915395284327e-05, "loss": 0.4987, "step": 380 }, { "epoch": 0.08115384115176029, "grad_norm": 0.5518950819969177, "learning_rate": 1.0818307905686547e-05, "loss": 0.5021, "step": 390 }, { "epoch": 0.0832347088736003, "grad_norm": 0.5505532026290894, "learning_rate": 1.1095700416088767e-05, "loss": 0.4922, "step": 400 }, { "epoch": 0.0853155765954403, "grad_norm": 0.5796090960502625, "learning_rate": 1.1373092926490985e-05, "loss": 0.4714, "step": 410 }, { "epoch": 0.0873964443172803, "grad_norm": 0.6258472204208374, "learning_rate": 1.1650485436893204e-05, "loss": 0.4798, "step": 420 }, { "epoch": 0.08947731203912031, "grad_norm": 0.5451054573059082, "learning_rate": 1.1927877947295423e-05, "loss": 0.4835, "step": 430 }, { "epoch": 0.09155817976096033, "grad_norm": 0.5761281251907349, "learning_rate": 1.2205270457697642e-05, "loss": 0.4844, "step": 440 }, { "epoch": 0.09363904748280033, "grad_norm": 0.5135563015937805, "learning_rate": 1.248266296809986e-05, "loss": 0.4591, "step": 450 }, { "epoch": 0.09571991520464034, "grad_norm": 0.6883898973464966, "learning_rate": 1.2760055478502082e-05, "loss": 0.4657, "step": 460 }, { "epoch": 0.09780078292648034, "grad_norm": 0.5398361682891846, "learning_rate": 1.3037447988904302e-05, "loss": 0.4552, "step": 470 }, { "epoch": 0.09988165064832034, "grad_norm": 0.556560218334198, "learning_rate": 1.3314840499306521e-05, "loss": 0.4575, "step": 480 }, { "epoch": 0.10196251837016036, "grad_norm": 0.5584908723831177, "learning_rate": 1.359223300970874e-05, "loss": 0.4565, "step": 490 }, { "epoch": 0.10404338609200037, "grad_norm": 0.5383917689323425, "learning_rate": 1.386962552011096e-05, "loss": 0.4494, "step": 500 }, { "epoch": 0.10612425381384037, "grad_norm": 0.5997122526168823, "learning_rate": 1.4147018030513177e-05, "loss": 0.4588, "step": 510 }, { "epoch": 0.10820512153568038, "grad_norm": 0.5846486687660217, "learning_rate": 1.4424410540915397e-05, "loss": 0.4503, "step": 520 }, { "epoch": 0.11028598925752038, "grad_norm": 0.6166787147521973, "learning_rate": 1.4701803051317615e-05, "loss": 0.4568, "step": 530 }, { "epoch": 0.1123668569793604, "grad_norm": 0.5325435400009155, "learning_rate": 1.4979195561719835e-05, "loss": 0.4486, "step": 540 }, { "epoch": 0.1144477247012004, "grad_norm": 0.6046930551528931, "learning_rate": 1.5256588072122055e-05, "loss": 0.4323, "step": 550 }, { "epoch": 0.11652859242304041, "grad_norm": 0.5020564794540405, "learning_rate": 1.5533980582524273e-05, "loss": 0.4424, "step": 560 }, { "epoch": 0.11860946014488041, "grad_norm": 0.5796077847480774, "learning_rate": 1.5811373092926492e-05, "loss": 0.4201, "step": 570 }, { "epoch": 0.12069032786672042, "grad_norm": 0.5508378744125366, "learning_rate": 1.6088765603328712e-05, "loss": 0.4355, "step": 580 }, { "epoch": 0.12277119558856044, "grad_norm": 0.5415584444999695, "learning_rate": 1.636615811373093e-05, "loss": 0.426, "step": 590 }, { "epoch": 0.12485206331040044, "grad_norm": 0.5371257066726685, "learning_rate": 1.6643550624133148e-05, "loss": 0.4299, "step": 600 }, { "epoch": 0.12693293103224043, "grad_norm": 0.5836284756660461, "learning_rate": 1.6920943134535368e-05, "loss": 0.4176, "step": 610 }, { "epoch": 0.12901379875408045, "grad_norm": 0.5548219084739685, "learning_rate": 1.7198335644937588e-05, "loss": 0.4192, "step": 620 }, { "epoch": 0.13109466647592047, "grad_norm": 0.5828205347061157, "learning_rate": 1.7475728155339808e-05, "loss": 0.4204, "step": 630 }, { "epoch": 0.13317553419776046, "grad_norm": 0.6023383140563965, "learning_rate": 1.7753120665742027e-05, "loss": 0.428, "step": 640 }, { "epoch": 0.13525640191960048, "grad_norm": 0.5714589357376099, "learning_rate": 1.8030513176144247e-05, "loss": 0.4181, "step": 650 }, { "epoch": 0.13733726964144047, "grad_norm": 0.5483116507530212, "learning_rate": 1.8307905686546467e-05, "loss": 0.4297, "step": 660 }, { "epoch": 0.1394181373632805, "grad_norm": 0.5876002311706543, "learning_rate": 1.8585298196948683e-05, "loss": 0.4103, "step": 670 }, { "epoch": 0.1414990050851205, "grad_norm": 0.5004014372825623, "learning_rate": 1.8862690707350903e-05, "loss": 0.4253, "step": 680 }, { "epoch": 0.1435798728069605, "grad_norm": 0.538354218006134, "learning_rate": 1.9140083217753123e-05, "loss": 0.4155, "step": 690 }, { "epoch": 0.14566074052880051, "grad_norm": 0.5196648836135864, "learning_rate": 1.9417475728155343e-05, "loss": 0.4116, "step": 700 }, { "epoch": 0.1477416082506405, "grad_norm": 0.545829176902771, "learning_rate": 1.9694868238557562e-05, "loss": 0.3977, "step": 710 }, { "epoch": 0.14982247597248052, "grad_norm": 0.6036495566368103, "learning_rate": 1.997226074895978e-05, "loss": 0.404, "step": 720 }, { "epoch": 0.15190334369432054, "grad_norm": 0.5703802108764648, "learning_rate": 2.0249653259362e-05, "loss": 0.3957, "step": 730 }, { "epoch": 0.15398421141616053, "grad_norm": 0.5305600166320801, "learning_rate": 2.0527045769764218e-05, "loss": 0.3826, "step": 740 }, { "epoch": 0.15606507913800055, "grad_norm": 0.7438121438026428, "learning_rate": 2.0804438280166438e-05, "loss": 0.3964, "step": 750 }, { "epoch": 0.15814594685984054, "grad_norm": 0.497429758310318, "learning_rate": 2.1081830790568654e-05, "loss": 0.3898, "step": 760 }, { "epoch": 0.16022681458168056, "grad_norm": 0.511633038520813, "learning_rate": 2.1359223300970874e-05, "loss": 0.3941, "step": 770 }, { "epoch": 0.16230768230352058, "grad_norm": 0.6061072945594788, "learning_rate": 2.1636615811373094e-05, "loss": 0.3937, "step": 780 }, { "epoch": 0.16438855002536057, "grad_norm": 0.5171635150909424, "learning_rate": 2.1914008321775314e-05, "loss": 0.3892, "step": 790 }, { "epoch": 0.1664694177472006, "grad_norm": 0.5516608953475952, "learning_rate": 2.2191400832177533e-05, "loss": 0.3877, "step": 800 }, { "epoch": 0.16855028546904058, "grad_norm": 0.5124514698982239, "learning_rate": 2.246879334257975e-05, "loss": 0.3834, "step": 810 }, { "epoch": 0.1706311531908806, "grad_norm": 0.590711772441864, "learning_rate": 2.274618585298197e-05, "loss": 0.3845, "step": 820 }, { "epoch": 0.17271202091272062, "grad_norm": 0.4929494559764862, "learning_rate": 2.302357836338419e-05, "loss": 0.3788, "step": 830 }, { "epoch": 0.1747928886345606, "grad_norm": 0.5041933059692383, "learning_rate": 2.330097087378641e-05, "loss": 0.3709, "step": 840 }, { "epoch": 0.17687375635640062, "grad_norm": 0.5636357665061951, "learning_rate": 2.357836338418863e-05, "loss": 0.3804, "step": 850 }, { "epoch": 0.17895462407824062, "grad_norm": 0.5682321786880493, "learning_rate": 2.3855755894590845e-05, "loss": 0.3786, "step": 860 }, { "epoch": 0.18103549180008063, "grad_norm": 0.5313034057617188, "learning_rate": 2.4133148404993065e-05, "loss": 0.3746, "step": 870 }, { "epoch": 0.18311635952192065, "grad_norm": 0.47882601618766785, "learning_rate": 2.4410540915395285e-05, "loss": 0.377, "step": 880 }, { "epoch": 0.18519722724376064, "grad_norm": 0.571884274482727, "learning_rate": 2.4687933425797504e-05, "loss": 0.3834, "step": 890 }, { "epoch": 0.18727809496560066, "grad_norm": 0.53108811378479, "learning_rate": 2.496532593619972e-05, "loss": 0.3688, "step": 900 }, { "epoch": 0.18935896268744065, "grad_norm": 0.6008568406105042, "learning_rate": 2.5242718446601947e-05, "loss": 0.3723, "step": 910 }, { "epoch": 0.19143983040928067, "grad_norm": 0.5836086869239807, "learning_rate": 2.5520110957004164e-05, "loss": 0.3506, "step": 920 }, { "epoch": 0.1935206981311207, "grad_norm": 0.5546506643295288, "learning_rate": 2.5797503467406383e-05, "loss": 0.3613, "step": 930 }, { "epoch": 0.19560156585296068, "grad_norm": 0.6195765137672424, "learning_rate": 2.6074895977808603e-05, "loss": 0.3624, "step": 940 }, { "epoch": 0.1976824335748007, "grad_norm": 0.4981297552585602, "learning_rate": 2.6352288488210823e-05, "loss": 0.3607, "step": 950 }, { "epoch": 0.1997633012966407, "grad_norm": 0.5058258175849915, "learning_rate": 2.6629680998613043e-05, "loss": 0.3577, "step": 960 }, { "epoch": 0.2018441690184807, "grad_norm": 0.5119690895080566, "learning_rate": 2.690707350901526e-05, "loss": 0.3444, "step": 970 }, { "epoch": 0.20392503674032073, "grad_norm": 0.5586859583854675, "learning_rate": 2.718446601941748e-05, "loss": 0.3545, "step": 980 }, { "epoch": 0.20600590446216072, "grad_norm": 0.5182698965072632, "learning_rate": 2.74618585298197e-05, "loss": 0.3409, "step": 990 }, { "epoch": 0.20808677218400073, "grad_norm": 0.5207095146179199, "learning_rate": 2.773925104022192e-05, "loss": 0.3599, "step": 1000 }, { "epoch": 0.21016763990584073, "grad_norm": 0.5361712574958801, "learning_rate": 2.8016643550624135e-05, "loss": 0.3553, "step": 1010 }, { "epoch": 0.21224850762768074, "grad_norm": 0.5423237681388855, "learning_rate": 2.8294036061026354e-05, "loss": 0.3579, "step": 1020 }, { "epoch": 0.21432937534952076, "grad_norm": 0.6166819930076599, "learning_rate": 2.8571428571428574e-05, "loss": 0.346, "step": 1030 }, { "epoch": 0.21641024307136075, "grad_norm": 0.46887466311454773, "learning_rate": 2.8848821081830794e-05, "loss": 0.35, "step": 1040 }, { "epoch": 0.21849111079320077, "grad_norm": 0.5211971998214722, "learning_rate": 2.9126213592233014e-05, "loss": 0.3493, "step": 1050 }, { "epoch": 0.22057197851504076, "grad_norm": 0.5611045956611633, "learning_rate": 2.940360610263523e-05, "loss": 0.3446, "step": 1060 }, { "epoch": 0.22265284623688078, "grad_norm": 0.4875233769416809, "learning_rate": 2.968099861303745e-05, "loss": 0.3632, "step": 1070 }, { "epoch": 0.2247337139587208, "grad_norm": 0.5015367865562439, "learning_rate": 2.995839112343967e-05, "loss": 0.33, "step": 1080 }, { "epoch": 0.2268145816805608, "grad_norm": 0.5094844698905945, "learning_rate": 3.023578363384189e-05, "loss": 0.3511, "step": 1090 }, { "epoch": 0.2288954494024008, "grad_norm": 0.5108461380004883, "learning_rate": 3.051317614424411e-05, "loss": 0.3477, "step": 1100 }, { "epoch": 0.2309763171242408, "grad_norm": 0.46363887190818787, "learning_rate": 3.079056865464633e-05, "loss": 0.3402, "step": 1110 }, { "epoch": 0.23305718484608082, "grad_norm": 0.5344491600990295, "learning_rate": 3.1067961165048545e-05, "loss": 0.3368, "step": 1120 }, { "epoch": 0.23513805256792084, "grad_norm": 0.5576034784317017, "learning_rate": 3.134535367545076e-05, "loss": 0.3347, "step": 1130 }, { "epoch": 0.23721892028976083, "grad_norm": 0.6531635522842407, "learning_rate": 3.1622746185852985e-05, "loss": 0.3316, "step": 1140 }, { "epoch": 0.23929978801160084, "grad_norm": 0.4941900372505188, "learning_rate": 3.19001386962552e-05, "loss": 0.3312, "step": 1150 }, { "epoch": 0.24138065573344084, "grad_norm": 0.556009829044342, "learning_rate": 3.2177531206657424e-05, "loss": 0.3195, "step": 1160 }, { "epoch": 0.24346152345528085, "grad_norm": 0.5304979681968689, "learning_rate": 3.245492371705964e-05, "loss": 0.3272, "step": 1170 }, { "epoch": 0.24554239117712087, "grad_norm": 0.5795299410820007, "learning_rate": 3.273231622746186e-05, "loss": 0.3337, "step": 1180 }, { "epoch": 0.24762325889896086, "grad_norm": 0.5468237400054932, "learning_rate": 3.300970873786408e-05, "loss": 0.3283, "step": 1190 }, { "epoch": 0.24970412662080088, "grad_norm": 0.5252871513366699, "learning_rate": 3.3287101248266297e-05, "loss": 0.3356, "step": 1200 }, { "epoch": 0.2517849943426409, "grad_norm": 0.4920332431793213, "learning_rate": 3.356449375866852e-05, "loss": 0.3151, "step": 1210 }, { "epoch": 0.25386586206448086, "grad_norm": 0.5123196244239807, "learning_rate": 3.3841886269070736e-05, "loss": 0.3316, "step": 1220 }, { "epoch": 0.2559467297863209, "grad_norm": 0.5303069353103638, "learning_rate": 3.411927877947295e-05, "loss": 0.3269, "step": 1230 }, { "epoch": 0.2580275975081609, "grad_norm": 0.510540783405304, "learning_rate": 3.4396671289875176e-05, "loss": 0.3315, "step": 1240 }, { "epoch": 0.2601084652300009, "grad_norm": 0.5283976197242737, "learning_rate": 3.467406380027739e-05, "loss": 0.3185, "step": 1250 }, { "epoch": 0.26218933295184094, "grad_norm": 0.5038172006607056, "learning_rate": 3.4951456310679615e-05, "loss": 0.3301, "step": 1260 }, { "epoch": 0.2642702006736809, "grad_norm": 0.4920941889286041, "learning_rate": 3.522884882108184e-05, "loss": 0.3123, "step": 1270 }, { "epoch": 0.2663510683955209, "grad_norm": 0.4713519215583801, "learning_rate": 3.5506241331484055e-05, "loss": 0.3201, "step": 1280 }, { "epoch": 0.26843193611736094, "grad_norm": 0.507668137550354, "learning_rate": 3.578363384188627e-05, "loss": 0.3132, "step": 1290 }, { "epoch": 0.27051280383920095, "grad_norm": 0.5752239227294922, "learning_rate": 3.6061026352288494e-05, "loss": 0.3161, "step": 1300 }, { "epoch": 0.272593671561041, "grad_norm": 0.5016420483589172, "learning_rate": 3.633841886269071e-05, "loss": 0.3042, "step": 1310 }, { "epoch": 0.27467453928288094, "grad_norm": 0.5392054319381714, "learning_rate": 3.6615811373092934e-05, "loss": 0.3059, "step": 1320 }, { "epoch": 0.27675540700472095, "grad_norm": 0.5451818704605103, "learning_rate": 3.689320388349515e-05, "loss": 0.3187, "step": 1330 }, { "epoch": 0.278836274726561, "grad_norm": 0.48481011390686035, "learning_rate": 3.7170596393897366e-05, "loss": 0.3164, "step": 1340 }, { "epoch": 0.280917142448401, "grad_norm": 0.5223284959793091, "learning_rate": 3.744798890429959e-05, "loss": 0.3015, "step": 1350 }, { "epoch": 0.282998010170241, "grad_norm": 0.4905010461807251, "learning_rate": 3.7725381414701806e-05, "loss": 0.3021, "step": 1360 }, { "epoch": 0.285078877892081, "grad_norm": 0.5254350304603577, "learning_rate": 3.800277392510403e-05, "loss": 0.3048, "step": 1370 }, { "epoch": 0.287159745613921, "grad_norm": 0.5294822454452515, "learning_rate": 3.8280166435506246e-05, "loss": 0.3115, "step": 1380 }, { "epoch": 0.289240613335761, "grad_norm": 0.5686126947402954, "learning_rate": 3.855755894590846e-05, "loss": 0.3006, "step": 1390 }, { "epoch": 0.29132148105760103, "grad_norm": 0.49913033843040466, "learning_rate": 3.8834951456310685e-05, "loss": 0.3066, "step": 1400 }, { "epoch": 0.29340234877944105, "grad_norm": 0.5537289381027222, "learning_rate": 3.91123439667129e-05, "loss": 0.3049, "step": 1410 }, { "epoch": 0.295483216501281, "grad_norm": 0.5877476334571838, "learning_rate": 3.9389736477115125e-05, "loss": 0.3068, "step": 1420 }, { "epoch": 0.29756408422312103, "grad_norm": 0.528241753578186, "learning_rate": 3.966712898751734e-05, "loss": 0.2951, "step": 1430 }, { "epoch": 0.29964495194496105, "grad_norm": 0.487116277217865, "learning_rate": 3.994452149791956e-05, "loss": 0.2978, "step": 1440 }, { "epoch": 0.30172581966680107, "grad_norm": 0.556057870388031, "learning_rate": 3.9999997092238556e-05, "loss": 0.3109, "step": 1450 }, { "epoch": 0.3038066873886411, "grad_norm": 0.5195190906524658, "learning_rate": 3.9999985279459116e-05, "loss": 0.2985, "step": 1460 }, { "epoch": 0.30588755511048105, "grad_norm": 0.5367197394371033, "learning_rate": 3.999996437993197e-05, "loss": 0.2937, "step": 1470 }, { "epoch": 0.30796842283232106, "grad_norm": 0.47495657205581665, "learning_rate": 3.999993439366661e-05, "loss": 0.2849, "step": 1480 }, { "epoch": 0.3100492905541611, "grad_norm": 0.46729224920272827, "learning_rate": 3.999989532067664e-05, "loss": 0.2813, "step": 1490 }, { "epoch": 0.3121301582760011, "grad_norm": 0.4885554015636444, "learning_rate": 3.999984716097984e-05, "loss": 0.2889, "step": 1500 }, { "epoch": 0.3142110259978411, "grad_norm": 0.490421861410141, "learning_rate": 3.9999789914598075e-05, "loss": 0.2941, "step": 1510 }, { "epoch": 0.3162918937196811, "grad_norm": 0.4422338008880615, "learning_rate": 3.999972358155736e-05, "loss": 0.2855, "step": 1520 }, { "epoch": 0.3183727614415211, "grad_norm": 0.4997877776622772, "learning_rate": 3.9999648161887824e-05, "loss": 0.2898, "step": 1530 }, { "epoch": 0.3204536291633611, "grad_norm": 0.4314012825489044, "learning_rate": 3.999956365562374e-05, "loss": 0.2846, "step": 1540 }, { "epoch": 0.32253449688520114, "grad_norm": 0.4961174428462982, "learning_rate": 3.999947006280351e-05, "loss": 0.2876, "step": 1550 }, { "epoch": 0.32461536460704116, "grad_norm": 0.4634471833705902, "learning_rate": 3.999936738346964e-05, "loss": 0.288, "step": 1560 }, { "epoch": 0.3266962323288811, "grad_norm": 0.5526167154312134, "learning_rate": 3.99992556176688e-05, "loss": 0.2726, "step": 1570 }, { "epoch": 0.32877710005072114, "grad_norm": 0.49075210094451904, "learning_rate": 3.999913476545175e-05, "loss": 0.2726, "step": 1580 }, { "epoch": 1.0021370256935485, "grad_norm": 0.4803968071937561, "learning_rate": 3.9999004826873406e-05, "loss": 0.2625, "step": 1590 }, { "epoch": 1.0084397742828162, "grad_norm": 0.5067729353904724, "learning_rate": 3.999886580199281e-05, "loss": 0.2593, "step": 1600 }, { "epoch": 1.0147425228720837, "grad_norm": 0.4871125817298889, "learning_rate": 3.999871769087312e-05, "loss": 0.2529, "step": 1610 }, { "epoch": 1.0210452714613514, "grad_norm": 0.46350720524787903, "learning_rate": 3.999856049358163e-05, "loss": 0.2581, "step": 1620 }, { "epoch": 1.0273480200506189, "grad_norm": 0.46517544984817505, "learning_rate": 3.999839421018975e-05, "loss": 0.2526, "step": 1630 }, { "epoch": 1.0336507686398866, "grad_norm": 0.5775149464607239, "learning_rate": 3.999821884077305e-05, "loss": 0.2618, "step": 1640 }, { "epoch": 1.039953517229154, "grad_norm": 0.474810391664505, "learning_rate": 3.999803438541119e-05, "loss": 0.247, "step": 1650 }, { "epoch": 1.0462562658184218, "grad_norm": 0.4558752477169037, "learning_rate": 3.999784084418799e-05, "loss": 0.2453, "step": 1660 }, { "epoch": 1.0525590144076893, "grad_norm": 0.599254310131073, "learning_rate": 3.9997638217191366e-05, "loss": 0.2532, "step": 1670 }, { "epoch": 1.058861762996957, "grad_norm": 0.4910080134868622, "learning_rate": 3.9997426504513385e-05, "loss": 0.2652, "step": 1680 }, { "epoch": 1.0651645115862245, "grad_norm": 0.46443894505500793, "learning_rate": 3.999720570625024e-05, "loss": 0.2621, "step": 1690 }, { "epoch": 1.0714672601754922, "grad_norm": 0.5010053515434265, "learning_rate": 3.9996975822502245e-05, "loss": 0.242, "step": 1700 }, { "epoch": 1.0777700087647597, "grad_norm": 0.4945682883262634, "learning_rate": 3.999673685337385e-05, "loss": 0.2462, "step": 1710 }, { "epoch": 1.0840727573540274, "grad_norm": 0.5110160708427429, "learning_rate": 3.999648879897362e-05, "loss": 0.25, "step": 1720 }, { "epoch": 1.090375505943295, "grad_norm": 0.5333554148674011, "learning_rate": 3.999623165941427e-05, "loss": 0.2403, "step": 1730 }, { "epoch": 1.0966782545325626, "grad_norm": 0.4536961019039154, "learning_rate": 3.9995965434812605e-05, "loss": 0.2472, "step": 1740 }, { "epoch": 1.1029810031218301, "grad_norm": 0.46494683623313904, "learning_rate": 3.9995690125289597e-05, "loss": 0.2414, "step": 1750 }, { "epoch": 1.1092837517110978, "grad_norm": 0.4517687261104584, "learning_rate": 3.9995405730970325e-05, "loss": 0.254, "step": 1760 }, { "epoch": 1.1155865003003653, "grad_norm": 0.46566787362098694, "learning_rate": 3.9995112251984e-05, "loss": 0.2318, "step": 1770 }, { "epoch": 1.121889248889633, "grad_norm": 0.4896292984485626, "learning_rate": 3.9994809688463965e-05, "loss": 0.2306, "step": 1780 }, { "epoch": 1.1281919974789005, "grad_norm": 0.534498929977417, "learning_rate": 3.9994498040547676e-05, "loss": 0.2366, "step": 1790 }, { "epoch": 1.1344947460681682, "grad_norm": 0.4239828586578369, "learning_rate": 3.999417730837674e-05, "loss": 0.2352, "step": 1800 }, { "epoch": 1.1407974946574357, "grad_norm": 0.5030452609062195, "learning_rate": 3.999384749209687e-05, "loss": 0.2415, "step": 1810 }, { "epoch": 1.1471002432467035, "grad_norm": 0.45849207043647766, "learning_rate": 3.999350859185791e-05, "loss": 0.2449, "step": 1820 }, { "epoch": 1.153402991835971, "grad_norm": 0.5510883331298828, "learning_rate": 3.9993160607813834e-05, "loss": 0.236, "step": 1830 }, { "epoch": 1.1597057404252387, "grad_norm": 0.4516603946685791, "learning_rate": 3.999280354012276e-05, "loss": 0.2288, "step": 1840 }, { "epoch": 1.1660084890145062, "grad_norm": 0.4839198887348175, "learning_rate": 3.9992437388946906e-05, "loss": 0.2392, "step": 1850 }, { "epoch": 1.1723112376037736, "grad_norm": 0.4888216257095337, "learning_rate": 3.9992062154452625e-05, "loss": 0.2363, "step": 1860 }, { "epoch": 1.1786139861930414, "grad_norm": 0.4802798926830292, "learning_rate": 3.99916778368104e-05, "loss": 0.238, "step": 1870 }, { "epoch": 1.184916734782309, "grad_norm": 0.4979405105113983, "learning_rate": 3.999128443619486e-05, "loss": 0.2466, "step": 1880 }, { "epoch": 1.1912194833715766, "grad_norm": 0.483803927898407, "learning_rate": 3.999088195278472e-05, "loss": 0.2398, "step": 1890 }, { "epoch": 1.197522231960844, "grad_norm": 0.5379830598831177, "learning_rate": 3.999047038676285e-05, "loss": 0.2371, "step": 1900 }, { "epoch": 1.2038249805501118, "grad_norm": 0.46652674674987793, "learning_rate": 3.999004973831624e-05, "loss": 0.2353, "step": 1910 }, { "epoch": 1.2101277291393795, "grad_norm": 0.4706651568412781, "learning_rate": 3.9989620007636004e-05, "loss": 0.2329, "step": 1920 }, { "epoch": 1.216430477728647, "grad_norm": 0.4794965982437134, "learning_rate": 3.9989181194917394e-05, "loss": 0.2461, "step": 1930 }, { "epoch": 1.2227332263179145, "grad_norm": 0.43573808670043945, "learning_rate": 3.9988733300359776e-05, "loss": 0.2249, "step": 1940 }, { "epoch": 1.2290359749071822, "grad_norm": 0.5253443717956543, "learning_rate": 3.9988276324166637e-05, "loss": 0.2346, "step": 1950 }, { "epoch": 1.2353387234964497, "grad_norm": 0.4183483421802521, "learning_rate": 3.9987810266545606e-05, "loss": 0.2224, "step": 1960 }, { "epoch": 1.2416414720857174, "grad_norm": 0.48613592982292175, "learning_rate": 3.998733512770843e-05, "loss": 0.2257, "step": 1970 }, { "epoch": 1.247944220674985, "grad_norm": 0.4992232024669647, "learning_rate": 3.9986850907870985e-05, "loss": 0.2215, "step": 1980 }, { "epoch": 1.2542469692642526, "grad_norm": 0.44093412160873413, "learning_rate": 3.998635760725326e-05, "loss": 0.2328, "step": 1990 }, { "epoch": 1.26054971785352, "grad_norm": 0.4545539617538452, "learning_rate": 3.9985855226079385e-05, "loss": 0.228, "step": 2000 }, { "epoch": 1.2668524664427878, "grad_norm": 0.5090922117233276, "learning_rate": 3.998534376457761e-05, "loss": 0.2266, "step": 2010 }, { "epoch": 1.2731552150320553, "grad_norm": 0.42511746287345886, "learning_rate": 3.9984823222980326e-05, "loss": 0.2282, "step": 2020 }, { "epoch": 1.279457963621323, "grad_norm": 0.41865649819374084, "learning_rate": 3.9984293601524015e-05, "loss": 0.2162, "step": 2030 }, { "epoch": 1.2857607122105905, "grad_norm": 0.48033300042152405, "learning_rate": 3.998375490044931e-05, "loss": 0.2297, "step": 2040 }, { "epoch": 1.2920634607998582, "grad_norm": 0.49681708216667175, "learning_rate": 3.998320712000096e-05, "loss": 0.225, "step": 2050 }, { "epoch": 1.2983662093891257, "grad_norm": 0.47631439566612244, "learning_rate": 3.998265026042786e-05, "loss": 0.2196, "step": 2060 }, { "epoch": 1.3046689579783934, "grad_norm": 0.5340527892112732, "learning_rate": 3.998208432198299e-05, "loss": 0.2235, "step": 2070 }, { "epoch": 1.310971706567661, "grad_norm": 0.4173406660556793, "learning_rate": 3.998150930492349e-05, "loss": 0.2191, "step": 2080 }, { "epoch": 1.3172744551569286, "grad_norm": 0.4502147436141968, "learning_rate": 3.99809252095106e-05, "loss": 0.2287, "step": 2090 }, { "epoch": 1.3235772037461961, "grad_norm": 0.4420490860939026, "learning_rate": 3.998033203600972e-05, "loss": 0.2133, "step": 2100 }, { "epoch": 1.3298799523354639, "grad_norm": 0.47674500942230225, "learning_rate": 3.9979729784690325e-05, "loss": 0.2299, "step": 2110 }, { "epoch": 1.3361827009247313, "grad_norm": 0.4726238250732422, "learning_rate": 3.997911845582605e-05, "loss": 0.221, "step": 2120 }, { "epoch": 1.342485449513999, "grad_norm": 0.4496481418609619, "learning_rate": 3.997849804969465e-05, "loss": 0.2164, "step": 2130 }, { "epoch": 1.3487881981032666, "grad_norm": 0.4389066994190216, "learning_rate": 3.9977868566577995e-05, "loss": 0.2217, "step": 2140 }, { "epoch": 1.3550909466925343, "grad_norm": 0.46826645731925964, "learning_rate": 3.9977230006762084e-05, "loss": 0.2156, "step": 2150 }, { "epoch": 1.3613936952818018, "grad_norm": 0.48218968510627747, "learning_rate": 3.997658237053703e-05, "loss": 0.218, "step": 2160 }, { "epoch": 1.3676964438710695, "grad_norm": 0.43983015418052673, "learning_rate": 3.9975925658197095e-05, "loss": 0.219, "step": 2170 }, { "epoch": 1.373999192460337, "grad_norm": 0.47644516825675964, "learning_rate": 3.9975259870040643e-05, "loss": 0.2164, "step": 2180 }, { "epoch": 1.3803019410496047, "grad_norm": 0.48191362619400024, "learning_rate": 3.9974585006370156e-05, "loss": 0.211, "step": 2190 }, { "epoch": 1.3866046896388722, "grad_norm": 0.432991087436676, "learning_rate": 3.997390106749226e-05, "loss": 0.2221, "step": 2200 }, { "epoch": 1.3929074382281397, "grad_norm": 0.4472201466560364, "learning_rate": 3.997320805371769e-05, "loss": 0.2144, "step": 2210 }, { "epoch": 1.3992101868174074, "grad_norm": 0.4602055847644806, "learning_rate": 3.997250596536131e-05, "loss": 0.1981, "step": 2220 }, { "epoch": 1.405512935406675, "grad_norm": 0.5007972717285156, "learning_rate": 3.997179480274211e-05, "loss": 0.2194, "step": 2230 }, { "epoch": 1.4118156839959426, "grad_norm": 0.4387381970882416, "learning_rate": 3.9971074566183185e-05, "loss": 0.211, "step": 2240 }, { "epoch": 1.41811843258521, "grad_norm": 0.4816175699234009, "learning_rate": 3.997034525601178e-05, "loss": 0.2139, "step": 2250 }, { "epoch": 1.4244211811744778, "grad_norm": 0.41405102610588074, "learning_rate": 3.9969606872559235e-05, "loss": 0.1982, "step": 2260 }, { "epoch": 1.4307239297637455, "grad_norm": 0.42122140526771545, "learning_rate": 3.996885941616104e-05, "loss": 0.198, "step": 2270 }, { "epoch": 1.437026678353013, "grad_norm": 0.5275078415870667, "learning_rate": 3.9968102887156776e-05, "loss": 0.2103, "step": 2280 }, { "epoch": 1.4433294269422805, "grad_norm": 0.48265472054481506, "learning_rate": 3.9967337285890175e-05, "loss": 0.2072, "step": 2290 }, { "epoch": 1.4496321755315482, "grad_norm": 0.5572926998138428, "learning_rate": 3.996656261270908e-05, "loss": 0.2066, "step": 2300 }, { "epoch": 1.455934924120816, "grad_norm": 0.4106401801109314, "learning_rate": 3.9965778867965445e-05, "loss": 0.1958, "step": 2310 }, { "epoch": 1.4622376727100834, "grad_norm": 0.4621416926383972, "learning_rate": 3.996498605201536e-05, "loss": 0.2149, "step": 2320 }, { "epoch": 1.468540421299351, "grad_norm": 0.4548831284046173, "learning_rate": 3.996418416521903e-05, "loss": 0.1982, "step": 2330 }, { "epoch": 1.4748431698886186, "grad_norm": 0.5119366645812988, "learning_rate": 3.9963373207940785e-05, "loss": 0.2008, "step": 2340 }, { "epoch": 1.4811459184778863, "grad_norm": 0.46298104524612427, "learning_rate": 3.996255318054907e-05, "loss": 0.2042, "step": 2350 }, { "epoch": 1.4874486670671538, "grad_norm": 0.47201600670814514, "learning_rate": 3.9961724083416455e-05, "loss": 0.2081, "step": 2360 }, { "epoch": 1.4937514156564213, "grad_norm": 0.46017932891845703, "learning_rate": 3.9960885916919635e-05, "loss": 0.2042, "step": 2370 }, { "epoch": 1.500054164245689, "grad_norm": 0.4416022002696991, "learning_rate": 3.996003868143941e-05, "loss": 0.1882, "step": 2380 }, { "epoch": 1.5063569128349568, "grad_norm": 0.45917487144470215, "learning_rate": 3.995918237736072e-05, "loss": 0.197, "step": 2390 }, { "epoch": 1.5126596614242243, "grad_norm": 0.4703538715839386, "learning_rate": 3.9958317005072616e-05, "loss": 0.1893, "step": 2400 }, { "epoch": 1.5189624100134917, "grad_norm": 0.44450321793556213, "learning_rate": 3.995744256496826e-05, "loss": 0.1949, "step": 2410 }, { "epoch": 1.5252651586027595, "grad_norm": 0.45814019441604614, "learning_rate": 3.995655905744495e-05, "loss": 0.191, "step": 2420 }, { "epoch": 1.5315679071920272, "grad_norm": 0.4615474045276642, "learning_rate": 3.99556664829041e-05, "loss": 0.2024, "step": 2430 }, { "epoch": 1.5378706557812947, "grad_norm": 0.4586353898048401, "learning_rate": 3.9954764841751243e-05, "loss": 0.194, "step": 2440 }, { "epoch": 1.5441734043705622, "grad_norm": 0.4379872679710388, "learning_rate": 3.995385413439602e-05, "loss": 0.206, "step": 2450 }, { "epoch": 1.5504761529598299, "grad_norm": 0.47381022572517395, "learning_rate": 3.9952934361252194e-05, "loss": 0.1871, "step": 2460 }, { "epoch": 1.5567789015490974, "grad_norm": 0.42367643117904663, "learning_rate": 3.995200552273767e-05, "loss": 0.19, "step": 2470 }, { "epoch": 1.5630816501383649, "grad_norm": 0.49796268343925476, "learning_rate": 3.995106761927444e-05, "loss": 0.1976, "step": 2480 }, { "epoch": 1.5693843987276326, "grad_norm": 0.4523191452026367, "learning_rate": 3.995012065128864e-05, "loss": 0.1921, "step": 2490 }, { "epoch": 1.5756871473169003, "grad_norm": 0.4541274309158325, "learning_rate": 3.99491646192105e-05, "loss": 0.1909, "step": 2500 }, { "epoch": 1.5819898959061678, "grad_norm": 0.45846495032310486, "learning_rate": 3.994819952347439e-05, "loss": 0.1824, "step": 2510 }, { "epoch": 1.5882926444954353, "grad_norm": 0.43728747963905334, "learning_rate": 3.99472253645188e-05, "loss": 0.192, "step": 2520 }, { "epoch": 1.594595393084703, "grad_norm": 0.4346434473991394, "learning_rate": 3.9946242142786304e-05, "loss": 0.1813, "step": 2530 }, { "epoch": 1.6008981416739707, "grad_norm": 0.48706403374671936, "learning_rate": 3.994524985872363e-05, "loss": 0.1948, "step": 2540 }, { "epoch": 1.6072008902632382, "grad_norm": 0.45505911111831665, "learning_rate": 3.994424851278161e-05, "loss": 0.1899, "step": 2550 }, { "epoch": 1.6135036388525057, "grad_norm": 0.4598616361618042, "learning_rate": 3.994323810541519e-05, "loss": 0.193, "step": 2560 }, { "epoch": 1.6198063874417734, "grad_norm": 0.44961825013160706, "learning_rate": 3.9942218637083445e-05, "loss": 0.1887, "step": 2570 }, { "epoch": 1.6261091360310411, "grad_norm": 0.4273212254047394, "learning_rate": 3.994119010824954e-05, "loss": 0.1891, "step": 2580 }, { "epoch": 1.6324118846203086, "grad_norm": 0.45176175236701965, "learning_rate": 3.994015251938079e-05, "loss": 0.1855, "step": 2590 }, { "epoch": 1.638714633209576, "grad_norm": 0.4702855050563812, "learning_rate": 3.993910587094861e-05, "loss": 0.1889, "step": 2600 }, { "epoch": 1.6450173817988438, "grad_norm": 0.48689204454421997, "learning_rate": 3.993805016342852e-05, "loss": 0.197, "step": 2610 }, { "epoch": 1.6513201303881115, "grad_norm": 0.46135565638542175, "learning_rate": 3.993698539730018e-05, "loss": 0.1865, "step": 2620 }, { "epoch": 1.657622878977379, "grad_norm": 0.4298563003540039, "learning_rate": 3.9935911573047355e-05, "loss": 0.1808, "step": 2630 }, { "epoch": 1.6639256275666465, "grad_norm": 0.442527711391449, "learning_rate": 3.9934828691157905e-05, "loss": 0.1672, "step": 2640 }, { "epoch": 1.6702283761559142, "grad_norm": 0.4763069450855255, "learning_rate": 3.993373675212384e-05, "loss": 0.1922, "step": 2650 }, { "epoch": 1.676531124745182, "grad_norm": 0.4350743889808655, "learning_rate": 3.993263575644127e-05, "loss": 0.173, "step": 2660 }, { "epoch": 1.6828338733344494, "grad_norm": 0.4019092619419098, "learning_rate": 3.993152570461042e-05, "loss": 0.1797, "step": 2670 }, { "epoch": 1.689136621923717, "grad_norm": 0.4276638329029083, "learning_rate": 3.993040659713561e-05, "loss": 0.1739, "step": 2680 }, { "epoch": 1.6954393705129847, "grad_norm": 0.4761333167552948, "learning_rate": 3.992927843452532e-05, "loss": 0.195, "step": 2690 }, { "epoch": 1.7017421191022524, "grad_norm": 0.43450096249580383, "learning_rate": 3.992814121729209e-05, "loss": 0.182, "step": 2700 }, { "epoch": 1.7080448676915199, "grad_norm": 0.45470181107521057, "learning_rate": 3.992699494595262e-05, "loss": 0.1813, "step": 2710 }, { "epoch": 1.7143476162807874, "grad_norm": 0.43640562891960144, "learning_rate": 3.99258396210277e-05, "loss": 0.1844, "step": 2720 }, { "epoch": 1.720650364870055, "grad_norm": 0.4665743112564087, "learning_rate": 3.992467524304223e-05, "loss": 0.1809, "step": 2730 }, { "epoch": 1.7269531134593228, "grad_norm": 0.472085565328598, "learning_rate": 3.992350181252524e-05, "loss": 0.1767, "step": 2740 }, { "epoch": 1.7332558620485903, "grad_norm": 0.4672451615333557, "learning_rate": 3.992231933000986e-05, "loss": 0.1793, "step": 2750 }, { "epoch": 1.7395586106378578, "grad_norm": 0.49024543166160583, "learning_rate": 3.992112779603334e-05, "loss": 0.1724, "step": 2760 }, { "epoch": 1.7458613592271255, "grad_norm": 0.41391807794570923, "learning_rate": 3.991992721113702e-05, "loss": 0.1808, "step": 2770 }, { "epoch": 1.7521641078163932, "grad_norm": 0.5061421394348145, "learning_rate": 3.99187175758664e-05, "loss": 0.1798, "step": 2780 }, { "epoch": 1.7584668564056607, "grad_norm": 0.44387102127075195, "learning_rate": 3.991749889077105e-05, "loss": 0.1809, "step": 2790 }, { "epoch": 1.7647696049949282, "grad_norm": 0.43782511353492737, "learning_rate": 3.991627115640466e-05, "loss": 0.1752, "step": 2800 }, { "epoch": 1.771072353584196, "grad_norm": 0.5715064406394958, "learning_rate": 3.9915034373325034e-05, "loss": 0.1804, "step": 2810 }, { "epoch": 1.7773751021734636, "grad_norm": 0.42727044224739075, "learning_rate": 3.9913788542094104e-05, "loss": 0.1661, "step": 2820 }, { "epoch": 1.783677850762731, "grad_norm": 0.40627333521842957, "learning_rate": 3.9912533663277896e-05, "loss": 0.1658, "step": 2830 }, { "epoch": 1.7899805993519986, "grad_norm": 0.43982890248298645, "learning_rate": 3.991126973744653e-05, "loss": 0.1768, "step": 2840 }, { "epoch": 1.7962833479412663, "grad_norm": 0.4824927747249603, "learning_rate": 3.9909996765174265e-05, "loss": 0.1731, "step": 2850 }, { "epoch": 1.8025860965305338, "grad_norm": 0.472576379776001, "learning_rate": 3.990871474703947e-05, "loss": 0.1755, "step": 2860 }, { "epoch": 1.8088888451198013, "grad_norm": 0.47619909048080444, "learning_rate": 3.99074236836246e-05, "loss": 0.1667, "step": 2870 }, { "epoch": 1.815191593709069, "grad_norm": 0.5371335744857788, "learning_rate": 3.990612357551625e-05, "loss": 0.171, "step": 2880 }, { "epoch": 1.8214943422983367, "grad_norm": 0.4684987962245941, "learning_rate": 3.990481442330509e-05, "loss": 0.1717, "step": 2890 }, { "epoch": 1.8277970908876042, "grad_norm": 0.44325536489486694, "learning_rate": 3.9903496227585926e-05, "loss": 0.1754, "step": 2900 }, { "epoch": 1.8340998394768717, "grad_norm": 0.45618388056755066, "learning_rate": 3.990216898895767e-05, "loss": 0.1821, "step": 2910 }, { "epoch": 1.8404025880661394, "grad_norm": 0.4308132231235504, "learning_rate": 3.990083270802333e-05, "loss": 0.1684, "step": 2920 }, { "epoch": 1.8467053366554071, "grad_norm": 0.4097435772418976, "learning_rate": 3.989948738539003e-05, "loss": 0.1736, "step": 2930 }, { "epoch": 1.8530080852446746, "grad_norm": 0.4433673918247223, "learning_rate": 3.9898133021668994e-05, "loss": 0.1678, "step": 2940 }, { "epoch": 1.8593108338339421, "grad_norm": 0.463981956243515, "learning_rate": 3.989676961747557e-05, "loss": 0.1685, "step": 2950 }, { "epoch": 1.8656135824232098, "grad_norm": 0.4708816111087799, "learning_rate": 3.98953971734292e-05, "loss": 0.1635, "step": 2960 }, { "epoch": 1.8719163310124776, "grad_norm": 0.48943549394607544, "learning_rate": 3.9894015690153435e-05, "loss": 0.1679, "step": 2970 }, { "epoch": 1.878219079601745, "grad_norm": 0.4639098644256592, "learning_rate": 3.989262516827594e-05, "loss": 0.1713, "step": 2980 }, { "epoch": 1.8845218281910125, "grad_norm": 0.4283701479434967, "learning_rate": 3.989122560842848e-05, "loss": 0.1754, "step": 2990 }, { "epoch": 1.8908245767802803, "grad_norm": 0.4343477189540863, "learning_rate": 3.988981701124693e-05, "loss": 0.1699, "step": 3000 }, { "epoch": 1.897127325369548, "grad_norm": 0.4539116322994232, "learning_rate": 3.9888399377371264e-05, "loss": 0.1658, "step": 3010 }, { "epoch": 1.9034300739588155, "grad_norm": 0.4731622040271759, "learning_rate": 3.988697270744557e-05, "loss": 0.1707, "step": 3020 }, { "epoch": 1.909732822548083, "grad_norm": 0.4326985478401184, "learning_rate": 3.9885537002118034e-05, "loss": 0.1658, "step": 3030 }, { "epoch": 1.9160355711373507, "grad_norm": 0.44999977946281433, "learning_rate": 3.988409226204095e-05, "loss": 0.1773, "step": 3040 }, { "epoch": 1.9223383197266184, "grad_norm": 0.4474574625492096, "learning_rate": 3.988263848787073e-05, "loss": 0.1676, "step": 3050 }, { "epoch": 1.9286410683158859, "grad_norm": 0.4656180441379547, "learning_rate": 3.9881175680267866e-05, "loss": 0.1627, "step": 3060 }, { "epoch": 1.9349438169051534, "grad_norm": 0.4666668474674225, "learning_rate": 3.9879703839896964e-05, "loss": 0.1577, "step": 3070 }, { "epoch": 1.941246565494421, "grad_norm": 0.5258769392967224, "learning_rate": 3.987822296742675e-05, "loss": 0.1558, "step": 3080 }, { "epoch": 1.9475493140836888, "grad_norm": 0.43421509861946106, "learning_rate": 3.987673306353003e-05, "loss": 0.1625, "step": 3090 }, { "epoch": 1.9538520626729563, "grad_norm": 0.40720343589782715, "learning_rate": 3.9875234128883734e-05, "loss": 0.1641, "step": 3100 }, { "epoch": 1.9601548112622238, "grad_norm": 0.44685980677604675, "learning_rate": 3.987372616416887e-05, "loss": 0.1649, "step": 3110 }, { "epoch": 1.9664575598514915, "grad_norm": 0.4323420226573944, "learning_rate": 3.987220917007057e-05, "loss": 0.16, "step": 3120 }, { "epoch": 1.9727603084407592, "grad_norm": 0.4535258710384369, "learning_rate": 3.987068314727807e-05, "loss": 0.157, "step": 3130 }, { "epoch": 1.9790630570300267, "grad_norm": 0.41930902004241943, "learning_rate": 3.986914809648468e-05, "loss": 0.162, "step": 3140 }, { "epoch": 1.9853658056192942, "grad_norm": 0.41208091378211975, "learning_rate": 3.9867604018387854e-05, "loss": 0.1596, "step": 3150 }, { "epoch": 1.991668554208562, "grad_norm": 0.4337902367115021, "learning_rate": 3.986605091368911e-05, "loss": 0.1575, "step": 3160 }, { "epoch": 1.9979713027978296, "grad_norm": 0.41506972908973694, "learning_rate": 3.986448878309409e-05, "loss": 0.1678, "step": 3170 }, { "epoch": 2.004274051387097, "grad_norm": 0.4155426621437073, "learning_rate": 3.986291762731252e-05, "loss": 0.1488, "step": 3180 }, { "epoch": 2.0105767999763646, "grad_norm": 0.4222520887851715, "learning_rate": 3.986133744705825e-05, "loss": 0.1627, "step": 3190 }, { "epoch": 2.0168795485656323, "grad_norm": 0.42573752999305725, "learning_rate": 3.98597482430492e-05, "loss": 0.1646, "step": 3200 }, { "epoch": 2.0231822971549, "grad_norm": 0.41649162769317627, "learning_rate": 3.9858150016007405e-05, "loss": 0.1548, "step": 3210 }, { "epoch": 2.0294850457441678, "grad_norm": 0.45249950885772705, "learning_rate": 3.9856542766659016e-05, "loss": 0.1504, "step": 3220 }, { "epoch": 2.035787794333435, "grad_norm": 0.4239870011806488, "learning_rate": 3.985492649573425e-05, "loss": 0.1628, "step": 3230 }, { "epoch": 2.0420905429227028, "grad_norm": 0.4301147758960724, "learning_rate": 3.9853301203967454e-05, "loss": 0.1573, "step": 3240 }, { "epoch": 2.04839329151197, "grad_norm": 0.4281153678894043, "learning_rate": 3.985166689209705e-05, "loss": 0.15, "step": 3250 }, { "epoch": 2.0546960401012377, "grad_norm": 0.46747416257858276, "learning_rate": 3.985002356086557e-05, "loss": 0.1641, "step": 3260 }, { "epoch": 2.0609987886905055, "grad_norm": 0.5423853397369385, "learning_rate": 3.984837121101964e-05, "loss": 0.1675, "step": 3270 }, { "epoch": 2.067301537279773, "grad_norm": 0.417141854763031, "learning_rate": 3.984670984330999e-05, "loss": 0.1483, "step": 3280 }, { "epoch": 2.073604285869041, "grad_norm": 0.5251865983009338, "learning_rate": 3.9845039458491435e-05, "loss": 0.1558, "step": 3290 }, { "epoch": 2.0799070344583086, "grad_norm": 0.43376970291137695, "learning_rate": 3.98433600573229e-05, "loss": 0.1365, "step": 3300 }, { "epoch": 2.086209783047576, "grad_norm": 0.44385552406311035, "learning_rate": 3.9841671640567395e-05, "loss": 0.154, "step": 3310 }, { "epoch": 2.0925125316368436, "grad_norm": 0.4632505476474762, "learning_rate": 3.983997420899203e-05, "loss": 0.156, "step": 3320 }, { "epoch": 2.098815280226111, "grad_norm": 0.40251821279525757, "learning_rate": 3.983826776336803e-05, "loss": 0.1467, "step": 3330 }, { "epoch": 2.1051180288153786, "grad_norm": 0.422961562871933, "learning_rate": 3.983655230447067e-05, "loss": 0.1562, "step": 3340 }, { "epoch": 2.1114207774046463, "grad_norm": 0.40030208230018616, "learning_rate": 3.983482783307937e-05, "loss": 0.1525, "step": 3350 }, { "epoch": 2.117723525993914, "grad_norm": 0.4453134834766388, "learning_rate": 3.983309434997761e-05, "loss": 0.1531, "step": 3360 }, { "epoch": 2.1240262745831817, "grad_norm": 0.46662160754203796, "learning_rate": 3.9831351855952975e-05, "loss": 0.1617, "step": 3370 }, { "epoch": 2.1303290231724494, "grad_norm": 0.40118637681007385, "learning_rate": 3.982960035179716e-05, "loss": 0.1503, "step": 3380 }, { "epoch": 2.1366317717617167, "grad_norm": 0.4365919530391693, "learning_rate": 3.982783983830592e-05, "loss": 0.1505, "step": 3390 }, { "epoch": 2.0025223513151644, "grad_norm": 0.4720030128955841, "learning_rate": 3.982607031627914e-05, "loss": 0.1458, "step": 3400 }, { "epoch": 2.0080507925538806, "grad_norm": 0.4371935725212097, "learning_rate": 3.982429178652078e-05, "loss": 0.1462, "step": 3410 }, { "epoch": 2.0135792337925973, "grad_norm": 0.4923725128173828, "learning_rate": 3.982250424983888e-05, "loss": 0.146, "step": 3420 }, { "epoch": 2.0191076750313135, "grad_norm": 0.4377041161060333, "learning_rate": 3.9820707707045596e-05, "loss": 0.1447, "step": 3430 }, { "epoch": 2.0246361162700297, "grad_norm": 0.4284875690937042, "learning_rate": 3.9818902158957155e-05, "loss": 0.1519, "step": 3440 }, { "epoch": 2.0301645575087464, "grad_norm": 0.46411600708961487, "learning_rate": 3.9817087606393895e-05, "loss": 0.1498, "step": 3450 }, { "epoch": 2.0356929987474626, "grad_norm": 0.4162873327732086, "learning_rate": 3.981526405018024e-05, "loss": 0.1469, "step": 3460 }, { "epoch": 2.0412214399861788, "grad_norm": 0.40271133184432983, "learning_rate": 3.981343149114469e-05, "loss": 0.15, "step": 3470 }, { "epoch": 2.0467498812248954, "grad_norm": 0.4702868163585663, "learning_rate": 3.981158993011984e-05, "loss": 0.1518, "step": 3480 }, { "epoch": 2.0522783224636116, "grad_norm": 0.4136670231819153, "learning_rate": 3.98097393679424e-05, "loss": 0.1366, "step": 3490 }, { "epoch": 2.057806763702328, "grad_norm": 0.4160245358943939, "learning_rate": 3.980787980545313e-05, "loss": 0.1386, "step": 3500 }, { "epoch": 2.0633352049410445, "grad_norm": 0.4755261540412903, "learning_rate": 3.9806011243496915e-05, "loss": 0.1459, "step": 3510 }, { "epoch": 2.0688636461797607, "grad_norm": 0.39648106694221497, "learning_rate": 3.980413368292271e-05, "loss": 0.1375, "step": 3520 }, { "epoch": 2.074392087418477, "grad_norm": 0.44521093368530273, "learning_rate": 3.9802247124583556e-05, "loss": 0.1492, "step": 3530 }, { "epoch": 2.0799205286571936, "grad_norm": 0.4031028151512146, "learning_rate": 3.9800351569336596e-05, "loss": 0.1464, "step": 3540 }, { "epoch": 2.0854489698959098, "grad_norm": 0.44121915102005005, "learning_rate": 3.979844701804304e-05, "loss": 0.1365, "step": 3550 }, { "epoch": 2.090977411134626, "grad_norm": 0.4047275185585022, "learning_rate": 3.979653347156821e-05, "loss": 0.1536, "step": 3560 }, { "epoch": 2.0965058523733426, "grad_norm": 0.5015032291412354, "learning_rate": 3.979461093078149e-05, "loss": 0.1366, "step": 3570 }, { "epoch": 2.102034293612059, "grad_norm": 0.44324538111686707, "learning_rate": 3.979267939655637e-05, "loss": 0.1471, "step": 3580 }, { "epoch": 2.1075627348507755, "grad_norm": 0.44045665860176086, "learning_rate": 3.979073886977042e-05, "loss": 0.1415, "step": 3590 }, { "epoch": 2.1130911760894917, "grad_norm": 0.4131348133087158, "learning_rate": 3.978878935130529e-05, "loss": 0.1434, "step": 3600 }, { "epoch": 2.118619617328208, "grad_norm": 0.44060182571411133, "learning_rate": 3.9786830842046734e-05, "loss": 0.152, "step": 3610 }, { "epoch": 2.1241480585669246, "grad_norm": 0.4803299605846405, "learning_rate": 3.978486334288455e-05, "loss": 0.1487, "step": 3620 }, { "epoch": 2.1296764998056408, "grad_norm": 0.6407479643821716, "learning_rate": 3.978288685471267e-05, "loss": 0.1413, "step": 3630 }, { "epoch": 2.135204941044357, "grad_norm": 0.5021945238113403, "learning_rate": 3.978090137842908e-05, "loss": 0.1349, "step": 3640 }, { "epoch": 2.1407333822830736, "grad_norm": 0.4292374551296234, "learning_rate": 3.9778906914935854e-05, "loss": 0.1406, "step": 3650 }, { "epoch": 2.14626182352179, "grad_norm": 0.4330940246582031, "learning_rate": 3.9776903465139154e-05, "loss": 0.1487, "step": 3660 }, { "epoch": 2.151790264760506, "grad_norm": 0.3934926986694336, "learning_rate": 3.977489102994923e-05, "loss": 0.1402, "step": 3670 }, { "epoch": 2.1573187059992227, "grad_norm": 0.46640247106552124, "learning_rate": 3.977286961028039e-05, "loss": 0.1343, "step": 3680 }, { "epoch": 2.162847147237939, "grad_norm": 0.4113956689834595, "learning_rate": 3.977083920705106e-05, "loss": 0.1331, "step": 3690 }, { "epoch": 2.168375588476655, "grad_norm": 0.48314768075942993, "learning_rate": 3.976879982118371e-05, "loss": 0.1445, "step": 3700 }, { "epoch": 2.1739040297153718, "grad_norm": 0.44583335518836975, "learning_rate": 3.976675145360493e-05, "loss": 0.1334, "step": 3710 }, { "epoch": 2.179432470954088, "grad_norm": 0.42313382029533386, "learning_rate": 3.976469410524536e-05, "loss": 0.1407, "step": 3720 }, { "epoch": 2.184960912192804, "grad_norm": 0.44853222370147705, "learning_rate": 3.9762627777039726e-05, "loss": 0.1397, "step": 3730 }, { "epoch": 2.190489353431521, "grad_norm": 0.46589556336402893, "learning_rate": 3.976055246992685e-05, "loss": 0.1357, "step": 3740 }, { "epoch": 2.196017794670237, "grad_norm": 0.43935269117355347, "learning_rate": 3.9758468184849614e-05, "loss": 0.1435, "step": 3750 }, { "epoch": 2.2015462359089533, "grad_norm": 0.44692733883857727, "learning_rate": 3.975637492275499e-05, "loss": 0.133, "step": 3760 }, { "epoch": 2.20707467714767, "grad_norm": 0.3989916741847992, "learning_rate": 3.975427268459403e-05, "loss": 0.1336, "step": 3770 }, { "epoch": 2.212603118386386, "grad_norm": 0.4053291380405426, "learning_rate": 3.975216147132185e-05, "loss": 0.1311, "step": 3780 }, { "epoch": 2.2181315596251028, "grad_norm": 0.491730272769928, "learning_rate": 3.9750041283897664e-05, "loss": 0.1381, "step": 3790 }, { "epoch": 2.223660000863819, "grad_norm": 0.43411651253700256, "learning_rate": 3.974791212328475e-05, "loss": 0.1295, "step": 3800 }, { "epoch": 2.229188442102535, "grad_norm": 0.4312554895877838, "learning_rate": 3.974577399045046e-05, "loss": 0.1358, "step": 3810 }, { "epoch": 2.234716883341252, "grad_norm": 0.4162611663341522, "learning_rate": 3.974362688636624e-05, "loss": 0.1359, "step": 3820 }, { "epoch": 2.240245324579968, "grad_norm": 0.3859942853450775, "learning_rate": 3.974147081200759e-05, "loss": 0.14, "step": 3830 }, { "epoch": 2.2457737658186843, "grad_norm": 0.4115585684776306, "learning_rate": 3.9739305768354104e-05, "loss": 0.1385, "step": 3840 }, { "epoch": 2.251302207057401, "grad_norm": 0.41153183579444885, "learning_rate": 3.973713175638943e-05, "loss": 0.1323, "step": 3850 }, { "epoch": 2.256830648296117, "grad_norm": 0.4219399392604828, "learning_rate": 3.973494877710132e-05, "loss": 0.133, "step": 3860 }, { "epoch": 2.2623590895348333, "grad_norm": 0.4809802174568176, "learning_rate": 3.973275683148158e-05, "loss": 0.1381, "step": 3870 }, { "epoch": 2.26788753077355, "grad_norm": 0.3913417458534241, "learning_rate": 3.973055592052608e-05, "loss": 0.1406, "step": 3880 }, { "epoch": 2.273415972012266, "grad_norm": 0.3952055275440216, "learning_rate": 3.9728346045234795e-05, "loss": 0.1358, "step": 3890 }, { "epoch": 2.2789444132509824, "grad_norm": 0.40519702434539795, "learning_rate": 3.9726127206611745e-05, "loss": 0.1266, "step": 3900 }, { "epoch": 2.284472854489699, "grad_norm": 0.4266471266746521, "learning_rate": 3.972389940566503e-05, "loss": 0.1398, "step": 3910 }, { "epoch": 2.2900012957284153, "grad_norm": 0.40302401781082153, "learning_rate": 3.972166264340683e-05, "loss": 0.13, "step": 3920 }, { "epoch": 2.295529736967132, "grad_norm": 0.4119429290294647, "learning_rate": 3.971941692085338e-05, "loss": 0.1273, "step": 3930 }, { "epoch": 2.301058178205848, "grad_norm": 0.43267980217933655, "learning_rate": 3.9717162239025016e-05, "loss": 0.1327, "step": 3940 }, { "epoch": 2.3065866194445643, "grad_norm": 0.4143143594264984, "learning_rate": 3.971489859894611e-05, "loss": 0.1351, "step": 3950 }, { "epoch": 2.312115060683281, "grad_norm": 0.4354778230190277, "learning_rate": 3.971262600164512e-05, "loss": 0.1299, "step": 3960 }, { "epoch": 2.317643501921997, "grad_norm": 0.4663635790348053, "learning_rate": 3.9710344448154574e-05, "loss": 0.1216, "step": 3970 }, { "epoch": 2.3231719431607134, "grad_norm": 0.4569236934185028, "learning_rate": 3.9708053939511074e-05, "loss": 0.1259, "step": 3980 }, { "epoch": 2.32870038439943, "grad_norm": 0.46295055747032166, "learning_rate": 3.9705754476755275e-05, "loss": 0.1415, "step": 3990 }, { "epoch": 2.3342288256381463, "grad_norm": 0.4508783221244812, "learning_rate": 3.970344606093192e-05, "loss": 0.1258, "step": 4000 }, { "epoch": 2.3397572668768625, "grad_norm": 0.42322778701782227, "learning_rate": 3.97011286930898e-05, "loss": 0.1326, "step": 4010 }, { "epoch": 2.345285708115579, "grad_norm": 0.5261969566345215, "learning_rate": 3.969880237428179e-05, "loss": 0.1294, "step": 4020 }, { "epoch": 2.3508141493542953, "grad_norm": 0.4136386811733246, "learning_rate": 3.969646710556481e-05, "loss": 0.1251, "step": 4030 }, { "epoch": 2.3563425905930115, "grad_norm": 0.40843337774276733, "learning_rate": 3.969412288799988e-05, "loss": 0.1294, "step": 4040 }, { "epoch": 2.361871031831728, "grad_norm": 0.43243229389190674, "learning_rate": 3.9691769722652055e-05, "loss": 0.1286, "step": 4050 }, { "epoch": 2.3673994730704444, "grad_norm": 0.4426671266555786, "learning_rate": 3.968940761059047e-05, "loss": 0.1387, "step": 4060 }, { "epoch": 2.3729279143091606, "grad_norm": 0.4446999132633209, "learning_rate": 3.968703655288832e-05, "loss": 0.145, "step": 4070 }, { "epoch": 2.3784563555478773, "grad_norm": 0.42282816767692566, "learning_rate": 3.968465655062287e-05, "loss": 0.1273, "step": 4080 }, { "epoch": 2.3839847967865935, "grad_norm": 0.43674221634864807, "learning_rate": 3.968226760487544e-05, "loss": 0.1311, "step": 4090 }, { "epoch": 2.3895132380253097, "grad_norm": 0.4247426688671112, "learning_rate": 3.9679869716731424e-05, "loss": 0.1357, "step": 4100 }, { "epoch": 2.3950416792640263, "grad_norm": 0.3931308686733246, "learning_rate": 3.967746288728027e-05, "loss": 0.1268, "step": 4110 }, { "epoch": 2.4005701205027425, "grad_norm": 0.3881493806838989, "learning_rate": 3.9675047117615475e-05, "loss": 0.1256, "step": 4120 }, { "epoch": 2.4060985617414588, "grad_norm": 0.3917362689971924, "learning_rate": 3.9672622408834645e-05, "loss": 0.1209, "step": 4130 }, { "epoch": 2.4116270029801754, "grad_norm": 0.40662428736686707, "learning_rate": 3.96701887620394e-05, "loss": 0.1298, "step": 4140 }, { "epoch": 2.4171554442188916, "grad_norm": 0.41967180371284485, "learning_rate": 3.9667746178335434e-05, "loss": 0.1222, "step": 4150 }, { "epoch": 2.4226838854576083, "grad_norm": 0.4021008610725403, "learning_rate": 3.966529465883251e-05, "loss": 0.1295, "step": 4160 }, { "epoch": 2.4282123266963245, "grad_norm": 0.4253454804420471, "learning_rate": 3.966283420464445e-05, "loss": 0.1202, "step": 4170 }, { "epoch": 2.4337407679350407, "grad_norm": 0.48380231857299805, "learning_rate": 3.9660364816889124e-05, "loss": 0.1278, "step": 4180 }, { "epoch": 2.4392692091737573, "grad_norm": 0.38834238052368164, "learning_rate": 3.965788649668847e-05, "loss": 0.1299, "step": 4190 }, { "epoch": 2.4447976504124735, "grad_norm": 0.38824462890625, "learning_rate": 3.965539924516848e-05, "loss": 0.1273, "step": 4200 }, { "epoch": 2.4503260916511898, "grad_norm": 0.45680421590805054, "learning_rate": 3.9652903063459216e-05, "loss": 0.1336, "step": 4210 }, { "epoch": 2.4558545328899064, "grad_norm": 0.44908010959625244, "learning_rate": 3.965039795269478e-05, "loss": 0.117, "step": 4220 }, { "epoch": 2.4613829741286226, "grad_norm": 0.40670692920684814, "learning_rate": 3.9647883914013335e-05, "loss": 0.1296, "step": 4230 }, { "epoch": 2.466911415367339, "grad_norm": 0.4207748472690582, "learning_rate": 3.964536094855711e-05, "loss": 0.1318, "step": 4240 }, { "epoch": 2.4724398566060555, "grad_norm": 0.43380653858184814, "learning_rate": 3.9642829057472383e-05, "loss": 0.1211, "step": 4250 }, { "epoch": 2.4779682978447717, "grad_norm": 0.377718061208725, "learning_rate": 3.964028824190948e-05, "loss": 0.1215, "step": 4260 }, { "epoch": 2.4834967390834883, "grad_norm": 0.37562650442123413, "learning_rate": 3.96377385030228e-05, "loss": 0.1213, "step": 4270 }, { "epoch": 2.4890251803222045, "grad_norm": 0.4355523884296417, "learning_rate": 3.9635179841970786e-05, "loss": 0.1185, "step": 4280 }, { "epoch": 2.4945536215609208, "grad_norm": 0.4724717438220978, "learning_rate": 3.963261225991592e-05, "loss": 0.1209, "step": 4290 }, { "epoch": 2.5000820627996374, "grad_norm": 0.4334560036659241, "learning_rate": 3.963003575802476e-05, "loss": 0.1181, "step": 4300 }, { "epoch": 2.5056105040383536, "grad_norm": 0.3827645182609558, "learning_rate": 3.962745033746791e-05, "loss": 0.1234, "step": 4310 }, { "epoch": 2.51113894527707, "grad_norm": 0.4257426857948303, "learning_rate": 3.962485599942002e-05, "loss": 0.1182, "step": 4320 }, { "epoch": 2.5166673865157865, "grad_norm": 0.46148166060447693, "learning_rate": 3.96222527450598e-05, "loss": 0.1248, "step": 4330 }, { "epoch": 2.5221958277545027, "grad_norm": 0.4582226276397705, "learning_rate": 3.9619640575570006e-05, "loss": 0.1136, "step": 4340 }, { "epoch": 2.527724268993219, "grad_norm": 0.46801555156707764, "learning_rate": 3.961701949213745e-05, "loss": 0.1176, "step": 4350 }, { "epoch": 2.5332527102319355, "grad_norm": 0.3948918282985687, "learning_rate": 3.961438949595297e-05, "loss": 0.1283, "step": 4360 }, { "epoch": 2.5387811514706518, "grad_norm": 0.3989269435405731, "learning_rate": 3.9611750588211496e-05, "loss": 0.1201, "step": 4370 }, { "epoch": 2.544309592709368, "grad_norm": 0.4456348419189453, "learning_rate": 3.9609102770111966e-05, "loss": 0.1112, "step": 4380 }, { "epoch": 2.5498380339480846, "grad_norm": 0.44770199060440063, "learning_rate": 3.9606446042857394e-05, "loss": 0.1209, "step": 4390 }, { "epoch": 2.555366475186801, "grad_norm": 0.4409154951572418, "learning_rate": 3.960378040765483e-05, "loss": 0.1248, "step": 4400 }, { "epoch": 2.560894916425517, "grad_norm": 0.3940699100494385, "learning_rate": 3.960110586571536e-05, "loss": 0.114, "step": 4410 }, { "epoch": 2.5664233576642337, "grad_norm": 0.4262852668762207, "learning_rate": 3.9598422418254146e-05, "loss": 0.1209, "step": 4420 }, { "epoch": 2.57195179890295, "grad_norm": 0.361361026763916, "learning_rate": 3.959573006649037e-05, "loss": 0.1156, "step": 4430 }, { "epoch": 2.577480240141666, "grad_norm": 0.42728179693222046, "learning_rate": 3.959302881164728e-05, "loss": 0.1243, "step": 4440 }, { "epoch": 2.5830086813803828, "grad_norm": 0.3920799493789673, "learning_rate": 3.959031865495214e-05, "loss": 0.1143, "step": 4450 }, { "epoch": 2.588537122619099, "grad_norm": 0.46031931042671204, "learning_rate": 3.95875995976363e-05, "loss": 0.1153, "step": 4460 }, { "epoch": 2.594065563857815, "grad_norm": 0.39148467779159546, "learning_rate": 3.9584871640935095e-05, "loss": 0.1183, "step": 4470 }, { "epoch": 2.599594005096532, "grad_norm": 0.38433676958084106, "learning_rate": 3.958213478608798e-05, "loss": 0.1179, "step": 4480 }, { "epoch": 2.605122446335248, "grad_norm": 0.40717417001724243, "learning_rate": 3.9579389034338376e-05, "loss": 0.1102, "step": 4490 }, { "epoch": 2.6106508875739642, "grad_norm": 0.49776431918144226, "learning_rate": 3.957663438693381e-05, "loss": 0.1176, "step": 4500 }, { "epoch": 2.616179328812681, "grad_norm": 0.43069538474082947, "learning_rate": 3.9573870845125804e-05, "loss": 0.1149, "step": 4510 }, { "epoch": 2.621707770051397, "grad_norm": 0.4419122636318207, "learning_rate": 3.9571098410169944e-05, "loss": 0.1103, "step": 4520 }, { "epoch": 2.6272362112901138, "grad_norm": 0.4307602047920227, "learning_rate": 3.9568317083325855e-05, "loss": 0.1139, "step": 4530 }, { "epoch": 2.63276465252883, "grad_norm": 0.49441689252853394, "learning_rate": 3.9565526865857194e-05, "loss": 0.1165, "step": 4540 }, { "epoch": 2.638293093767546, "grad_norm": 0.403817355632782, "learning_rate": 3.9562727759031666e-05, "loss": 0.1151, "step": 4550 }, { "epoch": 2.643821535006263, "grad_norm": 0.4570995569229126, "learning_rate": 3.955991976412101e-05, "loss": 0.1206, "step": 4560 }, { "epoch": 2.649349976244979, "grad_norm": 0.4763033986091614, "learning_rate": 3.9557102882401004e-05, "loss": 0.1226, "step": 4570 }, { "epoch": 2.6548784174836957, "grad_norm": 0.38693809509277344, "learning_rate": 3.955427711515146e-05, "loss": 0.1222, "step": 4580 }, { "epoch": 2.660406858722412, "grad_norm": 0.4658221900463104, "learning_rate": 3.955144246365623e-05, "loss": 0.1105, "step": 4590 }, { "epoch": 2.665935299961128, "grad_norm": 0.4216407835483551, "learning_rate": 3.954859892920321e-05, "loss": 0.1169, "step": 4600 }, { "epoch": 2.6714637411998448, "grad_norm": 0.4151599407196045, "learning_rate": 3.954574651308432e-05, "loss": 0.1168, "step": 4610 }, { "epoch": 2.676992182438561, "grad_norm": 0.4108287990093231, "learning_rate": 3.954288521659552e-05, "loss": 0.1167, "step": 4620 }, { "epoch": 2.682520623677277, "grad_norm": 0.4123769998550415, "learning_rate": 3.9540015041036814e-05, "loss": 0.1121, "step": 4630 }, { "epoch": 2.688049064915994, "grad_norm": 0.3626314699649811, "learning_rate": 3.953713598771222e-05, "loss": 0.1155, "step": 4640 }, { "epoch": 2.69357750615471, "grad_norm": 0.4260972738265991, "learning_rate": 3.9534248057929806e-05, "loss": 0.1221, "step": 4650 }, { "epoch": 2.6991059473934262, "grad_norm": 0.43823426961898804, "learning_rate": 3.953135125300166e-05, "loss": 0.1168, "step": 4660 }, { "epoch": 2.704634388632143, "grad_norm": 0.4378599524497986, "learning_rate": 3.9528445574243915e-05, "loss": 0.1151, "step": 4670 }, { "epoch": 2.710162829870859, "grad_norm": 0.4458305239677429, "learning_rate": 3.9525531022976735e-05, "loss": 0.1174, "step": 4680 }, { "epoch": 2.7156912711095753, "grad_norm": 0.45899659395217896, "learning_rate": 3.95226076005243e-05, "loss": 0.1162, "step": 4690 }, { "epoch": 2.721219712348292, "grad_norm": 0.4139960706233978, "learning_rate": 3.9519675308214844e-05, "loss": 0.1132, "step": 4700 }, { "epoch": 2.726748153587008, "grad_norm": 0.41143807768821716, "learning_rate": 3.951673414738061e-05, "loss": 0.117, "step": 4710 }, { "epoch": 2.7322765948257244, "grad_norm": 0.36129775643348694, "learning_rate": 3.951378411935788e-05, "loss": 0.1132, "step": 4720 }, { "epoch": 2.737805036064441, "grad_norm": 0.40212559700012207, "learning_rate": 3.9510825225486957e-05, "loss": 0.1056, "step": 4730 }, { "epoch": 2.7433334773031572, "grad_norm": 0.3516486585140228, "learning_rate": 3.950785746711219e-05, "loss": 0.1085, "step": 4740 }, { "epoch": 2.7488619185418735, "grad_norm": 0.3914463520050049, "learning_rate": 3.950488084558194e-05, "loss": 0.1076, "step": 4750 }, { "epoch": 2.75439035978059, "grad_norm": 0.3894129693508148, "learning_rate": 3.950189536224858e-05, "loss": 0.1149, "step": 4760 }, { "epoch": 2.7599188010193063, "grad_norm": 0.4161078929901123, "learning_rate": 3.949890101846856e-05, "loss": 0.1138, "step": 4770 }, { "epoch": 2.7654472422580225, "grad_norm": 0.4811912178993225, "learning_rate": 3.949589781560231e-05, "loss": 0.114, "step": 4780 }, { "epoch": 2.770975683496739, "grad_norm": 0.43853452801704407, "learning_rate": 3.9492885755014285e-05, "loss": 0.1148, "step": 4790 }, { "epoch": 2.7765041247354554, "grad_norm": 0.43201422691345215, "learning_rate": 3.948986483807299e-05, "loss": 0.1065, "step": 4800 }, { "epoch": 2.7820325659741716, "grad_norm": 0.38787567615509033, "learning_rate": 3.9486835066150934e-05, "loss": 0.113, "step": 4810 }, { "epoch": 2.7875610072128882, "grad_norm": 0.4203093349933624, "learning_rate": 3.948379644062467e-05, "loss": 0.1102, "step": 4820 }, { "epoch": 2.7930894484516045, "grad_norm": 0.47427502274513245, "learning_rate": 3.9480748962874746e-05, "loss": 0.1129, "step": 4830 }, { "epoch": 2.7986178896903207, "grad_norm": 0.40976908802986145, "learning_rate": 3.9477692634285763e-05, "loss": 0.1099, "step": 4840 }, { "epoch": 2.8041463309290373, "grad_norm": 0.43594345450401306, "learning_rate": 3.9474627456246305e-05, "loss": 0.113, "step": 4850 }, { "epoch": 2.8096747721677535, "grad_norm": 0.4238104522228241, "learning_rate": 3.9471553430149014e-05, "loss": 0.1064, "step": 4860 }, { "epoch": 2.8152032134064697, "grad_norm": 0.4034004509449005, "learning_rate": 3.946847055739053e-05, "loss": 0.1084, "step": 4870 }, { "epoch": 2.8207316546451864, "grad_norm": 0.4135122299194336, "learning_rate": 3.946537883937152e-05, "loss": 0.1025, "step": 4880 }, { "epoch": 2.8262600958839026, "grad_norm": 0.40541017055511475, "learning_rate": 3.9462278277496663e-05, "loss": 0.111, "step": 4890 }, { "epoch": 2.8317885371226192, "grad_norm": 0.4121241569519043, "learning_rate": 3.945916887317467e-05, "loss": 0.1118, "step": 4900 }, { "epoch": 2.8373169783613355, "grad_norm": 0.4163466989994049, "learning_rate": 3.945605062781825e-05, "loss": 0.1013, "step": 4910 }, { "epoch": 2.842845419600052, "grad_norm": 0.41021621227264404, "learning_rate": 3.945292354284415e-05, "loss": 0.1069, "step": 4920 }, { "epoch": 2.8483738608387683, "grad_norm": 0.41538918018341064, "learning_rate": 3.944978761967312e-05, "loss": 0.1044, "step": 4930 }, { "epoch": 2.8539023020774845, "grad_norm": 0.4511255919933319, "learning_rate": 3.944664285972993e-05, "loss": 0.1114, "step": 4940 }, { "epoch": 2.859430743316201, "grad_norm": 0.37908661365509033, "learning_rate": 3.944348926444335e-05, "loss": 0.1048, "step": 4950 }, { "epoch": 2.8649591845549174, "grad_norm": 0.4369373917579651, "learning_rate": 3.944032683524619e-05, "loss": 0.1031, "step": 4960 }, { "epoch": 2.8704876257936336, "grad_norm": 0.3852616548538208, "learning_rate": 3.943715557357526e-05, "loss": 0.1086, "step": 4970 }, { "epoch": 2.8760160670323502, "grad_norm": 0.3848329484462738, "learning_rate": 3.943397548087138e-05, "loss": 0.1103, "step": 4980 }, { "epoch": 2.8815445082710665, "grad_norm": 0.4077957272529602, "learning_rate": 3.943078655857939e-05, "loss": 0.1054, "step": 4990 }, { "epoch": 2.8870729495097827, "grad_norm": 0.40357813239097595, "learning_rate": 3.942758880814813e-05, "loss": 0.1063, "step": 5000 }, { "epoch": 2.8926013907484993, "grad_norm": 0.4105832874774933, "learning_rate": 3.9424382231030465e-05, "loss": 0.1054, "step": 5010 }, { "epoch": 2.8981298319872155, "grad_norm": 0.4045313000679016, "learning_rate": 3.942116682868327e-05, "loss": 0.1088, "step": 5020 }, { "epoch": 2.9036582732259317, "grad_norm": 0.4130319058895111, "learning_rate": 3.941794260256741e-05, "loss": 0.1036, "step": 5030 }, { "epoch": 2.9091867144646484, "grad_norm": 0.421705961227417, "learning_rate": 3.941470955414778e-05, "loss": 0.1052, "step": 5040 }, { "epoch": 2.9147151557033646, "grad_norm": 0.40853264927864075, "learning_rate": 3.9411467684893274e-05, "loss": 0.1067, "step": 5050 }, { "epoch": 2.920243596942081, "grad_norm": 0.40416499972343445, "learning_rate": 3.9408216996276795e-05, "loss": 0.0994, "step": 5060 }, { "epoch": 2.9257720381807975, "grad_norm": 0.390770822763443, "learning_rate": 3.940495748977526e-05, "loss": 0.1064, "step": 5070 }, { "epoch": 2.9313004794195137, "grad_norm": 0.3877948224544525, "learning_rate": 3.940168916686958e-05, "loss": 0.1098, "step": 5080 }, { "epoch": 2.93682892065823, "grad_norm": 0.3807394504547119, "learning_rate": 3.939841202904467e-05, "loss": 0.1022, "step": 5090 }, { "epoch": 2.9423573618969465, "grad_norm": 0.43130242824554443, "learning_rate": 3.939512607778947e-05, "loss": 0.1075, "step": 5100 }, { "epoch": 2.9478858031356627, "grad_norm": 0.42256706953048706, "learning_rate": 3.93918313145969e-05, "loss": 0.1067, "step": 5110 }, { "epoch": 2.953414244374379, "grad_norm": 0.38979974389076233, "learning_rate": 3.9388527740963915e-05, "loss": 0.0981, "step": 5120 }, { "epoch": 2.9589426856130956, "grad_norm": 0.40876826643943787, "learning_rate": 3.9385215358391435e-05, "loss": 0.1102, "step": 5130 }, { "epoch": 2.964471126851812, "grad_norm": 0.4242190420627594, "learning_rate": 3.9381894168384404e-05, "loss": 0.1068, "step": 5140 }, { "epoch": 2.969999568090528, "grad_norm": 0.5033330321311951, "learning_rate": 3.9378564172451765e-05, "loss": 0.106, "step": 5150 }, { "epoch": 2.9755280093292447, "grad_norm": 0.3564286231994629, "learning_rate": 3.937522537210646e-05, "loss": 0.1046, "step": 5160 }, { "epoch": 2.981056450567961, "grad_norm": 0.37209779024124146, "learning_rate": 3.937187776886544e-05, "loss": 0.0978, "step": 5170 }, { "epoch": 2.986584891806677, "grad_norm": 0.40230876207351685, "learning_rate": 3.936852136424963e-05, "loss": 0.1113, "step": 5180 }, { "epoch": 2.9921133330453937, "grad_norm": 0.38621124625205994, "learning_rate": 3.936515615978399e-05, "loss": 0.1105, "step": 5190 }, { "epoch": 2.99764177428411, "grad_norm": 0.4096004068851471, "learning_rate": 3.9361782156997446e-05, "loss": 0.1054, "step": 5200 }, { "epoch": 3.003170215522826, "grad_norm": 0.39862343668937683, "learning_rate": 3.935839935742294e-05, "loss": 0.0979, "step": 5210 }, { "epoch": 3.008698656761543, "grad_norm": 0.4386747181415558, "learning_rate": 3.9355007762597414e-05, "loss": 0.0942, "step": 5220 }, { "epoch": 3.0142270980002595, "grad_norm": 0.4412272572517395, "learning_rate": 3.935160737406178e-05, "loss": 0.0993, "step": 5230 }, { "epoch": 3.019755539238975, "grad_norm": 0.4449138939380646, "learning_rate": 3.934819819336097e-05, "loss": 0.1098, "step": 5240 }, { "epoch": 3.025283980477692, "grad_norm": 0.4455380141735077, "learning_rate": 3.934478022204391e-05, "loss": 0.1003, "step": 5250 }, { "epoch": 3.0308124217164085, "grad_norm": 0.4574155807495117, "learning_rate": 3.9341353461663504e-05, "loss": 0.1041, "step": 5260 }, { "epoch": 3.0363408629551243, "grad_norm": 0.3965187966823578, "learning_rate": 3.9337917913776665e-05, "loss": 0.1075, "step": 5270 }, { "epoch": 3.041869304193841, "grad_norm": 0.4028269648551941, "learning_rate": 3.933447357994429e-05, "loss": 0.0944, "step": 5280 }, { "epoch": 3.0473977454325576, "grad_norm": 0.37240272760391235, "learning_rate": 3.933102046173127e-05, "loss": 0.1019, "step": 5290 }, { "epoch": 3.052926186671274, "grad_norm": 0.4258761703968048, "learning_rate": 3.932755856070648e-05, "loss": 0.1007, "step": 5300 }, { "epoch": 3.05845462790999, "grad_norm": 0.4389885365962982, "learning_rate": 3.93240878784428e-05, "loss": 0.1097, "step": 5310 }, { "epoch": 3.0639830691487067, "grad_norm": 0.43050864338874817, "learning_rate": 3.93206084165171e-05, "loss": 0.0953, "step": 5320 }, { "epoch": 3.069511510387423, "grad_norm": 0.42751839756965637, "learning_rate": 3.931712017651021e-05, "loss": 0.0981, "step": 5330 }, { "epoch": 3.075039951626139, "grad_norm": 0.3839806318283081, "learning_rate": 3.931362316000699e-05, "loss": 0.0963, "step": 5340 }, { "epoch": 3.0805683928648557, "grad_norm": 0.39270731806755066, "learning_rate": 3.931011736859625e-05, "loss": 0.0978, "step": 5350 }, { "epoch": 3.086096834103572, "grad_norm": 0.3977856934070587, "learning_rate": 3.930660280387081e-05, "loss": 0.1012, "step": 5360 }, { "epoch": 3.091625275342288, "grad_norm": 0.4518764615058899, "learning_rate": 3.930307946742747e-05, "loss": 0.0984, "step": 5370 }, { "epoch": 3.097153716581005, "grad_norm": 0.40980660915374756, "learning_rate": 3.929954736086702e-05, "loss": 0.0939, "step": 5380 }, { "epoch": 3.102682157819721, "grad_norm": 0.43924733996391296, "learning_rate": 3.929600648579421e-05, "loss": 0.0928, "step": 5390 }, { "epoch": 3.108210599058437, "grad_norm": 0.4224366843700409, "learning_rate": 3.929245684381782e-05, "loss": 0.099, "step": 5400 }, { "epoch": 3.113739040297154, "grad_norm": 0.40405547618865967, "learning_rate": 3.928889843655056e-05, "loss": 0.0962, "step": 5410 }, { "epoch": 3.11926748153587, "grad_norm": 0.39338019490242004, "learning_rate": 3.928533126560916e-05, "loss": 0.0971, "step": 5420 }, { "epoch": 3.1247959227745863, "grad_norm": 0.3832258880138397, "learning_rate": 3.928175533261434e-05, "loss": 0.1025, "step": 5430 }, { "epoch": 3.130324364013303, "grad_norm": 0.42379528284072876, "learning_rate": 3.927817063919075e-05, "loss": 0.0972, "step": 5440 }, { "epoch": 3.135852805252019, "grad_norm": 0.4177568852901459, "learning_rate": 3.927457718696706e-05, "loss": 0.0994, "step": 5450 }, { "epoch": 3.1413812464907354, "grad_norm": 0.40894076228141785, "learning_rate": 3.927097497757592e-05, "loss": 0.1, "step": 5460 }, { "epoch": 3.146909687729452, "grad_norm": 0.40326249599456787, "learning_rate": 3.9267364012653945e-05, "loss": 0.0946, "step": 5470 }, { "epoch": 3.152438128968168, "grad_norm": 0.43118080496788025, "learning_rate": 3.9263744293841735e-05, "loss": 0.0971, "step": 5480 }, { "epoch": 3.1579665702068844, "grad_norm": 0.38085389137268066, "learning_rate": 3.926011582278386e-05, "loss": 0.1008, "step": 5490 }, { "epoch": 3.163495011445601, "grad_norm": 0.38605403900146484, "learning_rate": 3.9256478601128873e-05, "loss": 0.0992, "step": 5500 }, { "epoch": 3.1690234526843173, "grad_norm": 0.3789391815662384, "learning_rate": 3.92528326305293e-05, "loss": 0.0967, "step": 5510 }, { "epoch": 3.1745518939230335, "grad_norm": 0.39450758695602417, "learning_rate": 3.924917791264165e-05, "loss": 0.0954, "step": 5520 }, { "epoch": 3.18008033516175, "grad_norm": 0.3809379041194916, "learning_rate": 3.924551444912638e-05, "loss": 0.0913, "step": 5530 }, { "epoch": 3.1856087764004664, "grad_norm": 0.38566291332244873, "learning_rate": 3.9241842241647965e-05, "loss": 0.0908, "step": 5540 }, { "epoch": 3.1911372176391826, "grad_norm": 0.3613402545452118, "learning_rate": 3.923816129187481e-05, "loss": 0.0926, "step": 5550 }, { "epoch": 3.196665658877899, "grad_norm": 0.37487030029296875, "learning_rate": 3.9234471601479315e-05, "loss": 0.0967, "step": 5560 }, { "epoch": 3.202194100116616, "grad_norm": 0.3945640027523041, "learning_rate": 3.923077317213785e-05, "loss": 0.0953, "step": 5570 }, { "epoch": 3.2077225413553316, "grad_norm": 0.42098096013069153, "learning_rate": 3.922706600553074e-05, "loss": 0.0957, "step": 5580 }, { "epoch": 3.2132509825940483, "grad_norm": 0.39167165756225586, "learning_rate": 3.92233501033423e-05, "loss": 0.0945, "step": 5590 }, { "epoch": 3.218779423832765, "grad_norm": 0.4236452281475067, "learning_rate": 3.9219625467260794e-05, "loss": 0.0909, "step": 5600 }, { "epoch": 3.2243078650714807, "grad_norm": 0.4269375205039978, "learning_rate": 3.921589209897848e-05, "loss": 0.0948, "step": 5610 }, { "epoch": 3.2298363063101974, "grad_norm": 0.40583720803260803, "learning_rate": 3.9212150000191565e-05, "loss": 0.0996, "step": 5620 }, { "epoch": 3.235364747548914, "grad_norm": 0.38379621505737305, "learning_rate": 3.920839917260021e-05, "loss": 0.0974, "step": 5630 }, { "epoch": 3.24089318878763, "grad_norm": 0.3955712914466858, "learning_rate": 3.920463961790858e-05, "loss": 0.0929, "step": 5640 }, { "epoch": 3.2464216300263464, "grad_norm": 0.4033460021018982, "learning_rate": 3.920087133782476e-05, "loss": 0.083, "step": 5650 }, { "epoch": 3.251950071265063, "grad_norm": 0.4046616554260254, "learning_rate": 3.919709433406084e-05, "loss": 0.0967, "step": 5660 }, { "epoch": 3.2574785125037793, "grad_norm": 0.37485313415527344, "learning_rate": 3.9193308608332833e-05, "loss": 0.0929, "step": 5670 }, { "epoch": 3.2630069537424955, "grad_norm": 0.4146013557910919, "learning_rate": 3.9189514162360766e-05, "loss": 0.1026, "step": 5680 }, { "epoch": 3.268535394981212, "grad_norm": 0.3998347818851471, "learning_rate": 3.9185710997868584e-05, "loss": 0.099, "step": 5690 }, { "epoch": 3.2740638362199284, "grad_norm": 0.3854697644710541, "learning_rate": 3.91818991165842e-05, "loss": 0.1024, "step": 5700 }, { "epoch": 3.2795922774586446, "grad_norm": 0.3749615252017975, "learning_rate": 3.917807852023951e-05, "loss": 0.0969, "step": 5710 }, { "epoch": 3.285120718697361, "grad_norm": 0.4053827226161957, "learning_rate": 3.917424921057035e-05, "loss": 0.091, "step": 5720 }, { "epoch": 3.2906491599360774, "grad_norm": 0.4030649960041046, "learning_rate": 3.917041118931652e-05, "loss": 0.0896, "step": 5730 }, { "epoch": 3.2961776011747936, "grad_norm": 0.43534618616104126, "learning_rate": 3.916656445822177e-05, "loss": 0.0959, "step": 5740 }, { "epoch": 3.3017060424135103, "grad_norm": 0.403164803981781, "learning_rate": 3.9162709019033835e-05, "loss": 0.096, "step": 5750 }, { "epoch": 3.3072344836522265, "grad_norm": 0.4095669090747833, "learning_rate": 3.915884487350436e-05, "loss": 0.0964, "step": 5760 }, { "epoch": 3.3127629248909427, "grad_norm": 0.3990991413593292, "learning_rate": 3.9154972023388996e-05, "loss": 0.0944, "step": 5770 }, { "epoch": 3.3182913661296594, "grad_norm": 0.3821229934692383, "learning_rate": 3.91510904704473e-05, "loss": 0.0879, "step": 5780 }, { "epoch": 3.3238198073683756, "grad_norm": 0.3778615891933441, "learning_rate": 3.914720021644284e-05, "loss": 0.0978, "step": 5790 }, { "epoch": 3.3293482486070918, "grad_norm": 0.43820732831954956, "learning_rate": 3.9143301263143075e-05, "loss": 0.0955, "step": 5800 }, { "epoch": 3.3348766898458084, "grad_norm": 0.38895854353904724, "learning_rate": 3.913939361231947e-05, "loss": 0.0854, "step": 5810 }, { "epoch": 3.3404051310845246, "grad_norm": 0.4394699037075043, "learning_rate": 3.9135477265747394e-05, "loss": 0.0983, "step": 5820 }, { "epoch": 3.345933572323241, "grad_norm": 0.42309892177581787, "learning_rate": 3.913155222520621e-05, "loss": 0.0897, "step": 5830 }, { "epoch": 3.3514620135619575, "grad_norm": 0.3798421621322632, "learning_rate": 3.912761849247921e-05, "loss": 0.0881, "step": 5840 }, { "epoch": 3.3569904548006737, "grad_norm": 0.385541170835495, "learning_rate": 3.912367606935362e-05, "loss": 0.0888, "step": 5850 }, { "epoch": 3.36251889603939, "grad_norm": 0.4271203875541687, "learning_rate": 3.911972495762066e-05, "loss": 0.0925, "step": 5860 }, { "epoch": 3.3680473372781066, "grad_norm": 0.4005385637283325, "learning_rate": 3.911576515907545e-05, "loss": 0.0902, "step": 5870 }, { "epoch": 3.3735757785168228, "grad_norm": 0.3521791994571686, "learning_rate": 3.911179667551707e-05, "loss": 0.0856, "step": 5880 }, { "epoch": 3.379104219755539, "grad_norm": 0.3891170620918274, "learning_rate": 3.910781950874857e-05, "loss": 0.0922, "step": 5890 }, { "epoch": 3.3846326609942556, "grad_norm": 0.3729146420955658, "learning_rate": 3.910383366057692e-05, "loss": 0.0906, "step": 5900 }, { "epoch": 3.3901611022329723, "grad_norm": 0.3506501317024231, "learning_rate": 3.909983913281304e-05, "loss": 0.0878, "step": 5910 }, { "epoch": 3.395689543471688, "grad_norm": 0.45811739563941956, "learning_rate": 3.9095835927271784e-05, "loss": 0.0909, "step": 5920 }, { "epoch": 3.4012179847104047, "grad_norm": 0.3891924321651459, "learning_rate": 3.9091824045771976e-05, "loss": 0.0884, "step": 5930 }, { "epoch": 3.4067464259491214, "grad_norm": 0.36187615990638733, "learning_rate": 3.9087803490136364e-05, "loss": 0.0878, "step": 5940 }, { "epoch": 3.412274867187837, "grad_norm": 0.4014558792114258, "learning_rate": 3.9083774262191624e-05, "loss": 0.099, "step": 5950 }, { "epoch": 3.4178033084265538, "grad_norm": 0.4023808538913727, "learning_rate": 3.90797363637684e-05, "loss": 0.0842, "step": 5960 }, { "epoch": 3.4233317496652704, "grad_norm": 0.37827253341674805, "learning_rate": 3.907568979670125e-05, "loss": 0.0892, "step": 5970 }, { "epoch": 3.428860190903986, "grad_norm": 0.5029112696647644, "learning_rate": 3.9071634562828694e-05, "loss": 0.0964, "step": 5980 }, { "epoch": 3.434388632142703, "grad_norm": 0.4736369848251343, "learning_rate": 3.9067570663993176e-05, "loss": 0.0967, "step": 5990 }, { "epoch": 3.4399170733814195, "grad_norm": 0.4507979154586792, "learning_rate": 3.906349810204107e-05, "loss": 0.0909, "step": 6000 }, { "epoch": 3.4454455146201357, "grad_norm": 0.4070133566856384, "learning_rate": 3.905941687882269e-05, "loss": 0.0823, "step": 6010 }, { "epoch": 3.450973955858852, "grad_norm": 0.3976655900478363, "learning_rate": 3.9055326996192315e-05, "loss": 0.0826, "step": 6020 }, { "epoch": 3.4565023970975686, "grad_norm": 0.4014241099357605, "learning_rate": 3.905122845600811e-05, "loss": 0.0821, "step": 6030 }, { "epoch": 3.4620308383362848, "grad_norm": 0.38843992352485657, "learning_rate": 3.90471212601322e-05, "loss": 0.0821, "step": 6040 }, { "epoch": 3.467559279575001, "grad_norm": 0.3792579472064972, "learning_rate": 3.904300541043064e-05, "loss": 0.1046, "step": 6050 }, { "epoch": 3.4730877208137176, "grad_norm": 0.4405553638935089, "learning_rate": 3.9038880908773425e-05, "loss": 0.0928, "step": 6060 }, { "epoch": 3.478616162052434, "grad_norm": 0.3564014434814453, "learning_rate": 3.9034747757034466e-05, "loss": 0.0957, "step": 6070 }, { "epoch": 3.48414460329115, "grad_norm": 0.3685324490070343, "learning_rate": 3.90306059570916e-05, "loss": 0.0823, "step": 6080 }, { "epoch": 3.4896730445298667, "grad_norm": 0.3722662925720215, "learning_rate": 3.9026455510826624e-05, "loss": 0.0862, "step": 6090 }, { "epoch": 3.495201485768583, "grad_norm": 0.3736521005630493, "learning_rate": 3.902229642012522e-05, "loss": 0.0933, "step": 6100 }, { "epoch": 3.500729927007299, "grad_norm": 0.37787383794784546, "learning_rate": 3.901812868687703e-05, "loss": 0.0807, "step": 6110 }, { "epoch": 3.000918421233899, "grad_norm": 0.39985015988349915, "learning_rate": 3.901395231297561e-05, "loss": 0.0831, "step": 6120 }, { "epoch": 3.0045921061694947, "grad_norm": 0.36359015107154846, "learning_rate": 3.900976730031845e-05, "loss": 0.0838, "step": 6130 }, { "epoch": 3.0082657911050905, "grad_norm": 0.37965527176856995, "learning_rate": 3.900557365080696e-05, "loss": 0.0796, "step": 6140 }, { "epoch": 3.011939476040686, "grad_norm": 0.4232991933822632, "learning_rate": 3.900137136634646e-05, "loss": 0.0758, "step": 6150 }, { "epoch": 3.0156131609762817, "grad_norm": 0.38618186116218567, "learning_rate": 3.899716044884621e-05, "loss": 0.0736, "step": 6160 }, { "epoch": 3.0192868459118776, "grad_norm": 0.34749099612236023, "learning_rate": 3.8992940900219406e-05, "loss": 0.0729, "step": 6170 }, { "epoch": 3.0229605308474734, "grad_norm": 0.35351717472076416, "learning_rate": 3.8988712722383126e-05, "loss": 0.0716, "step": 6180 }, { "epoch": 3.026634215783069, "grad_norm": 0.372409850358963, "learning_rate": 3.898447591725841e-05, "loss": 0.0757, "step": 6190 }, { "epoch": 3.0303079007186646, "grad_norm": 0.37387657165527344, "learning_rate": 3.898023048677018e-05, "loss": 0.0761, "step": 6200 }, { "epoch": 3.0339815856542605, "grad_norm": 0.3686372935771942, "learning_rate": 3.897597643284731e-05, "loss": 0.0841, "step": 6210 }, { "epoch": 3.037655270589856, "grad_norm": 0.36609306931495667, "learning_rate": 3.897171375742257e-05, "loss": 0.0705, "step": 6220 }, { "epoch": 3.0413289555254517, "grad_norm": 0.39538371562957764, "learning_rate": 3.896744246243264e-05, "loss": 0.0734, "step": 6230 }, { "epoch": 3.0450026404610475, "grad_norm": 0.33480340242385864, "learning_rate": 3.896316254981816e-05, "loss": 0.0735, "step": 6240 }, { "epoch": 3.0486763253966433, "grad_norm": 0.31124961376190186, "learning_rate": 3.895887402152364e-05, "loss": 0.0736, "step": 6250 }, { "epoch": 3.0523500103322387, "grad_norm": 0.3793226182460785, "learning_rate": 3.895457687949752e-05, "loss": 0.0714, "step": 6260 }, { "epoch": 3.0560236952678346, "grad_norm": 0.3642493784427643, "learning_rate": 3.895027112569216e-05, "loss": 0.0765, "step": 6270 }, { "epoch": 3.0596973802034304, "grad_norm": 0.4314895272254944, "learning_rate": 3.8945956762063815e-05, "loss": 0.0717, "step": 6280 }, { "epoch": 3.063371065139026, "grad_norm": 0.3597939610481262, "learning_rate": 3.894163379057267e-05, "loss": 0.0758, "step": 6290 }, { "epoch": 3.0670447500746216, "grad_norm": 0.37375080585479736, "learning_rate": 3.8937302213182814e-05, "loss": 0.0701, "step": 6300 }, { "epoch": 3.0707184350102175, "grad_norm": 0.4299790859222412, "learning_rate": 3.8932962031862246e-05, "loss": 0.0715, "step": 6310 }, { "epoch": 3.0743921199458133, "grad_norm": 0.36089223623275757, "learning_rate": 3.892861324858287e-05, "loss": 0.0766, "step": 6320 }, { "epoch": 3.0780658048814087, "grad_norm": 0.3766305148601532, "learning_rate": 3.892425586532052e-05, "loss": 0.0711, "step": 6330 }, { "epoch": 3.0817394898170045, "grad_norm": 0.37447142601013184, "learning_rate": 3.891988988405489e-05, "loss": 0.0759, "step": 6340 }, { "epoch": 3.0854131747526004, "grad_norm": 0.4209616482257843, "learning_rate": 3.8915515306769636e-05, "loss": 0.0734, "step": 6350 }, { "epoch": 3.089086859688196, "grad_norm": 0.3882461488246918, "learning_rate": 3.891113213545228e-05, "loss": 0.08, "step": 6360 }, { "epoch": 3.0927605446237916, "grad_norm": 0.3927229940891266, "learning_rate": 3.8906740372094256e-05, "loss": 0.0712, "step": 6370 }, { "epoch": 3.0964342295593874, "grad_norm": 0.38795018196105957, "learning_rate": 3.890234001869093e-05, "loss": 0.0695, "step": 6380 }, { "epoch": 3.1001079144949832, "grad_norm": 0.3601156175136566, "learning_rate": 3.889793107724153e-05, "loss": 0.0744, "step": 6390 }, { "epoch": 3.1037815994305786, "grad_norm": 0.4026469886302948, "learning_rate": 3.889351354974921e-05, "loss": 0.074, "step": 6400 }, { "epoch": 3.1074552843661745, "grad_norm": 0.3505708575248718, "learning_rate": 3.888908743822102e-05, "loss": 0.0716, "step": 6410 }, { "epoch": 3.1111289693017703, "grad_norm": 0.3689245879650116, "learning_rate": 3.8884652744667904e-05, "loss": 0.0699, "step": 6420 }, { "epoch": 3.114802654237366, "grad_norm": 0.4046287536621094, "learning_rate": 3.888020947110472e-05, "loss": 0.0781, "step": 6430 }, { "epoch": 3.1184763391729615, "grad_norm": 0.34985780715942383, "learning_rate": 3.887575761955021e-05, "loss": 0.074, "step": 6440 }, { "epoch": 3.1221500241085574, "grad_norm": 0.3824096620082855, "learning_rate": 3.887129719202701e-05, "loss": 0.0668, "step": 6450 }, { "epoch": 3.125823709044153, "grad_norm": 0.36611294746398926, "learning_rate": 3.886682819056167e-05, "loss": 0.0758, "step": 6460 }, { "epoch": 3.129497393979749, "grad_norm": 0.37392356991767883, "learning_rate": 3.8862350617184634e-05, "loss": 0.0699, "step": 6470 }, { "epoch": 3.1331710789153444, "grad_norm": 0.3791895806789398, "learning_rate": 3.885786447393022e-05, "loss": 0.08, "step": 6480 }, { "epoch": 3.1368447638509402, "grad_norm": 0.38579681515693665, "learning_rate": 3.885336976283666e-05, "loss": 0.0676, "step": 6490 }, { "epoch": 3.140518448786536, "grad_norm": 0.3859240412712097, "learning_rate": 3.8848866485946055e-05, "loss": 0.0665, "step": 6500 }, { "epoch": 3.1441921337221315, "grad_norm": 0.38221970200538635, "learning_rate": 3.8844354645304425e-05, "loss": 0.0739, "step": 6510 }, { "epoch": 3.1478658186577273, "grad_norm": 0.39944809675216675, "learning_rate": 3.883983424296167e-05, "loss": 0.073, "step": 6520 }, { "epoch": 3.151539503593323, "grad_norm": 0.3389856815338135, "learning_rate": 3.883530528097158e-05, "loss": 0.0753, "step": 6530 }, { "epoch": 3.155213188528919, "grad_norm": 0.3564107418060303, "learning_rate": 3.8830767761391835e-05, "loss": 0.0765, "step": 6540 }, { "epoch": 3.1588868734645144, "grad_norm": 0.4259626269340515, "learning_rate": 3.882622168628399e-05, "loss": 0.0688, "step": 6550 }, { "epoch": 3.16256055840011, "grad_norm": 0.3969685733318329, "learning_rate": 3.882166705771351e-05, "loss": 0.0706, "step": 6560 }, { "epoch": 3.166234243335706, "grad_norm": 0.3602032959461212, "learning_rate": 3.8817103877749736e-05, "loss": 0.0709, "step": 6570 }, { "epoch": 3.169907928271302, "grad_norm": 0.36066046357154846, "learning_rate": 3.881253214846588e-05, "loss": 0.0778, "step": 6580 }, { "epoch": 3.1735816132068972, "grad_norm": 0.32860085368156433, "learning_rate": 3.880795187193906e-05, "loss": 0.0721, "step": 6590 }, { "epoch": 3.177255298142493, "grad_norm": 0.3962108790874481, "learning_rate": 3.8803363050250264e-05, "loss": 0.0728, "step": 6600 }, { "epoch": 3.180928983078089, "grad_norm": 0.35229527950286865, "learning_rate": 3.8798765685484373e-05, "loss": 0.0707, "step": 6610 }, { "epoch": 3.1846026680136843, "grad_norm": 0.3508455455303192, "learning_rate": 3.879415977973014e-05, "loss": 0.0787, "step": 6620 }, { "epoch": 3.18827635294928, "grad_norm": 0.3952919840812683, "learning_rate": 3.87895453350802e-05, "loss": 0.0751, "step": 6630 }, { "epoch": 3.191950037884876, "grad_norm": 0.34706318378448486, "learning_rate": 3.878492235363107e-05, "loss": 0.0728, "step": 6640 }, { "epoch": 3.195623722820472, "grad_norm": 0.3671702444553375, "learning_rate": 3.8780290837483143e-05, "loss": 0.0764, "step": 6650 }, { "epoch": 3.199297407756067, "grad_norm": 0.3888378143310547, "learning_rate": 3.87756507887407e-05, "loss": 0.0693, "step": 6660 }, { "epoch": 3.202971092691663, "grad_norm": 0.38945361971855164, "learning_rate": 3.877100220951188e-05, "loss": 0.0771, "step": 6670 }, { "epoch": 3.206644777627259, "grad_norm": 0.3618798553943634, "learning_rate": 3.8766345101908714e-05, "loss": 0.0764, "step": 6680 }, { "epoch": 3.2103184625628542, "grad_norm": 0.3384278416633606, "learning_rate": 3.8761679468047104e-05, "loss": 0.0776, "step": 6690 }, { "epoch": 3.21399214749845, "grad_norm": 0.3756442964076996, "learning_rate": 3.875700531004682e-05, "loss": 0.0796, "step": 6700 }, { "epoch": 3.217665832434046, "grad_norm": 0.39590638875961304, "learning_rate": 3.87523226300315e-05, "loss": 0.0772, "step": 6710 }, { "epoch": 3.2213395173696417, "grad_norm": 0.40792858600616455, "learning_rate": 3.874763143012867e-05, "loss": 0.0788, "step": 6720 }, { "epoch": 3.225013202305237, "grad_norm": 0.40718400478363037, "learning_rate": 3.8742931712469725e-05, "loss": 0.0743, "step": 6730 }, { "epoch": 3.228686887240833, "grad_norm": 0.3333398699760437, "learning_rate": 3.873822347918991e-05, "loss": 0.0679, "step": 6740 }, { "epoch": 3.232360572176429, "grad_norm": 0.34743279218673706, "learning_rate": 3.873350673242836e-05, "loss": 0.0745, "step": 6750 }, { "epoch": 3.2360342571120246, "grad_norm": 0.3658008873462677, "learning_rate": 3.872878147432808e-05, "loss": 0.0679, "step": 6760 }, { "epoch": 3.23970794204762, "grad_norm": 0.3572680354118347, "learning_rate": 3.8724047707035924e-05, "loss": 0.077, "step": 6770 }, { "epoch": 3.243381626983216, "grad_norm": 0.36468884348869324, "learning_rate": 3.871930543270262e-05, "loss": 0.076, "step": 6780 }, { "epoch": 3.2470553119188117, "grad_norm": 0.33676353096961975, "learning_rate": 3.871455465348276e-05, "loss": 0.0748, "step": 6790 }, { "epoch": 3.250728996854407, "grad_norm": 0.4322378933429718, "learning_rate": 3.870979537153481e-05, "loss": 0.0714, "step": 6800 }, { "epoch": 3.254402681790003, "grad_norm": 0.4001033306121826, "learning_rate": 3.8705027589021084e-05, "loss": 0.0733, "step": 6810 }, { "epoch": 3.2580763667255987, "grad_norm": 0.397564560174942, "learning_rate": 3.8700251308107766e-05, "loss": 0.0726, "step": 6820 }, { "epoch": 3.2617500516611946, "grad_norm": 0.38845571875572205, "learning_rate": 3.8695466530964916e-05, "loss": 0.0767, "step": 6830 }, { "epoch": 3.26542373659679, "grad_norm": 0.38318613171577454, "learning_rate": 3.8690673259766416e-05, "loss": 0.0759, "step": 6840 }, { "epoch": 3.269097421532386, "grad_norm": 0.3386957049369812, "learning_rate": 3.8685871496690046e-05, "loss": 0.0704, "step": 6850 }, { "epoch": 3.2727711064679816, "grad_norm": 0.41133052110671997, "learning_rate": 3.868106124391742e-05, "loss": 0.0716, "step": 6860 }, { "epoch": 3.276444791403577, "grad_norm": 0.3641374707221985, "learning_rate": 3.8676242503634016e-05, "loss": 0.0728, "step": 6870 }, { "epoch": 3.280118476339173, "grad_norm": 0.3426092267036438, "learning_rate": 3.867141527802917e-05, "loss": 0.0731, "step": 6880 }, { "epoch": 3.2837921612747687, "grad_norm": 0.3867241144180298, "learning_rate": 3.8666579569296085e-05, "loss": 0.0749, "step": 6890 }, { "epoch": 3.2874658462103645, "grad_norm": 0.34291258454322815, "learning_rate": 3.866173537963179e-05, "loss": 0.0682, "step": 6900 }, { "epoch": 3.29113953114596, "grad_norm": 0.3923577666282654, "learning_rate": 3.865688271123719e-05, "loss": 0.0691, "step": 6910 }, { "epoch": 3.2948132160815558, "grad_norm": 0.3977607786655426, "learning_rate": 3.865202156631703e-05, "loss": 0.069, "step": 6920 }, { "epoch": 3.2984869010171516, "grad_norm": 0.4023333489894867, "learning_rate": 3.864715194707992e-05, "loss": 0.0735, "step": 6930 }, { "epoch": 3.3021605859527474, "grad_norm": 0.3720442056655884, "learning_rate": 3.864227385573832e-05, "loss": 0.0734, "step": 6940 }, { "epoch": 3.305834270888343, "grad_norm": 0.32263362407684326, "learning_rate": 3.8637387294508506e-05, "loss": 0.0759, "step": 6950 }, { "epoch": 3.3095079558239386, "grad_norm": 0.3768882751464844, "learning_rate": 3.8632492265610645e-05, "loss": 0.0698, "step": 6960 }, { "epoch": 3.3131816407595345, "grad_norm": 0.3831019401550293, "learning_rate": 3.862758877126873e-05, "loss": 0.0745, "step": 6970 }, { "epoch": 3.31685532569513, "grad_norm": 0.33821457624435425, "learning_rate": 3.8622676813710596e-05, "loss": 0.0708, "step": 6980 }, { "epoch": 3.3205290106307257, "grad_norm": 0.3672340512275696, "learning_rate": 3.861775639516794e-05, "loss": 0.0722, "step": 6990 }, { "epoch": 3.3242026955663215, "grad_norm": 0.35920974612236023, "learning_rate": 3.861282751787629e-05, "loss": 0.0738, "step": 7000 }, { "epoch": 3.3278763805019174, "grad_norm": 0.39940935373306274, "learning_rate": 3.860789018407502e-05, "loss": 0.0723, "step": 7010 }, { "epoch": 3.3315500654375128, "grad_norm": 0.37681448459625244, "learning_rate": 3.860294439600735e-05, "loss": 0.0715, "step": 7020 }, { "epoch": 3.3352237503731086, "grad_norm": 0.34028860926628113, "learning_rate": 3.8597990155920325e-05, "loss": 0.0694, "step": 7030 }, { "epoch": 3.3388974353087044, "grad_norm": 0.3723374009132385, "learning_rate": 3.8593027466064865e-05, "loss": 0.0654, "step": 7040 }, { "epoch": 3.3425711202443003, "grad_norm": 0.3413618505001068, "learning_rate": 3.85880563286957e-05, "loss": 0.0779, "step": 7050 }, { "epoch": 3.3462448051798956, "grad_norm": 0.40859681367874146, "learning_rate": 3.858307674607139e-05, "loss": 0.0748, "step": 7060 }, { "epoch": 3.3499184901154915, "grad_norm": 0.37525543570518494, "learning_rate": 3.8578088720454366e-05, "loss": 0.0711, "step": 7070 }, { "epoch": 3.3535921750510873, "grad_norm": 0.40834948420524597, "learning_rate": 3.857309225411086e-05, "loss": 0.0754, "step": 7080 }, { "epoch": 3.3572658599866827, "grad_norm": 0.3934822976589203, "learning_rate": 3.856808734931098e-05, "loss": 0.0756, "step": 7090 }, { "epoch": 3.3609395449222785, "grad_norm": 0.3892536759376526, "learning_rate": 3.8563074008328616e-05, "loss": 0.0734, "step": 7100 }, { "epoch": 3.3646132298578744, "grad_norm": 0.3682181239128113, "learning_rate": 3.8558052233441536e-05, "loss": 0.0711, "step": 7110 }, { "epoch": 3.36828691479347, "grad_norm": 0.37954092025756836, "learning_rate": 3.8553022026931306e-05, "loss": 0.0701, "step": 7120 }, { "epoch": 3.3719605997290656, "grad_norm": 0.4413468539714813, "learning_rate": 3.854798339108335e-05, "loss": 0.0742, "step": 7130 }, { "epoch": 3.3756342846646614, "grad_norm": 0.36334455013275146, "learning_rate": 3.854293632818691e-05, "loss": 0.07, "step": 7140 }, { "epoch": 3.3793079696002573, "grad_norm": 0.39029690623283386, "learning_rate": 3.8537880840535054e-05, "loss": 0.0789, "step": 7150 }, { "epoch": 3.382981654535853, "grad_norm": 0.38254764676094055, "learning_rate": 3.8532816930424685e-05, "loss": 0.0735, "step": 7160 }, { "epoch": 3.3866553394714485, "grad_norm": 0.3467348515987396, "learning_rate": 3.8527744600156515e-05, "loss": 0.0666, "step": 7170 }, { "epoch": 3.3903290244070443, "grad_norm": 0.3426026701927185, "learning_rate": 3.852266385203511e-05, "loss": 0.0725, "step": 7180 }, { "epoch": 3.39400270934264, "grad_norm": 0.4232753813266754, "learning_rate": 3.8517574688368844e-05, "loss": 0.0696, "step": 7190 }, { "epoch": 3.3976763942782355, "grad_norm": 0.38134926557540894, "learning_rate": 3.851247711146991e-05, "loss": 0.0708, "step": 7200 }, { "epoch": 3.4013500792138314, "grad_norm": 0.43960040807724, "learning_rate": 3.850737112365433e-05, "loss": 0.0734, "step": 7210 }, { "epoch": 3.405023764149427, "grad_norm": 0.34710749983787537, "learning_rate": 3.8502256727241955e-05, "loss": 0.0677, "step": 7220 }, { "epoch": 3.408697449085023, "grad_norm": 0.3959217369556427, "learning_rate": 3.8497133924556436e-05, "loss": 0.0724, "step": 7230 }, { "epoch": 3.4123711340206184, "grad_norm": 0.36663129925727844, "learning_rate": 3.849200271792526e-05, "loss": 0.0616, "step": 7240 }, { "epoch": 3.4160448189562143, "grad_norm": 0.35240399837493896, "learning_rate": 3.848686310967973e-05, "loss": 0.0722, "step": 7250 }, { "epoch": 3.41971850389181, "grad_norm": 0.41409143805503845, "learning_rate": 3.848171510215497e-05, "loss": 0.0733, "step": 7260 }, { "epoch": 3.423392188827406, "grad_norm": 0.39291131496429443, "learning_rate": 3.84765586976899e-05, "loss": 0.0673, "step": 7270 }, { "epoch": 3.4270658737630013, "grad_norm": 0.3210747539997101, "learning_rate": 3.8471393898627276e-05, "loss": 0.0658, "step": 7280 }, { "epoch": 3.430739558698597, "grad_norm": 0.32398563623428345, "learning_rate": 3.846622070731366e-05, "loss": 0.0645, "step": 7290 }, { "epoch": 3.434413243634193, "grad_norm": 0.3727194666862488, "learning_rate": 3.846103912609943e-05, "loss": 0.068, "step": 7300 }, { "epoch": 3.4380869285697884, "grad_norm": 0.42383453249931335, "learning_rate": 3.8455849157338775e-05, "loss": 0.0689, "step": 7310 }, { "epoch": 3.441760613505384, "grad_norm": 0.3685389757156372, "learning_rate": 3.8450650803389694e-05, "loss": 0.0687, "step": 7320 }, { "epoch": 3.44543429844098, "grad_norm": 0.4055418074131012, "learning_rate": 3.844544406661399e-05, "loss": 0.0697, "step": 7330 }, { "epoch": 3.4491079833765754, "grad_norm": 0.3819187879562378, "learning_rate": 3.844022894937728e-05, "loss": 0.0681, "step": 7340 }, { "epoch": 3.4527816683121713, "grad_norm": 0.37955814599990845, "learning_rate": 3.843500545404899e-05, "loss": 0.0615, "step": 7350 }, { "epoch": 3.456455353247767, "grad_norm": 0.4003753662109375, "learning_rate": 3.842977358300235e-05, "loss": 0.0697, "step": 7360 }, { "epoch": 3.460129038183363, "grad_norm": 0.3453231155872345, "learning_rate": 3.842453333861441e-05, "loss": 0.0694, "step": 7370 }, { "epoch": 3.4638027231189588, "grad_norm": 0.3369866609573364, "learning_rate": 3.841928472326599e-05, "loss": 0.0605, "step": 7380 }, { "epoch": 3.467476408054554, "grad_norm": 0.3885999619960785, "learning_rate": 3.841402773934174e-05, "loss": 0.0659, "step": 7390 }, { "epoch": 3.47115009299015, "grad_norm": 0.38517746329307556, "learning_rate": 3.840876238923012e-05, "loss": 0.0694, "step": 7400 }, { "epoch": 3.474823777925746, "grad_norm": 0.3682037591934204, "learning_rate": 3.8403488675323356e-05, "loss": 0.0725, "step": 7410 }, { "epoch": 3.478497462861341, "grad_norm": 0.36746084690093994, "learning_rate": 3.839820660001751e-05, "loss": 0.0643, "step": 7420 }, { "epoch": 3.482171147796937, "grad_norm": 0.40562278032302856, "learning_rate": 3.839291616571243e-05, "loss": 0.0708, "step": 7430 }, { "epoch": 3.485844832732533, "grad_norm": 0.4035206735134125, "learning_rate": 3.838761737481174e-05, "loss": 0.0623, "step": 7440 }, { "epoch": 3.4895185176681283, "grad_norm": 0.4176293611526489, "learning_rate": 3.83823102297229e-05, "loss": 0.0764, "step": 7450 }, { "epoch": 3.493192202603724, "grad_norm": 0.37610921263694763, "learning_rate": 3.837699473285714e-05, "loss": 0.0703, "step": 7460 }, { "epoch": 3.49686588753932, "grad_norm": 0.41398000717163086, "learning_rate": 3.837167088662949e-05, "loss": 0.0659, "step": 7470 }, { "epoch": 3.5005395724749158, "grad_norm": 0.35997703671455383, "learning_rate": 3.8366338693458775e-05, "loss": 0.0618, "step": 7480 }, { "epoch": 3.5042132574105116, "grad_norm": 0.35327473282814026, "learning_rate": 3.836099815576761e-05, "loss": 0.0612, "step": 7490 }, { "epoch": 3.507886942346107, "grad_norm": 0.37258461117744446, "learning_rate": 3.835564927598241e-05, "loss": 0.0669, "step": 7500 }, { "epoch": 3.511560627281703, "grad_norm": 0.3680964708328247, "learning_rate": 3.835029205653337e-05, "loss": 0.066, "step": 7510 }, { "epoch": 3.5152343122172987, "grad_norm": 0.34851181507110596, "learning_rate": 3.834492649985446e-05, "loss": 0.0678, "step": 7520 }, { "epoch": 3.518907997152894, "grad_norm": 0.34612253308296204, "learning_rate": 3.8339552608383485e-05, "loss": 0.0659, "step": 7530 }, { "epoch": 3.52258168208849, "grad_norm": 0.3771200180053711, "learning_rate": 3.833417038456198e-05, "loss": 0.0623, "step": 7540 }, { "epoch": 3.5262553670240857, "grad_norm": 0.41751450300216675, "learning_rate": 3.83287798308353e-05, "loss": 0.0692, "step": 7550 }, { "epoch": 3.529929051959681, "grad_norm": 0.38141199946403503, "learning_rate": 3.832338094965258e-05, "loss": 0.0688, "step": 7560 }, { "epoch": 3.533602736895277, "grad_norm": 0.3504331409931183, "learning_rate": 3.831797374346674e-05, "loss": 0.0701, "step": 7570 }, { "epoch": 3.5372764218308728, "grad_norm": 0.368120402097702, "learning_rate": 3.831255821473447e-05, "loss": 0.0656, "step": 7580 }, { "epoch": 3.5409501067664686, "grad_norm": 0.3137691617012024, "learning_rate": 3.830713436591625e-05, "loss": 0.0617, "step": 7590 }, { "epoch": 3.544623791702064, "grad_norm": 0.3407038748264313, "learning_rate": 3.8301702199476346e-05, "loss": 0.0645, "step": 7600 }, { "epoch": 3.54829747663766, "grad_norm": 0.372866690158844, "learning_rate": 3.8296261717882786e-05, "loss": 0.0693, "step": 7610 }, { "epoch": 3.5519711615732557, "grad_norm": 0.35678520798683167, "learning_rate": 3.829081292360739e-05, "loss": 0.0735, "step": 7620 }, { "epoch": 3.5556448465088515, "grad_norm": 0.3527635931968689, "learning_rate": 3.828535581912576e-05, "loss": 0.071, "step": 7630 }, { "epoch": 3.559318531444447, "grad_norm": 0.3414013683795929, "learning_rate": 3.827989040691725e-05, "loss": 0.0657, "step": 7640 }, { "epoch": 3.5629922163800427, "grad_norm": 0.3904476761817932, "learning_rate": 3.8274416689465e-05, "loss": 0.0641, "step": 7650 }, { "epoch": 3.5666659013156385, "grad_norm": 0.3812848925590515, "learning_rate": 3.8268934669255946e-05, "loss": 0.0608, "step": 7660 }, { "epoch": 3.570339586251234, "grad_norm": 0.3596697449684143, "learning_rate": 3.826344434878077e-05, "loss": 0.0642, "step": 7670 }, { "epoch": 3.5740132711868298, "grad_norm": 0.35639938712120056, "learning_rate": 3.825794573053392e-05, "loss": 0.0622, "step": 7680 }, { "epoch": 3.5776869561224256, "grad_norm": 0.348125159740448, "learning_rate": 3.8252438817013636e-05, "loss": 0.0625, "step": 7690 }, { "epoch": 3.581360641058021, "grad_norm": 0.3511098623275757, "learning_rate": 3.8246923610721906e-05, "loss": 0.0683, "step": 7700 }, { "epoch": 3.585034325993617, "grad_norm": 0.34246912598609924, "learning_rate": 3.824140011416451e-05, "loss": 0.0601, "step": 7710 }, { "epoch": 3.5887080109292127, "grad_norm": 0.347200870513916, "learning_rate": 3.823586832985097e-05, "loss": 0.064, "step": 7720 }, { "epoch": 3.5923816958648085, "grad_norm": 0.3503406345844269, "learning_rate": 3.8230328260294596e-05, "loss": 0.0719, "step": 7730 }, { "epoch": 3.5960553808004043, "grad_norm": 0.33439186215400696, "learning_rate": 3.8224779908012444e-05, "loss": 0.0554, "step": 7740 }, { "epoch": 3.5997290657359997, "grad_norm": 0.3767121136188507, "learning_rate": 3.8219223275525335e-05, "loss": 0.0676, "step": 7750 }, { "epoch": 3.6034027506715955, "grad_norm": 0.3346874713897705, "learning_rate": 3.821365836535786e-05, "loss": 0.0682, "step": 7760 }, { "epoch": 3.6070764356071914, "grad_norm": 0.39006564021110535, "learning_rate": 3.820808518003837e-05, "loss": 0.0652, "step": 7770 }, { "epoch": 3.6107501205427868, "grad_norm": 0.35553354024887085, "learning_rate": 3.8202503722098976e-05, "loss": 0.0627, "step": 7780 }, { "epoch": 3.6144238054783826, "grad_norm": 0.3937147855758667, "learning_rate": 3.819691399407553e-05, "loss": 0.0703, "step": 7790 }, { "epoch": 3.6180974904139784, "grad_norm": 0.3638565242290497, "learning_rate": 3.819131599850768e-05, "loss": 0.0654, "step": 7800 }, { "epoch": 3.621771175349574, "grad_norm": 0.35591986775398254, "learning_rate": 3.818570973793879e-05, "loss": 0.0664, "step": 7810 }, { "epoch": 3.6254448602851697, "grad_norm": 0.3683430850505829, "learning_rate": 3.818009521491599e-05, "loss": 0.0661, "step": 7820 }, { "epoch": 3.6291185452207655, "grad_norm": 0.3660489022731781, "learning_rate": 3.817447243199019e-05, "loss": 0.0604, "step": 7830 }, { "epoch": 3.6327922301563613, "grad_norm": 0.3491329252719879, "learning_rate": 3.8168841391716016e-05, "loss": 0.0651, "step": 7840 }, { "epoch": 3.636465915091957, "grad_norm": 0.3848625123500824, "learning_rate": 3.816320209665187e-05, "loss": 0.0714, "step": 7850 }, { "epoch": 3.6401396000275525, "grad_norm": 0.3493505120277405, "learning_rate": 3.81575545493599e-05, "loss": 0.0662, "step": 7860 }, { "epoch": 3.6438132849631484, "grad_norm": 0.37918442487716675, "learning_rate": 3.8151898752405975e-05, "loss": 0.0613, "step": 7870 }, { "epoch": 3.647486969898744, "grad_norm": 0.3876636326313019, "learning_rate": 3.8146234708359766e-05, "loss": 0.065, "step": 7880 }, { "epoch": 3.6511606548343396, "grad_norm": 0.3761611580848694, "learning_rate": 3.814056241979465e-05, "loss": 0.0648, "step": 7890 }, { "epoch": 3.6548343397699354, "grad_norm": 0.36565306782722473, "learning_rate": 3.8134881889287776e-05, "loss": 0.0644, "step": 7900 }, { "epoch": 3.6585080247055313, "grad_norm": 0.3948571979999542, "learning_rate": 3.812919311941999e-05, "loss": 0.0676, "step": 7910 }, { "epoch": 3.6621817096411267, "grad_norm": 0.3455873727798462, "learning_rate": 3.812349611277594e-05, "loss": 0.0592, "step": 7920 }, { "epoch": 3.6658553945767225, "grad_norm": 0.3156326413154602, "learning_rate": 3.8117790871944e-05, "loss": 0.0586, "step": 7930 }, { "epoch": 3.6695290795123183, "grad_norm": 0.35699769854545593, "learning_rate": 3.811207739951625e-05, "loss": 0.0668, "step": 7940 }, { "epoch": 3.673202764447914, "grad_norm": 0.35023537278175354, "learning_rate": 3.810635569808855e-05, "loss": 0.0652, "step": 7950 }, { "epoch": 3.67687644938351, "grad_norm": 0.3936767876148224, "learning_rate": 3.810062577026048e-05, "loss": 0.0673, "step": 7960 }, { "epoch": 3.6805501343191054, "grad_norm": 0.36356693506240845, "learning_rate": 3.809488761863536e-05, "loss": 0.0625, "step": 7970 }, { "epoch": 3.684223819254701, "grad_norm": 0.3651621341705322, "learning_rate": 3.8089141245820264e-05, "loss": 0.0662, "step": 7980 }, { "epoch": 3.687897504190297, "grad_norm": 0.34833160042762756, "learning_rate": 3.808338665442597e-05, "loss": 0.0621, "step": 7990 }, { "epoch": 3.6915711891258924, "grad_norm": 0.33042076230049133, "learning_rate": 3.807762384706702e-05, "loss": 0.0587, "step": 8000 }, { "epoch": 3.6952448740614883, "grad_norm": 0.31583070755004883, "learning_rate": 3.8071852826361656e-05, "loss": 0.0683, "step": 8010 }, { "epoch": 3.698918558997084, "grad_norm": 0.35448724031448364, "learning_rate": 3.806607359493189e-05, "loss": 0.0624, "step": 8020 }, { "epoch": 3.7025922439326795, "grad_norm": 0.42939212918281555, "learning_rate": 3.806028615540343e-05, "loss": 0.0642, "step": 8030 }, { "epoch": 3.7062659288682753, "grad_norm": 0.3486788868904114, "learning_rate": 3.8054490510405736e-05, "loss": 0.0658, "step": 8040 }, { "epoch": 3.709939613803871, "grad_norm": 0.4249204993247986, "learning_rate": 3.804868666257199e-05, "loss": 0.0688, "step": 8050 }, { "epoch": 3.713613298739467, "grad_norm": 0.35971787571907043, "learning_rate": 3.804287461453909e-05, "loss": 0.0637, "step": 8060 }, { "epoch": 3.717286983675063, "grad_norm": 0.3460175096988678, "learning_rate": 3.803705436894767e-05, "loss": 0.0632, "step": 8070 }, { "epoch": 3.720960668610658, "grad_norm": 0.368527889251709, "learning_rate": 3.8031225928442095e-05, "loss": 0.0668, "step": 8080 }, { "epoch": 3.724634353546254, "grad_norm": 0.3620864450931549, "learning_rate": 3.8025389295670444e-05, "loss": 0.0623, "step": 8090 }, { "epoch": 3.72830803848185, "grad_norm": 0.4405253529548645, "learning_rate": 3.8019544473284516e-05, "loss": 0.0638, "step": 8100 }, { "epoch": 3.7319817234174453, "grad_norm": 0.38408783078193665, "learning_rate": 3.801369146393983e-05, "loss": 0.0587, "step": 8110 }, { "epoch": 3.735655408353041, "grad_norm": 0.3630177974700928, "learning_rate": 3.8007830270295634e-05, "loss": 0.064, "step": 8120 }, { "epoch": 3.739329093288637, "grad_norm": 0.41500958800315857, "learning_rate": 3.800196089501489e-05, "loss": 0.0616, "step": 8130 }, { "epoch": 3.7430027782242323, "grad_norm": 0.39276209473609924, "learning_rate": 3.7996083340764276e-05, "loss": 0.065, "step": 8140 }, { "epoch": 3.746676463159828, "grad_norm": 0.33958104252815247, "learning_rate": 3.799019761021419e-05, "loss": 0.0604, "step": 8150 }, { "epoch": 3.750350148095424, "grad_norm": 0.3880743384361267, "learning_rate": 3.798430370603873e-05, "loss": 0.0607, "step": 8160 }, { "epoch": 3.75402383303102, "grad_norm": 0.4502705931663513, "learning_rate": 3.7978401630915735e-05, "loss": 0.0665, "step": 8170 }, { "epoch": 3.7576975179666157, "grad_norm": 0.3808438777923584, "learning_rate": 3.7972491387526726e-05, "loss": 0.0608, "step": 8180 }, { "epoch": 3.761371202902211, "grad_norm": 0.38702285289764404, "learning_rate": 3.796657297855696e-05, "loss": 0.0663, "step": 8190 }, { "epoch": 3.765044887837807, "grad_norm": 0.36310693621635437, "learning_rate": 3.7960646406695385e-05, "loss": 0.063, "step": 8200 }, { "epoch": 3.7687185727734027, "grad_norm": 0.38645002245903015, "learning_rate": 3.795471167463467e-05, "loss": 0.0647, "step": 8210 }, { "epoch": 3.772392257708998, "grad_norm": 0.34869855642318726, "learning_rate": 3.794876878507119e-05, "loss": 0.0674, "step": 8220 }, { "epoch": 3.776065942644594, "grad_norm": 0.3708350658416748, "learning_rate": 3.794281774070502e-05, "loss": 0.0619, "step": 8230 }, { "epoch": 3.7797396275801898, "grad_norm": 0.35810190439224243, "learning_rate": 3.7936858544239944e-05, "loss": 0.0637, "step": 8240 }, { "epoch": 3.783413312515785, "grad_norm": 0.39535266160964966, "learning_rate": 3.793089119838345e-05, "loss": 0.0624, "step": 8250 }, { "epoch": 3.787086997451381, "grad_norm": 0.33278441429138184, "learning_rate": 3.792491570584674e-05, "loss": 0.0584, "step": 8260 }, { "epoch": 3.790760682386977, "grad_norm": 0.37573954463005066, "learning_rate": 3.791893206934469e-05, "loss": 0.0654, "step": 8270 }, { "epoch": 3.794434367322572, "grad_norm": 0.3730659782886505, "learning_rate": 3.791294029159589e-05, "loss": 0.066, "step": 8280 }, { "epoch": 3.798108052258168, "grad_norm": 0.3069198727607727, "learning_rate": 3.790694037532265e-05, "loss": 0.0584, "step": 8290 }, { "epoch": 3.801781737193764, "grad_norm": 0.3499199151992798, "learning_rate": 3.7900932323250936e-05, "loss": 0.0576, "step": 8300 }, { "epoch": 3.8054554221293597, "grad_norm": 0.3697301745414734, "learning_rate": 3.789491613811044e-05, "loss": 0.0662, "step": 8310 }, { "epoch": 3.8091291070649556, "grad_norm": 0.3310595452785492, "learning_rate": 3.7888891822634554e-05, "loss": 0.061, "step": 8320 }, { "epoch": 3.812802792000551, "grad_norm": 0.34428343176841736, "learning_rate": 3.788285937956034e-05, "loss": 0.0637, "step": 8330 }, { "epoch": 3.8164764769361468, "grad_norm": 0.3240375518798828, "learning_rate": 3.787681881162856e-05, "loss": 0.0595, "step": 8340 }, { "epoch": 3.8201501618717426, "grad_norm": 0.3121042847633362, "learning_rate": 3.7870770121583687e-05, "loss": 0.0592, "step": 8350 }, { "epoch": 3.823823846807338, "grad_norm": 0.37518566846847534, "learning_rate": 3.7864713312173856e-05, "loss": 0.0646, "step": 8360 }, { "epoch": 3.827497531742934, "grad_norm": 0.3310908079147339, "learning_rate": 3.7858648386150904e-05, "loss": 0.0535, "step": 8370 }, { "epoch": 3.8311712166785297, "grad_norm": 0.45427343249320984, "learning_rate": 3.785257534627037e-05, "loss": 0.0635, "step": 8380 }, { "epoch": 3.834844901614125, "grad_norm": 0.35925179719924927, "learning_rate": 3.784649419529145e-05, "loss": 0.0624, "step": 8390 }, { "epoch": 3.838518586549721, "grad_norm": 0.33186519145965576, "learning_rate": 3.784040493597705e-05, "loss": 0.0609, "step": 8400 }, { "epoch": 3.8421922714853167, "grad_norm": 0.3419983983039856, "learning_rate": 3.783430757109374e-05, "loss": 0.0605, "step": 8410 }, { "epoch": 3.8458659564209126, "grad_norm": 0.3384428918361664, "learning_rate": 3.782820210341179e-05, "loss": 0.0615, "step": 8420 }, { "epoch": 3.8495396413565084, "grad_norm": 0.3448338806629181, "learning_rate": 3.782208853570514e-05, "loss": 0.0601, "step": 8430 }, { "epoch": 3.853213326292104, "grad_norm": 0.37677285075187683, "learning_rate": 3.781596687075143e-05, "loss": 0.0658, "step": 8440 }, { "epoch": 3.8568870112276996, "grad_norm": 0.3536309599876404, "learning_rate": 3.780983711133194e-05, "loss": 0.0615, "step": 8450 }, { "epoch": 3.8605606961632954, "grad_norm": 0.3391807973384857, "learning_rate": 3.780369926023165e-05, "loss": 0.0537, "step": 8460 }, { "epoch": 3.864234381098891, "grad_norm": 0.3494960367679596, "learning_rate": 3.779755332023925e-05, "loss": 0.058, "step": 8470 }, { "epoch": 3.8679080660344867, "grad_norm": 0.3329848647117615, "learning_rate": 3.779139929414704e-05, "loss": 0.0617, "step": 8480 }, { "epoch": 3.8715817509700825, "grad_norm": 0.3815949857234955, "learning_rate": 3.778523718475104e-05, "loss": 0.061, "step": 8490 }, { "epoch": 3.875255435905678, "grad_norm": 0.327009916305542, "learning_rate": 3.777906699485092e-05, "loss": 0.0572, "step": 8500 }, { "epoch": 3.8789291208412737, "grad_norm": 0.34351393580436707, "learning_rate": 3.7772888727250047e-05, "loss": 0.0604, "step": 8510 }, { "epoch": 3.8826028057768696, "grad_norm": 0.3600986897945404, "learning_rate": 3.776670238475543e-05, "loss": 0.0589, "step": 8520 }, { "epoch": 3.8862764907124654, "grad_norm": 0.29365289211273193, "learning_rate": 3.776050797017775e-05, "loss": 0.0604, "step": 8530 }, { "epoch": 3.8899501756480612, "grad_norm": 0.37498974800109863, "learning_rate": 3.775430548633137e-05, "loss": 0.0582, "step": 8540 }, { "epoch": 3.8936238605836566, "grad_norm": 0.3378652036190033, "learning_rate": 3.774809493603432e-05, "loss": 0.06, "step": 8550 }, { "epoch": 3.8972975455192524, "grad_norm": 0.3303601145744324, "learning_rate": 3.774187632210828e-05, "loss": 0.0572, "step": 8560 }, { "epoch": 3.9009712304548483, "grad_norm": 0.3994413912296295, "learning_rate": 3.77356496473786e-05, "loss": 0.0628, "step": 8570 }, { "epoch": 3.9046449153904437, "grad_norm": 0.339233934879303, "learning_rate": 3.772941491467429e-05, "loss": 0.0611, "step": 8580 }, { "epoch": 3.9083186003260395, "grad_norm": 0.37156522274017334, "learning_rate": 3.7723172126828034e-05, "loss": 0.0568, "step": 8590 }, { "epoch": 3.9119922852616353, "grad_norm": 0.3347920775413513, "learning_rate": 3.771692128667616e-05, "loss": 0.0593, "step": 8600 }, { "epoch": 3.9156659701972307, "grad_norm": 0.33663907647132874, "learning_rate": 3.771066239705866e-05, "loss": 0.0563, "step": 8610 }, { "epoch": 3.9193396551328266, "grad_norm": 0.34232792258262634, "learning_rate": 3.770439546081919e-05, "loss": 0.0669, "step": 8620 }, { "epoch": 3.9230133400684224, "grad_norm": 0.38553059101104736, "learning_rate": 3.7698120480805044e-05, "loss": 0.0592, "step": 8630 }, { "epoch": 3.9266870250040182, "grad_norm": 0.3767704963684082, "learning_rate": 3.7691837459867196e-05, "loss": 0.0564, "step": 8640 }, { "epoch": 3.930360709939614, "grad_norm": 0.34299132227897644, "learning_rate": 3.768554640086025e-05, "loss": 0.0593, "step": 8650 }, { "epoch": 3.9340343948752095, "grad_norm": 0.3847411572933197, "learning_rate": 3.767924730664247e-05, "loss": 0.0638, "step": 8660 }, { "epoch": 3.9377080798108053, "grad_norm": 0.394016832113266, "learning_rate": 3.7672940180075775e-05, "loss": 0.0592, "step": 8670 }, { "epoch": 3.941381764746401, "grad_norm": 0.3355691134929657, "learning_rate": 3.7666625024025735e-05, "loss": 0.0576, "step": 8680 }, { "epoch": 3.9450554496819965, "grad_norm": 0.3539235591888428, "learning_rate": 3.766030184136156e-05, "loss": 0.0621, "step": 8690 }, { "epoch": 3.9487291346175923, "grad_norm": 0.3478702902793884, "learning_rate": 3.765397063495611e-05, "loss": 0.0593, "step": 8700 }, { "epoch": 3.952402819553188, "grad_norm": 0.40612536668777466, "learning_rate": 3.764763140768588e-05, "loss": 0.0626, "step": 8710 }, { "epoch": 3.9560765044887836, "grad_norm": 0.388936847448349, "learning_rate": 3.764128416243105e-05, "loss": 0.0605, "step": 8720 }, { "epoch": 3.9597501894243794, "grad_norm": 0.40920427441596985, "learning_rate": 3.763492890207538e-05, "loss": 0.0582, "step": 8730 }, { "epoch": 3.9634238743599752, "grad_norm": 0.34851428866386414, "learning_rate": 3.7628565629506326e-05, "loss": 0.0596, "step": 8740 }, { "epoch": 3.967097559295571, "grad_norm": 0.35320472717285156, "learning_rate": 3.7622194347614946e-05, "loss": 0.0578, "step": 8750 }, { "epoch": 3.970771244231167, "grad_norm": 0.3640444278717041, "learning_rate": 3.761581505929597e-05, "loss": 0.0584, "step": 8760 }, { "epoch": 3.9744449291667623, "grad_norm": 0.33672264218330383, "learning_rate": 3.7609427767447744e-05, "loss": 0.0607, "step": 8770 }, { "epoch": 3.978118614102358, "grad_norm": 0.34837546944618225, "learning_rate": 3.7603032474972255e-05, "loss": 0.0567, "step": 8780 }, { "epoch": 3.981792299037954, "grad_norm": 0.3596403896808624, "learning_rate": 3.759662918477512e-05, "loss": 0.0594, "step": 8790 }, { "epoch": 3.9854659839735493, "grad_norm": 0.3376719057559967, "learning_rate": 3.75902178997656e-05, "loss": 0.0553, "step": 8800 }, { "epoch": 3.989139668909145, "grad_norm": 0.3064998686313629, "learning_rate": 3.758379862285658e-05, "loss": 0.0553, "step": 8810 }, { "epoch": 3.992813353844741, "grad_norm": 0.39124488830566406, "learning_rate": 3.757737135696459e-05, "loss": 0.061, "step": 8820 }, { "epoch": 3.9964870387803364, "grad_norm": 0.3533807694911957, "learning_rate": 3.7570936105009766e-05, "loss": 0.0611, "step": 8830 }, { "epoch": 4.000160723715933, "grad_norm": 0.4401451647281647, "learning_rate": 3.7564492869915895e-05, "loss": 0.0573, "step": 8840 }, { "epoch": 4.003834408651528, "grad_norm": 0.38754549622535706, "learning_rate": 3.755804165461038e-05, "loss": 0.0616, "step": 8850 }, { "epoch": 4.0075080935871235, "grad_norm": 0.34620991349220276, "learning_rate": 3.755158246202425e-05, "loss": 0.0658, "step": 8860 }, { "epoch": 4.01118177852272, "grad_norm": 0.3822627365589142, "learning_rate": 3.754511529509216e-05, "loss": 0.0559, "step": 8870 }, { "epoch": 4.014855463458315, "grad_norm": 0.33620741963386536, "learning_rate": 3.753864015675238e-05, "loss": 0.0554, "step": 8880 }, { "epoch": 4.018529148393911, "grad_norm": 0.3410077393054962, "learning_rate": 3.753215704994683e-05, "loss": 0.06, "step": 8890 }, { "epoch": 4.022202833329507, "grad_norm": 0.3564826250076294, "learning_rate": 3.7525665977621016e-05, "loss": 0.0608, "step": 8900 }, { "epoch": 4.025876518265102, "grad_norm": 0.33676818013191223, "learning_rate": 3.7519166942724074e-05, "loss": 0.055, "step": 8910 }, { "epoch": 4.029550203200698, "grad_norm": 0.3729001581668854, "learning_rate": 3.751265994820876e-05, "loss": 0.0559, "step": 8920 }, { "epoch": 4.033223888136294, "grad_norm": 0.3194955885410309, "learning_rate": 3.750614499703146e-05, "loss": 0.0557, "step": 8930 }, { "epoch": 4.036897573071889, "grad_norm": 0.3385716676712036, "learning_rate": 3.749962209215215e-05, "loss": 0.0568, "step": 8940 }, { "epoch": 4.0405712580074855, "grad_norm": 0.3649482727050781, "learning_rate": 3.749309123653444e-05, "loss": 0.0568, "step": 8950 }, { "epoch": 4.044244942943081, "grad_norm": 0.31956973671913147, "learning_rate": 3.748655243314554e-05, "loss": 0.0585, "step": 8960 }, { "epoch": 4.047918627878676, "grad_norm": 0.32628512382507324, "learning_rate": 3.748000568495628e-05, "loss": 0.0565, "step": 8970 }, { "epoch": 4.051592312814273, "grad_norm": 0.40425875782966614, "learning_rate": 3.747345099494108e-05, "loss": 0.0553, "step": 8980 }, { "epoch": 4.055265997749868, "grad_norm": 0.38358399271965027, "learning_rate": 3.7466888366077996e-05, "loss": 0.0539, "step": 8990 }, { "epoch": 4.058939682685464, "grad_norm": 0.35680723190307617, "learning_rate": 3.746031780134868e-05, "loss": 0.063, "step": 9000 }, { "epoch": 4.06261336762106, "grad_norm": 0.3092491626739502, "learning_rate": 3.745373930373837e-05, "loss": 0.0593, "step": 9010 }, { "epoch": 4.066287052556655, "grad_norm": 0.31899896264076233, "learning_rate": 3.7447152876235945e-05, "loss": 0.0562, "step": 9020 }, { "epoch": 4.06996073749225, "grad_norm": 0.34592658281326294, "learning_rate": 3.744055852183386e-05, "loss": 0.0571, "step": 9030 }, { "epoch": 4.073634422427847, "grad_norm": 0.3316452205181122, "learning_rate": 3.743395624352817e-05, "loss": 0.0658, "step": 9040 }, { "epoch": 4.077308107363442, "grad_norm": 0.3339042067527771, "learning_rate": 3.7427346044318565e-05, "loss": 0.0562, "step": 9050 }, { "epoch": 4.080981792299038, "grad_norm": 0.3353637754917145, "learning_rate": 3.742072792720828e-05, "loss": 0.0571, "step": 9060 }, { "epoch": 4.084655477234634, "grad_norm": 0.4055820107460022, "learning_rate": 3.7414101895204186e-05, "loss": 0.0558, "step": 9070 }, { "epoch": 4.088329162170229, "grad_norm": 0.36465486884117126, "learning_rate": 3.7407467951316744e-05, "loss": 0.054, "step": 9080 }, { "epoch": 4.092002847105825, "grad_norm": 0.40922459959983826, "learning_rate": 3.740082609856e-05, "loss": 0.0558, "step": 9090 }, { "epoch": 4.095676532041421, "grad_norm": 0.3609990179538727, "learning_rate": 3.7394176339951594e-05, "loss": 0.0529, "step": 9100 }, { "epoch": 4.099350216977016, "grad_norm": 0.3209187090396881, "learning_rate": 3.7387518678512764e-05, "loss": 0.0597, "step": 9110 }, { "epoch": 4.1030239019126125, "grad_norm": 0.3588849604129791, "learning_rate": 3.738085311726835e-05, "loss": 0.0552, "step": 9120 }, { "epoch": 4.106697586848208, "grad_norm": 0.40738099813461304, "learning_rate": 3.737417965924675e-05, "loss": 0.0584, "step": 9130 }, { "epoch": 4.110371271783803, "grad_norm": 0.3822818994522095, "learning_rate": 3.736749830747997e-05, "loss": 0.0619, "step": 9140 }, { "epoch": 4.1140449567193995, "grad_norm": 0.294779509305954, "learning_rate": 3.7360809065003613e-05, "loss": 0.0561, "step": 9150 }, { "epoch": 4.117718641654995, "grad_norm": 0.33281511068344116, "learning_rate": 3.735411193485685e-05, "loss": 0.0561, "step": 9160 }, { "epoch": 4.121392326590591, "grad_norm": 0.4370999038219452, "learning_rate": 3.7347406920082435e-05, "loss": 0.0633, "step": 9170 }, { "epoch": 4.125066011526187, "grad_norm": 0.30675435066223145, "learning_rate": 3.7340694023726704e-05, "loss": 0.0527, "step": 9180 }, { "epoch": 4.128739696461782, "grad_norm": 0.30382096767425537, "learning_rate": 3.733397324883959e-05, "loss": 0.0604, "step": 9190 }, { "epoch": 4.132413381397378, "grad_norm": 0.339853972196579, "learning_rate": 3.732724459847458e-05, "loss": 0.0565, "step": 9200 }, { "epoch": 4.136087066332974, "grad_norm": 0.33548668026924133, "learning_rate": 3.732050807568878e-05, "loss": 0.0532, "step": 9210 }, { "epoch": 4.139760751268569, "grad_norm": 0.38548097014427185, "learning_rate": 3.7313763683542816e-05, "loss": 0.0584, "step": 9220 }, { "epoch": 4.143434436204165, "grad_norm": 0.3988785445690155, "learning_rate": 3.7307011425100944e-05, "loss": 0.0558, "step": 9230 }, { "epoch": 4.147108121139761, "grad_norm": 0.31689491868019104, "learning_rate": 3.730025130343095e-05, "loss": 0.0513, "step": 9240 }, { "epoch": 4.150781806075356, "grad_norm": 0.3752569556236267, "learning_rate": 3.729348332160423e-05, "loss": 0.0603, "step": 9250 }, { "epoch": 4.154455491010952, "grad_norm": 0.3657349944114685, "learning_rate": 3.7286707482695714e-05, "loss": 0.0529, "step": 9260 }, { "epoch": 4.158129175946548, "grad_norm": 0.37941595911979675, "learning_rate": 3.727992378978394e-05, "loss": 0.0519, "step": 9270 }, { "epoch": 4.161802860882144, "grad_norm": 0.4194289445877075, "learning_rate": 3.7273132245950987e-05, "loss": 0.0622, "step": 9280 }, { "epoch": 4.165476545817739, "grad_norm": 0.3783503770828247, "learning_rate": 3.726633285428251e-05, "loss": 0.0627, "step": 9290 }, { "epoch": 4.169150230753335, "grad_norm": 0.3589822053909302, "learning_rate": 3.7259525617867736e-05, "loss": 0.055, "step": 9300 }, { "epoch": 4.17282391568893, "grad_norm": 0.354597270488739, "learning_rate": 3.725271053979944e-05, "loss": 0.0587, "step": 9310 }, { "epoch": 4.1764976006245265, "grad_norm": 0.37541255354881287, "learning_rate": 3.7245887623173964e-05, "loss": 0.0542, "step": 9320 }, { "epoch": 4.180171285560122, "grad_norm": 0.3665836453437805, "learning_rate": 3.723905687109123e-05, "loss": 0.0563, "step": 9330 }, { "epoch": 4.183844970495718, "grad_norm": 0.3444162607192993, "learning_rate": 3.7232218286654694e-05, "loss": 0.0547, "step": 9340 }, { "epoch": 4.1875186554313135, "grad_norm": 0.3163646161556244, "learning_rate": 3.72253718729714e-05, "loss": 0.0553, "step": 9350 }, { "epoch": 4.191192340366909, "grad_norm": 0.3367973268032074, "learning_rate": 3.721851763315191e-05, "loss": 0.0558, "step": 9360 }, { "epoch": 4.002306421868154, "grad_norm": 0.3380977511405945, "learning_rate": 3.7211655570310385e-05, "loss": 0.0479, "step": 9370 }, { "epoch": 4.005388071128317, "grad_norm": 0.2721419930458069, "learning_rate": 3.72047856875645e-05, "loss": 0.0444, "step": 9380 }, { "epoch": 4.00846972038848, "grad_norm": 0.3046185076236725, "learning_rate": 3.7197907988035524e-05, "loss": 0.04, "step": 9390 }, { "epoch": 4.011551369648644, "grad_norm": 0.30911293625831604, "learning_rate": 3.719102247484823e-05, "loss": 0.038, "step": 9400 }, { "epoch": 4.014633018908807, "grad_norm": 0.30588841438293457, "learning_rate": 3.718412915113098e-05, "loss": 0.0428, "step": 9410 }, { "epoch": 4.017714668168971, "grad_norm": 0.2892226278781891, "learning_rate": 3.717722802001568e-05, "loss": 0.0405, "step": 9420 }, { "epoch": 4.020796317429134, "grad_norm": 0.2794080674648285, "learning_rate": 3.717031908463775e-05, "loss": 0.0419, "step": 9430 }, { "epoch": 4.023877966689297, "grad_norm": 0.34222760796546936, "learning_rate": 3.71634023481362e-05, "loss": 0.046, "step": 9440 }, { "epoch": 4.026959615949461, "grad_norm": 0.36665770411491394, "learning_rate": 3.715647781365356e-05, "loss": 0.0434, "step": 9450 }, { "epoch": 4.030041265209625, "grad_norm": 0.301266610622406, "learning_rate": 3.71495454843359e-05, "loss": 0.0416, "step": 9460 }, { "epoch": 4.0331229144697875, "grad_norm": 0.31478413939476013, "learning_rate": 3.714260536333285e-05, "loss": 0.0463, "step": 9470 }, { "epoch": 4.036204563729951, "grad_norm": 0.3083324432373047, "learning_rate": 3.713565745379756e-05, "loss": 0.0409, "step": 9480 }, { "epoch": 4.039286212990115, "grad_norm": 0.32863229513168335, "learning_rate": 3.712870175888673e-05, "loss": 0.042, "step": 9490 }, { "epoch": 4.042367862250278, "grad_norm": 0.3115628659725189, "learning_rate": 3.7121738281760595e-05, "loss": 0.0406, "step": 9500 }, { "epoch": 4.045449511510442, "grad_norm": 0.3127463757991791, "learning_rate": 3.711476702558292e-05, "loss": 0.0409, "step": 9510 }, { "epoch": 4.0485311607706045, "grad_norm": 0.2912161946296692, "learning_rate": 3.710778799352102e-05, "loss": 0.0391, "step": 9520 }, { "epoch": 4.051612810030768, "grad_norm": 0.2777150273323059, "learning_rate": 3.7100801188745726e-05, "loss": 0.0435, "step": 9530 }, { "epoch": 4.054694459290932, "grad_norm": 0.3739161193370819, "learning_rate": 3.709380661443141e-05, "loss": 0.0435, "step": 9540 }, { "epoch": 4.057776108551095, "grad_norm": 0.3583177924156189, "learning_rate": 3.708680427375597e-05, "loss": 0.0453, "step": 9550 }, { "epoch": 4.060857757811259, "grad_norm": 0.3123624324798584, "learning_rate": 3.7079794169900824e-05, "loss": 0.0435, "step": 9560 }, { "epoch": 4.063939407071422, "grad_norm": 0.3254983425140381, "learning_rate": 3.7072776306050944e-05, "loss": 0.0459, "step": 9570 }, { "epoch": 4.067021056331585, "grad_norm": 0.352689266204834, "learning_rate": 3.70657506853948e-05, "loss": 0.0409, "step": 9580 }, { "epoch": 4.070102705591749, "grad_norm": 0.3007083535194397, "learning_rate": 3.7058717311124395e-05, "loss": 0.0463, "step": 9590 }, { "epoch": 4.073184354851913, "grad_norm": 0.37921011447906494, "learning_rate": 3.7051676186435256e-05, "loss": 0.0411, "step": 9600 }, { "epoch": 4.0762660041120755, "grad_norm": 0.2605467438697815, "learning_rate": 3.704462731452644e-05, "loss": 0.0471, "step": 9610 }, { "epoch": 4.079347653372239, "grad_norm": 0.3103994131088257, "learning_rate": 3.703757069860051e-05, "loss": 0.0424, "step": 9620 }, { "epoch": 4.082429302632403, "grad_norm": 0.2943425476551056, "learning_rate": 3.703050634186355e-05, "loss": 0.0429, "step": 9630 }, { "epoch": 4.085510951892566, "grad_norm": 0.32114508748054504, "learning_rate": 3.702343424752517e-05, "loss": 0.0412, "step": 9640 }, { "epoch": 4.08859260115273, "grad_norm": 0.33326685428619385, "learning_rate": 3.701635441879848e-05, "loss": 0.0428, "step": 9650 }, { "epoch": 4.0916742504128925, "grad_norm": 0.32998064160346985, "learning_rate": 3.700926685890012e-05, "loss": 0.0424, "step": 9660 }, { "epoch": 4.094755899673056, "grad_norm": 0.3350954055786133, "learning_rate": 3.7002171571050237e-05, "loss": 0.0423, "step": 9670 }, { "epoch": 4.09783754893322, "grad_norm": 0.29235246777534485, "learning_rate": 3.699506855847249e-05, "loss": 0.0395, "step": 9680 }, { "epoch": 4.100919198193383, "grad_norm": 0.31485816836357117, "learning_rate": 3.6987957824394035e-05, "loss": 0.0463, "step": 9690 }, { "epoch": 4.1040008474535465, "grad_norm": 0.3026577830314636, "learning_rate": 3.698083937204555e-05, "loss": 0.04, "step": 9700 }, { "epoch": 4.10708249671371, "grad_norm": 0.3695080280303955, "learning_rate": 3.697371320466123e-05, "loss": 0.0356, "step": 9710 }, { "epoch": 4.110164145973873, "grad_norm": 0.3437636196613312, "learning_rate": 3.696657932547874e-05, "loss": 0.0382, "step": 9720 }, { "epoch": 4.113245795234037, "grad_norm": 0.29336875677108765, "learning_rate": 3.695943773773929e-05, "loss": 0.0412, "step": 9730 }, { "epoch": 4.116327444494201, "grad_norm": 0.34003928303718567, "learning_rate": 3.6952288444687556e-05, "loss": 0.0404, "step": 9740 }, { "epoch": 4.1194090937543635, "grad_norm": 0.37962788343429565, "learning_rate": 3.694513144957175e-05, "loss": 0.0425, "step": 9750 }, { "epoch": 4.122490743014527, "grad_norm": 0.26789405941963196, "learning_rate": 3.6937966755643545e-05, "loss": 0.0416, "step": 9760 }, { "epoch": 4.12557239227469, "grad_norm": 0.2703173756599426, "learning_rate": 3.6930794366158146e-05, "loss": 0.0383, "step": 9770 }, { "epoch": 4.128654041534854, "grad_norm": 0.32025593519210815, "learning_rate": 3.6923614284374234e-05, "loss": 0.0459, "step": 9780 }, { "epoch": 4.131735690795018, "grad_norm": 0.27730244398117065, "learning_rate": 3.691642651355399e-05, "loss": 0.0411, "step": 9790 }, { "epoch": 4.13481734005518, "grad_norm": 0.2933109700679779, "learning_rate": 3.6909231056963095e-05, "loss": 0.0374, "step": 9800 }, { "epoch": 4.137898989315344, "grad_norm": 0.2956279218196869, "learning_rate": 3.690202791787071e-05, "loss": 0.0405, "step": 9810 }, { "epoch": 4.140980638575508, "grad_norm": 0.32838159799575806, "learning_rate": 3.68948170995495e-05, "loss": 0.0403, "step": 9820 }, { "epoch": 4.144062287835671, "grad_norm": 0.3620191216468811, "learning_rate": 3.6887598605275606e-05, "loss": 0.0425, "step": 9830 }, { "epoch": 4.1471439370958345, "grad_norm": 0.41367578506469727, "learning_rate": 3.688037243832866e-05, "loss": 0.0497, "step": 9840 }, { "epoch": 4.150225586355998, "grad_norm": 0.32846391201019287, "learning_rate": 3.6873138601991786e-05, "loss": 0.0432, "step": 9850 }, { "epoch": 4.153307235616161, "grad_norm": 0.3001476526260376, "learning_rate": 3.686589709955159e-05, "loss": 0.0442, "step": 9860 }, { "epoch": 4.156388884876325, "grad_norm": 0.3394618332386017, "learning_rate": 3.685864793429816e-05, "loss": 0.0455, "step": 9870 }, { "epoch": 4.159470534136489, "grad_norm": 0.3439669609069824, "learning_rate": 3.6851391109525066e-05, "loss": 0.0462, "step": 9880 }, { "epoch": 4.1625521833966515, "grad_norm": 0.30198949575424194, "learning_rate": 3.684412662852935e-05, "loss": 0.0385, "step": 9890 }, { "epoch": 4.165633832656815, "grad_norm": 0.3063207268714905, "learning_rate": 3.683685449461155e-05, "loss": 0.0426, "step": 9900 }, { "epoch": 4.168715481916978, "grad_norm": 0.32436466217041016, "learning_rate": 3.6829574711075654e-05, "loss": 0.0439, "step": 9910 }, { "epoch": 4.171797131177142, "grad_norm": 0.3263995349407196, "learning_rate": 3.682228728122916e-05, "loss": 0.0414, "step": 9920 }, { "epoch": 4.1748787804373055, "grad_norm": 0.3167702853679657, "learning_rate": 3.6814992208383024e-05, "loss": 0.0401, "step": 9930 }, { "epoch": 4.177960429697468, "grad_norm": 0.3152026832103729, "learning_rate": 3.680768949585166e-05, "loss": 0.0425, "step": 9940 }, { "epoch": 4.181042078957632, "grad_norm": 0.32295241951942444, "learning_rate": 3.680037914695296e-05, "loss": 0.0432, "step": 9950 }, { "epoch": 4.184123728217796, "grad_norm": 0.3271796703338623, "learning_rate": 3.679306116500831e-05, "loss": 0.0433, "step": 9960 }, { "epoch": 4.187205377477959, "grad_norm": 0.32456493377685547, "learning_rate": 3.678573555334253e-05, "loss": 0.0437, "step": 9970 }, { "epoch": 4.1902870267381225, "grad_norm": 0.3132164776325226, "learning_rate": 3.6778402315283944e-05, "loss": 0.0411, "step": 9980 }, { "epoch": 4.193368675998286, "grad_norm": 0.27927663922309875, "learning_rate": 3.677106145416429e-05, "loss": 0.04, "step": 9990 }, { "epoch": 4.196450325258449, "grad_norm": 0.29946237802505493, "learning_rate": 3.676371297331882e-05, "loss": 0.0367, "step": 10000 }, { "epoch": 4.199531974518613, "grad_norm": 0.3281599283218384, "learning_rate": 3.675635687608621e-05, "loss": 0.0426, "step": 10010 }, { "epoch": 4.202613623778776, "grad_norm": 0.3641193211078644, "learning_rate": 3.674899316580862e-05, "loss": 0.0445, "step": 10020 }, { "epoch": 4.205695273038939, "grad_norm": 0.3226047158241272, "learning_rate": 3.674162184583167e-05, "loss": 0.04, "step": 10030 }, { "epoch": 4.208776922299103, "grad_norm": 0.29529958963394165, "learning_rate": 3.673424291950441e-05, "loss": 0.0424, "step": 10040 }, { "epoch": 4.211858571559266, "grad_norm": 0.32880842685699463, "learning_rate": 3.672685639017937e-05, "loss": 0.0455, "step": 10050 }, { "epoch": 4.21494022081943, "grad_norm": 0.36458897590637207, "learning_rate": 3.671946226121255e-05, "loss": 0.0426, "step": 10060 }, { "epoch": 4.2180218700795935, "grad_norm": 0.3117377460002899, "learning_rate": 3.671206053596335e-05, "loss": 0.0403, "step": 10070 }, { "epoch": 4.221103519339756, "grad_norm": 0.3280072808265686, "learning_rate": 3.6704651217794675e-05, "loss": 0.0425, "step": 10080 }, { "epoch": 4.22418516859992, "grad_norm": 0.37662777304649353, "learning_rate": 3.669723431007285e-05, "loss": 0.0407, "step": 10090 }, { "epoch": 4.227266817860084, "grad_norm": 0.32971858978271484, "learning_rate": 3.6689809816167656e-05, "loss": 0.0405, "step": 10100 }, { "epoch": 4.230348467120247, "grad_norm": 0.3146090507507324, "learning_rate": 3.668237773945232e-05, "loss": 0.0432, "step": 10110 }, { "epoch": 4.2334301163804104, "grad_norm": 0.30962038040161133, "learning_rate": 3.6674938083303516e-05, "loss": 0.0423, "step": 10120 }, { "epoch": 4.236511765640574, "grad_norm": 0.2676369845867157, "learning_rate": 3.666749085110136e-05, "loss": 0.043, "step": 10130 }, { "epoch": 4.239593414900737, "grad_norm": 0.3658912181854248, "learning_rate": 3.6660036046229415e-05, "loss": 0.0429, "step": 10140 }, { "epoch": 4.242675064160901, "grad_norm": 0.33384639024734497, "learning_rate": 3.665257367207467e-05, "loss": 0.04, "step": 10150 }, { "epoch": 4.245756713421064, "grad_norm": 0.3115827441215515, "learning_rate": 3.664510373202757e-05, "loss": 0.0408, "step": 10160 }, { "epoch": 4.248838362681227, "grad_norm": 0.29378828406333923, "learning_rate": 3.6637626229481995e-05, "loss": 0.0389, "step": 10170 }, { "epoch": 4.251920011941391, "grad_norm": 0.31682097911834717, "learning_rate": 3.663014116783525e-05, "loss": 0.041, "step": 10180 }, { "epoch": 4.255001661201554, "grad_norm": 0.3610402047634125, "learning_rate": 3.6622648550488076e-05, "loss": 0.0405, "step": 10190 }, { "epoch": 4.258083310461718, "grad_norm": 0.3358828127384186, "learning_rate": 3.661514838084467e-05, "loss": 0.0436, "step": 10200 }, { "epoch": 4.2611649597218815, "grad_norm": 0.3402293622493744, "learning_rate": 3.660764066231261e-05, "loss": 0.0426, "step": 10210 }, { "epoch": 4.264246608982044, "grad_norm": 0.29273271560668945, "learning_rate": 3.660012539830297e-05, "loss": 0.0452, "step": 10220 }, { "epoch": 4.267328258242208, "grad_norm": 0.3252599537372589, "learning_rate": 3.65926025922302e-05, "loss": 0.0415, "step": 10230 }, { "epoch": 4.270409907502372, "grad_norm": 0.3180873990058899, "learning_rate": 3.65850722475122e-05, "loss": 0.0441, "step": 10240 }, { "epoch": 4.273491556762535, "grad_norm": 0.3306243121623993, "learning_rate": 3.657753436757028e-05, "loss": 0.0414, "step": 10250 }, { "epoch": 4.276573206022698, "grad_norm": 0.3268897533416748, "learning_rate": 3.65699889558292e-05, "loss": 0.0437, "step": 10260 }, { "epoch": 4.279654855282862, "grad_norm": 0.31589773297309875, "learning_rate": 3.656243601571711e-05, "loss": 0.0444, "step": 10270 }, { "epoch": 4.282736504543025, "grad_norm": 0.30028030276298523, "learning_rate": 3.65548755506656e-05, "loss": 0.0444, "step": 10280 }, { "epoch": 4.285818153803189, "grad_norm": 0.30416807532310486, "learning_rate": 3.654730756410968e-05, "loss": 0.0463, "step": 10290 }, { "epoch": 4.288899803063352, "grad_norm": 0.3526163101196289, "learning_rate": 3.653973205948776e-05, "loss": 0.0419, "step": 10300 }, { "epoch": 4.291981452323515, "grad_norm": 0.30602598190307617, "learning_rate": 3.653214904024169e-05, "loss": 0.0474, "step": 10310 }, { "epoch": 4.295063101583679, "grad_norm": 0.34762275218963623, "learning_rate": 3.652455850981672e-05, "loss": 0.043, "step": 10320 }, { "epoch": 4.298144750843842, "grad_norm": 0.3712126314640045, "learning_rate": 3.651696047166151e-05, "loss": 0.042, "step": 10330 }, { "epoch": 4.301226400104006, "grad_norm": 0.2854516804218292, "learning_rate": 3.650935492922813e-05, "loss": 0.0466, "step": 10340 }, { "epoch": 4.304308049364169, "grad_norm": 0.33146342635154724, "learning_rate": 3.650174188597207e-05, "loss": 0.042, "step": 10350 }, { "epoch": 4.307389698624332, "grad_norm": 0.3074069321155548, "learning_rate": 3.649412134535223e-05, "loss": 0.0394, "step": 10360 }, { "epoch": 4.310471347884496, "grad_norm": 0.34020891785621643, "learning_rate": 3.648649331083091e-05, "loss": 0.0441, "step": 10370 }, { "epoch": 4.31355299714466, "grad_norm": 0.30666857957839966, "learning_rate": 3.647885778587379e-05, "loss": 0.0372, "step": 10380 }, { "epoch": 4.316634646404823, "grad_norm": 0.3114950656890869, "learning_rate": 3.647121477395001e-05, "loss": 0.041, "step": 10390 }, { "epoch": 4.319716295664986, "grad_norm": 0.33240383863449097, "learning_rate": 3.6463564278532056e-05, "loss": 0.0417, "step": 10400 }, { "epoch": 4.322797944925149, "grad_norm": 0.3683377504348755, "learning_rate": 3.645590630309584e-05, "loss": 0.0435, "step": 10410 }, { "epoch": 4.325879594185313, "grad_norm": 0.30594033002853394, "learning_rate": 3.644824085112067e-05, "loss": 0.0418, "step": 10420 }, { "epoch": 4.328961243445477, "grad_norm": 0.2731369733810425, "learning_rate": 3.6440567926089264e-05, "loss": 0.0456, "step": 10430 }, { "epoch": 4.33204289270564, "grad_norm": 0.33231619000434875, "learning_rate": 3.6432887531487695e-05, "loss": 0.0463, "step": 10440 }, { "epoch": 4.335124541965803, "grad_norm": 0.2976283133029938, "learning_rate": 3.642519967080548e-05, "loss": 0.0419, "step": 10450 }, { "epoch": 4.338206191225967, "grad_norm": 0.29846516251564026, "learning_rate": 3.641750434753549e-05, "loss": 0.044, "step": 10460 }, { "epoch": 4.34128784048613, "grad_norm": 0.3507411777973175, "learning_rate": 3.640980156517401e-05, "loss": 0.0446, "step": 10470 }, { "epoch": 4.344369489746294, "grad_norm": 0.3180818259716034, "learning_rate": 3.640209132722069e-05, "loss": 0.0433, "step": 10480 }, { "epoch": 4.347451139006457, "grad_norm": 0.31917834281921387, "learning_rate": 3.6394373637178594e-05, "loss": 0.0418, "step": 10490 }, { "epoch": 4.35053278826662, "grad_norm": 0.3200957179069519, "learning_rate": 3.6386648498554166e-05, "loss": 0.0427, "step": 10500 }, { "epoch": 4.353614437526784, "grad_norm": 0.28474539518356323, "learning_rate": 3.637891591485721e-05, "loss": 0.0423, "step": 10510 }, { "epoch": 4.356696086786947, "grad_norm": 0.29221653938293457, "learning_rate": 3.637117588960095e-05, "loss": 0.0428, "step": 10520 }, { "epoch": 4.359777736047111, "grad_norm": 0.33411985635757446, "learning_rate": 3.636342842630195e-05, "loss": 0.0433, "step": 10530 }, { "epoch": 4.362859385307274, "grad_norm": 0.3637237548828125, "learning_rate": 3.635567352848019e-05, "loss": 0.0413, "step": 10540 }, { "epoch": 4.365941034567437, "grad_norm": 0.31617382168769836, "learning_rate": 3.634791119965901e-05, "loss": 0.0414, "step": 10550 }, { "epoch": 4.369022683827601, "grad_norm": 0.3295506536960602, "learning_rate": 3.6340141443365125e-05, "loss": 0.0396, "step": 10560 }, { "epoch": 4.372104333087765, "grad_norm": 0.3292025029659271, "learning_rate": 3.633236426312863e-05, "loss": 0.0437, "step": 10570 }, { "epoch": 4.3751859823479275, "grad_norm": 0.26511362195014954, "learning_rate": 3.632457966248299e-05, "loss": 0.0404, "step": 10580 }, { "epoch": 4.378267631608091, "grad_norm": 0.33575162291526794, "learning_rate": 3.631678764496505e-05, "loss": 0.0437, "step": 10590 }, { "epoch": 4.381349280868255, "grad_norm": 0.34593477845191956, "learning_rate": 3.630898821411501e-05, "loss": 0.0441, "step": 10600 }, { "epoch": 4.384430930128418, "grad_norm": 0.32075434923171997, "learning_rate": 3.630118137347645e-05, "loss": 0.0408, "step": 10610 }, { "epoch": 4.387512579388582, "grad_norm": 0.3471469581127167, "learning_rate": 3.629336712659631e-05, "loss": 0.0408, "step": 10620 }, { "epoch": 4.3905942286487445, "grad_norm": 0.3235209584236145, "learning_rate": 3.6285545477024886e-05, "loss": 0.0387, "step": 10630 }, { "epoch": 4.393675877908908, "grad_norm": 0.29633450508117676, "learning_rate": 3.627771642831587e-05, "loss": 0.0424, "step": 10640 }, { "epoch": 4.396757527169072, "grad_norm": 0.3094930350780487, "learning_rate": 3.626987998402628e-05, "loss": 0.0388, "step": 10650 }, { "epoch": 4.399839176429235, "grad_norm": 0.36882707476615906, "learning_rate": 3.626203614771651e-05, "loss": 0.0456, "step": 10660 }, { "epoch": 4.402920825689399, "grad_norm": 0.3292597830295563, "learning_rate": 3.6254184922950314e-05, "loss": 0.0442, "step": 10670 }, { "epoch": 4.406002474949562, "grad_norm": 0.361396461725235, "learning_rate": 3.6246326313294804e-05, "loss": 0.0434, "step": 10680 }, { "epoch": 4.409084124209725, "grad_norm": 0.33741047978401184, "learning_rate": 3.623846032232043e-05, "loss": 0.0468, "step": 10690 }, { "epoch": 4.412165773469889, "grad_norm": 0.3026597499847412, "learning_rate": 3.6230586953601015e-05, "loss": 0.0458, "step": 10700 }, { "epoch": 4.415247422730053, "grad_norm": 0.3589765727519989, "learning_rate": 3.6222706210713734e-05, "loss": 0.0412, "step": 10710 }, { "epoch": 4.4183290719902155, "grad_norm": 0.34596431255340576, "learning_rate": 3.621481809723909e-05, "loss": 0.0465, "step": 10720 }, { "epoch": 4.421410721250379, "grad_norm": 0.34278133511543274, "learning_rate": 3.620692261676097e-05, "loss": 0.0425, "step": 10730 }, { "epoch": 4.424492370510543, "grad_norm": 0.3546799123287201, "learning_rate": 3.6199019772866575e-05, "loss": 0.0402, "step": 10740 }, { "epoch": 4.427574019770706, "grad_norm": 0.34626173973083496, "learning_rate": 3.6191109569146467e-05, "loss": 0.0457, "step": 10750 }, { "epoch": 4.43065566903087, "grad_norm": 0.3132508099079132, "learning_rate": 3.618319200919455e-05, "loss": 0.0489, "step": 10760 }, { "epoch": 4.433737318291033, "grad_norm": 0.30217868089675903, "learning_rate": 3.617526709660807e-05, "loss": 0.0427, "step": 10770 }, { "epoch": 4.436818967551196, "grad_norm": 0.33972036838531494, "learning_rate": 3.616733483498762e-05, "loss": 0.0408, "step": 10780 }, { "epoch": 4.43990061681136, "grad_norm": 0.39725083112716675, "learning_rate": 3.615939522793712e-05, "loss": 0.0422, "step": 10790 }, { "epoch": 4.442982266071523, "grad_norm": 0.32679492235183716, "learning_rate": 3.615144827906383e-05, "loss": 0.0391, "step": 10800 }, { "epoch": 4.4460639153316865, "grad_norm": 0.33433806896209717, "learning_rate": 3.6143493991978355e-05, "loss": 0.0456, "step": 10810 }, { "epoch": 4.44914556459185, "grad_norm": 0.3037717938423157, "learning_rate": 3.613553237029462e-05, "loss": 0.0382, "step": 10820 }, { "epoch": 4.452227213852013, "grad_norm": 0.3224252164363861, "learning_rate": 3.612756341762991e-05, "loss": 0.0424, "step": 10830 }, { "epoch": 4.455308863112177, "grad_norm": 0.288066565990448, "learning_rate": 3.61195871376048e-05, "loss": 0.0405, "step": 10840 }, { "epoch": 4.458390512372341, "grad_norm": 0.3300135135650635, "learning_rate": 3.611160353384322e-05, "loss": 0.0426, "step": 10850 }, { "epoch": 4.4614721616325035, "grad_norm": 0.25224244594573975, "learning_rate": 3.610361260997242e-05, "loss": 0.044, "step": 10860 }, { "epoch": 4.464553810892667, "grad_norm": 0.29439032077789307, "learning_rate": 3.609561436962298e-05, "loss": 0.0403, "step": 10870 }, { "epoch": 4.467635460152831, "grad_norm": 0.3318496346473694, "learning_rate": 3.608760881642881e-05, "loss": 0.0441, "step": 10880 }, { "epoch": 4.470717109412994, "grad_norm": 0.3241943418979645, "learning_rate": 3.6079595954027126e-05, "loss": 0.0455, "step": 10890 }, { "epoch": 4.473798758673158, "grad_norm": 0.36015984416007996, "learning_rate": 3.607157578605847e-05, "loss": 0.0413, "step": 10900 }, { "epoch": 4.47688040793332, "grad_norm": 0.3000034987926483, "learning_rate": 3.6063548316166726e-05, "loss": 0.0402, "step": 10910 }, { "epoch": 4.479962057193484, "grad_norm": 0.3295136094093323, "learning_rate": 3.605551354799905e-05, "loss": 0.041, "step": 10920 }, { "epoch": 4.483043706453648, "grad_norm": 0.3507075607776642, "learning_rate": 3.6047471485205955e-05, "loss": 0.0399, "step": 10930 }, { "epoch": 4.486125355713811, "grad_norm": 0.3125300407409668, "learning_rate": 3.603942213144126e-05, "loss": 0.0406, "step": 10940 }, { "epoch": 4.4892070049739745, "grad_norm": 0.29813337326049805, "learning_rate": 3.603136549036208e-05, "loss": 0.0388, "step": 10950 }, { "epoch": 4.492288654234138, "grad_norm": 0.2965327799320221, "learning_rate": 3.602330156562885e-05, "loss": 0.0461, "step": 10960 }, { "epoch": 4.495370303494301, "grad_norm": 0.32570433616638184, "learning_rate": 3.601523036090532e-05, "loss": 0.0416, "step": 10970 }, { "epoch": 4.498451952754465, "grad_norm": 0.3090783655643463, "learning_rate": 3.600715187985854e-05, "loss": 0.0417, "step": 10980 }, { "epoch": 4.501533602014629, "grad_norm": 0.29515308141708374, "learning_rate": 3.599906612615887e-05, "loss": 0.0401, "step": 10990 }, { "epoch": 4.5046152512747915, "grad_norm": 0.30470454692840576, "learning_rate": 3.5990973103479976e-05, "loss": 0.0387, "step": 11000 }, { "epoch": 4.507696900534955, "grad_norm": 0.3292587995529175, "learning_rate": 3.598287281549881e-05, "loss": 0.0378, "step": 11010 }, { "epoch": 4.510778549795118, "grad_norm": 0.33807411789894104, "learning_rate": 3.597476526589566e-05, "loss": 0.039, "step": 11020 }, { "epoch": 4.513860199055282, "grad_norm": 0.3111196756362915, "learning_rate": 3.596665045835408e-05, "loss": 0.0423, "step": 11030 }, { "epoch": 4.5169418483154455, "grad_norm": 0.30805960297584534, "learning_rate": 3.5958528396560925e-05, "loss": 0.0394, "step": 11040 }, { "epoch": 4.520023497575608, "grad_norm": 0.3363763391971588, "learning_rate": 3.595039908420637e-05, "loss": 0.0399, "step": 11050 }, { "epoch": 2.301725819666801, "grad_norm": 0.30268439650535583, "learning_rate": 3.594226252498385e-05, "loss": 0.0381, "step": 11060 }, { "epoch": 2.303806687388641, "grad_norm": 0.2891196608543396, "learning_rate": 3.593411872259013e-05, "loss": 0.035, "step": 11070 }, { "epoch": 2.305887555110481, "grad_norm": 0.2854248881340027, "learning_rate": 3.5925967680725236e-05, "loss": 0.0377, "step": 11080 }, { "epoch": 2.307968422832321, "grad_norm": 0.3208191394805908, "learning_rate": 3.591780940309249e-05, "loss": 0.037, "step": 11090 }, { "epoch": 2.310049290554161, "grad_norm": 0.27559521794319153, "learning_rate": 3.590964389339852e-05, "loss": 0.0355, "step": 11100 }, { "epoch": 2.312130158276001, "grad_norm": 0.2856278121471405, "learning_rate": 3.5901471155353204e-05, "loss": 0.0358, "step": 11110 }, { "epoch": 2.314211025997841, "grad_norm": 0.2758062183856964, "learning_rate": 3.5893291192669746e-05, "loss": 0.0418, "step": 11120 }, { "epoch": 2.316291893719681, "grad_norm": 0.29176098108291626, "learning_rate": 3.5885104009064605e-05, "loss": 0.0338, "step": 11130 }, { "epoch": 2.318372761441521, "grad_norm": 0.3503691554069519, "learning_rate": 3.587690960825752e-05, "loss": 0.0388, "step": 11140 }, { "epoch": 2.320453629163361, "grad_norm": 0.3110394775867462, "learning_rate": 3.586870799397152e-05, "loss": 0.0379, "step": 11150 }, { "epoch": 2.322534496885201, "grad_norm": 0.2984427511692047, "learning_rate": 3.586049916993291e-05, "loss": 0.0392, "step": 11160 }, { "epoch": 2.324615364607041, "grad_norm": 0.3040499687194824, "learning_rate": 3.585228313987127e-05, "loss": 0.0336, "step": 11170 }, { "epoch": 2.326696232328881, "grad_norm": 0.2670856714248657, "learning_rate": 3.584405990751945e-05, "loss": 0.0346, "step": 11180 }, { "epoch": 2.328777100050721, "grad_norm": 0.30487948656082153, "learning_rate": 3.583582947661357e-05, "loss": 0.039, "step": 11190 }, { "epoch": 2.330857967772561, "grad_norm": 0.3018737733364105, "learning_rate": 3.582759185089304e-05, "loss": 0.0381, "step": 11200 }, { "epoch": 2.332938835494401, "grad_norm": 0.31703054904937744, "learning_rate": 3.58193470341005e-05, "loss": 0.036, "step": 11210 }, { "epoch": 2.335019703216241, "grad_norm": 0.3566927909851074, "learning_rate": 3.58110950299819e-05, "loss": 0.035, "step": 11220 }, { "epoch": 2.337100570938081, "grad_norm": 0.2971939444541931, "learning_rate": 3.580283584228643e-05, "loss": 0.0392, "step": 11230 }, { "epoch": 2.3391814386599212, "grad_norm": 0.3059505224227905, "learning_rate": 3.579456947476656e-05, "loss": 0.0353, "step": 11240 }, { "epoch": 2.3412623063817612, "grad_norm": 0.2918558716773987, "learning_rate": 3.5786295931178e-05, "loss": 0.0332, "step": 11250 }, { "epoch": 2.3433431741036013, "grad_norm": 0.2899167239665985, "learning_rate": 3.577801521527974e-05, "loss": 0.0336, "step": 11260 }, { "epoch": 2.3454240418254413, "grad_norm": 0.2957324683666229, "learning_rate": 3.576972733083402e-05, "loss": 0.034, "step": 11270 }, { "epoch": 2.3475049095472813, "grad_norm": 0.35977745056152344, "learning_rate": 3.576143228160633e-05, "loss": 0.0431, "step": 11280 }, { "epoch": 2.3495857772691213, "grad_norm": 0.265982985496521, "learning_rate": 3.5753130071365437e-05, "loss": 0.035, "step": 11290 }, { "epoch": 2.3516666449909613, "grad_norm": 0.2920279800891876, "learning_rate": 3.574482070388334e-05, "loss": 0.0376, "step": 11300 }, { "epoch": 2.3537475127128014, "grad_norm": 0.29442721605300903, "learning_rate": 3.573650418293531e-05, "loss": 0.0344, "step": 11310 }, { "epoch": 2.3558283804346414, "grad_norm": 0.29460766911506653, "learning_rate": 3.572818051229984e-05, "loss": 0.0356, "step": 11320 }, { "epoch": 2.3579092481564814, "grad_norm": 0.3080938160419464, "learning_rate": 3.57198496957587e-05, "loss": 0.035, "step": 11330 }, { "epoch": 2.3599901158783214, "grad_norm": 0.3513731062412262, "learning_rate": 3.571151173709688e-05, "loss": 0.0379, "step": 11340 }, { "epoch": 2.3620709836001614, "grad_norm": 0.2877587080001831, "learning_rate": 3.570316664010265e-05, "loss": 0.0363, "step": 11350 }, { "epoch": 2.3641518513220015, "grad_norm": 0.3214273750782013, "learning_rate": 3.569481440856748e-05, "loss": 0.034, "step": 11360 }, { "epoch": 2.3662327190438415, "grad_norm": 0.36585351824760437, "learning_rate": 3.568645504628612e-05, "loss": 0.0418, "step": 11370 }, { "epoch": 2.368313586765681, "grad_norm": 0.3446786105632782, "learning_rate": 3.567808855705654e-05, "loss": 0.0378, "step": 11380 }, { "epoch": 2.3703944544875215, "grad_norm": 0.3472091853618622, "learning_rate": 3.5669714944679935e-05, "loss": 0.0343, "step": 11390 }, { "epoch": 2.372475322209361, "grad_norm": 0.3366478383541107, "learning_rate": 3.566133421296078e-05, "loss": 0.0383, "step": 11400 }, { "epoch": 2.3745561899312015, "grad_norm": 0.2659747898578644, "learning_rate": 3.5652946365706744e-05, "loss": 0.0367, "step": 11410 }, { "epoch": 2.376637057653041, "grad_norm": 0.3204088807106018, "learning_rate": 3.564455140672874e-05, "loss": 0.0409, "step": 11420 }, { "epoch": 2.378717925374881, "grad_norm": 0.30545324087142944, "learning_rate": 3.563614933984092e-05, "loss": 0.0346, "step": 11430 }, { "epoch": 2.380798793096721, "grad_norm": 0.33762693405151367, "learning_rate": 3.562774016886066e-05, "loss": 0.0369, "step": 11440 }, { "epoch": 2.382879660818561, "grad_norm": 0.3624821901321411, "learning_rate": 3.5619323897608555e-05, "loss": 0.0395, "step": 11450 }, { "epoch": 2.384960528540401, "grad_norm": 0.3454044461250305, "learning_rate": 3.561090052990845e-05, "loss": 0.0371, "step": 11460 }, { "epoch": 2.387041396262241, "grad_norm": 0.29174354672431946, "learning_rate": 3.560247006958738e-05, "loss": 0.0385, "step": 11470 }, { "epoch": 2.3891222639840812, "grad_norm": 0.304726779460907, "learning_rate": 3.5594032520475636e-05, "loss": 0.0364, "step": 11480 }, { "epoch": 2.3912031317059212, "grad_norm": 0.28460097312927246, "learning_rate": 3.558558788640671e-05, "loss": 0.0353, "step": 11490 }, { "epoch": 2.3932839994277613, "grad_norm": 0.29018497467041016, "learning_rate": 3.5577136171217316e-05, "loss": 0.0344, "step": 11500 }, { "epoch": 2.3953648671496013, "grad_norm": 0.3227468729019165, "learning_rate": 3.556867737874738e-05, "loss": 0.0368, "step": 11510 }, { "epoch": 2.3974457348714413, "grad_norm": 0.34828704595565796, "learning_rate": 3.556021151284007e-05, "loss": 0.0388, "step": 11520 }, { "epoch": 2.3995266025932813, "grad_norm": 0.303180992603302, "learning_rate": 3.5551738577341734e-05, "loss": 0.0344, "step": 11530 }, { "epoch": 2.4016074703151213, "grad_norm": 0.3118494749069214, "learning_rate": 3.554325857610195e-05, "loss": 0.0356, "step": 11540 }, { "epoch": 2.4036883380369614, "grad_norm": 0.3530193865299225, "learning_rate": 3.55347715129735e-05, "loss": 0.0349, "step": 11550 }, { "epoch": 2.4057692057588014, "grad_norm": 0.2898083031177521, "learning_rate": 3.552627739181238e-05, "loss": 0.0347, "step": 11560 }, { "epoch": 2.4078500734806414, "grad_norm": 0.2934173345565796, "learning_rate": 3.551777621647779e-05, "loss": 0.0355, "step": 11570 }, { "epoch": 2.4099309412024814, "grad_norm": 0.3120407164096832, "learning_rate": 3.550926799083213e-05, "loss": 0.0356, "step": 11580 }, { "epoch": 2.4120118089243214, "grad_norm": 0.36177578568458557, "learning_rate": 3.550075271874101e-05, "loss": 0.0366, "step": 11590 }, { "epoch": 2.4140926766461615, "grad_norm": 0.3461219072341919, "learning_rate": 3.549223040407323e-05, "loss": 0.0365, "step": 11600 }, { "epoch": 2.4161735443680015, "grad_norm": 0.28728434443473816, "learning_rate": 3.548370105070083e-05, "loss": 0.0359, "step": 11610 }, { "epoch": 2.4182544120898415, "grad_norm": 0.3010919988155365, "learning_rate": 3.547516466249898e-05, "loss": 0.0381, "step": 11620 }, { "epoch": 2.4203352798116815, "grad_norm": 0.35254737734794617, "learning_rate": 3.546662124334611e-05, "loss": 0.0392, "step": 11630 }, { "epoch": 2.4224161475335215, "grad_norm": 0.3366020917892456, "learning_rate": 3.54580707971238e-05, "loss": 0.035, "step": 11640 }, { "epoch": 2.4244970152553615, "grad_norm": 0.3122754395008087, "learning_rate": 3.5449513327716846e-05, "loss": 0.0381, "step": 11650 }, { "epoch": 2.4265778829772016, "grad_norm": 0.26322808861732483, "learning_rate": 3.544094883901323e-05, "loss": 0.0322, "step": 11660 }, { "epoch": 2.4286587506990416, "grad_norm": 0.3448466956615448, "learning_rate": 3.543237733490413e-05, "loss": 0.0371, "step": 11670 }, { "epoch": 2.4307396184208816, "grad_norm": 0.305071622133255, "learning_rate": 3.542379881928389e-05, "loss": 0.0376, "step": 11680 }, { "epoch": 2.4328204861427216, "grad_norm": 0.3848431408405304, "learning_rate": 3.5415213296050065e-05, "loss": 0.0386, "step": 11690 }, { "epoch": 2.4349013538645616, "grad_norm": 0.3197186589241028, "learning_rate": 3.5406620769103364e-05, "loss": 0.0376, "step": 11700 }, { "epoch": 2.4369822215864017, "grad_norm": 0.296649307012558, "learning_rate": 3.539802124234771e-05, "loss": 0.0391, "step": 11710 }, { "epoch": 2.4390630893082417, "grad_norm": 0.33391237258911133, "learning_rate": 3.53894147196902e-05, "loss": 0.0425, "step": 11720 }, { "epoch": 2.4411439570300817, "grad_norm": 0.33323973417282104, "learning_rate": 3.538080120504109e-05, "loss": 0.0391, "step": 11730 }, { "epoch": 2.4432248247519217, "grad_norm": 0.2884734272956848, "learning_rate": 3.537218070231383e-05, "loss": 0.038, "step": 11740 }, { "epoch": 2.4453056924737617, "grad_norm": 0.3230050802230835, "learning_rate": 3.536355321542503e-05, "loss": 0.0361, "step": 11750 }, { "epoch": 2.4473865601956017, "grad_norm": 0.32755517959594727, "learning_rate": 3.535491874829449e-05, "loss": 0.035, "step": 11760 }, { "epoch": 2.4494674279174418, "grad_norm": 0.2821137309074402, "learning_rate": 3.5346277304845173e-05, "loss": 0.0374, "step": 11770 }, { "epoch": 2.451548295639282, "grad_norm": 0.3168204128742218, "learning_rate": 3.533762888900322e-05, "loss": 0.0406, "step": 11780 }, { "epoch": 2.4536291633611214, "grad_norm": 0.35218650102615356, "learning_rate": 3.532897350469792e-05, "loss": 0.0327, "step": 11790 }, { "epoch": 2.455710031082962, "grad_norm": 0.2872650921344757, "learning_rate": 3.5320311155861745e-05, "loss": 0.0434, "step": 11800 }, { "epoch": 2.4577908988048014, "grad_norm": 0.31303051114082336, "learning_rate": 3.5311641846430335e-05, "loss": 0.0392, "step": 11810 }, { "epoch": 2.459871766526642, "grad_norm": 0.3220546841621399, "learning_rate": 3.530296558034248e-05, "loss": 0.0375, "step": 11820 }, { "epoch": 2.4619526342484814, "grad_norm": 0.292793869972229, "learning_rate": 3.529428236154012e-05, "loss": 0.0386, "step": 11830 }, { "epoch": 2.4640335019703214, "grad_norm": 0.2619633972644806, "learning_rate": 3.528559219396838e-05, "loss": 0.0351, "step": 11840 }, { "epoch": 2.4661143696921615, "grad_norm": 0.32612138986587524, "learning_rate": 3.527689508157555e-05, "loss": 0.0389, "step": 11850 }, { "epoch": 2.4681952374140015, "grad_norm": 0.3157649338245392, "learning_rate": 3.526819102831302e-05, "loss": 0.0394, "step": 11860 }, { "epoch": 2.4702761051358415, "grad_norm": 0.32882100343704224, "learning_rate": 3.5259480038135404e-05, "loss": 0.0315, "step": 11870 }, { "epoch": 2.4723569728576815, "grad_norm": 0.28571221232414246, "learning_rate": 3.525076211500041e-05, "loss": 0.0366, "step": 11880 }, { "epoch": 2.4744378405795215, "grad_norm": 0.3465617001056671, "learning_rate": 3.5242037262868923e-05, "loss": 0.0361, "step": 11890 }, { "epoch": 2.4765187083013616, "grad_norm": 0.30079367756843567, "learning_rate": 3.523330548570498e-05, "loss": 0.0391, "step": 11900 }, { "epoch": 2.4785995760232016, "grad_norm": 0.2995516359806061, "learning_rate": 3.522456678747576e-05, "loss": 0.0378, "step": 11910 }, { "epoch": 2.4806804437450416, "grad_norm": 0.2697989344596863, "learning_rate": 3.5215821172151566e-05, "loss": 0.0362, "step": 11920 }, { "epoch": 2.4827613114668816, "grad_norm": 0.2952364981174469, "learning_rate": 3.5207068643705876e-05, "loss": 0.0415, "step": 11930 }, { "epoch": 2.4848421791887216, "grad_norm": 0.37823086977005005, "learning_rate": 3.5198309206115286e-05, "loss": 0.0372, "step": 11940 }, { "epoch": 2.4869230469105617, "grad_norm": 0.31549614667892456, "learning_rate": 3.518954286335954e-05, "loss": 0.0342, "step": 11950 }, { "epoch": 2.4890039146324017, "grad_norm": 0.3042607605457306, "learning_rate": 3.518076961942153e-05, "loss": 0.0393, "step": 11960 }, { "epoch": 2.4910847823542417, "grad_norm": 0.33682650327682495, "learning_rate": 3.5171989478287255e-05, "loss": 0.0368, "step": 11970 }, { "epoch": 2.4931656500760817, "grad_norm": 0.3069358170032501, "learning_rate": 3.516320244394587e-05, "loss": 0.0345, "step": 11980 }, { "epoch": 2.4952465177979217, "grad_norm": 0.31737157702445984, "learning_rate": 3.515440852038966e-05, "loss": 0.04, "step": 11990 }, { "epoch": 2.4973273855197617, "grad_norm": 0.30659279227256775, "learning_rate": 3.5145607711614026e-05, "loss": 0.0375, "step": 12000 }, { "epoch": 2.4994082532416018, "grad_norm": 0.3092106580734253, "learning_rate": 3.513680002161752e-05, "loss": 0.0347, "step": 12010 }, { "epoch": 2.501489120963442, "grad_norm": 0.31598004698753357, "learning_rate": 3.512798545440181e-05, "loss": 0.0362, "step": 12020 }, { "epoch": 2.503569988685282, "grad_norm": 0.2942461669445038, "learning_rate": 3.511916401397167e-05, "loss": 0.0362, "step": 12030 }, { "epoch": 2.505650856407122, "grad_norm": 0.29198601841926575, "learning_rate": 3.5110335704335025e-05, "loss": 0.0362, "step": 12040 }, { "epoch": 2.507731724128962, "grad_norm": 0.3306974172592163, "learning_rate": 3.5101500529502904e-05, "loss": 0.0351, "step": 12050 }, { "epoch": 2.509812591850802, "grad_norm": 0.2895551919937134, "learning_rate": 3.509265849348946e-05, "loss": 0.0414, "step": 12060 }, { "epoch": 2.511893459572642, "grad_norm": 0.27072635293006897, "learning_rate": 3.508380960031197e-05, "loss": 0.0336, "step": 12070 }, { "epoch": 2.513974327294482, "grad_norm": 0.2842830717563629, "learning_rate": 3.507495385399082e-05, "loss": 0.0372, "step": 12080 }, { "epoch": 2.516055195016322, "grad_norm": 0.3376796841621399, "learning_rate": 3.50660912585495e-05, "loss": 0.0367, "step": 12090 }, { "epoch": 2.518136062738162, "grad_norm": 0.31657376885414124, "learning_rate": 3.5057221818014626e-05, "loss": 0.0351, "step": 12100 }, { "epoch": 2.520216930460002, "grad_norm": 0.34025925397872925, "learning_rate": 3.504834553641592e-05, "loss": 0.0382, "step": 12110 }, { "epoch": 2.522297798181842, "grad_norm": 0.3006317913532257, "learning_rate": 3.503946241778621e-05, "loss": 0.0356, "step": 12120 }, { "epoch": 2.524378665903682, "grad_norm": 0.2715776264667511, "learning_rate": 3.5030572466161445e-05, "loss": 0.0346, "step": 12130 }, { "epoch": 2.526459533625522, "grad_norm": 0.31406304240226746, "learning_rate": 3.502167568558064e-05, "loss": 0.038, "step": 12140 }, { "epoch": 2.528540401347362, "grad_norm": 0.3206557035446167, "learning_rate": 3.5012772080085964e-05, "loss": 0.0344, "step": 12150 }, { "epoch": 2.5306212690692016, "grad_norm": 0.27863243222236633, "learning_rate": 3.5003861653722646e-05, "loss": 0.0317, "step": 12160 }, { "epoch": 2.532702136791042, "grad_norm": 0.3108104467391968, "learning_rate": 3.499494441053903e-05, "loss": 0.0374, "step": 12170 }, { "epoch": 2.5347830045128816, "grad_norm": 0.2993543744087219, "learning_rate": 3.498602035458655e-05, "loss": 0.0376, "step": 12180 }, { "epoch": 2.536863872234722, "grad_norm": 0.3016905188560486, "learning_rate": 3.4977089489919755e-05, "loss": 0.0386, "step": 12190 }, { "epoch": 2.5389447399565617, "grad_norm": 0.3379725217819214, "learning_rate": 3.496815182059627e-05, "loss": 0.0362, "step": 12200 }, { "epoch": 2.541025607678402, "grad_norm": 0.28729456663131714, "learning_rate": 3.4959207350676805e-05, "loss": 0.0367, "step": 12210 }, { "epoch": 2.5431064754002417, "grad_norm": 0.3103012144565582, "learning_rate": 3.4950256084225185e-05, "loss": 0.0351, "step": 12220 }, { "epoch": 2.545187343122082, "grad_norm": 0.2896404266357422, "learning_rate": 3.49412980253083e-05, "loss": 0.0328, "step": 12230 }, { "epoch": 2.5472682108439217, "grad_norm": 0.29043471813201904, "learning_rate": 3.4932333177996135e-05, "loss": 0.0354, "step": 12240 }, { "epoch": 2.5493490785657618, "grad_norm": 0.3100534677505493, "learning_rate": 3.492336154636175e-05, "loss": 0.0326, "step": 12250 }, { "epoch": 2.551429946287602, "grad_norm": 0.34353360533714294, "learning_rate": 3.491438313448131e-05, "loss": 0.0354, "step": 12260 }, { "epoch": 2.553510814009442, "grad_norm": 0.3380125164985657, "learning_rate": 3.4905397946434044e-05, "loss": 0.0386, "step": 12270 }, { "epoch": 2.555591681731282, "grad_norm": 0.2802422344684601, "learning_rate": 3.4896405986302254e-05, "loss": 0.0394, "step": 12280 }, { "epoch": 2.557672549453122, "grad_norm": 0.29394522309303284, "learning_rate": 3.4887407258171326e-05, "loss": 0.033, "step": 12290 }, { "epoch": 2.559753417174962, "grad_norm": 0.35762542486190796, "learning_rate": 3.487840176612973e-05, "loss": 0.0347, "step": 12300 }, { "epoch": 2.561834284896802, "grad_norm": 0.3386646509170532, "learning_rate": 3.4869389514269e-05, "loss": 0.0347, "step": 12310 }, { "epoch": 2.563915152618642, "grad_norm": 0.29994726181030273, "learning_rate": 3.486037050668374e-05, "loss": 0.0337, "step": 12320 }, { "epoch": 2.565996020340482, "grad_norm": 0.34277448058128357, "learning_rate": 3.485134474747162e-05, "loss": 0.0383, "step": 12330 }, { "epoch": 2.568076888062322, "grad_norm": 0.32160839438438416, "learning_rate": 3.484231224073338e-05, "loss": 0.0344, "step": 12340 }, { "epoch": 2.570157755784162, "grad_norm": 0.3271612226963043, "learning_rate": 3.483327299057285e-05, "loss": 0.0334, "step": 12350 }, { "epoch": 2.572238623506002, "grad_norm": 0.30362653732299805, "learning_rate": 3.482422700109687e-05, "loss": 0.0348, "step": 12360 }, { "epoch": 2.574319491227842, "grad_norm": 0.3347521126270294, "learning_rate": 3.481517427641541e-05, "loss": 0.0332, "step": 12370 }, { "epoch": 2.576400358949682, "grad_norm": 0.2925850450992584, "learning_rate": 3.4806114820641444e-05, "loss": 0.0356, "step": 12380 }, { "epoch": 2.578481226671522, "grad_norm": 0.26429876685142517, "learning_rate": 3.4797048637891016e-05, "loss": 0.0356, "step": 12390 }, { "epoch": 2.580562094393362, "grad_norm": 0.2946569621562958, "learning_rate": 3.478797573228326e-05, "loss": 0.0335, "step": 12400 }, { "epoch": 2.582642962115202, "grad_norm": 0.30468007922172546, "learning_rate": 3.477889610794031e-05, "loss": 0.0302, "step": 12410 }, { "epoch": 2.584723829837042, "grad_norm": 0.38808274269104004, "learning_rate": 3.4769809768987404e-05, "loss": 0.0389, "step": 12420 }, { "epoch": 2.586804697558882, "grad_norm": 0.2821674048900604, "learning_rate": 3.4760716719552806e-05, "loss": 0.0383, "step": 12430 }, { "epoch": 2.588885565280722, "grad_norm": 0.2982695698738098, "learning_rate": 3.475161696376782e-05, "loss": 0.0392, "step": 12440 }, { "epoch": 2.590966433002562, "grad_norm": 0.29223743081092834, "learning_rate": 3.474251050576682e-05, "loss": 0.0342, "step": 12450 }, { "epoch": 2.593047300724402, "grad_norm": 0.3061562478542328, "learning_rate": 3.47333973496872e-05, "loss": 0.0385, "step": 12460 }, { "epoch": 2.595128168446242, "grad_norm": 0.3548818826675415, "learning_rate": 3.472427749966942e-05, "loss": 0.0352, "step": 12470 }, { "epoch": 2.597209036168082, "grad_norm": 0.3254649341106415, "learning_rate": 3.4715150959856976e-05, "loss": 0.0374, "step": 12480 }, { "epoch": 2.599289903889922, "grad_norm": 0.29093897342681885, "learning_rate": 3.470601773439639e-05, "loss": 0.0351, "step": 12490 }, { "epoch": 2.6013707716117622, "grad_norm": 0.3551461696624756, "learning_rate": 3.469687782743724e-05, "loss": 0.037, "step": 12500 }, { "epoch": 2.6034516393336022, "grad_norm": 0.3390995264053345, "learning_rate": 3.468773124313212e-05, "loss": 0.0359, "step": 12510 }, { "epoch": 2.6055325070554423, "grad_norm": 0.3200623393058777, "learning_rate": 3.4678577985636676e-05, "loss": 0.0363, "step": 12520 }, { "epoch": 2.6076133747772823, "grad_norm": 0.36658236384391785, "learning_rate": 3.466941805910958e-05, "loss": 0.0325, "step": 12530 }, { "epoch": 2.6096942424991223, "grad_norm": 0.29639938473701477, "learning_rate": 3.4660251467712515e-05, "loss": 0.0364, "step": 12540 }, { "epoch": 2.611775110220962, "grad_norm": 0.27615025639533997, "learning_rate": 3.4651078215610235e-05, "loss": 0.0374, "step": 12550 }, { "epoch": 2.6138559779428023, "grad_norm": 0.3296145498752594, "learning_rate": 3.464189830697048e-05, "loss": 0.0371, "step": 12560 }, { "epoch": 2.615936845664642, "grad_norm": 0.32468146085739136, "learning_rate": 3.463271174596402e-05, "loss": 0.0339, "step": 12570 }, { "epoch": 2.6180177133864824, "grad_norm": 0.31952914595603943, "learning_rate": 3.462351853676468e-05, "loss": 0.0377, "step": 12580 }, { "epoch": 2.620098581108322, "grad_norm": 0.3083640933036804, "learning_rate": 3.461431868354926e-05, "loss": 0.0313, "step": 12590 }, { "epoch": 2.6221794488301624, "grad_norm": 0.3006347417831421, "learning_rate": 3.4605112190497606e-05, "loss": 0.0412, "step": 12600 }, { "epoch": 2.624260316552002, "grad_norm": 0.2754458785057068, "learning_rate": 3.459589906179258e-05, "loss": 0.0368, "step": 12610 }, { "epoch": 2.6263411842738424, "grad_norm": 0.2939032316207886, "learning_rate": 3.458667930162006e-05, "loss": 0.0375, "step": 12620 }, { "epoch": 2.628422051995682, "grad_norm": 0.33566784858703613, "learning_rate": 3.457745291416891e-05, "loss": 0.0329, "step": 12630 }, { "epoch": 2.6305029197175225, "grad_norm": 0.3357202410697937, "learning_rate": 3.456821990363104e-05, "loss": 0.0348, "step": 12640 }, { "epoch": 2.632583787439362, "grad_norm": 0.29309195280075073, "learning_rate": 3.455898027420136e-05, "loss": 0.0369, "step": 12650 }, { "epoch": 2.634664655161202, "grad_norm": 0.28545552492141724, "learning_rate": 3.454973403007777e-05, "loss": 0.0351, "step": 12660 }, { "epoch": 2.636745522883042, "grad_norm": 0.2724292278289795, "learning_rate": 3.45404811754612e-05, "loss": 0.0326, "step": 12670 }, { "epoch": 2.638826390604882, "grad_norm": 0.2767535448074341, "learning_rate": 3.453122171455556e-05, "loss": 0.0323, "step": 12680 }, { "epoch": 2.640907258326722, "grad_norm": 0.30563876032829285, "learning_rate": 3.452195565156777e-05, "loss": 0.0371, "step": 12690 }, { "epoch": 2.642988126048562, "grad_norm": 0.25403088331222534, "learning_rate": 3.451268299070776e-05, "loss": 0.0326, "step": 12700 }, { "epoch": 2.645068993770402, "grad_norm": 0.39842280745506287, "learning_rate": 3.4503403736188455e-05, "loss": 0.0384, "step": 12710 }, { "epoch": 2.647149861492242, "grad_norm": 0.2673007547855377, "learning_rate": 3.449411789222575e-05, "loss": 0.0334, "step": 12720 }, { "epoch": 2.649230729214082, "grad_norm": 0.3502455949783325, "learning_rate": 3.448482546303858e-05, "loss": 0.0364, "step": 12730 }, { "epoch": 2.651311596935922, "grad_norm": 0.30819061398506165, "learning_rate": 3.447552645284883e-05, "loss": 0.0398, "step": 12740 }, { "epoch": 2.6533924646577622, "grad_norm": 0.2750566899776459, "learning_rate": 3.4466220865881396e-05, "loss": 0.0348, "step": 12750 }, { "epoch": 2.6554733323796023, "grad_norm": 0.2821422517299652, "learning_rate": 3.445690870636414e-05, "loss": 0.0357, "step": 12760 }, { "epoch": 2.6575542001014423, "grad_norm": 0.3496500849723816, "learning_rate": 3.444758997852796e-05, "loss": 0.0402, "step": 12770 }, { "epoch": 2.6596350678232823, "grad_norm": 0.26533234119415283, "learning_rate": 3.4438264686606676e-05, "loss": 0.0345, "step": 12780 }, { "epoch": 2.6617159355451223, "grad_norm": 0.3063587248325348, "learning_rate": 3.4428932834837136e-05, "loss": 0.0352, "step": 12790 }, { "epoch": 2.6637968032669623, "grad_norm": 0.3205823600292206, "learning_rate": 3.4419594427459156e-05, "loss": 0.0365, "step": 12800 }, { "epoch": 2.6658776709888024, "grad_norm": 0.27043455839157104, "learning_rate": 3.441024946871551e-05, "loss": 0.0391, "step": 12810 }, { "epoch": 2.6679585387106424, "grad_norm": 0.3282109200954437, "learning_rate": 3.440089796285198e-05, "loss": 0.0364, "step": 12820 }, { "epoch": 2.6700394064324824, "grad_norm": 0.30918750166893005, "learning_rate": 3.43915399141173e-05, "loss": 0.0377, "step": 12830 }, { "epoch": 2.6721202741543224, "grad_norm": 0.3233569860458374, "learning_rate": 3.438217532676318e-05, "loss": 0.0345, "step": 12840 }, { "epoch": 2.6742011418761624, "grad_norm": 0.29556748270988464, "learning_rate": 3.437280420504432e-05, "loss": 0.0358, "step": 12850 }, { "epoch": 2.6762820095980024, "grad_norm": 0.31145018339157104, "learning_rate": 3.436342655321837e-05, "loss": 0.037, "step": 12860 }, { "epoch": 2.6783628773198425, "grad_norm": 0.2875843942165375, "learning_rate": 3.435404237554594e-05, "loss": 0.0353, "step": 12870 }, { "epoch": 2.6804437450416825, "grad_norm": 0.32330521941185, "learning_rate": 3.4344651676290623e-05, "loss": 0.0337, "step": 12880 }, { "epoch": 2.6825246127635225, "grad_norm": 0.3145151436328888, "learning_rate": 3.433525445971897e-05, "loss": 0.0337, "step": 12890 }, { "epoch": 2.6846054804853625, "grad_norm": 0.3058949112892151, "learning_rate": 3.432585073010049e-05, "loss": 0.037, "step": 12900 }, { "epoch": 2.6866863482072025, "grad_norm": 0.32106122374534607, "learning_rate": 3.431644049170765e-05, "loss": 0.0384, "step": 12910 }, { "epoch": 2.6887672159290426, "grad_norm": 0.33399197459220886, "learning_rate": 3.430702374881588e-05, "loss": 0.0367, "step": 12920 }, { "epoch": 2.6908480836508826, "grad_norm": 0.3249880075454712, "learning_rate": 3.429760050570355e-05, "loss": 0.0352, "step": 12930 }, { "epoch": 2.6929289513727226, "grad_norm": 0.3036189675331116, "learning_rate": 3.4288170766652e-05, "loss": 0.0365, "step": 12940 }, { "epoch": 2.6950098190945626, "grad_norm": 0.3243730366230011, "learning_rate": 3.4278734535945536e-05, "loss": 0.0338, "step": 12950 }, { "epoch": 2.697090686816402, "grad_norm": 0.25106537342071533, "learning_rate": 3.426929181787136e-05, "loss": 0.0373, "step": 12960 }, { "epoch": 2.6991715545382426, "grad_norm": 0.3977755308151245, "learning_rate": 3.425984261671968e-05, "loss": 0.036, "step": 12970 }, { "epoch": 2.701252422260082, "grad_norm": 0.3329249620437622, "learning_rate": 3.425038693678362e-05, "loss": 0.0352, "step": 12980 }, { "epoch": 2.7033332899819227, "grad_norm": 0.26118066906929016, "learning_rate": 3.424092478235925e-05, "loss": 0.033, "step": 12990 }, { "epoch": 2.7054141577037623, "grad_norm": 0.30967018008232117, "learning_rate": 3.423145615774558e-05, "loss": 0.0308, "step": 13000 }, { "epoch": 2.7074950254256027, "grad_norm": 0.3503299355506897, "learning_rate": 3.422198106724456e-05, "loss": 0.0376, "step": 13010 }, { "epoch": 2.7095758931474423, "grad_norm": 0.38717955350875854, "learning_rate": 3.4212499515161083e-05, "loss": 0.0385, "step": 13020 }, { "epoch": 2.7116567608692828, "grad_norm": 0.28374406695365906, "learning_rate": 3.420301150580299e-05, "loss": 0.0361, "step": 13030 }, { "epoch": 2.7137376285911223, "grad_norm": 0.3024613559246063, "learning_rate": 3.4193517043481024e-05, "loss": 0.0314, "step": 13040 }, { "epoch": 2.7158184963129623, "grad_norm": 0.2909814417362213, "learning_rate": 3.418401613250889e-05, "loss": 0.0326, "step": 13050 }, { "epoch": 2.7178993640348024, "grad_norm": 0.305775910615921, "learning_rate": 3.4174508777203204e-05, "loss": 0.0357, "step": 13060 }, { "epoch": 2.7199802317566424, "grad_norm": 0.3072550594806671, "learning_rate": 3.416499498188351e-05, "loss": 0.0381, "step": 13070 }, { "epoch": 2.7220610994784824, "grad_norm": 0.2828276455402374, "learning_rate": 3.4155474750872296e-05, "loss": 0.0332, "step": 13080 }, { "epoch": 2.7241419672003224, "grad_norm": 0.26082268357276917, "learning_rate": 3.4145948088494966e-05, "loss": 0.0355, "step": 13090 }, { "epoch": 2.7262228349221624, "grad_norm": 0.31082993745803833, "learning_rate": 3.413641499907981e-05, "loss": 0.0377, "step": 13100 }, { "epoch": 2.7283037026440025, "grad_norm": 0.31411805748939514, "learning_rate": 3.412687548695811e-05, "loss": 0.0366, "step": 13110 }, { "epoch": 2.7303845703658425, "grad_norm": 0.3904567062854767, "learning_rate": 3.4117329556464006e-05, "loss": 0.0303, "step": 13120 }, { "epoch": 2.7324654380876825, "grad_norm": 0.23070377111434937, "learning_rate": 3.410777721193458e-05, "loss": 0.0345, "step": 13130 }, { "epoch": 2.7345463058095225, "grad_norm": 0.31763890385627747, "learning_rate": 3.409821845770981e-05, "loss": 0.0356, "step": 13140 }, { "epoch": 2.7366271735313625, "grad_norm": 0.3090628385543823, "learning_rate": 3.408865329813262e-05, "loss": 0.0364, "step": 13150 }, { "epoch": 2.7387080412532026, "grad_norm": 0.24769167602062225, "learning_rate": 3.407908173754881e-05, "loss": 0.0348, "step": 13160 }, { "epoch": 2.7407889089750426, "grad_norm": 0.29511526226997375, "learning_rate": 3.406950378030709e-05, "loss": 0.0333, "step": 13170 }, { "epoch": 2.7428697766968826, "grad_norm": 0.29970523715019226, "learning_rate": 3.405991943075911e-05, "loss": 0.0363, "step": 13180 }, { "epoch": 2.7449506444187226, "grad_norm": 0.3412422835826874, "learning_rate": 3.4050328693259386e-05, "loss": 0.0336, "step": 13190 }, { "epoch": 3.0013597852236673, "grad_norm": 0.24482403695583344, "learning_rate": 3.404073157216536e-05, "loss": 0.028, "step": 13200 }, { "epoch": 3.0041490882465745, "grad_norm": 0.26707157492637634, "learning_rate": 3.403112807183736e-05, "loss": 0.0208, "step": 13210 }, { "epoch": 3.0069383912694816, "grad_norm": 0.25593703985214233, "learning_rate": 3.402151819663863e-05, "loss": 0.0244, "step": 13220 }, { "epoch": 3.0097276942923887, "grad_norm": 0.24734297394752502, "learning_rate": 3.401190195093528e-05, "loss": 0.0224, "step": 13230 }, { "epoch": 3.012516997315296, "grad_norm": 0.26002949476242065, "learning_rate": 3.4002279339096346e-05, "loss": 0.0228, "step": 13240 }, { "epoch": 3.015306300338203, "grad_norm": 0.3302578926086426, "learning_rate": 3.399265036549374e-05, "loss": 0.0214, "step": 13250 }, { "epoch": 3.01809560336111, "grad_norm": 0.27390220761299133, "learning_rate": 3.398301503450228e-05, "loss": 0.0203, "step": 13260 }, { "epoch": 3.0208849063840173, "grad_norm": 0.3026038706302643, "learning_rate": 3.397337335049963e-05, "loss": 0.022, "step": 13270 }, { "epoch": 3.0236742094069244, "grad_norm": 0.24820047616958618, "learning_rate": 3.396372531786641e-05, "loss": 0.0229, "step": 13280 }, { "epoch": 3.0264635124298316, "grad_norm": 0.2632322609424591, "learning_rate": 3.395407094098605e-05, "loss": 0.0219, "step": 13290 }, { "epoch": 3.0292528154527387, "grad_norm": 0.2500011920928955, "learning_rate": 3.3944410224244926e-05, "loss": 0.0223, "step": 13300 }, { "epoch": 3.032042118475646, "grad_norm": 0.19556154310703278, "learning_rate": 3.393474317203223e-05, "loss": 0.0228, "step": 13310 }, { "epoch": 3.034831421498553, "grad_norm": 0.2560470998287201, "learning_rate": 3.392506978874011e-05, "loss": 0.0222, "step": 13320 }, { "epoch": 3.03762072452146, "grad_norm": 0.2904638350009918, "learning_rate": 3.391539007876353e-05, "loss": 0.0229, "step": 13330 }, { "epoch": 3.0404100275443673, "grad_norm": 0.2634361982345581, "learning_rate": 3.390570404650034e-05, "loss": 0.0222, "step": 13340 }, { "epoch": 3.0431993305672744, "grad_norm": 0.29922613501548767, "learning_rate": 3.389601169635128e-05, "loss": 0.023, "step": 13350 }, { "epoch": 3.0459886335901816, "grad_norm": 0.2858612537384033, "learning_rate": 3.3886313032719944e-05, "loss": 0.0215, "step": 13360 }, { "epoch": 3.0487779366130887, "grad_norm": 0.2993389070034027, "learning_rate": 3.387660806001281e-05, "loss": 0.0241, "step": 13370 }, { "epoch": 3.051567239635996, "grad_norm": 0.24013638496398926, "learning_rate": 3.386689678263921e-05, "loss": 0.0248, "step": 13380 }, { "epoch": 3.054356542658903, "grad_norm": 0.3608420193195343, "learning_rate": 3.385717920501133e-05, "loss": 0.0216, "step": 13390 }, { "epoch": 3.05714584568181, "grad_norm": 0.2794843912124634, "learning_rate": 3.384745533154425e-05, "loss": 0.0213, "step": 13400 }, { "epoch": 3.0599351487047173, "grad_norm": 0.28975316882133484, "learning_rate": 3.3837725166655885e-05, "loss": 0.0244, "step": 13410 }, { "epoch": 3.0627244517276244, "grad_norm": 0.26695263385772705, "learning_rate": 3.3827988714767014e-05, "loss": 0.0206, "step": 13420 }, { "epoch": 3.0655137547505316, "grad_norm": 0.26984328031539917, "learning_rate": 3.381824598030128e-05, "loss": 0.0239, "step": 13430 }, { "epoch": 3.0683030577734387, "grad_norm": 0.27714720368385315, "learning_rate": 3.380849696768516e-05, "loss": 0.0242, "step": 13440 }, { "epoch": 3.071092360796346, "grad_norm": 0.27931588888168335, "learning_rate": 3.379874168134802e-05, "loss": 0.0214, "step": 13450 }, { "epoch": 3.073881663819253, "grad_norm": 0.2815168499946594, "learning_rate": 3.3788980125722054e-05, "loss": 0.0204, "step": 13460 }, { "epoch": 3.07667096684216, "grad_norm": 0.26290053129196167, "learning_rate": 3.3779212305242285e-05, "loss": 0.0246, "step": 13470 }, { "epoch": 3.0794602698650673, "grad_norm": 0.2690380811691284, "learning_rate": 3.376943822434661e-05, "loss": 0.0243, "step": 13480 }, { "epoch": 3.0822495728879744, "grad_norm": 0.27221551537513733, "learning_rate": 3.3759657887475766e-05, "loss": 0.0212, "step": 13490 }, { "epoch": 3.0850388759108816, "grad_norm": 0.24697984755039215, "learning_rate": 3.374987129907333e-05, "loss": 0.0212, "step": 13500 }, { "epoch": 3.0878281789337887, "grad_norm": 0.28138527274131775, "learning_rate": 3.374007846358572e-05, "loss": 0.0228, "step": 13510 }, { "epoch": 3.0906174819566963, "grad_norm": 0.22284817695617676, "learning_rate": 3.373027938546218e-05, "loss": 0.0218, "step": 13520 }, { "epoch": 3.0934067849796034, "grad_norm": 0.2623077929019928, "learning_rate": 3.3720474069154805e-05, "loss": 0.0246, "step": 13530 }, { "epoch": 3.0961960880025106, "grad_norm": 0.2587921917438507, "learning_rate": 3.3710662519118526e-05, "loss": 0.0236, "step": 13540 }, { "epoch": 3.0989853910254177, "grad_norm": 0.2497584968805313, "learning_rate": 3.37008447398111e-05, "loss": 0.0257, "step": 13550 }, { "epoch": 3.101774694048325, "grad_norm": 0.2643270194530487, "learning_rate": 3.369102073569312e-05, "loss": 0.0235, "step": 13560 }, { "epoch": 3.104563997071232, "grad_norm": 0.27453988790512085, "learning_rate": 3.368119051122798e-05, "loss": 0.0234, "step": 13570 }, { "epoch": 3.107353300094139, "grad_norm": 0.25669676065444946, "learning_rate": 3.367135407088193e-05, "loss": 0.0232, "step": 13580 }, { "epoch": 3.1101426031170463, "grad_norm": 0.2662254273891449, "learning_rate": 3.366151141912405e-05, "loss": 0.0262, "step": 13590 }, { "epoch": 3.1129319061399534, "grad_norm": 0.31864455342292786, "learning_rate": 3.365166256042622e-05, "loss": 0.0246, "step": 13600 }, { "epoch": 3.1157212091628605, "grad_norm": 0.2810993492603302, "learning_rate": 3.3641807499263147e-05, "loss": 0.0218, "step": 13610 }, { "epoch": 3.1185105121857677, "grad_norm": 0.2380981743335724, "learning_rate": 3.363194624011235e-05, "loss": 0.0225, "step": 13620 }, { "epoch": 3.121299815208675, "grad_norm": 0.2637837529182434, "learning_rate": 3.3622078787454186e-05, "loss": 0.0213, "step": 13630 }, { "epoch": 3.124089118231582, "grad_norm": 0.29635053873062134, "learning_rate": 3.36122051457718e-05, "loss": 0.0225, "step": 13640 }, { "epoch": 3.126878421254489, "grad_norm": 0.23687735199928284, "learning_rate": 3.360232531955116e-05, "loss": 0.0231, "step": 13650 }, { "epoch": 3.1296677242773963, "grad_norm": 0.2480635941028595, "learning_rate": 3.359243931328105e-05, "loss": 0.0226, "step": 13660 }, { "epoch": 3.1324570273003034, "grad_norm": 0.25591012835502625, "learning_rate": 3.358254713145304e-05, "loss": 0.0231, "step": 13670 }, { "epoch": 3.1352463303232105, "grad_norm": 0.2124357670545578, "learning_rate": 3.357264877856155e-05, "loss": 0.0212, "step": 13680 }, { "epoch": 3.1380356333461177, "grad_norm": 0.26496636867523193, "learning_rate": 3.356274425910375e-05, "loss": 0.0218, "step": 13690 }, { "epoch": 3.140824936369025, "grad_norm": 0.2616378962993622, "learning_rate": 3.355283357757965e-05, "loss": 0.0231, "step": 13700 }, { "epoch": 3.143614239391932, "grad_norm": 0.2625362277030945, "learning_rate": 3.3542916738492035e-05, "loss": 0.0228, "step": 13710 }, { "epoch": 3.146403542414839, "grad_norm": 0.30131980776786804, "learning_rate": 3.3532993746346507e-05, "loss": 0.0243, "step": 13720 }, { "epoch": 3.1491928454377462, "grad_norm": 0.27038127183914185, "learning_rate": 3.3523064605651453e-05, "loss": 0.0231, "step": 13730 }, { "epoch": 3.1519821484606534, "grad_norm": 0.22497743368148804, "learning_rate": 3.351312932091806e-05, "loss": 0.0234, "step": 13740 }, { "epoch": 3.1547714514835605, "grad_norm": 0.24452827870845795, "learning_rate": 3.35031878966603e-05, "loss": 0.0228, "step": 13750 }, { "epoch": 3.1575607545064677, "grad_norm": 0.2604657709598541, "learning_rate": 3.3493240337394937e-05, "loss": 0.0215, "step": 13760 }, { "epoch": 3.160350057529375, "grad_norm": 0.2851611077785492, "learning_rate": 3.348328664764152e-05, "loss": 0.0229, "step": 13770 }, { "epoch": 3.163139360552282, "grad_norm": 0.2635997533798218, "learning_rate": 3.3473326831922396e-05, "loss": 0.0233, "step": 13780 }, { "epoch": 3.165928663575189, "grad_norm": 0.3127332627773285, "learning_rate": 3.3463360894762664e-05, "loss": 0.0226, "step": 13790 }, { "epoch": 3.1687179665980962, "grad_norm": 0.2836700677871704, "learning_rate": 3.3453388840690244e-05, "loss": 0.0238, "step": 13800 }, { "epoch": 3.1715072696210034, "grad_norm": 0.2467455267906189, "learning_rate": 3.344341067423581e-05, "loss": 0.0237, "step": 13810 }, { "epoch": 3.1742965726439105, "grad_norm": 0.25613921880722046, "learning_rate": 3.343342639993282e-05, "loss": 0.0217, "step": 13820 }, { "epoch": 3.1770858756668177, "grad_norm": 0.26561596989631653, "learning_rate": 3.342343602231752e-05, "loss": 0.0219, "step": 13830 }, { "epoch": 3.179875178689725, "grad_norm": 0.2683209776878357, "learning_rate": 3.3413439545928877e-05, "loss": 0.0209, "step": 13840 }, { "epoch": 3.182664481712632, "grad_norm": 0.2540314197540283, "learning_rate": 3.340343697530871e-05, "loss": 0.022, "step": 13850 }, { "epoch": 3.185453784735539, "grad_norm": 0.26656922698020935, "learning_rate": 3.339342831500154e-05, "loss": 0.0253, "step": 13860 }, { "epoch": 3.1882430877584462, "grad_norm": 0.23745234310626984, "learning_rate": 3.338341356955469e-05, "loss": 0.0247, "step": 13870 }, { "epoch": 3.1910323907813534, "grad_norm": 0.2430545687675476, "learning_rate": 3.337339274351822e-05, "loss": 0.0241, "step": 13880 }, { "epoch": 3.1938216938042605, "grad_norm": 0.2104274034500122, "learning_rate": 3.336336584144499e-05, "loss": 0.0212, "step": 13890 }, { "epoch": 3.1966109968271677, "grad_norm": 0.2963005602359772, "learning_rate": 3.335333286789059e-05, "loss": 0.0214, "step": 13900 }, { "epoch": 3.199400299850075, "grad_norm": 0.3048710525035858, "learning_rate": 3.334329382741338e-05, "loss": 0.0229, "step": 13910 }, { "epoch": 3.202189602872982, "grad_norm": 0.2970116436481476, "learning_rate": 3.3333248724574466e-05, "loss": 0.0307, "step": 13920 }, { "epoch": 3.204978905895889, "grad_norm": 0.31085214018821716, "learning_rate": 3.332319756393773e-05, "loss": 0.0222, "step": 13930 }, { "epoch": 3.207768208918796, "grad_norm": 0.27891308069229126, "learning_rate": 3.3313140350069793e-05, "loss": 0.0218, "step": 13940 }, { "epoch": 3.210557511941704, "grad_norm": 0.2988732159137726, "learning_rate": 3.330307708754002e-05, "loss": 0.0227, "step": 13950 }, { "epoch": 3.213346814964611, "grad_norm": 0.28284892439842224, "learning_rate": 3.3293007780920535e-05, "loss": 0.0222, "step": 13960 }, { "epoch": 3.216136117987518, "grad_norm": 0.28325435519218445, "learning_rate": 3.328293243478619e-05, "loss": 0.0241, "step": 13970 }, { "epoch": 3.2189254210104252, "grad_norm": 0.2990381717681885, "learning_rate": 3.327285105371461e-05, "loss": 0.023, "step": 13980 }, { "epoch": 3.2217147240333324, "grad_norm": 0.27463462948799133, "learning_rate": 3.326276364228615e-05, "loss": 0.0235, "step": 13990 }, { "epoch": 3.2245040270562395, "grad_norm": 0.3132324814796448, "learning_rate": 3.3252670205083894e-05, "loss": 0.0219, "step": 14000 }, { "epoch": 3.2272933300791466, "grad_norm": 0.2535857856273651, "learning_rate": 3.3242570746693665e-05, "loss": 0.0226, "step": 14010 }, { "epoch": 3.230082633102054, "grad_norm": 0.2753618061542511, "learning_rate": 3.3232465271704034e-05, "loss": 0.0247, "step": 14020 }, { "epoch": 3.232871936124961, "grad_norm": 0.29607653617858887, "learning_rate": 3.322235378470631e-05, "loss": 0.0217, "step": 14030 }, { "epoch": 3.235661239147868, "grad_norm": 0.26469945907592773, "learning_rate": 3.321223629029451e-05, "loss": 0.0222, "step": 14040 }, { "epoch": 3.238450542170775, "grad_norm": 0.24293407797813416, "learning_rate": 3.320211279306539e-05, "loss": 0.0236, "step": 14050 }, { "epoch": 3.2412398451936824, "grad_norm": 0.2802630364894867, "learning_rate": 3.3191983297618447e-05, "loss": 0.0209, "step": 14060 }, { "epoch": 3.2440291482165895, "grad_norm": 0.2546704411506653, "learning_rate": 3.3181847808555896e-05, "loss": 0.021, "step": 14070 }, { "epoch": 3.2468184512394966, "grad_norm": 0.28556132316589355, "learning_rate": 3.317170633048265e-05, "loss": 0.024, "step": 14080 }, { "epoch": 3.249607754262404, "grad_norm": 0.26946327090263367, "learning_rate": 3.3161558868006394e-05, "loss": 0.0208, "step": 14090 }, { "epoch": 3.252397057285311, "grad_norm": 0.24964095652103424, "learning_rate": 3.3151405425737485e-05, "loss": 0.0219, "step": 14100 }, { "epoch": 3.255186360308218, "grad_norm": 0.2816349267959595, "learning_rate": 3.3141246008289017e-05, "loss": 0.0234, "step": 14110 }, { "epoch": 3.257975663331125, "grad_norm": 0.2552763819694519, "learning_rate": 3.31310806202768e-05, "loss": 0.0213, "step": 14120 }, { "epoch": 3.2607649663540323, "grad_norm": 0.3010943531990051, "learning_rate": 3.312090926631935e-05, "loss": 0.0235, "step": 14130 }, { "epoch": 3.2635542693769395, "grad_norm": 0.26190343499183655, "learning_rate": 3.31107319510379e-05, "loss": 0.025, "step": 14140 }, { "epoch": 3.2663435723998466, "grad_norm": 0.2976658344268799, "learning_rate": 3.3100548679056384e-05, "loss": 0.0212, "step": 14150 }, { "epoch": 3.2691328754227538, "grad_norm": 0.2628508508205414, "learning_rate": 3.3090359455001446e-05, "loss": 0.0227, "step": 14160 }, { "epoch": 3.271922178445661, "grad_norm": 0.30362480878829956, "learning_rate": 3.3080164283502446e-05, "loss": 0.0239, "step": 14170 }, { "epoch": 3.274711481468568, "grad_norm": 0.2450343817472458, "learning_rate": 3.306996316919142e-05, "loss": 0.0228, "step": 14180 }, { "epoch": 3.277500784491475, "grad_norm": 0.2693016827106476, "learning_rate": 3.305975611670313e-05, "loss": 0.0228, "step": 14190 }, { "epoch": 3.2802900875143823, "grad_norm": 0.24091771245002747, "learning_rate": 3.304954313067502e-05, "loss": 0.023, "step": 14200 }, { "epoch": 3.2830793905372895, "grad_norm": 0.311689555644989, "learning_rate": 3.303932421574723e-05, "loss": 0.0225, "step": 14210 }, { "epoch": 3.2858686935601966, "grad_norm": 0.24628707766532898, "learning_rate": 3.3029099376562614e-05, "loss": 0.0251, "step": 14220 }, { "epoch": 3.2886579965831038, "grad_norm": 0.2675122320652008, "learning_rate": 3.3018868617766684e-05, "loss": 0.0233, "step": 14230 }, { "epoch": 3.291447299606011, "grad_norm": 0.2579371929168701, "learning_rate": 3.3008631944007675e-05, "loss": 0.0219, "step": 14240 }, { "epoch": 3.294236602628918, "grad_norm": 0.2687638998031616, "learning_rate": 3.299838935993648e-05, "loss": 0.0229, "step": 14250 }, { "epoch": 3.297025905651825, "grad_norm": 0.2942163050174713, "learning_rate": 3.2988140870206704e-05, "loss": 0.0239, "step": 14260 }, { "epoch": 3.2998152086747323, "grad_norm": 0.28006604313850403, "learning_rate": 3.297788647947462e-05, "loss": 0.0228, "step": 14270 }, { "epoch": 3.3026045116976395, "grad_norm": 0.27043041586875916, "learning_rate": 3.2967626192399174e-05, "loss": 0.023, "step": 14280 }, { "epoch": 3.3053938147205466, "grad_norm": 0.29186487197875977, "learning_rate": 3.295736001364201e-05, "loss": 0.0253, "step": 14290 }, { "epoch": 3.3081831177434537, "grad_norm": 0.26711907982826233, "learning_rate": 3.2947087947867435e-05, "loss": 0.0251, "step": 14300 }, { "epoch": 3.310972420766361, "grad_norm": 0.25996002554893494, "learning_rate": 3.293680999974245e-05, "loss": 0.0227, "step": 14310 }, { "epoch": 3.313761723789268, "grad_norm": 0.28949686884880066, "learning_rate": 3.292652617393669e-05, "loss": 0.0231, "step": 14320 }, { "epoch": 3.316551026812175, "grad_norm": 0.2576659619808197, "learning_rate": 3.291623647512252e-05, "loss": 0.0269, "step": 14330 }, { "epoch": 3.3193403298350823, "grad_norm": 0.24940505623817444, "learning_rate": 3.29059409079749e-05, "loss": 0.0266, "step": 14340 }, { "epoch": 3.3221296328579895, "grad_norm": 0.2991146743297577, "learning_rate": 3.2895639477171515e-05, "loss": 0.0233, "step": 14350 }, { "epoch": 3.3249189358808966, "grad_norm": 0.308015376329422, "learning_rate": 3.288533218739269e-05, "loss": 0.0233, "step": 14360 }, { "epoch": 3.3277082389038037, "grad_norm": 0.26003509759902954, "learning_rate": 3.2875019043321424e-05, "loss": 0.0247, "step": 14370 }, { "epoch": 3.330497541926711, "grad_norm": 0.2725379765033722, "learning_rate": 3.286470004964335e-05, "loss": 0.0239, "step": 14380 }, { "epoch": 3.333286844949618, "grad_norm": 0.2688729763031006, "learning_rate": 3.28543752110468e-05, "loss": 0.0234, "step": 14390 }, { "epoch": 3.336076147972525, "grad_norm": 0.30011212825775146, "learning_rate": 3.284404453222271e-05, "loss": 0.0239, "step": 14400 }, { "epoch": 3.3388654509954323, "grad_norm": 0.2591323256492615, "learning_rate": 3.283370801786472e-05, "loss": 0.0237, "step": 14410 }, { "epoch": 3.3416547540183394, "grad_norm": 0.25189006328582764, "learning_rate": 3.282336567266908e-05, "loss": 0.0261, "step": 14420 }, { "epoch": 3.3444440570412466, "grad_norm": 0.2549824118614197, "learning_rate": 3.281301750133472e-05, "loss": 0.0243, "step": 14430 }, { "epoch": 3.3472333600641537, "grad_norm": 0.26575151085853577, "learning_rate": 3.28026635085632e-05, "loss": 0.0227, "step": 14440 }, { "epoch": 3.3500226630870613, "grad_norm": 0.2813517451286316, "learning_rate": 3.279230369905873e-05, "loss": 0.0239, "step": 14450 }, { "epoch": 3.3528119661099685, "grad_norm": 0.3055305778980255, "learning_rate": 3.278193807752816e-05, "loss": 0.0231, "step": 14460 }, { "epoch": 3.3556012691328756, "grad_norm": 0.27121463418006897, "learning_rate": 3.2771566648680996e-05, "loss": 0.0281, "step": 14470 }, { "epoch": 3.3583905721557827, "grad_norm": 0.32952556014060974, "learning_rate": 3.276118941722935e-05, "loss": 0.0265, "step": 14480 }, { "epoch": 3.36117987517869, "grad_norm": 0.2479351907968521, "learning_rate": 3.275080638788799e-05, "loss": 0.0235, "step": 14490 }, { "epoch": 3.363969178201597, "grad_norm": 0.2834234833717346, "learning_rate": 3.274041756537434e-05, "loss": 0.0256, "step": 14500 }, { "epoch": 3.366758481224504, "grad_norm": 0.24205411970615387, "learning_rate": 3.273002295440842e-05, "loss": 0.0235, "step": 14510 }, { "epoch": 3.3695477842474113, "grad_norm": 0.31610462069511414, "learning_rate": 3.271962255971289e-05, "loss": 0.0251, "step": 14520 }, { "epoch": 3.3723370872703184, "grad_norm": 0.26715707778930664, "learning_rate": 3.270921638601305e-05, "loss": 0.0232, "step": 14530 }, { "epoch": 3.3751263902932256, "grad_norm": 0.2567874789237976, "learning_rate": 3.2698804438036804e-05, "loss": 0.0239, "step": 14540 }, { "epoch": 3.3779156933161327, "grad_norm": 0.2602570354938507, "learning_rate": 3.2688386720514704e-05, "loss": 0.0239, "step": 14550 }, { "epoch": 3.38070499633904, "grad_norm": 0.30415239930152893, "learning_rate": 3.267796323817991e-05, "loss": 0.0234, "step": 14560 }, { "epoch": 3.383494299361947, "grad_norm": 0.24899551272392273, "learning_rate": 3.2667533995768215e-05, "loss": 0.0243, "step": 14570 }, { "epoch": 3.386283602384854, "grad_norm": 0.2832050323486328, "learning_rate": 3.265709899801799e-05, "loss": 0.025, "step": 14580 }, { "epoch": 3.3890729054077613, "grad_norm": 0.2666321098804474, "learning_rate": 3.264665824967027e-05, "loss": 0.0238, "step": 14590 }, { "epoch": 3.3918622084306684, "grad_norm": 0.28956279158592224, "learning_rate": 3.263621175546867e-05, "loss": 0.0256, "step": 14600 }, { "epoch": 3.3946515114535756, "grad_norm": 0.2588934898376465, "learning_rate": 3.262575952015944e-05, "loss": 0.0247, "step": 14610 }, { "epoch": 3.3974408144764827, "grad_norm": 0.2793014645576477, "learning_rate": 3.26153015484914e-05, "loss": 0.025, "step": 14620 }, { "epoch": 3.40023011749939, "grad_norm": 0.3003716766834259, "learning_rate": 3.2604837845216025e-05, "loss": 0.025, "step": 14630 }, { "epoch": 3.403019420522297, "grad_norm": 0.29944756627082825, "learning_rate": 3.259436841508736e-05, "loss": 0.0232, "step": 14640 }, { "epoch": 3.405808723545204, "grad_norm": 0.28853851556777954, "learning_rate": 3.2583893262862065e-05, "loss": 0.0243, "step": 14650 }, { "epoch": 3.4085980265681113, "grad_norm": 0.2709476351737976, "learning_rate": 3.2573412393299395e-05, "loss": 0.0249, "step": 14660 }, { "epoch": 3.4113873295910184, "grad_norm": 0.27466580271720886, "learning_rate": 3.25629258111612e-05, "loss": 0.0233, "step": 14670 }, { "epoch": 3.4141766326139256, "grad_norm": 0.2762468457221985, "learning_rate": 3.255243352121194e-05, "loss": 0.0219, "step": 14680 }, { "epoch": 3.4169659356368327, "grad_norm": 0.3155626654624939, "learning_rate": 3.2541935528218647e-05, "loss": 0.0243, "step": 14690 }, { "epoch": 3.41975523865974, "grad_norm": 0.27512988448143005, "learning_rate": 3.253143183695096e-05, "loss": 0.0249, "step": 14700 }, { "epoch": 3.422544541682647, "grad_norm": 0.27925392985343933, "learning_rate": 3.25209224521811e-05, "loss": 0.0223, "step": 14710 }, { "epoch": 3.425333844705554, "grad_norm": 0.25881728529930115, "learning_rate": 3.251040737868389e-05, "loss": 0.0231, "step": 14720 }, { "epoch": 3.4281231477284613, "grad_norm": 0.35452213883399963, "learning_rate": 3.249988662123671e-05, "loss": 0.0262, "step": 14730 }, { "epoch": 3.4309124507513684, "grad_norm": 0.26180315017700195, "learning_rate": 3.248936018461954e-05, "loss": 0.0232, "step": 14740 }, { "epoch": 3.4337017537742756, "grad_norm": 0.2888478636741638, "learning_rate": 3.247882807361494e-05, "loss": 0.0234, "step": 14750 }, { "epoch": 3.4364910567971827, "grad_norm": 0.28038153052330017, "learning_rate": 3.246829029300804e-05, "loss": 0.0229, "step": 14760 }, { "epoch": 3.43928035982009, "grad_norm": 0.2777486741542816, "learning_rate": 3.245774684758655e-05, "loss": 0.0228, "step": 14770 }, { "epoch": 3.442069662842997, "grad_norm": 0.28087764978408813, "learning_rate": 3.244719774214077e-05, "loss": 0.0256, "step": 14780 }, { "epoch": 3.444858965865904, "grad_norm": 0.27317070960998535, "learning_rate": 3.243664298146354e-05, "loss": 0.024, "step": 14790 }, { "epoch": 3.4476482688888113, "grad_norm": 0.23199228942394257, "learning_rate": 3.2426082570350294e-05, "loss": 0.023, "step": 14800 }, { "epoch": 3.4504375719117184, "grad_norm": 0.2767881453037262, "learning_rate": 3.241551651359902e-05, "loss": 0.0258, "step": 14810 }, { "epoch": 3.4532268749346255, "grad_norm": 0.26215067505836487, "learning_rate": 3.240494481601029e-05, "loss": 0.0207, "step": 14820 }, { "epoch": 3.4560161779575327, "grad_norm": 0.24118299782276154, "learning_rate": 3.239436748238721e-05, "loss": 0.024, "step": 14830 }, { "epoch": 3.45880548098044, "grad_norm": 0.2798198163509369, "learning_rate": 3.238378451753546e-05, "loss": 0.0238, "step": 14840 }, { "epoch": 3.461594784003347, "grad_norm": 0.30132144689559937, "learning_rate": 3.237319592626329e-05, "loss": 0.0239, "step": 14850 }, { "epoch": 3.4643840870262546, "grad_norm": 0.2743625342845917, "learning_rate": 3.236260171338149e-05, "loss": 0.0265, "step": 14860 }, { "epoch": 3.4671733900491617, "grad_norm": 0.276888370513916, "learning_rate": 3.2352001883703417e-05, "loss": 0.0242, "step": 14870 }, { "epoch": 3.469962693072069, "grad_norm": 0.27659326791763306, "learning_rate": 3.234139644204497e-05, "loss": 0.0226, "step": 14880 }, { "epoch": 3.472751996094976, "grad_norm": 0.2512148320674896, "learning_rate": 3.2330785393224595e-05, "loss": 0.0248, "step": 14890 }, { "epoch": 3.475541299117883, "grad_norm": 0.2746523916721344, "learning_rate": 3.23201687420633e-05, "loss": 0.0244, "step": 14900 }, { "epoch": 3.4783306021407903, "grad_norm": 0.2753903865814209, "learning_rate": 3.230954649338463e-05, "loss": 0.023, "step": 14910 }, { "epoch": 3.4811199051636974, "grad_norm": 0.30259764194488525, "learning_rate": 3.229891865201466e-05, "loss": 0.0239, "step": 14920 }, { "epoch": 3.4839092081866045, "grad_norm": 0.2955324649810791, "learning_rate": 3.228828522278204e-05, "loss": 0.0249, "step": 14930 }, { "epoch": 3.4866985112095117, "grad_norm": 0.2676406800746918, "learning_rate": 3.227764621051793e-05, "loss": 0.0237, "step": 14940 }, { "epoch": 3.489487814232419, "grad_norm": 0.2758808434009552, "learning_rate": 3.226700162005603e-05, "loss": 0.0232, "step": 14950 }, { "epoch": 3.492277117255326, "grad_norm": 0.2755467891693115, "learning_rate": 3.2256351456232584e-05, "loss": 0.0226, "step": 14960 }, { "epoch": 3.495066420278233, "grad_norm": 0.24590110778808594, "learning_rate": 3.224569572388635e-05, "loss": 0.0232, "step": 14970 }, { "epoch": 3.4978557233011403, "grad_norm": 0.2946254312992096, "learning_rate": 3.2235034427858654e-05, "loss": 0.0235, "step": 14980 }, { "epoch": 3.5006450263240474, "grad_norm": 0.2832624912261963, "learning_rate": 3.2224367572993305e-05, "loss": 0.022, "step": 14990 }, { "epoch": 3.5034343293469545, "grad_norm": 0.2314351499080658, "learning_rate": 3.221369516413666e-05, "loss": 0.0234, "step": 15000 }, { "epoch": 3.5062236323698617, "grad_norm": 0.3111531436443329, "learning_rate": 3.22030172061376e-05, "loss": 0.0235, "step": 15010 }, { "epoch": 3.509012935392769, "grad_norm": 0.2579207122325897, "learning_rate": 3.219233370384753e-05, "loss": 0.0244, "step": 15020 }, { "epoch": 3.511802238415676, "grad_norm": 0.24636487662792206, "learning_rate": 3.218164466212035e-05, "loss": 0.0259, "step": 15030 }, { "epoch": 3.514591541438583, "grad_norm": 0.29913580417633057, "learning_rate": 3.217095008581252e-05, "loss": 0.0226, "step": 15040 }, { "epoch": 3.5173808444614902, "grad_norm": 0.2680075466632843, "learning_rate": 3.216024997978297e-05, "loss": 0.0235, "step": 15050 }, { "epoch": 3.5201701474843974, "grad_norm": 0.2441624402999878, "learning_rate": 3.214954434889317e-05, "loss": 0.0251, "step": 15060 }, { "epoch": 3.5229594505073045, "grad_norm": 0.25790488719940186, "learning_rate": 3.213883319800709e-05, "loss": 0.0227, "step": 15070 }, { "epoch": 3.5257487535302117, "grad_norm": 0.23142455518245697, "learning_rate": 3.212811653199121e-05, "loss": 0.025, "step": 15080 }, { "epoch": 3.528538056553119, "grad_norm": 0.2899198532104492, "learning_rate": 3.2117394355714524e-05, "loss": 0.0222, "step": 15090 }, { "epoch": 3.531327359576026, "grad_norm": 0.27671676874160767, "learning_rate": 3.21066666740485e-05, "loss": 0.0261, "step": 15100 }, { "epoch": 3.534116662598933, "grad_norm": 0.28205421566963196, "learning_rate": 3.209593349186715e-05, "loss": 0.0225, "step": 15110 }, { "epoch": 3.5369059656218402, "grad_norm": 0.26121020317077637, "learning_rate": 3.208519481404695e-05, "loss": 0.024, "step": 15120 }, { "epoch": 3.5396952686447474, "grad_norm": 0.21435414254665375, "learning_rate": 3.20744506454669e-05, "loss": 0.0215, "step": 15130 }, { "epoch": 3.5424845716676545, "grad_norm": 0.298065721988678, "learning_rate": 3.206370099100846e-05, "loss": 0.0231, "step": 15140 }, { "epoch": 3.5452738746905617, "grad_norm": 0.277558833360672, "learning_rate": 3.205294585555562e-05, "loss": 0.0259, "step": 15150 }, { "epoch": 3.548063177713469, "grad_norm": 0.25043636560440063, "learning_rate": 3.204218524399484e-05, "loss": 0.0219, "step": 15160 }, { "epoch": 3.550852480736376, "grad_norm": 0.25726521015167236, "learning_rate": 3.2031419161215074e-05, "loss": 0.022, "step": 15170 }, { "epoch": 3.553641783759283, "grad_norm": 0.315782368183136, "learning_rate": 3.2020647612107754e-05, "loss": 0.0254, "step": 15180 }, { "epoch": 3.55643108678219, "grad_norm": 0.27541130781173706, "learning_rate": 3.20098706015668e-05, "loss": 0.0238, "step": 15190 }, { "epoch": 3.5592203898050974, "grad_norm": 0.25939613580703735, "learning_rate": 3.199908813448862e-05, "loss": 0.0238, "step": 15200 }, { "epoch": 3.5620096928280045, "grad_norm": 0.2744806110858917, "learning_rate": 3.19883002157721e-05, "loss": 0.024, "step": 15210 }, { "epoch": 3.5647989958509116, "grad_norm": 0.2605966329574585, "learning_rate": 3.1977506850318595e-05, "loss": 0.0268, "step": 15220 }, { "epoch": 3.567588298873819, "grad_norm": 0.27557557821273804, "learning_rate": 3.1966708043031925e-05, "loss": 0.0231, "step": 15230 }, { "epoch": 3.570377601896726, "grad_norm": 0.2341373711824417, "learning_rate": 3.195590379881841e-05, "loss": 0.0251, "step": 15240 }, { "epoch": 3.573166904919633, "grad_norm": 0.29587018489837646, "learning_rate": 3.194509412258682e-05, "loss": 0.0222, "step": 15250 }, { "epoch": 3.57595620794254, "grad_norm": 0.2542768120765686, "learning_rate": 3.1934279019248396e-05, "loss": 0.0247, "step": 15260 }, { "epoch": 3.5787455109654474, "grad_norm": 0.28459104895591736, "learning_rate": 3.1923458493716855e-05, "loss": 0.0232, "step": 15270 }, { "epoch": 3.5815348139883545, "grad_norm": 0.2627626657485962, "learning_rate": 3.191263255090837e-05, "loss": 0.0242, "step": 15280 }, { "epoch": 3.5843241170112616, "grad_norm": 0.2537837028503418, "learning_rate": 3.190180119574157e-05, "loss": 0.0247, "step": 15290 }, { "epoch": 3.5871134200341688, "grad_norm": 0.2855553925037384, "learning_rate": 3.189096443313755e-05, "loss": 0.0221, "step": 15300 }, { "epoch": 3.589902723057076, "grad_norm": 0.28314700722694397, "learning_rate": 3.188012226801985e-05, "loss": 0.0231, "step": 15310 }, { "epoch": 3.592692026079983, "grad_norm": 0.2773413062095642, "learning_rate": 3.1869274705314495e-05, "loss": 0.0235, "step": 15320 }, { "epoch": 3.59548132910289, "grad_norm": 0.2644776701927185, "learning_rate": 3.185842174994993e-05, "loss": 0.0239, "step": 15330 }, { "epoch": 3.5982706321257973, "grad_norm": 0.2433871030807495, "learning_rate": 3.184756340685706e-05, "loss": 0.0227, "step": 15340 }, { "epoch": 3.6010599351487045, "grad_norm": 0.26916709542274475, "learning_rate": 3.183669968096925e-05, "loss": 0.0221, "step": 15350 }, { "epoch": 3.6038492381716116, "grad_norm": 0.27984699606895447, "learning_rate": 3.182583057722228e-05, "loss": 0.0256, "step": 15360 }, { "epoch": 3.6066385411945188, "grad_norm": 0.283122181892395, "learning_rate": 3.181495610055442e-05, "loss": 0.024, "step": 15370 }, { "epoch": 3.609427844217426, "grad_norm": 0.27689045667648315, "learning_rate": 3.1804076255906336e-05, "loss": 0.022, "step": 15380 }, { "epoch": 3.612217147240333, "grad_norm": 0.27164074778556824, "learning_rate": 3.179319104822116e-05, "loss": 0.0227, "step": 15390 }, { "epoch": 3.6150064502632406, "grad_norm": 0.3222748935222626, "learning_rate": 3.178230048244445e-05, "loss": 0.0218, "step": 15400 }, { "epoch": 3.6177957532861478, "grad_norm": 0.27783316373825073, "learning_rate": 3.17714045635242e-05, "loss": 0.0233, "step": 15410 }, { "epoch": 3.620585056309055, "grad_norm": 0.26954102516174316, "learning_rate": 3.1760503296410835e-05, "loss": 0.0239, "step": 15420 }, { "epoch": 3.623374359331962, "grad_norm": 0.25336700677871704, "learning_rate": 3.174959668605722e-05, "loss": 0.0226, "step": 15430 }, { "epoch": 3.626163662354869, "grad_norm": 0.30870893597602844, "learning_rate": 3.1738684737418635e-05, "loss": 0.0239, "step": 15440 }, { "epoch": 3.6289529653777763, "grad_norm": 0.2561623454093933, "learning_rate": 3.172776745545279e-05, "loss": 0.0261, "step": 15450 }, { "epoch": 3.6317422684006835, "grad_norm": 0.2751530110836029, "learning_rate": 3.171684484511981e-05, "loss": 0.0227, "step": 15460 }, { "epoch": 3.6345315714235906, "grad_norm": 0.26530447602272034, "learning_rate": 3.1705916911382265e-05, "loss": 0.0242, "step": 15470 }, { "epoch": 3.6373208744464978, "grad_norm": 0.27005353569984436, "learning_rate": 3.169498365920512e-05, "loss": 0.021, "step": 15480 }, { "epoch": 3.640110177469405, "grad_norm": 0.2548362612724304, "learning_rate": 3.168404509355576e-05, "loss": 0.0225, "step": 15490 }, { "epoch": 3.642899480492312, "grad_norm": 0.25435373187065125, "learning_rate": 3.167310121940398e-05, "loss": 0.022, "step": 15500 }, { "epoch": 3.645688783515219, "grad_norm": 0.27601340413093567, "learning_rate": 3.1662152041722016e-05, "loss": 0.0258, "step": 15510 }, { "epoch": 3.6484780865381263, "grad_norm": 0.22557896375656128, "learning_rate": 3.1651197565484486e-05, "loss": 0.0234, "step": 15520 }, { "epoch": 3.6512673895610335, "grad_norm": 0.3238334655761719, "learning_rate": 3.164023779566841e-05, "loss": 0.0243, "step": 15530 }, { "epoch": 3.6540566925839406, "grad_norm": 0.2556135952472687, "learning_rate": 3.162927273725323e-05, "loss": 0.0237, "step": 15540 }, { "epoch": 3.6568459956068478, "grad_norm": 0.2570909857749939, "learning_rate": 3.1618302395220797e-05, "loss": 0.0239, "step": 15550 }, { "epoch": 3.659635298629755, "grad_norm": 0.2553325295448303, "learning_rate": 3.160732677455534e-05, "loss": 0.0236, "step": 15560 }, { "epoch": 3.662424601652662, "grad_norm": 0.27167630195617676, "learning_rate": 3.1596345880243494e-05, "loss": 0.0237, "step": 15570 }, { "epoch": 3.665213904675569, "grad_norm": 0.23852258920669556, "learning_rate": 3.15853597172743e-05, "loss": 0.0257, "step": 15580 }, { "epoch": 3.6680032076984763, "grad_norm": 0.2575397491455078, "learning_rate": 3.157436829063919e-05, "loss": 0.0227, "step": 15590 }, { "epoch": 3.6707925107213835, "grad_norm": 0.2545246183872223, "learning_rate": 3.1563371605331985e-05, "loss": 0.0248, "step": 15600 }, { "epoch": 3.6735818137442906, "grad_norm": 0.27988600730895996, "learning_rate": 3.155236966634888e-05, "loss": 0.0251, "step": 15610 }, { "epoch": 3.6763711167671977, "grad_norm": 0.24943014979362488, "learning_rate": 3.154136247868848e-05, "loss": 0.0219, "step": 15620 }, { "epoch": 3.679160419790105, "grad_norm": 0.2614193558692932, "learning_rate": 3.153035004735176e-05, "loss": 0.0258, "step": 15630 }, { "epoch": 3.681949722813012, "grad_norm": 0.36057183146476746, "learning_rate": 3.1519332377342095e-05, "loss": 0.0235, "step": 15640 }, { "epoch": 3.684739025835919, "grad_norm": 0.2620885372161865, "learning_rate": 3.150830947366522e-05, "loss": 0.0228, "step": 15650 }, { "epoch": 3.6875283288588263, "grad_norm": 0.24786575138568878, "learning_rate": 3.149728134132925e-05, "loss": 0.0242, "step": 15660 }, { "epoch": 3.6903176318817335, "grad_norm": 0.2653746008872986, "learning_rate": 3.148624798534469e-05, "loss": 0.0247, "step": 15670 }, { "epoch": 3.6931069349046406, "grad_norm": 0.2877577841281891, "learning_rate": 3.147520941072442e-05, "loss": 0.0242, "step": 15680 }, { "epoch": 3.6958962379275477, "grad_norm": 0.2853648066520691, "learning_rate": 3.146416562248365e-05, "loss": 0.0239, "step": 15690 }, { "epoch": 3.698685540950455, "grad_norm": 0.2354426085948944, "learning_rate": 3.145311662564002e-05, "loss": 0.0243, "step": 15700 }, { "epoch": 3.701474843973362, "grad_norm": 0.2665269374847412, "learning_rate": 3.14420624252135e-05, "loss": 0.0238, "step": 15710 }, { "epoch": 3.704264146996269, "grad_norm": 0.26391300559043884, "learning_rate": 3.14310030262264e-05, "loss": 0.0227, "step": 15720 }, { "epoch": 3.7070534500191763, "grad_norm": 0.21704114973545074, "learning_rate": 3.141993843370347e-05, "loss": 0.0211, "step": 15730 }, { "epoch": 3.709842753042084, "grad_norm": 0.3036942780017853, "learning_rate": 3.140886865267174e-05, "loss": 0.024, "step": 15740 }, { "epoch": 3.712632056064991, "grad_norm": 0.24613241851329803, "learning_rate": 3.139779368816064e-05, "loss": 0.0242, "step": 15750 }, { "epoch": 3.715421359087898, "grad_norm": 0.25333818793296814, "learning_rate": 3.138671354520194e-05, "loss": 0.0218, "step": 15760 }, { "epoch": 3.7182106621108053, "grad_norm": 0.2559027671813965, "learning_rate": 3.1375628228829774e-05, "loss": 0.0209, "step": 15770 }, { "epoch": 3.7209999651337125, "grad_norm": 0.26799312233924866, "learning_rate": 3.13645377440806e-05, "loss": 0.0245, "step": 15780 }, { "epoch": 3.7237892681566196, "grad_norm": 0.2522207796573639, "learning_rate": 3.135344209599327e-05, "loss": 0.0263, "step": 15790 }, { "epoch": 3.7265785711795267, "grad_norm": 0.2654907703399658, "learning_rate": 3.134234128960893e-05, "loss": 0.0233, "step": 15800 }, { "epoch": 3.729367874202434, "grad_norm": 0.2869943678379059, "learning_rate": 3.133123532997112e-05, "loss": 0.0238, "step": 15810 }, { "epoch": 3.732157177225341, "grad_norm": 0.27265143394470215, "learning_rate": 3.132012422212569e-05, "loss": 0.0251, "step": 15820 }, { "epoch": 3.734946480248248, "grad_norm": 0.22739249467849731, "learning_rate": 3.1309007971120815e-05, "loss": 0.0216, "step": 15830 }, { "epoch": 3.7377357832711553, "grad_norm": 0.2616797387599945, "learning_rate": 3.129788658200705e-05, "loss": 0.0218, "step": 15840 }, { "epoch": 3.7405250862940624, "grad_norm": 0.27246034145355225, "learning_rate": 3.1286760059837244e-05, "loss": 0.0252, "step": 15850 }, { "epoch": 3.7433143893169696, "grad_norm": 0.276716411113739, "learning_rate": 3.127562840966661e-05, "loss": 0.0228, "step": 15860 }, { "epoch": 3.7461036923398767, "grad_norm": 0.2592340111732483, "learning_rate": 3.126449163655268e-05, "loss": 0.0234, "step": 15870 }, { "epoch": 3.748892995362784, "grad_norm": 0.28145483136177063, "learning_rate": 3.125334974555529e-05, "loss": 0.0218, "step": 15880 }, { "epoch": 3.751682298385691, "grad_norm": 0.24650302529335022, "learning_rate": 3.124220274173664e-05, "loss": 0.0232, "step": 15890 }, { "epoch": 3.754471601408598, "grad_norm": 0.31845226883888245, "learning_rate": 3.123105063016122e-05, "loss": 0.0225, "step": 15900 }, { "epoch": 3.7572609044315053, "grad_norm": 0.25776994228363037, "learning_rate": 3.121989341589587e-05, "loss": 0.0217, "step": 15910 }, { "epoch": 3.7600502074544124, "grad_norm": 0.27747273445129395, "learning_rate": 3.1208731104009727e-05, "loss": 0.0241, "step": 15920 }, { "epoch": 3.7628395104773196, "grad_norm": 0.25583693385124207, "learning_rate": 3.119756369957425e-05, "loss": 0.0239, "step": 15930 }, { "epoch": 3.7656288135002267, "grad_norm": 0.2807357907295227, "learning_rate": 3.1186391207663204e-05, "loss": 0.0208, "step": 15940 }, { "epoch": 3.768418116523134, "grad_norm": 0.2773926258087158, "learning_rate": 3.117521363335268e-05, "loss": 0.0205, "step": 15950 }, { "epoch": 3.771207419546041, "grad_norm": 0.22191046178340912, "learning_rate": 3.116403098172108e-05, "loss": 0.0221, "step": 15960 }, { "epoch": 3.773996722568948, "grad_norm": 0.25844356417655945, "learning_rate": 3.115284325784909e-05, "loss": 0.0234, "step": 15970 }, { "epoch": 3.7767860255918553, "grad_norm": 0.25318723917007446, "learning_rate": 3.114165046681972e-05, "loss": 0.0218, "step": 15980 }, { "epoch": 3.7795753286147624, "grad_norm": 0.25427284836769104, "learning_rate": 3.113045261371828e-05, "loss": 0.0235, "step": 15990 }, { "epoch": 3.7823646316376696, "grad_norm": 0.2764810025691986, "learning_rate": 3.111924970363238e-05, "loss": 0.0236, "step": 16000 }, { "epoch": 3.7851539346605767, "grad_norm": 0.23988427221775055, "learning_rate": 3.1108041741651925e-05, "loss": 0.0239, "step": 16010 }, { "epoch": 3.787943237683484, "grad_norm": 0.3185794949531555, "learning_rate": 3.10968287328691e-05, "loss": 0.0255, "step": 16020 }, { "epoch": 3.790732540706391, "grad_norm": 0.24892133474349976, "learning_rate": 3.1085610682378416e-05, "loss": 0.0222, "step": 16030 }, { "epoch": 3.793521843729298, "grad_norm": 0.25834664702415466, "learning_rate": 3.107438759527664e-05, "loss": 0.0233, "step": 16040 }, { "epoch": 3.7963111467522053, "grad_norm": 0.25952237844467163, "learning_rate": 3.106315947666286e-05, "loss": 0.0282, "step": 16050 }, { "epoch": 3.7991004497751124, "grad_norm": 0.3096574544906616, "learning_rate": 3.105192633163842e-05, "loss": 0.0239, "step": 16060 }, { "epoch": 3.8018897527980196, "grad_norm": 0.2849220633506775, "learning_rate": 3.104068816530697e-05, "loss": 0.0202, "step": 16070 }, { "epoch": 3.8046790558209267, "grad_norm": 0.27324458956718445, "learning_rate": 3.1029444982774434e-05, "loss": 0.0236, "step": 16080 }, { "epoch": 3.807468358843834, "grad_norm": 0.31851494312286377, "learning_rate": 3.1018196789149016e-05, "loss": 0.024, "step": 16090 }, { "epoch": 3.810257661866741, "grad_norm": 0.22566501796245575, "learning_rate": 3.100694358954118e-05, "loss": 0.0232, "step": 16100 }, { "epoch": 3.813046964889648, "grad_norm": 0.2573205530643463, "learning_rate": 3.0995685389063684e-05, "loss": 0.0209, "step": 16110 }, { "epoch": 3.8158362679125553, "grad_norm": 0.21723061800003052, "learning_rate": 3.0984422192831565e-05, "loss": 0.0257, "step": 16120 }, { "epoch": 3.8186255709354624, "grad_norm": 0.2644743025302887, "learning_rate": 3.097315400596211e-05, "loss": 0.0221, "step": 16130 }, { "epoch": 3.8214148739583695, "grad_norm": 0.3371449112892151, "learning_rate": 3.096188083357488e-05, "loss": 0.0256, "step": 16140 }, { "epoch": 3.8242041769812767, "grad_norm": 0.2705473303794861, "learning_rate": 3.09506026807917e-05, "loss": 0.0208, "step": 16150 }, { "epoch": 3.826993480004184, "grad_norm": 0.22667215764522552, "learning_rate": 3.093931955273667e-05, "loss": 0.0223, "step": 16160 }, { "epoch": 4.003076793391217, "grad_norm": 0.24000266194343567, "learning_rate": 3.092803145453614e-05, "loss": 0.0193, "step": 16170 }, { "epoch": 4.006448621765152, "grad_norm": 0.28797316551208496, "learning_rate": 3.0916738391318704e-05, "loss": 0.0184, "step": 16180 }, { "epoch": 4.009820450139088, "grad_norm": 0.21582230925559998, "learning_rate": 3.0905440368215244e-05, "loss": 0.0195, "step": 16190 }, { "epoch": 4.013192278513023, "grad_norm": 0.1924019753932953, "learning_rate": 3.089413739035886e-05, "loss": 0.0182, "step": 16200 }, { "epoch": 4.0165641068869595, "grad_norm": 0.21489696204662323, "learning_rate": 3.088282946288493e-05, "loss": 0.0188, "step": 16210 }, { "epoch": 4.019935935260896, "grad_norm": 0.19778066873550415, "learning_rate": 3.087151659093109e-05, "loss": 0.0158, "step": 16220 }, { "epoch": 4.023307763634831, "grad_norm": 0.21411612629890442, "learning_rate": 3.086019877963718e-05, "loss": 0.0175, "step": 16230 }, { "epoch": 4.026679592008767, "grad_norm": 0.23039782047271729, "learning_rate": 3.084887603414531e-05, "loss": 0.0157, "step": 16240 }, { "epoch": 4.030051420382702, "grad_norm": 0.23105810582637787, "learning_rate": 3.083754835959985e-05, "loss": 0.0175, "step": 16250 }, { "epoch": 4.033423248756638, "grad_norm": 0.23968879878520966, "learning_rate": 3.082621576114737e-05, "loss": 0.0176, "step": 16260 }, { "epoch": 4.036795077130574, "grad_norm": 0.20048338174819946, "learning_rate": 3.08148782439367e-05, "loss": 0.0175, "step": 16270 }, { "epoch": 4.04016690550451, "grad_norm": 0.26422345638275146, "learning_rate": 3.080353581311891e-05, "loss": 0.0178, "step": 16280 }, { "epoch": 4.043538733878446, "grad_norm": 0.2384038269519806, "learning_rate": 3.079218847384729e-05, "loss": 0.0185, "step": 16290 }, { "epoch": 4.046910562252381, "grad_norm": 0.2000572383403778, "learning_rate": 3.0780836231277366e-05, "loss": 0.0171, "step": 16300 }, { "epoch": 4.050282390626317, "grad_norm": 0.25314298272132874, "learning_rate": 3.0769479090566884e-05, "loss": 0.0169, "step": 16310 }, { "epoch": 4.0536542190002525, "grad_norm": 0.2290262132883072, "learning_rate": 3.075811705687582e-05, "loss": 0.018, "step": 16320 }, { "epoch": 4.057026047374189, "grad_norm": 0.206838458776474, "learning_rate": 3.074675013536639e-05, "loss": 0.019, "step": 16330 }, { "epoch": 4.060397875748125, "grad_norm": 0.23652002215385437, "learning_rate": 3.0735378331203e-05, "loss": 0.0187, "step": 16340 }, { "epoch": 4.06376970412206, "grad_norm": 0.20325829088687897, "learning_rate": 3.072400164955229e-05, "loss": 0.0163, "step": 16350 }, { "epoch": 4.067141532495996, "grad_norm": 0.22494150698184967, "learning_rate": 3.071262009558314e-05, "loss": 0.0159, "step": 16360 }, { "epoch": 4.070513360869931, "grad_norm": 0.25647035241127014, "learning_rate": 3.070123367446658e-05, "loss": 0.0164, "step": 16370 }, { "epoch": 4.073885189243867, "grad_norm": 0.24968741834163666, "learning_rate": 3.068984239137591e-05, "loss": 0.018, "step": 16380 }, { "epoch": 4.077257017617804, "grad_norm": 0.2361917495727539, "learning_rate": 3.067844625148663e-05, "loss": 0.0175, "step": 16390 }, { "epoch": 4.080628845991739, "grad_norm": 0.23595644533634186, "learning_rate": 3.066704525997642e-05, "loss": 0.0184, "step": 16400 }, { "epoch": 4.084000674365675, "grad_norm": 0.2332417368888855, "learning_rate": 3.065563942202518e-05, "loss": 0.016, "step": 16410 }, { "epoch": 4.08737250273961, "grad_norm": 0.20604532957077026, "learning_rate": 3.064422874281503e-05, "loss": 0.0171, "step": 16420 }, { "epoch": 4.090744331113546, "grad_norm": 0.21294823288917542, "learning_rate": 3.063281322753025e-05, "loss": 0.0158, "step": 16430 }, { "epoch": 4.094116159487482, "grad_norm": 0.24685785174369812, "learning_rate": 3.062139288135736e-05, "loss": 0.0186, "step": 16440 }, { "epoch": 4.097487987861418, "grad_norm": 0.221607506275177, "learning_rate": 3.0609967709485045e-05, "loss": 0.0174, "step": 16450 }, { "epoch": 4.100859816235354, "grad_norm": 0.22848044335842133, "learning_rate": 3.059853771710418e-05, "loss": 0.0184, "step": 16460 }, { "epoch": 4.104231644609289, "grad_norm": 0.2465742528438568, "learning_rate": 3.058710290940786e-05, "loss": 0.017, "step": 16470 }, { "epoch": 4.107603472983225, "grad_norm": 0.21509014070034027, "learning_rate": 3.057566329159134e-05, "loss": 0.0196, "step": 16480 }, { "epoch": 4.110975301357161, "grad_norm": 0.23679278790950775, "learning_rate": 3.056421886885207e-05, "loss": 0.0186, "step": 16490 }, { "epoch": 4.1143471297310965, "grad_norm": 0.2243577241897583, "learning_rate": 3.055276964638969e-05, "loss": 0.0181, "step": 16500 }, { "epoch": 4.117718958105033, "grad_norm": 0.21534140408039093, "learning_rate": 3.054131562940601e-05, "loss": 0.018, "step": 16510 }, { "epoch": 4.121090786478968, "grad_norm": 0.21232305467128754, "learning_rate": 3.052985682310501e-05, "loss": 0.0177, "step": 16520 }, { "epoch": 4.124462614852904, "grad_norm": 0.19061733782291412, "learning_rate": 3.0518393232692876e-05, "loss": 0.0168, "step": 16530 }, { "epoch": 4.12783444322684, "grad_norm": 0.2137490063905716, "learning_rate": 3.0506924863377932e-05, "loss": 0.019, "step": 16540 }, { "epoch": 4.131206271600775, "grad_norm": 0.20542873442173004, "learning_rate": 3.049545172037071e-05, "loss": 0.0172, "step": 16550 }, { "epoch": 4.1345780999747115, "grad_norm": 0.23066620528697968, "learning_rate": 3.048397380888388e-05, "loss": 0.0171, "step": 16560 }, { "epoch": 4.137949928348647, "grad_norm": 0.2558150291442871, "learning_rate": 3.0472491134132287e-05, "loss": 0.0182, "step": 16570 }, { "epoch": 4.141321756722583, "grad_norm": 0.2511146068572998, "learning_rate": 3.046100370133295e-05, "loss": 0.0187, "step": 16580 }, { "epoch": 4.144693585096519, "grad_norm": 0.21473099291324615, "learning_rate": 3.044951151570504e-05, "loss": 0.0177, "step": 16590 }, { "epoch": 4.148065413470454, "grad_norm": 0.23352792859077454, "learning_rate": 3.0438014582469898e-05, "loss": 0.0169, "step": 16600 }, { "epoch": 4.15143724184439, "grad_norm": 0.2596484124660492, "learning_rate": 3.0426512906851004e-05, "loss": 0.0167, "step": 16610 }, { "epoch": 4.154809070218326, "grad_norm": 0.219725102186203, "learning_rate": 3.0415006494074012e-05, "loss": 0.0182, "step": 16620 }, { "epoch": 4.158180898592262, "grad_norm": 0.2419557273387909, "learning_rate": 3.0403495349366712e-05, "loss": 0.0182, "step": 16630 }, { "epoch": 4.161552726966198, "grad_norm": 0.2511640787124634, "learning_rate": 3.0391979477959048e-05, "loss": 0.0165, "step": 16640 }, { "epoch": 4.164924555340133, "grad_norm": 0.27945205569267273, "learning_rate": 3.0380458885083128e-05, "loss": 0.0187, "step": 16650 }, { "epoch": 4.168296383714069, "grad_norm": 0.21342603862285614, "learning_rate": 3.0368933575973182e-05, "loss": 0.0168, "step": 16660 }, { "epoch": 4.171668212088004, "grad_norm": 0.2467052936553955, "learning_rate": 3.0357403555865602e-05, "loss": 0.0193, "step": 16670 }, { "epoch": 4.1750400404619405, "grad_norm": 0.25043046474456787, "learning_rate": 3.034586882999889e-05, "loss": 0.0189, "step": 16680 }, { "epoch": 4.178411868835877, "grad_norm": 0.2660280168056488, "learning_rate": 3.0334329403613728e-05, "loss": 0.0203, "step": 16690 }, { "epoch": 4.181783697209812, "grad_norm": 0.26268699765205383, "learning_rate": 3.032278528195291e-05, "loss": 0.0185, "step": 16700 }, { "epoch": 4.185155525583748, "grad_norm": 0.21762637794017792, "learning_rate": 3.031123647026136e-05, "loss": 0.0184, "step": 16710 }, { "epoch": 4.188527353957683, "grad_norm": 0.26516997814178467, "learning_rate": 3.0299682973786137e-05, "loss": 0.0194, "step": 16720 }, { "epoch": 4.191899182331619, "grad_norm": 0.27233436703681946, "learning_rate": 3.0288124797776438e-05, "loss": 0.0167, "step": 16730 }, { "epoch": 4.1952710107055555, "grad_norm": 0.2358931601047516, "learning_rate": 3.0276561947483575e-05, "loss": 0.0178, "step": 16740 }, { "epoch": 4.198642839079491, "grad_norm": 0.28013119101524353, "learning_rate": 3.0264994428160985e-05, "loss": 0.0184, "step": 16750 }, { "epoch": 4.202014667453427, "grad_norm": 0.24036946892738342, "learning_rate": 3.0253422245064227e-05, "loss": 0.0179, "step": 16760 }, { "epoch": 4.205386495827362, "grad_norm": 0.26984143257141113, "learning_rate": 3.0241845403450984e-05, "loss": 0.0173, "step": 16770 }, { "epoch": 4.208758324201298, "grad_norm": 0.23088659346103668, "learning_rate": 3.023026390858105e-05, "loss": 0.019, "step": 16780 }, { "epoch": 4.212130152575234, "grad_norm": 0.24600031971931458, "learning_rate": 3.021867776571634e-05, "loss": 0.0171, "step": 16790 }, { "epoch": 4.21550198094917, "grad_norm": 0.2503112852573395, "learning_rate": 3.0207086980120867e-05, "loss": 0.0168, "step": 16800 }, { "epoch": 4.218873809323106, "grad_norm": 0.24345998466014862, "learning_rate": 3.0195491557060767e-05, "loss": 0.0167, "step": 16810 }, { "epoch": 4.222245637697041, "grad_norm": 0.22057320177555084, "learning_rate": 3.018389150180428e-05, "loss": 0.0178, "step": 16820 }, { "epoch": 4.225617466070977, "grad_norm": 0.25742119550704956, "learning_rate": 3.0172286819621742e-05, "loss": 0.0187, "step": 16830 }, { "epoch": 4.228989294444912, "grad_norm": 0.2528959810733795, "learning_rate": 3.016067751578561e-05, "loss": 0.018, "step": 16840 }, { "epoch": 4.2323611228188485, "grad_norm": 0.2312375009059906, "learning_rate": 3.0149063595570415e-05, "loss": 0.0184, "step": 16850 }, { "epoch": 4.235732951192785, "grad_norm": 0.19597618281841278, "learning_rate": 3.0137445064252806e-05, "loss": 0.0183, "step": 16860 }, { "epoch": 4.23910477956672, "grad_norm": 0.27132663130760193, "learning_rate": 3.0125821927111523e-05, "loss": 0.0168, "step": 16870 }, { "epoch": 4.242476607940656, "grad_norm": 0.2568066120147705, "learning_rate": 3.0114194189427386e-05, "loss": 0.0184, "step": 16880 }, { "epoch": 4.245848436314591, "grad_norm": 0.2455788105726242, "learning_rate": 3.0102561856483316e-05, "loss": 0.0187, "step": 16890 }, { "epoch": 4.249220264688527, "grad_norm": 0.25140148401260376, "learning_rate": 3.0090924933564335e-05, "loss": 0.0188, "step": 16900 }, { "epoch": 4.252592093062463, "grad_norm": 0.23564386367797852, "learning_rate": 3.0079283425957523e-05, "loss": 0.0186, "step": 16910 }, { "epoch": 4.255963921436399, "grad_norm": 0.22422507405281067, "learning_rate": 3.0067637338952056e-05, "loss": 0.0191, "step": 16920 }, { "epoch": 4.259335749810335, "grad_norm": 0.25312647223472595, "learning_rate": 3.00559866778392e-05, "loss": 0.0186, "step": 16930 }, { "epoch": 4.26270757818427, "grad_norm": 0.2401970475912094, "learning_rate": 3.0044331447912274e-05, "loss": 0.0185, "step": 16940 }, { "epoch": 4.266079406558206, "grad_norm": 0.2565164864063263, "learning_rate": 3.00326716544667e-05, "loss": 0.0181, "step": 16950 }, { "epoch": 4.269451234932142, "grad_norm": 0.32464268803596497, "learning_rate": 3.0021007302799966e-05, "loss": 0.0179, "step": 16960 }, { "epoch": 4.2728230633060775, "grad_norm": 0.22168785333633423, "learning_rate": 3.000933839821162e-05, "loss": 0.0187, "step": 16970 }, { "epoch": 4.276194891680014, "grad_norm": 0.24025274813175201, "learning_rate": 2.9997664946003283e-05, "loss": 0.0185, "step": 16980 }, { "epoch": 4.279566720053949, "grad_norm": 0.23596760630607605, "learning_rate": 2.998598695147865e-05, "loss": 0.0196, "step": 16990 }, { "epoch": 4.282938548427885, "grad_norm": 0.23789100348949432, "learning_rate": 2.9974304419943475e-05, "loss": 0.0178, "step": 17000 }, { "epoch": 4.286310376801821, "grad_norm": 0.27643904089927673, "learning_rate": 2.9962617356705567e-05, "loss": 0.0199, "step": 17010 }, { "epoch": 4.289682205175756, "grad_norm": 0.23902755975723267, "learning_rate": 2.9950925767074802e-05, "loss": 0.0192, "step": 17020 }, { "epoch": 4.2930540335496925, "grad_norm": 0.2404031753540039, "learning_rate": 2.9939229656363112e-05, "loss": 0.0161, "step": 17030 }, { "epoch": 4.296425861923628, "grad_norm": 0.21523749828338623, "learning_rate": 2.992752902988448e-05, "loss": 0.0181, "step": 17040 }, { "epoch": 4.299797690297564, "grad_norm": 0.2594034671783447, "learning_rate": 2.9915823892954946e-05, "loss": 0.0194, "step": 17050 }, { "epoch": 4.3031695186715, "grad_norm": 0.23049363493919373, "learning_rate": 2.9904114250892588e-05, "loss": 0.0171, "step": 17060 }, { "epoch": 4.306541347045435, "grad_norm": 0.3011740446090698, "learning_rate": 2.9892400109017537e-05, "loss": 0.0178, "step": 17070 }, { "epoch": 4.309913175419371, "grad_norm": 0.2514997124671936, "learning_rate": 2.988068147265198e-05, "loss": 0.0173, "step": 17080 }, { "epoch": 4.313285003793307, "grad_norm": 0.2322489619255066, "learning_rate": 2.9868958347120133e-05, "loss": 0.0185, "step": 17090 }, { "epoch": 4.316656832167243, "grad_norm": 0.22080634534358978, "learning_rate": 2.985723073774825e-05, "loss": 0.0192, "step": 17100 }, { "epoch": 4.320028660541179, "grad_norm": 0.2336024045944214, "learning_rate": 2.9845498649864618e-05, "loss": 0.0173, "step": 17110 }, { "epoch": 4.323400488915114, "grad_norm": 0.21369606256484985, "learning_rate": 2.983376208879958e-05, "loss": 0.0201, "step": 17120 }, { "epoch": 4.32677231728905, "grad_norm": 0.2630109190940857, "learning_rate": 2.98220210598855e-05, "loss": 0.0189, "step": 17130 }, { "epoch": 4.3301441456629854, "grad_norm": 0.22298294305801392, "learning_rate": 2.9810275568456767e-05, "loss": 0.0194, "step": 17140 }, { "epoch": 4.333515974036922, "grad_norm": 0.25575923919677734, "learning_rate": 2.979852561984979e-05, "loss": 0.0178, "step": 17150 }, { "epoch": 4.336887802410858, "grad_norm": 0.2305557280778885, "learning_rate": 2.9786771219403022e-05, "loss": 0.0205, "step": 17160 }, { "epoch": 4.340259630784793, "grad_norm": 0.25408709049224854, "learning_rate": 2.9775012372456936e-05, "loss": 0.018, "step": 17170 }, { "epoch": 4.343631459158729, "grad_norm": 0.2112557291984558, "learning_rate": 2.9763249084354013e-05, "loss": 0.0202, "step": 17180 }, { "epoch": 4.347003287532664, "grad_norm": 0.233245387673378, "learning_rate": 2.9751481360438763e-05, "loss": 0.0172, "step": 17190 }, { "epoch": 4.3503751159066, "grad_norm": 0.24801577627658844, "learning_rate": 2.9739709206057702e-05, "loss": 0.0191, "step": 17200 }, { "epoch": 4.353746944280536, "grad_norm": 0.23003992438316345, "learning_rate": 2.9727932626559366e-05, "loss": 0.0175, "step": 17210 }, { "epoch": 4.357118772654472, "grad_norm": 0.2133759707212448, "learning_rate": 2.97161516272943e-05, "loss": 0.0203, "step": 17220 }, { "epoch": 4.360490601028408, "grad_norm": 0.22024774551391602, "learning_rate": 2.9704366213615055e-05, "loss": 0.0176, "step": 17230 }, { "epoch": 4.363862429402343, "grad_norm": 0.20226849615573883, "learning_rate": 2.9692576390876183e-05, "loss": 0.0179, "step": 17240 }, { "epoch": 4.367234257776279, "grad_norm": 0.26265543699264526, "learning_rate": 2.968078216443426e-05, "loss": 0.0174, "step": 17250 }, { "epoch": 4.3706060861502145, "grad_norm": 0.21632713079452515, "learning_rate": 2.966898353964784e-05, "loss": 0.0192, "step": 17260 }, { "epoch": 4.373977914524151, "grad_norm": 0.27335110306739807, "learning_rate": 2.965718052187748e-05, "loss": 0.0188, "step": 17270 }, { "epoch": 4.377349742898087, "grad_norm": 0.20728182792663574, "learning_rate": 2.9645373116485738e-05, "loss": 0.0188, "step": 17280 }, { "epoch": 4.380721571272022, "grad_norm": 0.24149763584136963, "learning_rate": 2.9633561328837167e-05, "loss": 0.0162, "step": 17290 }, { "epoch": 4.384093399645958, "grad_norm": 0.23386479914188385, "learning_rate": 2.9621745164298304e-05, "loss": 0.0181, "step": 17300 }, { "epoch": 4.387465228019893, "grad_norm": 0.2273348867893219, "learning_rate": 2.960992462823768e-05, "loss": 0.019, "step": 17310 }, { "epoch": 4.3908370563938295, "grad_norm": 0.2784688472747803, "learning_rate": 2.959809972602581e-05, "loss": 0.0177, "step": 17320 }, { "epoch": 4.394208884767766, "grad_norm": 0.255176305770874, "learning_rate": 2.9586270463035195e-05, "loss": 0.0216, "step": 17330 }, { "epoch": 4.397580713141701, "grad_norm": 0.25486600399017334, "learning_rate": 2.9574436844640314e-05, "loss": 0.0176, "step": 17340 }, { "epoch": 4.400952541515637, "grad_norm": 0.2591731548309326, "learning_rate": 2.956259887621762e-05, "loss": 0.0191, "step": 17350 }, { "epoch": 4.404324369889572, "grad_norm": 0.21950682997703552, "learning_rate": 2.955075656314556e-05, "loss": 0.019, "step": 17360 }, { "epoch": 4.407696198263508, "grad_norm": 0.2394210547208786, "learning_rate": 2.9538909910804534e-05, "loss": 0.0173, "step": 17370 }, { "epoch": 4.4110680266374445, "grad_norm": 0.26499056816101074, "learning_rate": 2.9527058924576932e-05, "loss": 0.0204, "step": 17380 }, { "epoch": 4.41443985501138, "grad_norm": 0.27804988622665405, "learning_rate": 2.9515203609847093e-05, "loss": 0.018, "step": 17390 }, { "epoch": 4.417811683385316, "grad_norm": 0.22676849365234375, "learning_rate": 2.9503343972001346e-05, "loss": 0.0186, "step": 17400 }, { "epoch": 4.421183511759251, "grad_norm": 0.24507412314414978, "learning_rate": 2.9491480016427966e-05, "loss": 0.0201, "step": 17410 }, { "epoch": 4.424555340133187, "grad_norm": 0.24344073235988617, "learning_rate": 2.9479611748517186e-05, "loss": 0.0204, "step": 17420 }, { "epoch": 4.427927168507123, "grad_norm": 0.27451786398887634, "learning_rate": 2.9467739173661218e-05, "loss": 0.0188, "step": 17430 }, { "epoch": 4.4312989968810585, "grad_norm": 0.221276193857193, "learning_rate": 2.9455862297254225e-05, "loss": 0.0196, "step": 17440 }, { "epoch": 4.434670825254995, "grad_norm": 0.21992751955986023, "learning_rate": 2.944398112469232e-05, "loss": 0.0218, "step": 17450 }, { "epoch": 4.43804265362893, "grad_norm": 0.21258202195167542, "learning_rate": 2.943209566137355e-05, "loss": 0.0183, "step": 17460 }, { "epoch": 4.441414482002866, "grad_norm": 0.24209187924861908, "learning_rate": 2.9420205912697947e-05, "loss": 0.0188, "step": 17470 }, { "epoch": 4.444786310376802, "grad_norm": 0.2790079414844513, "learning_rate": 2.9408311884067465e-05, "loss": 0.018, "step": 17480 }, { "epoch": 4.448158138750737, "grad_norm": 0.21273599565029144, "learning_rate": 2.939641358088601e-05, "loss": 0.0172, "step": 17490 }, { "epoch": 4.4515299671246735, "grad_norm": 0.27751797437667847, "learning_rate": 2.9384511008559437e-05, "loss": 0.0204, "step": 17500 }, { "epoch": 4.454901795498609, "grad_norm": 0.22481313347816467, "learning_rate": 2.9372604172495524e-05, "loss": 0.0164, "step": 17510 }, { "epoch": 4.458273623872545, "grad_norm": 0.23746679723262787, "learning_rate": 2.9360693078104e-05, "loss": 0.0201, "step": 17520 }, { "epoch": 4.461645452246481, "grad_norm": 0.22043335437774658, "learning_rate": 2.9348777730796523e-05, "loss": 0.0178, "step": 17530 }, { "epoch": 4.465017280620416, "grad_norm": 0.21736320853233337, "learning_rate": 2.9336858135986693e-05, "loss": 0.0185, "step": 17540 }, { "epoch": 4.468389108994352, "grad_norm": 0.2547312080860138, "learning_rate": 2.932493429909002e-05, "loss": 0.0197, "step": 17550 }, { "epoch": 4.471760937368288, "grad_norm": 0.24983647465705872, "learning_rate": 2.9313006225523954e-05, "loss": 0.0201, "step": 17560 }, { "epoch": 4.475132765742224, "grad_norm": 0.1984914094209671, "learning_rate": 2.9301073920707876e-05, "loss": 0.0171, "step": 17570 }, { "epoch": 4.47850459411616, "grad_norm": 0.261827677488327, "learning_rate": 2.9289137390063082e-05, "loss": 0.0206, "step": 17580 }, { "epoch": 4.481876422490095, "grad_norm": 0.2424766570329666, "learning_rate": 2.9277196639012776e-05, "loss": 0.0169, "step": 17590 }, { "epoch": 4.485248250864031, "grad_norm": 0.21655333042144775, "learning_rate": 2.9265251672982105e-05, "loss": 0.0188, "step": 17600 }, { "epoch": 4.4886200792379665, "grad_norm": 0.26524239778518677, "learning_rate": 2.925330249739811e-05, "loss": 0.0193, "step": 17610 }, { "epoch": 4.491991907611903, "grad_norm": 0.22429201006889343, "learning_rate": 2.924134911768976e-05, "loss": 0.0177, "step": 17620 }, { "epoch": 4.495363735985839, "grad_norm": 0.21605223417282104, "learning_rate": 2.922939153928792e-05, "loss": 0.0204, "step": 17630 }, { "epoch": 4.498735564359774, "grad_norm": 0.27601373195648193, "learning_rate": 2.9217429767625365e-05, "loss": 0.0171, "step": 17640 }, { "epoch": 4.50210739273371, "grad_norm": 0.2599527835845947, "learning_rate": 2.920546380813679e-05, "loss": 0.0186, "step": 17650 }, { "epoch": 4.505479221107645, "grad_norm": 0.23922976851463318, "learning_rate": 2.919349366625877e-05, "loss": 0.0199, "step": 17660 }, { "epoch": 4.508851049481581, "grad_norm": 0.19961631298065186, "learning_rate": 2.9181519347429805e-05, "loss": 0.0176, "step": 17670 }, { "epoch": 4.512222877855518, "grad_norm": 0.23411454260349274, "learning_rate": 2.916954085709027e-05, "loss": 0.0176, "step": 17680 }, { "epoch": 4.515594706229453, "grad_norm": 0.2304239124059677, "learning_rate": 2.9157558200682446e-05, "loss": 0.0189, "step": 17690 }, { "epoch": 4.518966534603389, "grad_norm": 0.2844913899898529, "learning_rate": 2.914557138365051e-05, "loss": 0.0191, "step": 17700 }, { "epoch": 4.522338362977324, "grad_norm": 0.2197534292936325, "learning_rate": 2.9133580411440522e-05, "loss": 0.0193, "step": 17710 }, { "epoch": 4.52571019135126, "grad_norm": 0.23582908511161804, "learning_rate": 2.912158528950043e-05, "loss": 0.0204, "step": 17720 }, { "epoch": 4.529082019725196, "grad_norm": 0.27614346146583557, "learning_rate": 2.9109586023280075e-05, "loss": 0.0182, "step": 17730 }, { "epoch": 4.532453848099132, "grad_norm": 0.2718147933483124, "learning_rate": 2.9097582618231176e-05, "loss": 0.0205, "step": 17740 }, { "epoch": 4.535825676473068, "grad_norm": 0.24264606833457947, "learning_rate": 2.9085575079807332e-05, "loss": 0.019, "step": 17750 }, { "epoch": 4.539197504847003, "grad_norm": 0.22118732333183289, "learning_rate": 2.907356341346402e-05, "loss": 0.0199, "step": 17760 }, { "epoch": 4.542569333220939, "grad_norm": 0.23295961320400238, "learning_rate": 2.9061547624658594e-05, "loss": 0.0201, "step": 17770 }, { "epoch": 4.545941161594875, "grad_norm": 0.26126810908317566, "learning_rate": 2.904952771885028e-05, "loss": 0.0185, "step": 17780 }, { "epoch": 4.5493129899688105, "grad_norm": 0.24917489290237427, "learning_rate": 2.9037503701500177e-05, "loss": 0.0174, "step": 17790 }, { "epoch": 4.552684818342747, "grad_norm": 0.23107214272022247, "learning_rate": 2.9025475578071245e-05, "loss": 0.0183, "step": 17800 }, { "epoch": 4.556056646716682, "grad_norm": 0.26870423555374146, "learning_rate": 2.9013443354028313e-05, "loss": 0.0223, "step": 17810 }, { "epoch": 4.559428475090618, "grad_norm": 0.2437800168991089, "learning_rate": 2.9001407034838078e-05, "loss": 0.022, "step": 17820 }, { "epoch": 4.562800303464554, "grad_norm": 0.25699543952941895, "learning_rate": 2.89893666259691e-05, "loss": 0.0201, "step": 17830 }, { "epoch": 4.566172131838489, "grad_norm": 0.2056744545698166, "learning_rate": 2.8977322132891776e-05, "loss": 0.0188, "step": 17840 }, { "epoch": 4.5695439602124255, "grad_norm": 0.2761535942554474, "learning_rate": 2.8965273561078383e-05, "loss": 0.0169, "step": 17850 }, { "epoch": 4.572915788586361, "grad_norm": 0.26450860500335693, "learning_rate": 2.8953220916003036e-05, "loss": 0.0184, "step": 17860 }, { "epoch": 4.576287616960297, "grad_norm": 0.23280870914459229, "learning_rate": 2.894116420314172e-05, "loss": 0.0178, "step": 17870 }, { "epoch": 4.579659445334233, "grad_norm": 0.23211678862571716, "learning_rate": 2.8929103427972242e-05, "loss": 0.0186, "step": 17880 }, { "epoch": 4.583031273708168, "grad_norm": 0.2506985366344452, "learning_rate": 2.891703859597427e-05, "loss": 0.0186, "step": 17890 }, { "epoch": 4.586403102082104, "grad_norm": 0.2500843107700348, "learning_rate": 2.8904969712629304e-05, "loss": 0.0184, "step": 17900 }, { "epoch": 4.58977493045604, "grad_norm": 0.2584279179573059, "learning_rate": 2.889289678342071e-05, "loss": 0.0175, "step": 17910 }, { "epoch": 4.593146758829976, "grad_norm": 0.245949849486351, "learning_rate": 2.888081981383367e-05, "loss": 0.0201, "step": 17920 }, { "epoch": 4.596518587203911, "grad_norm": 0.20596468448638916, "learning_rate": 2.8868738809355198e-05, "loss": 0.0185, "step": 17930 }, { "epoch": 4.599890415577847, "grad_norm": 0.22271639108657837, "learning_rate": 2.8856653775474153e-05, "loss": 0.0171, "step": 17940 }, { "epoch": 4.603262243951783, "grad_norm": 0.21134240925312042, "learning_rate": 2.8844564717681224e-05, "loss": 0.0183, "step": 17950 }, { "epoch": 4.606634072325718, "grad_norm": 0.22465282678604126, "learning_rate": 2.8832471641468926e-05, "loss": 0.0195, "step": 17960 }, { "epoch": 4.6100059006996545, "grad_norm": 0.2657279670238495, "learning_rate": 2.8820374552331603e-05, "loss": 0.0181, "step": 17970 }, { "epoch": 4.61337772907359, "grad_norm": 0.24450133740901947, "learning_rate": 2.8808273455765413e-05, "loss": 0.0195, "step": 17980 }, { "epoch": 4.616749557447526, "grad_norm": 0.27537646889686584, "learning_rate": 2.8796168357268343e-05, "loss": 0.0185, "step": 17990 }, { "epoch": 4.620121385821462, "grad_norm": 0.23843339085578918, "learning_rate": 2.8784059262340194e-05, "loss": 0.0186, "step": 18000 }, { "epoch": 4.623493214195397, "grad_norm": 0.2412642538547516, "learning_rate": 2.877194617648259e-05, "loss": 0.022, "step": 18010 }, { "epoch": 4.626865042569333, "grad_norm": 0.2599549889564514, "learning_rate": 2.8759829105198962e-05, "loss": 0.0192, "step": 18020 }, { "epoch": 4.630236870943269, "grad_norm": 0.2924239933490753, "learning_rate": 2.8747708053994547e-05, "loss": 0.0193, "step": 18030 }, { "epoch": 4.633608699317205, "grad_norm": 0.24405671656131744, "learning_rate": 2.8735583028376397e-05, "loss": 0.0176, "step": 18040 }, { "epoch": 4.636980527691141, "grad_norm": 0.2686670422554016, "learning_rate": 2.872345403385337e-05, "loss": 0.0174, "step": 18050 }, { "epoch": 4.640352356065076, "grad_norm": 0.24921433627605438, "learning_rate": 2.8711321075936133e-05, "loss": 0.0188, "step": 18060 }, { "epoch": 4.643724184439012, "grad_norm": 0.1993594765663147, "learning_rate": 2.869918416013713e-05, "loss": 0.0182, "step": 18070 }, { "epoch": 4.6470960128129475, "grad_norm": 0.21975001692771912, "learning_rate": 2.868704329197063e-05, "loss": 0.0188, "step": 18080 }, { "epoch": 4.650467841186884, "grad_norm": 0.2692890167236328, "learning_rate": 2.867489847695269e-05, "loss": 0.0185, "step": 18090 }, { "epoch": 4.65383966956082, "grad_norm": 0.2486664354801178, "learning_rate": 2.866274972060115e-05, "loss": 0.0197, "step": 18100 }, { "epoch": 4.657211497934755, "grad_norm": 0.257823646068573, "learning_rate": 2.865059702843566e-05, "loss": 0.0206, "step": 18110 }, { "epoch": 4.660583326308691, "grad_norm": 0.2407723218202591, "learning_rate": 2.863844040597763e-05, "loss": 0.0176, "step": 18120 }, { "epoch": 4.663955154682626, "grad_norm": 0.2435118854045868, "learning_rate": 2.8626279858750283e-05, "loss": 0.0173, "step": 18130 }, { "epoch": 4.6673269830565625, "grad_norm": 0.20741723477840424, "learning_rate": 2.8614115392278612e-05, "loss": 0.018, "step": 18140 }, { "epoch": 4.670698811430498, "grad_norm": 0.2491118311882019, "learning_rate": 2.8601947012089396e-05, "loss": 0.0211, "step": 18150 }, { "epoch": 4.674070639804434, "grad_norm": 0.22737187147140503, "learning_rate": 2.8589774723711182e-05, "loss": 0.0182, "step": 18160 }, { "epoch": 4.67744246817837, "grad_norm": 0.22033344209194183, "learning_rate": 2.8577598532674308e-05, "loss": 0.0202, "step": 18170 }, { "epoch": 4.680814296552305, "grad_norm": 0.2168598622083664, "learning_rate": 2.856541844451087e-05, "loss": 0.0199, "step": 18180 }, { "epoch": 4.684186124926241, "grad_norm": 0.24684607982635498, "learning_rate": 2.855323446475475e-05, "loss": 0.0197, "step": 18190 }, { "epoch": 4.6875579533001765, "grad_norm": 0.2370768040418625, "learning_rate": 2.8541046598941574e-05, "loss": 0.0173, "step": 18200 }, { "epoch": 4.690929781674113, "grad_norm": 0.25768986344337463, "learning_rate": 2.8528854852608763e-05, "loss": 0.021, "step": 18210 }, { "epoch": 4.694301610048049, "grad_norm": 0.253317654132843, "learning_rate": 2.8516659231295488e-05, "loss": 0.0189, "step": 18220 }, { "epoch": 4.697673438421984, "grad_norm": 0.2158082276582718, "learning_rate": 2.850445974054267e-05, "loss": 0.0179, "step": 18230 }, { "epoch": 4.70104526679592, "grad_norm": 0.23523399233818054, "learning_rate": 2.8492256385893006e-05, "loss": 0.018, "step": 18240 }, { "epoch": 4.704417095169855, "grad_norm": 0.2246246486902237, "learning_rate": 2.848004917289093e-05, "loss": 0.0203, "step": 18250 }, { "epoch": 4.7077889235437915, "grad_norm": 0.25570428371429443, "learning_rate": 2.846783810708265e-05, "loss": 0.0181, "step": 18260 }, { "epoch": 4.711160751917728, "grad_norm": 0.22391094267368317, "learning_rate": 2.8455623194016114e-05, "loss": 0.0173, "step": 18270 }, { "epoch": 4.714532580291663, "grad_norm": 0.22978243231773376, "learning_rate": 2.8443404439241017e-05, "loss": 0.0166, "step": 18280 }, { "epoch": 4.717904408665599, "grad_norm": 0.2589948773384094, "learning_rate": 2.8431181848308783e-05, "loss": 0.0204, "step": 18290 }, { "epoch": 4.721276237039534, "grad_norm": 0.27853938937187195, "learning_rate": 2.841895542677262e-05, "loss": 0.0188, "step": 18300 }, { "epoch": 4.72464806541347, "grad_norm": 0.23950879275798798, "learning_rate": 2.8406725180187444e-05, "loss": 0.0177, "step": 18310 }, { "epoch": 4.7280198937874065, "grad_norm": 0.2429947853088379, "learning_rate": 2.8394491114109913e-05, "loss": 0.0181, "step": 18320 }, { "epoch": 4.731391722161342, "grad_norm": 0.2340213507413864, "learning_rate": 2.8382253234098432e-05, "loss": 0.0197, "step": 18330 }, { "epoch": 4.734763550535278, "grad_norm": 0.22026726603507996, "learning_rate": 2.8370011545713124e-05, "loss": 0.0187, "step": 18340 }, { "epoch": 4.738135378909213, "grad_norm": 0.23555444180965424, "learning_rate": 2.8357766054515852e-05, "loss": 0.0212, "step": 18350 }, { "epoch": 4.741507207283149, "grad_norm": 0.25672611594200134, "learning_rate": 2.8345516766070206e-05, "loss": 0.0186, "step": 18360 }, { "epoch": 4.744879035657085, "grad_norm": 0.24224528670310974, "learning_rate": 2.83332636859415e-05, "loss": 0.0174, "step": 18370 }, { "epoch": 4.748250864031021, "grad_norm": 0.22105005383491516, "learning_rate": 2.832100681969677e-05, "loss": 0.0185, "step": 18380 }, { "epoch": 4.751622692404957, "grad_norm": 0.22094480693340302, "learning_rate": 2.8308746172904773e-05, "loss": 0.0188, "step": 18390 }, { "epoch": 4.754994520778892, "grad_norm": 0.2546200454235077, "learning_rate": 2.8296481751135987e-05, "loss": 0.0181, "step": 18400 }, { "epoch": 4.758366349152828, "grad_norm": 0.2571086287498474, "learning_rate": 2.828421355996259e-05, "loss": 0.0185, "step": 18410 }, { "epoch": 4.761738177526764, "grad_norm": 0.22212448716163635, "learning_rate": 2.827194160495849e-05, "loss": 0.019, "step": 18420 }, { "epoch": 4.765110005900699, "grad_norm": 0.2524887025356293, "learning_rate": 2.825966589169931e-05, "loss": 0.0181, "step": 18430 }, { "epoch": 4.768481834274636, "grad_norm": 0.2396986484527588, "learning_rate": 2.8247386425762356e-05, "loss": 0.0224, "step": 18440 }, { "epoch": 4.771853662648571, "grad_norm": 0.22813208401203156, "learning_rate": 2.8235103212726655e-05, "loss": 0.0181, "step": 18450 }, { "epoch": 4.775225491022507, "grad_norm": 0.261733740568161, "learning_rate": 2.8222816258172936e-05, "loss": 0.0186, "step": 18460 }, { "epoch": 4.778597319396443, "grad_norm": 0.25770998001098633, "learning_rate": 2.821052556768362e-05, "loss": 0.0186, "step": 18470 }, { "epoch": 4.781969147770378, "grad_norm": 0.2484024316072464, "learning_rate": 2.8198231146842843e-05, "loss": 0.0173, "step": 18480 }, { "epoch": 4.785340976144314, "grad_norm": 0.2538393437862396, "learning_rate": 2.818593300123642e-05, "loss": 0.0195, "step": 18490 }, { "epoch": 4.78871280451825, "grad_norm": 0.2410932183265686, "learning_rate": 2.8173631136451857e-05, "loss": 0.0186, "step": 18500 }, { "epoch": 4.792084632892186, "grad_norm": 0.22359521687030792, "learning_rate": 2.816132555807836e-05, "loss": 0.019, "step": 18510 }, { "epoch": 4.795456461266122, "grad_norm": 0.2423936128616333, "learning_rate": 2.8149016271706816e-05, "loss": 0.0203, "step": 18520 }, { "epoch": 4.798828289640057, "grad_norm": 0.25296926498413086, "learning_rate": 2.8136703282929797e-05, "loss": 0.0202, "step": 18530 }, { "epoch": 4.802200118013993, "grad_norm": 0.2153603732585907, "learning_rate": 2.8124386597341563e-05, "loss": 0.0179, "step": 18540 }, { "epoch": 4.8055719463879285, "grad_norm": 0.25383952260017395, "learning_rate": 2.8112066220538037e-05, "loss": 0.0185, "step": 18550 }, { "epoch": 4.808943774761865, "grad_norm": 0.2132316380739212, "learning_rate": 2.8099742158116843e-05, "loss": 0.0175, "step": 18560 }, { "epoch": 4.812315603135801, "grad_norm": 0.24596697092056274, "learning_rate": 2.8087414415677265e-05, "loss": 0.018, "step": 18570 }, { "epoch": 4.815687431509736, "grad_norm": 0.2689269185066223, "learning_rate": 2.807508299882026e-05, "loss": 0.0177, "step": 18580 }, { "epoch": 4.819059259883672, "grad_norm": 0.2696880102157593, "learning_rate": 2.8062747913148453e-05, "loss": 0.0206, "step": 18590 }, { "epoch": 4.822431088257607, "grad_norm": 0.24183709919452667, "learning_rate": 2.805040916426614e-05, "loss": 0.0202, "step": 18600 }, { "epoch": 4.8258029166315435, "grad_norm": 0.24690236151218414, "learning_rate": 2.803806675777929e-05, "loss": 0.0184, "step": 18610 }, { "epoch": 4.82917474500548, "grad_norm": 0.24896670877933502, "learning_rate": 2.8025720699295503e-05, "loss": 0.0166, "step": 18620 }, { "epoch": 4.832546573379415, "grad_norm": 0.2462640404701233, "learning_rate": 2.801337099442408e-05, "loss": 0.0187, "step": 18630 }, { "epoch": 4.835918401753351, "grad_norm": 0.22907690703868866, "learning_rate": 2.800101764877595e-05, "loss": 0.0189, "step": 18640 }, { "epoch": 4.839290230127286, "grad_norm": 0.24833475053310394, "learning_rate": 2.79886606679637e-05, "loss": 0.018, "step": 18650 }, { "epoch": 4.842662058501222, "grad_norm": 0.25465553998947144, "learning_rate": 2.7976300057601576e-05, "loss": 0.0191, "step": 18660 }, { "epoch": 4.8460338868751585, "grad_norm": 0.2279806286096573, "learning_rate": 2.7963935823305468e-05, "loss": 0.0177, "step": 18670 }, { "epoch": 4.849405715249094, "grad_norm": 0.22825105488300323, "learning_rate": 2.7951567970692912e-05, "loss": 0.019, "step": 18680 }, { "epoch": 4.85277754362303, "grad_norm": 0.27101466059684753, "learning_rate": 2.79391965053831e-05, "loss": 0.0182, "step": 18690 }, { "epoch": 4.856149371996965, "grad_norm": 0.2463192641735077, "learning_rate": 2.7926821432996845e-05, "loss": 0.0212, "step": 18700 }, { "epoch": 4.859521200370901, "grad_norm": 0.24148598313331604, "learning_rate": 2.791444275915661e-05, "loss": 0.0193, "step": 18710 }, { "epoch": 4.862893028744837, "grad_norm": 0.2546135485172272, "learning_rate": 2.7902060489486502e-05, "loss": 0.0185, "step": 18720 }, { "epoch": 4.8662648571187725, "grad_norm": 0.26038461923599243, "learning_rate": 2.7889674629612238e-05, "loss": 0.0194, "step": 18730 }, { "epoch": 4.869636685492709, "grad_norm": 0.2719481587409973, "learning_rate": 2.78772851851612e-05, "loss": 0.0184, "step": 18740 }, { "epoch": 4.873008513866644, "grad_norm": 0.23753610253334045, "learning_rate": 2.7864892161762367e-05, "loss": 0.0198, "step": 18750 }, { "epoch": 4.87638034224058, "grad_norm": 0.24012060463428497, "learning_rate": 2.7852495565046363e-05, "loss": 0.0181, "step": 18760 }, { "epoch": 4.879752170614516, "grad_norm": 0.2063847780227661, "learning_rate": 2.784009540064542e-05, "loss": 0.0181, "step": 18770 }, { "epoch": 4.883123998988451, "grad_norm": 0.2662946581840515, "learning_rate": 2.7827691674193408e-05, "loss": 0.0206, "step": 18780 }, { "epoch": 4.8864958273623875, "grad_norm": 0.2130303829908371, "learning_rate": 2.781528439132581e-05, "loss": 0.0206, "step": 18790 }, { "epoch": 4.889867655736323, "grad_norm": 0.24531124532222748, "learning_rate": 2.780287355767972e-05, "loss": 0.019, "step": 18800 }, { "epoch": 4.893239484110259, "grad_norm": 0.24486882984638214, "learning_rate": 2.7790459178893852e-05, "loss": 0.0183, "step": 18810 }, { "epoch": 4.896611312484195, "grad_norm": 0.23218926787376404, "learning_rate": 2.7778041260608514e-05, "loss": 0.0201, "step": 18820 }, { "epoch": 4.89998314085813, "grad_norm": 0.2808361053466797, "learning_rate": 2.7765619808465642e-05, "loss": 0.0165, "step": 18830 }, { "epoch": 4.903354969232066, "grad_norm": 0.22755496203899384, "learning_rate": 2.775319482810878e-05, "loss": 0.019, "step": 18840 }, { "epoch": 4.906726797606002, "grad_norm": 0.18933406472206116, "learning_rate": 2.774076632518305e-05, "loss": 0.018, "step": 18850 }, { "epoch": 4.910098625979938, "grad_norm": 0.2426774650812149, "learning_rate": 2.77283343053352e-05, "loss": 0.0188, "step": 18860 }, { "epoch": 4.913470454353874, "grad_norm": 0.2520374059677124, "learning_rate": 2.771589877421356e-05, "loss": 0.0181, "step": 18870 }, { "epoch": 4.916842282727809, "grad_norm": 0.23606373369693756, "learning_rate": 2.7703459737468058e-05, "loss": 0.0184, "step": 18880 }, { "epoch": 4.920214111101745, "grad_norm": 0.2664554715156555, "learning_rate": 2.769101720075023e-05, "loss": 0.016, "step": 18890 }, { "epoch": 4.9235859394756805, "grad_norm": 0.2304772138595581, "learning_rate": 2.7678571169713175e-05, "loss": 0.0176, "step": 18900 }, { "epoch": 4.926957767849617, "grad_norm": 0.2606422007083893, "learning_rate": 2.76661216500116e-05, "loss": 0.0203, "step": 18910 }, { "epoch": 4.930329596223553, "grad_norm": 0.28550800681114197, "learning_rate": 2.7653668647301797e-05, "loss": 0.0184, "step": 18920 }, { "epoch": 4.933701424597488, "grad_norm": 0.24595999717712402, "learning_rate": 2.7641212167241633e-05, "loss": 0.0203, "step": 18930 }, { "epoch": 4.937073252971424, "grad_norm": 0.20649367570877075, "learning_rate": 2.762875221549055e-05, "loss": 0.0167, "step": 18940 }, { "epoch": 4.940445081345359, "grad_norm": 0.20479141175746918, "learning_rate": 2.7616288797709577e-05, "loss": 0.0205, "step": 18950 }, { "epoch": 4.943816909719295, "grad_norm": 0.2306610494852066, "learning_rate": 2.7603821919561315e-05, "loss": 0.0173, "step": 18960 }, { "epoch": 4.947188738093231, "grad_norm": 0.20074597001075745, "learning_rate": 2.7591351586709945e-05, "loss": 0.0187, "step": 18970 }, { "epoch": 4.950560566467167, "grad_norm": 0.26092153787612915, "learning_rate": 2.7578877804821194e-05, "loss": 0.0194, "step": 18980 }, { "epoch": 4.953932394841103, "grad_norm": 0.2369234263896942, "learning_rate": 2.7566400579562386e-05, "loss": 0.0183, "step": 18990 }, { "epoch": 4.957304223215038, "grad_norm": 0.26205873489379883, "learning_rate": 2.7553919916602388e-05, "loss": 0.0185, "step": 19000 }, { "epoch": 4.960676051588974, "grad_norm": 0.25922486186027527, "learning_rate": 2.7541435821611638e-05, "loss": 0.0203, "step": 19010 }, { "epoch": 4.9640478799629095, "grad_norm": 0.3143695890903473, "learning_rate": 2.7528948300262132e-05, "loss": 0.02, "step": 19020 }, { "epoch": 4.967419708336846, "grad_norm": 0.2248535305261612, "learning_rate": 2.7516457358227417e-05, "loss": 0.0192, "step": 19030 }, { "epoch": 4.970791536710782, "grad_norm": 0.22288578748703003, "learning_rate": 2.7503963001182602e-05, "loss": 0.0184, "step": 19040 }, { "epoch": 4.974163365084717, "grad_norm": 0.2738927900791168, "learning_rate": 2.749146523480435e-05, "loss": 0.0175, "step": 19050 }, { "epoch": 4.977535193458653, "grad_norm": 0.2193661481142044, "learning_rate": 2.747896406477086e-05, "loss": 0.0184, "step": 19060 }, { "epoch": 4.980907021832588, "grad_norm": 0.2397335171699524, "learning_rate": 2.7466459496761886e-05, "loss": 0.0177, "step": 19070 }, { "epoch": 4.9842788502065245, "grad_norm": 0.20521509647369385, "learning_rate": 2.7453951536458726e-05, "loss": 0.0193, "step": 19080 }, { "epoch": 4.987650678580461, "grad_norm": 0.2836483418941498, "learning_rate": 2.7441440189544226e-05, "loss": 0.0194, "step": 19090 }, { "epoch": 4.991022506954396, "grad_norm": 0.25265297293663025, "learning_rate": 2.7428925461702754e-05, "loss": 0.0174, "step": 19100 }, { "epoch": 4.994394335328332, "grad_norm": 0.23713137209415436, "learning_rate": 2.7416407358620224e-05, "loss": 0.0181, "step": 19110 }, { "epoch": 4.997766163702267, "grad_norm": 0.2274797260761261, "learning_rate": 2.7403885885984082e-05, "loss": 0.0179, "step": 19120 }, { "epoch": 5.001137992076203, "grad_norm": 0.24704012274742126, "learning_rate": 2.739136104948331e-05, "loss": 0.0186, "step": 19130 }, { "epoch": 5.004509820450139, "grad_norm": 0.24430058896541595, "learning_rate": 2.7378832854808405e-05, "loss": 0.0155, "step": 19140 }, { "epoch": 5.007881648824075, "grad_norm": 0.21568675339221954, "learning_rate": 2.736630130765141e-05, "loss": 0.0173, "step": 19150 }, { "epoch": 5.011253477198011, "grad_norm": 0.2656593918800354, "learning_rate": 2.7353766413705866e-05, "loss": 0.0195, "step": 19160 }, { "epoch": 5.014625305571946, "grad_norm": 0.23040729761123657, "learning_rate": 2.734122817866686e-05, "loss": 0.0186, "step": 19170 }, { "epoch": 5.017997133945882, "grad_norm": 0.2545837461948395, "learning_rate": 2.732868660823098e-05, "loss": 0.0213, "step": 19180 }, { "epoch": 5.021368962319817, "grad_norm": 0.22912153601646423, "learning_rate": 2.731614170809634e-05, "loss": 0.0187, "step": 19190 }, { "epoch": 5.024740790693754, "grad_norm": 0.23088446259498596, "learning_rate": 2.730359348396255e-05, "loss": 0.0168, "step": 19200 }, { "epoch": 5.02811261906769, "grad_norm": 0.26153862476348877, "learning_rate": 2.7291041941530747e-05, "loss": 0.0178, "step": 19210 }, { "epoch": 5.031484447441625, "grad_norm": 0.20643839240074158, "learning_rate": 2.7278487086503576e-05, "loss": 0.0193, "step": 19220 }, { "epoch": 5.034856275815561, "grad_norm": 0.22487160563468933, "learning_rate": 2.7265928924585172e-05, "loss": 0.0177, "step": 19230 }, { "epoch": 5.038228104189496, "grad_norm": 0.23614230751991272, "learning_rate": 2.725336746148119e-05, "loss": 0.0168, "step": 19240 }, { "epoch": 5.041599932563432, "grad_norm": 0.2734718918800354, "learning_rate": 2.724080270289877e-05, "loss": 0.0176, "step": 19250 }, { "epoch": 5.0449717609373685, "grad_norm": 0.22847409546375275, "learning_rate": 2.7228234654546554e-05, "loss": 0.0173, "step": 19260 }, { "epoch": 5.048343589311304, "grad_norm": 0.24270017445087433, "learning_rate": 2.721566332213469e-05, "loss": 0.0182, "step": 19270 }, { "epoch": 5.05171541768524, "grad_norm": 0.2439003586769104, "learning_rate": 2.7203088711374808e-05, "loss": 0.0166, "step": 19280 }, { "epoch": 5.055087246059175, "grad_norm": 0.19013996422290802, "learning_rate": 2.7190510827980016e-05, "loss": 0.0188, "step": 19290 }, { "epoch": 5.058459074433111, "grad_norm": 0.24405580759048462, "learning_rate": 2.7177929677664927e-05, "loss": 0.0173, "step": 19300 }, { "epoch": 5.061830902807047, "grad_norm": 0.2211460918188095, "learning_rate": 2.7165345266145635e-05, "loss": 0.0168, "step": 19310 }, { "epoch": 5.065202731180983, "grad_norm": 0.20776285231113434, "learning_rate": 2.7152757599139713e-05, "loss": 0.015, "step": 19320 }, { "epoch": 5.068574559554919, "grad_norm": 0.1957291215658188, "learning_rate": 2.714016668236621e-05, "loss": 0.0194, "step": 19330 }, { "epoch": 5.071946387928854, "grad_norm": 0.2540283203125, "learning_rate": 2.7127572521545658e-05, "loss": 0.0175, "step": 19340 }, { "epoch": 5.07531821630279, "grad_norm": 0.24939708411693573, "learning_rate": 2.7114975122400054e-05, "loss": 0.0207, "step": 19350 }, { "epoch": 5.078690044676726, "grad_norm": 0.2639811933040619, "learning_rate": 2.7102374490652876e-05, "loss": 0.0201, "step": 19360 }, { "epoch": 5.0820618730506615, "grad_norm": 0.20437470078468323, "learning_rate": 2.708977063202906e-05, "loss": 0.0182, "step": 19370 }, { "epoch": 5.085433701424598, "grad_norm": 0.20920786261558533, "learning_rate": 2.707716355225502e-05, "loss": 0.0176, "step": 19380 }, { "epoch": 5.088805529798533, "grad_norm": 0.23277322947978973, "learning_rate": 2.7064553257058636e-05, "loss": 0.0181, "step": 19390 }, { "epoch": 5.092177358172469, "grad_norm": 0.22038480639457703, "learning_rate": 2.7051939752169222e-05, "loss": 0.0174, "step": 19400 }, { "epoch": 5.095549186546405, "grad_norm": 0.24343299865722656, "learning_rate": 2.7039323043317583e-05, "loss": 0.0167, "step": 19410 }, { "epoch": 5.09892101492034, "grad_norm": 0.24059222638607025, "learning_rate": 2.702670313623596e-05, "loss": 0.0172, "step": 19420 }, { "epoch": 5.1022928432942765, "grad_norm": 0.21403351426124573, "learning_rate": 2.701408003665805e-05, "loss": 0.016, "step": 19430 }, { "epoch": 5.105664671668212, "grad_norm": 0.27803653478622437, "learning_rate": 2.7001453750319014e-05, "loss": 0.0189, "step": 19440 }, { "epoch": 5.109036500042148, "grad_norm": 0.2149626761674881, "learning_rate": 2.698882428295544e-05, "loss": 0.0181, "step": 19450 }, { "epoch": 5.112408328416084, "grad_norm": 0.2034926563501358, "learning_rate": 2.6976191640305378e-05, "loss": 0.0188, "step": 19460 }, { "epoch": 5.115780156790019, "grad_norm": 0.250324547290802, "learning_rate": 2.696355582810831e-05, "loss": 0.017, "step": 19470 }, { "epoch": 5.119151985163955, "grad_norm": 0.228510782122612, "learning_rate": 2.6950916852105164e-05, "loss": 0.0158, "step": 19480 }, { "epoch": 5.1225238135378905, "grad_norm": 0.20537246763706207, "learning_rate": 2.69382747180383e-05, "loss": 0.0157, "step": 19490 }, { "epoch": 5.125895641911827, "grad_norm": 0.22593852877616882, "learning_rate": 2.692562943165152e-05, "loss": 0.017, "step": 19500 }, { "epoch": 5.129267470285763, "grad_norm": 0.2446858435869217, "learning_rate": 2.6912980998690056e-05, "loss": 0.0184, "step": 19510 }, { "epoch": 5.000730820613796, "grad_norm": 0.22124513983726501, "learning_rate": 2.6900329424900563e-05, "loss": 0.0176, "step": 19520 }, { "epoch": 5.003709961969399, "grad_norm": 0.24168118834495544, "learning_rate": 2.6887674716031136e-05, "loss": 0.0221, "step": 19530 }, { "epoch": 5.006689103325001, "grad_norm": 0.2069394290447235, "learning_rate": 2.687501687783128e-05, "loss": 0.02, "step": 19540 }, { "epoch": 5.009668244680603, "grad_norm": 0.20717620849609375, "learning_rate": 2.6862355916051925e-05, "loss": 0.0181, "step": 19550 }, { "epoch": 5.012647386036206, "grad_norm": 0.1857452392578125, "learning_rate": 2.6849691836445432e-05, "loss": 0.0203, "step": 19560 }, { "epoch": 5.015626527391809, "grad_norm": 0.23387710750102997, "learning_rate": 2.6837024644765572e-05, "loss": 0.0197, "step": 19570 }, { "epoch": 5.01860566874741, "grad_norm": 0.23339876532554626, "learning_rate": 2.6824354346767517e-05, "loss": 0.0168, "step": 19580 }, { "epoch": 5.021584810103013, "grad_norm": 0.28826475143432617, "learning_rate": 2.6811680948207866e-05, "loss": 0.0201, "step": 19590 }, { "epoch": 5.024563951458616, "grad_norm": 0.18750245869159698, "learning_rate": 2.6799004454844624e-05, "loss": 0.0157, "step": 19600 }, { "epoch": 5.027543092814218, "grad_norm": 0.1882340908050537, "learning_rate": 2.6786324872437197e-05, "loss": 0.0188, "step": 19610 }, { "epoch": 5.030522234169821, "grad_norm": 0.20195451378822327, "learning_rate": 2.6773642206746403e-05, "loss": 0.018, "step": 19620 }, { "epoch": 5.0335013755254225, "grad_norm": 0.20809027552604675, "learning_rate": 2.6760956463534453e-05, "loss": 0.019, "step": 19630 }, { "epoch": 5.036480516881025, "grad_norm": 0.28460267186164856, "learning_rate": 2.6748267648564955e-05, "loss": 0.0187, "step": 19640 }, { "epoch": 5.039459658236628, "grad_norm": 0.2244839072227478, "learning_rate": 2.673557576760292e-05, "loss": 0.0176, "step": 19650 }, { "epoch": 5.04243879959223, "grad_norm": 0.18880613148212433, "learning_rate": 2.6722880826414742e-05, "loss": 0.0204, "step": 19660 }, { "epoch": 5.045417940947832, "grad_norm": 0.16829293966293335, "learning_rate": 2.671018283076823e-05, "loss": 0.0166, "step": 19670 }, { "epoch": 5.048397082303435, "grad_norm": 0.19682909548282623, "learning_rate": 2.6697481786432535e-05, "loss": 0.0203, "step": 19680 }, { "epoch": 5.051376223659037, "grad_norm": 0.1722119301557541, "learning_rate": 2.668477769917824e-05, "loss": 0.0196, "step": 19690 }, { "epoch": 5.05435536501464, "grad_norm": 0.2024022936820984, "learning_rate": 2.667207057477729e-05, "loss": 0.0211, "step": 19700 }, { "epoch": 5.057334506370242, "grad_norm": 0.19601456820964813, "learning_rate": 2.6659360419003006e-05, "loss": 0.0173, "step": 19710 }, { "epoch": 5.060313647725844, "grad_norm": 0.19464105367660522, "learning_rate": 2.664664723763009e-05, "loss": 0.0205, "step": 19720 }, { "epoch": 5.063292789081447, "grad_norm": 0.23889558017253876, "learning_rate": 2.6633931036434618e-05, "loss": 0.0183, "step": 19730 }, { "epoch": 5.06627193043705, "grad_norm": 0.24776266515254974, "learning_rate": 2.6621211821194043e-05, "loss": 0.0222, "step": 19740 }, { "epoch": 5.069251071792651, "grad_norm": 0.20345711708068848, "learning_rate": 2.660848959768719e-05, "loss": 0.02, "step": 19750 }, { "epoch": 5.072230213148254, "grad_norm": 0.1981542557477951, "learning_rate": 2.659576437169424e-05, "loss": 0.0199, "step": 19760 }, { "epoch": 5.075209354503857, "grad_norm": 0.2311795949935913, "learning_rate": 2.6583036148996742e-05, "loss": 0.0211, "step": 19770 }, { "epoch": 5.078188495859459, "grad_norm": 0.2115679383277893, "learning_rate": 2.6570304935377606e-05, "loss": 0.0183, "step": 19780 }, { "epoch": 5.081167637215062, "grad_norm": 0.2868432104587555, "learning_rate": 2.6557570736621107e-05, "loss": 0.0231, "step": 19790 }, { "epoch": 5.0841467785706635, "grad_norm": 0.21249128878116608, "learning_rate": 2.654483355851287e-05, "loss": 0.0224, "step": 19800 }, { "epoch": 5.087125919926266, "grad_norm": 0.25142914056777954, "learning_rate": 2.6532093406839873e-05, "loss": 0.0219, "step": 19810 }, { "epoch": 5.090105061281869, "grad_norm": 0.18741977214813232, "learning_rate": 2.6519350287390455e-05, "loss": 0.0194, "step": 19820 }, { "epoch": 5.093084202637471, "grad_norm": 0.27068814635276794, "learning_rate": 2.650660420595429e-05, "loss": 0.0237, "step": 19830 }, { "epoch": 5.096063343993073, "grad_norm": 0.17731668055057526, "learning_rate": 2.6493855168322405e-05, "loss": 0.0178, "step": 19840 }, { "epoch": 5.099042485348676, "grad_norm": 0.2347021847963333, "learning_rate": 2.648110318028717e-05, "loss": 0.0187, "step": 19850 }, { "epoch": 5.102021626704278, "grad_norm": 0.2854408919811249, "learning_rate": 2.6468348247642283e-05, "loss": 0.0182, "step": 19860 }, { "epoch": 5.105000768059881, "grad_norm": 0.23762382566928864, "learning_rate": 2.6455590376182806e-05, "loss": 0.0189, "step": 19870 }, { "epoch": 5.107979909415483, "grad_norm": 0.2182045578956604, "learning_rate": 2.6442829571705114e-05, "loss": 0.0182, "step": 19880 }, { "epoch": 5.110959050771085, "grad_norm": 0.1992790251970291, "learning_rate": 2.643006584000693e-05, "loss": 0.018, "step": 19890 }, { "epoch": 5.113938192126688, "grad_norm": 0.21208135783672333, "learning_rate": 2.6417299186887285e-05, "loss": 0.0173, "step": 19900 }, { "epoch": 5.116917333482291, "grad_norm": 0.2490251213312149, "learning_rate": 2.640452961814656e-05, "loss": 0.0196, "step": 19910 }, { "epoch": 5.119896474837893, "grad_norm": 0.23001495003700256, "learning_rate": 2.639175713958645e-05, "loss": 0.0189, "step": 19920 }, { "epoch": 5.122875616193495, "grad_norm": 0.1747155338525772, "learning_rate": 2.6378981757009978e-05, "loss": 0.0188, "step": 19930 }, { "epoch": 5.125854757549098, "grad_norm": 0.2475578933954239, "learning_rate": 2.636620347622148e-05, "loss": 0.0223, "step": 19940 }, { "epoch": 5.1288338989047, "grad_norm": 0.25662654638290405, "learning_rate": 2.6353422303026594e-05, "loss": 0.0186, "step": 19950 }, { "epoch": 5.131813040260303, "grad_norm": 0.20675553381443024, "learning_rate": 2.6340638243232315e-05, "loss": 0.0197, "step": 19960 }, { "epoch": 5.134792181615905, "grad_norm": 0.24988260865211487, "learning_rate": 2.6327851302646913e-05, "loss": 0.0224, "step": 19970 }, { "epoch": 5.137771322971507, "grad_norm": 0.21429187059402466, "learning_rate": 2.6315061487079976e-05, "loss": 0.0205, "step": 19980 }, { "epoch": 5.14075046432711, "grad_norm": 0.2442208081483841, "learning_rate": 2.6302268802342394e-05, "loss": 0.0179, "step": 19990 }, { "epoch": 5.143729605682712, "grad_norm": 0.23651088774204254, "learning_rate": 2.6289473254246384e-05, "loss": 0.0198, "step": 20000 }, { "epoch": 5.146708747038314, "grad_norm": 0.2101641744375229, "learning_rate": 2.6276674848605428e-05, "loss": 0.0214, "step": 20010 }, { "epoch": 5.149687888393917, "grad_norm": 0.2086223065853119, "learning_rate": 2.626387359123433e-05, "loss": 0.0168, "step": 20020 }, { "epoch": 5.152667029749519, "grad_norm": 0.2130655199289322, "learning_rate": 2.625106948794919e-05, "loss": 0.0189, "step": 20030 }, { "epoch": 5.155646171105122, "grad_norm": 0.2284734696149826, "learning_rate": 2.623826254456739e-05, "loss": 0.018, "step": 20040 }, { "epoch": 5.158625312460725, "grad_norm": 0.2123996466398239, "learning_rate": 2.622545276690761e-05, "loss": 0.0212, "step": 20050 }, { "epoch": 5.161604453816326, "grad_norm": 0.23687440156936646, "learning_rate": 2.621264016078981e-05, "loss": 0.0197, "step": 20060 }, { "epoch": 5.164583595171929, "grad_norm": 0.20010609924793243, "learning_rate": 2.619982473203525e-05, "loss": 0.0181, "step": 20070 }, { "epoch": 5.167562736527532, "grad_norm": 0.19987130165100098, "learning_rate": 2.6187006486466443e-05, "loss": 0.0187, "step": 20080 }, { "epoch": 5.170541877883134, "grad_norm": 0.20724903047084808, "learning_rate": 2.6174185429907222e-05, "loss": 0.02, "step": 20090 }, { "epoch": 5.173521019238736, "grad_norm": 0.16666123270988464, "learning_rate": 2.616136156818267e-05, "loss": 0.0185, "step": 20100 }, { "epoch": 5.176500160594339, "grad_norm": 0.2230112999677658, "learning_rate": 2.614853490711915e-05, "loss": 0.02, "step": 20110 }, { "epoch": 5.179479301949941, "grad_norm": 0.18592500686645508, "learning_rate": 2.61357054525443e-05, "loss": 0.0201, "step": 20120 }, { "epoch": 5.182458443305544, "grad_norm": 0.19092586636543274, "learning_rate": 2.6122873210287018e-05, "loss": 0.0165, "step": 20130 }, { "epoch": 5.185437584661146, "grad_norm": 0.15422183275222778, "learning_rate": 2.6110038186177486e-05, "loss": 0.0198, "step": 20140 }, { "epoch": 5.188416726016748, "grad_norm": 0.2983990013599396, "learning_rate": 2.6097200386047136e-05, "loss": 0.0171, "step": 20150 }, { "epoch": 5.191395867372351, "grad_norm": 0.2050776183605194, "learning_rate": 2.6084359815728658e-05, "loss": 0.0217, "step": 20160 }, { "epoch": 5.194375008727953, "grad_norm": 0.21341703832149506, "learning_rate": 2.6071516481056018e-05, "loss": 0.0198, "step": 20170 }, { "epoch": 5.197354150083555, "grad_norm": 0.1904391199350357, "learning_rate": 2.6058670387864423e-05, "loss": 0.0196, "step": 20180 }, { "epoch": 5.200333291439158, "grad_norm": 0.24104022979736328, "learning_rate": 2.6045821541990336e-05, "loss": 0.0167, "step": 20190 }, { "epoch": 5.20331243279476, "grad_norm": 0.21795357763767242, "learning_rate": 2.6032969949271472e-05, "loss": 0.0173, "step": 20200 }, { "epoch": 5.206291574150363, "grad_norm": 0.2408769577741623, "learning_rate": 2.6020115615546794e-05, "loss": 0.0183, "step": 20210 }, { "epoch": 5.209270715505966, "grad_norm": 0.17753422260284424, "learning_rate": 2.6007258546656517e-05, "loss": 0.0172, "step": 20220 }, { "epoch": 5.212249856861567, "grad_norm": 0.2160950005054474, "learning_rate": 2.599439874844208e-05, "loss": 0.0177, "step": 20230 }, { "epoch": 5.21522899821717, "grad_norm": 0.23319309949874878, "learning_rate": 2.598153622674619e-05, "loss": 0.0161, "step": 20240 }, { "epoch": 5.218208139572773, "grad_norm": 0.20214416086673737, "learning_rate": 2.5968670987412767e-05, "loss": 0.0202, "step": 20250 }, { "epoch": 5.221187280928375, "grad_norm": 0.22858619689941406, "learning_rate": 2.595580303628697e-05, "loss": 0.0174, "step": 20260 }, { "epoch": 5.224166422283977, "grad_norm": 0.23673215508460999, "learning_rate": 2.5942932379215203e-05, "loss": 0.0191, "step": 20270 }, { "epoch": 5.22714556363958, "grad_norm": 0.24917764961719513, "learning_rate": 2.5930059022045092e-05, "loss": 0.0167, "step": 20280 }, { "epoch": 5.230124704995182, "grad_norm": 0.21031804382801056, "learning_rate": 2.5917182970625484e-05, "loss": 0.022, "step": 20290 }, { "epoch": 5.233103846350785, "grad_norm": 0.1969963014125824, "learning_rate": 2.590430423080646e-05, "loss": 0.0212, "step": 20300 }, { "epoch": 5.236082987706387, "grad_norm": 0.188373401761055, "learning_rate": 2.5891422808439305e-05, "loss": 0.0174, "step": 20310 }, { "epoch": 5.239062129061989, "grad_norm": 0.24723003804683685, "learning_rate": 2.5878538709376544e-05, "loss": 0.0174, "step": 20320 }, { "epoch": 5.242041270417592, "grad_norm": 0.2471388280391693, "learning_rate": 2.586565193947191e-05, "loss": 0.0188, "step": 20330 }, { "epoch": 5.2450204117731944, "grad_norm": 0.20233100652694702, "learning_rate": 2.5852762504580346e-05, "loss": 0.0165, "step": 20340 }, { "epoch": 5.247999553128797, "grad_norm": 0.20060746371746063, "learning_rate": 2.583987041055801e-05, "loss": 0.017, "step": 20350 }, { "epoch": 5.250978694484399, "grad_norm": 0.21093881130218506, "learning_rate": 2.5826975663262265e-05, "loss": 0.0205, "step": 20360 }, { "epoch": 5.253957835840001, "grad_norm": 0.20225584506988525, "learning_rate": 2.5814078268551685e-05, "loss": 0.0183, "step": 20370 }, { "epoch": 5.256936977195604, "grad_norm": 0.19970782101154327, "learning_rate": 2.5801178232286036e-05, "loss": 0.0167, "step": 20380 }, { "epoch": 5.259916118551207, "grad_norm": 0.2507387101650238, "learning_rate": 2.5788275560326294e-05, "loss": 0.017, "step": 20390 }, { "epoch": 5.262895259906808, "grad_norm": 0.21313156187534332, "learning_rate": 2.577537025853464e-05, "loss": 0.0211, "step": 20400 }, { "epoch": 5.265874401262411, "grad_norm": 0.20348569750785828, "learning_rate": 2.5762462332774422e-05, "loss": 0.021, "step": 20410 }, { "epoch": 5.268853542618014, "grad_norm": 0.24487127363681793, "learning_rate": 2.574955178891021e-05, "loss": 0.0227, "step": 20420 }, { "epoch": 5.271832683973616, "grad_norm": 0.21400628983974457, "learning_rate": 2.573663863280774e-05, "loss": 0.0158, "step": 20430 }, { "epoch": 5.274811825329218, "grad_norm": 0.21882222592830658, "learning_rate": 2.5723722870333955e-05, "loss": 0.0185, "step": 20440 }, { "epoch": 5.277790966684821, "grad_norm": 0.351329505443573, "learning_rate": 2.5710804507356973e-05, "loss": 0.0193, "step": 20450 }, { "epoch": 5.280770108040423, "grad_norm": 0.20631736516952515, "learning_rate": 2.569788354974609e-05, "loss": 0.0205, "step": 20460 }, { "epoch": 5.283749249396026, "grad_norm": 0.2229851335287094, "learning_rate": 2.568496000337179e-05, "loss": 0.0173, "step": 20470 }, { "epoch": 5.2867283907516285, "grad_norm": 0.23499158024787903, "learning_rate": 2.5672033874105714e-05, "loss": 0.0188, "step": 20480 }, { "epoch": 5.28970753210723, "grad_norm": 0.21765021979808807, "learning_rate": 2.5659105167820703e-05, "loss": 0.021, "step": 20490 }, { "epoch": 5.292686673462833, "grad_norm": 0.21503254771232605, "learning_rate": 2.5646173890390756e-05, "loss": 0.0196, "step": 20500 }, { "epoch": 5.2956658148184355, "grad_norm": 0.21782803535461426, "learning_rate": 2.563324004769103e-05, "loss": 0.0177, "step": 20510 }, { "epoch": 5.298644956174038, "grad_norm": 0.19380000233650208, "learning_rate": 2.562030364559787e-05, "loss": 0.0203, "step": 20520 }, { "epoch": 5.30162409752964, "grad_norm": 0.22888386249542236, "learning_rate": 2.5607364689988766e-05, "loss": 0.019, "step": 20530 }, { "epoch": 5.304603238885242, "grad_norm": 0.353510320186615, "learning_rate": 2.5594423186742372e-05, "loss": 0.0199, "step": 20540 }, { "epoch": 5.307582380240845, "grad_norm": 0.30894705653190613, "learning_rate": 2.55814791417385e-05, "loss": 0.02, "step": 20550 }, { "epoch": 5.310561521596448, "grad_norm": 0.1973877102136612, "learning_rate": 2.5568532560858118e-05, "loss": 0.023, "step": 20560 }, { "epoch": 5.313540662952049, "grad_norm": 0.20646296441555023, "learning_rate": 2.555558344998335e-05, "loss": 0.0231, "step": 20570 }, { "epoch": 5.316519804307652, "grad_norm": 0.17725211381912231, "learning_rate": 2.5542631814997466e-05, "loss": 0.0214, "step": 20580 }, { "epoch": 5.319498945663255, "grad_norm": 0.2485491931438446, "learning_rate": 2.5529677661784878e-05, "loss": 0.0191, "step": 20590 }, { "epoch": 5.322478087018857, "grad_norm": 0.22124578058719635, "learning_rate": 2.5516720996231147e-05, "loss": 0.0189, "step": 20600 }, { "epoch": 5.325457228374459, "grad_norm": 0.2329595685005188, "learning_rate": 2.5503761824222975e-05, "loss": 0.0233, "step": 20610 }, { "epoch": 5.328436369730062, "grad_norm": 0.25131696462631226, "learning_rate": 2.5490800151648208e-05, "loss": 0.021, "step": 20620 }, { "epoch": 5.331415511085664, "grad_norm": 0.27258753776550293, "learning_rate": 2.5477835984395817e-05, "loss": 0.0217, "step": 20630 }, { "epoch": 5.334394652441267, "grad_norm": 0.31886351108551025, "learning_rate": 2.546486932835591e-05, "loss": 0.0187, "step": 20640 }, { "epoch": 5.3373737937968695, "grad_norm": 0.21077263355255127, "learning_rate": 2.5451900189419738e-05, "loss": 0.0207, "step": 20650 }, { "epoch": 5.340352935152471, "grad_norm": 0.2056349664926529, "learning_rate": 2.5438928573479657e-05, "loss": 0.017, "step": 20660 }, { "epoch": 5.343332076508074, "grad_norm": 0.2254790961742401, "learning_rate": 2.5425954486429176e-05, "loss": 0.0188, "step": 20670 }, { "epoch": 5.3463112178636765, "grad_norm": 0.2554806172847748, "learning_rate": 2.54129779341629e-05, "loss": 0.0182, "step": 20680 }, { "epoch": 5.349290359219279, "grad_norm": 0.24541519582271576, "learning_rate": 2.539999892257657e-05, "loss": 0.0157, "step": 20690 }, { "epoch": 5.352269500574881, "grad_norm": 0.18887488543987274, "learning_rate": 2.5387017457567038e-05, "loss": 0.0234, "step": 20700 }, { "epoch": 5.3552486419304834, "grad_norm": 0.2789071500301361, "learning_rate": 2.5374033545032283e-05, "loss": 0.0223, "step": 20710 }, { "epoch": 5.358227783286086, "grad_norm": 0.27655836939811707, "learning_rate": 2.5361047190871375e-05, "loss": 0.0232, "step": 20720 }, { "epoch": 5.361206924641689, "grad_norm": 0.18944162130355835, "learning_rate": 2.534805840098451e-05, "loss": 0.0176, "step": 20730 }, { "epoch": 5.364186065997291, "grad_norm": 0.20149090886116028, "learning_rate": 2.5335067181272983e-05, "loss": 0.0204, "step": 20740 }, { "epoch": 5.367165207352893, "grad_norm": 0.20144148170948029, "learning_rate": 2.5322073537639193e-05, "loss": 0.0187, "step": 20750 }, { "epoch": 5.370144348708496, "grad_norm": 0.22650809586048126, "learning_rate": 2.530907747598666e-05, "loss": 0.0202, "step": 20760 }, { "epoch": 5.373123490064098, "grad_norm": 0.18886767327785492, "learning_rate": 2.529607900221996e-05, "loss": 0.0166, "step": 20770 }, { "epoch": 5.376102631419701, "grad_norm": 0.20368419587612152, "learning_rate": 2.5283078122244796e-05, "loss": 0.0175, "step": 20780 }, { "epoch": 5.379081772775303, "grad_norm": 0.19322335720062256, "learning_rate": 2.5270074841967967e-05, "loss": 0.0183, "step": 20790 }, { "epoch": 5.382060914130905, "grad_norm": 0.2526770830154419, "learning_rate": 2.5257069167297353e-05, "loss": 0.0207, "step": 20800 }, { "epoch": 5.385040055486508, "grad_norm": 0.203384667634964, "learning_rate": 2.5244061104141916e-05, "loss": 0.0217, "step": 20810 }, { "epoch": 5.3880191968421105, "grad_norm": 0.2576720714569092, "learning_rate": 2.5231050658411717e-05, "loss": 0.0205, "step": 20820 }, { "epoch": 5.390998338197712, "grad_norm": 0.20031099021434784, "learning_rate": 2.5218037836017883e-05, "loss": 0.0203, "step": 20830 }, { "epoch": 5.393977479553315, "grad_norm": 0.21334253251552582, "learning_rate": 2.5205022642872637e-05, "loss": 0.019, "step": 20840 }, { "epoch": 5.3969566209089175, "grad_norm": 0.2244141399860382, "learning_rate": 2.5192005084889272e-05, "loss": 0.02, "step": 20850 }, { "epoch": 5.39993576226452, "grad_norm": 0.22002896666526794, "learning_rate": 2.5178985167982154e-05, "loss": 0.0191, "step": 20860 }, { "epoch": 5.402914903620122, "grad_norm": 0.21281297504901886, "learning_rate": 2.516596289806672e-05, "loss": 0.0215, "step": 20870 }, { "epoch": 5.4058940449757245, "grad_norm": 0.21919624507427216, "learning_rate": 2.5152938281059483e-05, "loss": 0.0189, "step": 20880 }, { "epoch": 5.408873186331327, "grad_norm": 0.2174314707517624, "learning_rate": 2.5139911322878012e-05, "loss": 0.0174, "step": 20890 }, { "epoch": 5.41185232768693, "grad_norm": 0.2169213891029358, "learning_rate": 2.5126882029440953e-05, "loss": 0.0186, "step": 20900 }, { "epoch": 5.414831469042532, "grad_norm": 0.24731837213039398, "learning_rate": 2.5113850406667996e-05, "loss": 0.0166, "step": 20910 }, { "epoch": 5.417810610398134, "grad_norm": 0.25800901651382446, "learning_rate": 2.510081646047991e-05, "loss": 0.0199, "step": 20920 }, { "epoch": 5.420789751753737, "grad_norm": 0.260555624961853, "learning_rate": 2.5087780196798503e-05, "loss": 0.0207, "step": 20930 }, { "epoch": 5.423768893109339, "grad_norm": 0.22725388407707214, "learning_rate": 2.507474162154664e-05, "loss": 0.0227, "step": 20940 }, { "epoch": 5.426748034464942, "grad_norm": 0.22360342741012573, "learning_rate": 2.506170074064824e-05, "loss": 0.0189, "step": 20950 }, { "epoch": 5.429727175820544, "grad_norm": 0.2062200903892517, "learning_rate": 2.5048657560028256e-05, "loss": 0.0177, "step": 20960 }, { "epoch": 5.432706317176146, "grad_norm": 0.22473278641700745, "learning_rate": 2.5035612085612714e-05, "loss": 0.0198, "step": 20970 }, { "epoch": 5.435685458531749, "grad_norm": 0.2287294715642929, "learning_rate": 2.502256432332866e-05, "loss": 0.0211, "step": 20980 }, { "epoch": 5.4386645998873515, "grad_norm": 0.18008819222450256, "learning_rate": 2.500951427910418e-05, "loss": 0.0155, "step": 20990 }, { "epoch": 5.441643741242953, "grad_norm": 0.1961473524570465, "learning_rate": 2.49964619588684e-05, "loss": 0.0166, "step": 21000 }, { "epoch": 5.444622882598556, "grad_norm": 0.2201472371816635, "learning_rate": 2.4983407368551494e-05, "loss": 0.0188, "step": 21010 }, { "epoch": 5.4476020239541585, "grad_norm": 0.18927574157714844, "learning_rate": 2.497035051408464e-05, "loss": 0.0164, "step": 21020 }, { "epoch": 5.450581165309761, "grad_norm": 0.20658504962921143, "learning_rate": 2.4957291401400065e-05, "loss": 0.0181, "step": 21030 }, { "epoch": 5.453560306665363, "grad_norm": 0.23681022226810455, "learning_rate": 2.4944230036431015e-05, "loss": 0.018, "step": 21040 }, { "epoch": 5.4565394480209655, "grad_norm": 0.22746481001377106, "learning_rate": 2.4931166425111765e-05, "loss": 0.022, "step": 21050 }, { "epoch": 5.459518589376568, "grad_norm": 0.23896481096744537, "learning_rate": 2.4918100573377606e-05, "loss": 0.0207, "step": 21060 }, { "epoch": 5.462497730732171, "grad_norm": 0.22099269926548004, "learning_rate": 2.4905032487164844e-05, "loss": 0.0172, "step": 21070 }, { "epoch": 5.465476872087773, "grad_norm": 0.23113621771335602, "learning_rate": 2.4891962172410803e-05, "loss": 0.0203, "step": 21080 }, { "epoch": 5.468456013443375, "grad_norm": 0.24825778603553772, "learning_rate": 2.4878889635053818e-05, "loss": 0.0186, "step": 21090 }, { "epoch": 5.471435154798978, "grad_norm": 0.19311337172985077, "learning_rate": 2.4865814881033243e-05, "loss": 0.0177, "step": 21100 }, { "epoch": 5.47441429615458, "grad_norm": 0.18993736803531647, "learning_rate": 2.4852737916289432e-05, "loss": 0.0227, "step": 21110 }, { "epoch": 5.477393437510183, "grad_norm": 0.23083285987377167, "learning_rate": 2.483965874676374e-05, "loss": 0.0224, "step": 21120 }, { "epoch": 5.480372578865785, "grad_norm": 0.19622445106506348, "learning_rate": 2.4826577378398526e-05, "loss": 0.0185, "step": 21130 }, { "epoch": 5.483351720221387, "grad_norm": 0.23498232662677765, "learning_rate": 2.481349381713715e-05, "loss": 0.0167, "step": 21140 }, { "epoch": 5.48633086157699, "grad_norm": 0.278945654630661, "learning_rate": 2.4800408068923965e-05, "loss": 0.018, "step": 21150 }, { "epoch": 5.4893100029325925, "grad_norm": 0.17457830905914307, "learning_rate": 2.4787320139704326e-05, "loss": 0.0209, "step": 21160 }, { "epoch": 5.492289144288195, "grad_norm": 0.24865612387657166, "learning_rate": 2.4774230035424564e-05, "loss": 0.0164, "step": 21170 }, { "epoch": 5.495268285643797, "grad_norm": 0.18730874359607697, "learning_rate": 2.4761137762032012e-05, "loss": 0.018, "step": 21180 }, { "epoch": 5.4982474269993995, "grad_norm": 0.2409316599369049, "learning_rate": 2.474804332547499e-05, "loss": 0.0193, "step": 21190 }, { "epoch": 5.501226568355002, "grad_norm": 0.20427021384239197, "learning_rate": 2.4734946731702786e-05, "loss": 0.0185, "step": 21200 }, { "epoch": 5.504205709710604, "grad_norm": 0.20248211920261383, "learning_rate": 2.472184798666568e-05, "loss": 0.0169, "step": 21210 }, { "epoch": 5.5071848510662065, "grad_norm": 0.24101264774799347, "learning_rate": 2.4708747096314922e-05, "loss": 0.015, "step": 21220 }, { "epoch": 5.510163992421809, "grad_norm": 0.19237981736660004, "learning_rate": 2.469564406660275e-05, "loss": 0.0184, "step": 21230 }, { "epoch": 5.513143133777412, "grad_norm": 0.17479051649570465, "learning_rate": 2.4682538903482357e-05, "loss": 0.0176, "step": 21240 }, { "epoch": 5.516122275133014, "grad_norm": 0.2143075317144394, "learning_rate": 2.4669431612907907e-05, "loss": 0.0197, "step": 21250 }, { "epoch": 5.519101416488616, "grad_norm": 0.19491387903690338, "learning_rate": 2.4656322200834547e-05, "loss": 0.0221, "step": 21260 }, { "epoch": 5.522080557844219, "grad_norm": 0.2281254678964615, "learning_rate": 2.4643210673218377e-05, "loss": 0.0179, "step": 21270 }, { "epoch": 5.525059699199821, "grad_norm": 0.1873811036348343, "learning_rate": 2.4630097036016456e-05, "loss": 0.0202, "step": 21280 }, { "epoch": 5.528038840555424, "grad_norm": 0.1974356770515442, "learning_rate": 2.46169812951868e-05, "loss": 0.0193, "step": 21290 }, { "epoch": 5.531017981911026, "grad_norm": 0.21751587092876434, "learning_rate": 2.4603863456688393e-05, "loss": 0.0183, "step": 21300 }, { "epoch": 5.533997123266628, "grad_norm": 0.21447227895259857, "learning_rate": 2.4590743526481154e-05, "loss": 0.0164, "step": 21310 }, { "epoch": 5.536976264622231, "grad_norm": 0.1859472095966339, "learning_rate": 2.4577621510525972e-05, "loss": 0.0157, "step": 21320 }, { "epoch": 5.5399554059778335, "grad_norm": 0.2001761943101883, "learning_rate": 2.456449741478467e-05, "loss": 0.0177, "step": 21330 }, { "epoch": 5.542934547333436, "grad_norm": 0.29732081294059753, "learning_rate": 2.455137124522002e-05, "loss": 0.0206, "step": 21340 }, { "epoch": 5.545913688689038, "grad_norm": 0.19343514740467072, "learning_rate": 2.453824300779573e-05, "loss": 0.0158, "step": 21350 }, { "epoch": 5.5488928300446405, "grad_norm": 0.22099259495735168, "learning_rate": 2.4525112708476463e-05, "loss": 0.0178, "step": 21360 }, { "epoch": 5.551871971400243, "grad_norm": 0.20886610448360443, "learning_rate": 2.4511980353227803e-05, "loss": 0.0184, "step": 21370 }, { "epoch": 5.554851112755846, "grad_norm": 0.27647680044174194, "learning_rate": 2.4498845948016276e-05, "loss": 0.0196, "step": 21380 }, { "epoch": 5.5578302541114475, "grad_norm": 0.205998957157135, "learning_rate": 2.4485709498809337e-05, "loss": 0.0178, "step": 21390 }, { "epoch": 5.56080939546705, "grad_norm": 0.1798434555530548, "learning_rate": 2.4472571011575373e-05, "loss": 0.0213, "step": 21400 }, { "epoch": 5.563788536822653, "grad_norm": 0.24621230363845825, "learning_rate": 2.4459430492283694e-05, "loss": 0.0205, "step": 21410 }, { "epoch": 5.566767678178255, "grad_norm": 0.2601448595523834, "learning_rate": 2.444628794690453e-05, "loss": 0.0201, "step": 21420 }, { "epoch": 5.569746819533858, "grad_norm": 0.20717591047286987, "learning_rate": 2.443314338140904e-05, "loss": 0.0177, "step": 21430 }, { "epoch": 5.57272596088946, "grad_norm": 0.2132101058959961, "learning_rate": 2.4419996801769284e-05, "loss": 0.0166, "step": 21440 }, { "epoch": 5.575705102245062, "grad_norm": 0.1951812356710434, "learning_rate": 2.440684821395827e-05, "loss": 0.0203, "step": 21450 }, { "epoch": 5.578684243600665, "grad_norm": 0.20292185246944427, "learning_rate": 2.4393697623949878e-05, "loss": 0.0222, "step": 21460 }, { "epoch": 5.581663384956267, "grad_norm": 0.21427802741527557, "learning_rate": 2.438054503771893e-05, "loss": 0.0206, "step": 21470 }, { "epoch": 5.584642526311869, "grad_norm": 0.21059536933898926, "learning_rate": 2.4367390461241135e-05, "loss": 0.019, "step": 21480 }, { "epoch": 5.587621667667472, "grad_norm": 0.24670438468456268, "learning_rate": 2.4354233900493115e-05, "loss": 0.0192, "step": 21490 }, { "epoch": 5.5906008090230745, "grad_norm": 0.228184774518013, "learning_rate": 2.4341075361452392e-05, "loss": 0.0171, "step": 21500 }, { "epoch": 5.593579950378677, "grad_norm": 0.28955164551734924, "learning_rate": 2.4327914850097383e-05, "loss": 0.0163, "step": 21510 }, { "epoch": 5.596559091734279, "grad_norm": 0.24084962904453278, "learning_rate": 2.431475237240741e-05, "loss": 0.018, "step": 21520 }, { "epoch": 5.5995382330898815, "grad_norm": 0.22638435661792755, "learning_rate": 2.430158793436268e-05, "loss": 0.0183, "step": 21530 }, { "epoch": 5.602517374445484, "grad_norm": 0.1989947408437729, "learning_rate": 2.4288421541944297e-05, "loss": 0.0161, "step": 21540 }, { "epoch": 5.605496515801087, "grad_norm": 0.28115466237068176, "learning_rate": 2.427525320113425e-05, "loss": 0.0195, "step": 21550 }, { "epoch": 5.6084756571566885, "grad_norm": 0.19656367599964142, "learning_rate": 2.426208291791541e-05, "loss": 0.0167, "step": 21560 }, { "epoch": 5.611454798512291, "grad_norm": 0.2094409316778183, "learning_rate": 2.4248910698271527e-05, "loss": 0.0166, "step": 21570 }, { "epoch": 5.614433939867894, "grad_norm": 0.18330973386764526, "learning_rate": 2.4235736548187253e-05, "loss": 0.0163, "step": 21580 }, { "epoch": 5.617413081223496, "grad_norm": 0.2599663734436035, "learning_rate": 2.4222560473648096e-05, "loss": 0.0156, "step": 21590 }, { "epoch": 5.620392222579099, "grad_norm": 0.19073952734470367, "learning_rate": 2.4209382480640433e-05, "loss": 0.0189, "step": 21600 }, { "epoch": 5.623371363934701, "grad_norm": 0.1978006660938263, "learning_rate": 2.4196202575151532e-05, "loss": 0.0166, "step": 21610 }, { "epoch": 5.626350505290303, "grad_norm": 0.24230985343456268, "learning_rate": 2.4183020763169516e-05, "loss": 0.0186, "step": 21620 }, { "epoch": 5.629329646645906, "grad_norm": 0.20139309763908386, "learning_rate": 2.416983705068339e-05, "loss": 0.0182, "step": 21630 }, { "epoch": 5.632308788001508, "grad_norm": 0.18129126727581024, "learning_rate": 2.4156651443683005e-05, "loss": 0.0186, "step": 21640 }, { "epoch": 5.63528792935711, "grad_norm": 0.20581819117069244, "learning_rate": 2.4143463948159075e-05, "loss": 0.0214, "step": 21650 }, { "epoch": 5.638267070712713, "grad_norm": 0.29402652382850647, "learning_rate": 2.4130274570103183e-05, "loss": 0.0206, "step": 21660 }, { "epoch": 5.641246212068316, "grad_norm": 0.21806025505065918, "learning_rate": 2.4117083315507762e-05, "loss": 0.0164, "step": 21670 }, { "epoch": 5.644225353423918, "grad_norm": 0.3084464967250824, "learning_rate": 2.4103890190366092e-05, "loss": 0.0184, "step": 21680 }, { "epoch": 5.64720449477952, "grad_norm": 0.18274804949760437, "learning_rate": 2.409069520067231e-05, "loss": 0.0172, "step": 21690 }, { "epoch": 5.6501836361351225, "grad_norm": 0.2170702964067459, "learning_rate": 2.40774983524214e-05, "loss": 0.0196, "step": 21700 }, { "epoch": 5.653162777490725, "grad_norm": 0.19274817407131195, "learning_rate": 2.406429965160918e-05, "loss": 0.0167, "step": 21710 }, { "epoch": 5.656141918846328, "grad_norm": 0.24421051144599915, "learning_rate": 2.4051099104232325e-05, "loss": 0.018, "step": 21720 }, { "epoch": 5.6591210602019295, "grad_norm": 0.22827836871147156, "learning_rate": 2.4037896716288338e-05, "loss": 0.0198, "step": 21730 }, { "epoch": 5.662100201557532, "grad_norm": 0.20393024384975433, "learning_rate": 2.402469249377556e-05, "loss": 0.0189, "step": 21740 }, { "epoch": 5.665079342913135, "grad_norm": 0.205760657787323, "learning_rate": 2.4011486442693178e-05, "loss": 0.0178, "step": 21750 }, { "epoch": 5.668058484268737, "grad_norm": 0.23849861323833466, "learning_rate": 2.3998278569041186e-05, "loss": 0.0196, "step": 21760 }, { "epoch": 5.67103762562434, "grad_norm": 0.24662956595420837, "learning_rate": 2.3985068878820424e-05, "loss": 0.0194, "step": 21770 }, { "epoch": 5.674016766979942, "grad_norm": 0.22095635533332825, "learning_rate": 2.3971857378032555e-05, "loss": 0.0174, "step": 21780 }, { "epoch": 5.676995908335544, "grad_norm": 0.20252546668052673, "learning_rate": 2.3958644072680057e-05, "loss": 0.0191, "step": 21790 }, { "epoch": 5.679975049691147, "grad_norm": 0.22829298675060272, "learning_rate": 2.394542896876624e-05, "loss": 0.0176, "step": 21800 }, { "epoch": 5.68295419104675, "grad_norm": 0.23207728564739227, "learning_rate": 2.393221207229522e-05, "loss": 0.0197, "step": 21810 }, { "epoch": 5.685933332402351, "grad_norm": 0.25748518109321594, "learning_rate": 2.391899338927193e-05, "loss": 0.0186, "step": 21820 }, { "epoch": 5.688912473757954, "grad_norm": 0.19165389239788055, "learning_rate": 2.3905772925702122e-05, "loss": 0.0188, "step": 21830 }, { "epoch": 5.691891615113557, "grad_norm": 0.19250252842903137, "learning_rate": 2.389255068759235e-05, "loss": 0.016, "step": 21840 }, { "epoch": 5.694870756469159, "grad_norm": 0.21416951715946198, "learning_rate": 2.387932668094997e-05, "loss": 0.0169, "step": 21850 }, { "epoch": 5.697849897824762, "grad_norm": 0.19579045474529266, "learning_rate": 2.386610091178315e-05, "loss": 0.0198, "step": 21860 }, { "epoch": 5.7008290391803635, "grad_norm": 0.22136886417865753, "learning_rate": 2.385287338610085e-05, "loss": 0.0184, "step": 21870 }, { "epoch": 5.703808180535966, "grad_norm": 0.20144161581993103, "learning_rate": 2.3839644109912843e-05, "loss": 0.0162, "step": 21880 }, { "epoch": 5.706787321891569, "grad_norm": 0.2394806444644928, "learning_rate": 2.382641308922969e-05, "loss": 0.0171, "step": 21890 }, { "epoch": 5.7097664632471705, "grad_norm": 0.18181666731834412, "learning_rate": 2.381318033006274e-05, "loss": 0.0169, "step": 21900 }, { "epoch": 5.712745604602773, "grad_norm": 0.23151373863220215, "learning_rate": 2.3799945838424127e-05, "loss": 0.0182, "step": 21910 }, { "epoch": 5.715724745958376, "grad_norm": 0.18917304277420044, "learning_rate": 2.3786709620326783e-05, "loss": 0.0156, "step": 21920 }, { "epoch": 5.718703887313978, "grad_norm": 0.1862196922302246, "learning_rate": 2.3773471681784428e-05, "loss": 0.0155, "step": 21930 }, { "epoch": 5.721683028669581, "grad_norm": 0.18879419565200806, "learning_rate": 2.3760232028811552e-05, "loss": 0.0137, "step": 21940 }, { "epoch": 5.724662170025183, "grad_norm": 0.3036576807498932, "learning_rate": 2.374699066742343e-05, "loss": 0.0198, "step": 21950 }, { "epoch": 5.727641311380785, "grad_norm": 0.16357183456420898, "learning_rate": 2.3733747603636107e-05, "loss": 0.0159, "step": 21960 }, { "epoch": 5.730620452736388, "grad_norm": 0.20785298943519592, "learning_rate": 2.3720502843466413e-05, "loss": 0.0168, "step": 21970 }, { "epoch": 5.733599594091991, "grad_norm": 0.23996159434318542, "learning_rate": 2.370725639293194e-05, "loss": 0.0179, "step": 21980 }, { "epoch": 5.736578735447592, "grad_norm": 0.25150802731513977, "learning_rate": 2.3694008258051046e-05, "loss": 0.0183, "step": 21990 }, { "epoch": 5.739557876803195, "grad_norm": 0.16361966729164124, "learning_rate": 2.3680758444842863e-05, "loss": 0.0172, "step": 22000 }, { "epoch": 5.742537018158798, "grad_norm": 0.2255014181137085, "learning_rate": 2.3667506959327283e-05, "loss": 0.0198, "step": 22010 }, { "epoch": 5.7455161595144, "grad_norm": 0.20593373477458954, "learning_rate": 2.3654253807524952e-05, "loss": 0.0185, "step": 22020 }, { "epoch": 5.748495300870003, "grad_norm": 0.19963236153125763, "learning_rate": 2.3640998995457274e-05, "loss": 0.0154, "step": 22030 }, { "epoch": 5.7514744422256046, "grad_norm": 0.20309461653232574, "learning_rate": 2.3627742529146418e-05, "loss": 0.0175, "step": 22040 }, { "epoch": 5.754453583581207, "grad_norm": 0.2035822868347168, "learning_rate": 2.361448441461529e-05, "loss": 0.0217, "step": 22050 }, { "epoch": 5.75743272493681, "grad_norm": 0.18991628289222717, "learning_rate": 2.3601224657887555e-05, "loss": 0.0193, "step": 22060 }, { "epoch": 5.760411866292412, "grad_norm": 0.20778781175613403, "learning_rate": 2.3587963264987617e-05, "loss": 0.0149, "step": 22070 }, { "epoch": 5.763391007648014, "grad_norm": 0.23419512808322906, "learning_rate": 2.3574700241940633e-05, "loss": 0.0167, "step": 22080 }, { "epoch": 5.766370149003617, "grad_norm": 0.18839669227600098, "learning_rate": 2.3561435594772485e-05, "loss": 0.0162, "step": 22090 }, { "epoch": 5.769349290359219, "grad_norm": 0.21329781413078308, "learning_rate": 2.354816932950981e-05, "loss": 0.0203, "step": 22100 }, { "epoch": 5.772328431714822, "grad_norm": 0.23691023886203766, "learning_rate": 2.3534901452179977e-05, "loss": 0.0184, "step": 22110 }, { "epoch": 5.775307573070424, "grad_norm": 0.2553551495075226, "learning_rate": 2.352163196881107e-05, "loss": 0.0172, "step": 22120 }, { "epoch": 5.778286714426026, "grad_norm": 0.18628904223442078, "learning_rate": 2.3508360885431926e-05, "loss": 0.0171, "step": 22130 }, { "epoch": 5.781265855781629, "grad_norm": 0.19878844916820526, "learning_rate": 2.34950882080721e-05, "loss": 0.0183, "step": 22140 }, { "epoch": 5.784244997137232, "grad_norm": 0.27346497774124146, "learning_rate": 2.3481813942761865e-05, "loss": 0.0176, "step": 22150 }, { "epoch": 5.787224138492833, "grad_norm": 0.16482602059841156, "learning_rate": 2.346853809553222e-05, "loss": 0.016, "step": 22160 }, { "epoch": 5.790203279848436, "grad_norm": 0.2177228033542633, "learning_rate": 2.3455260672414882e-05, "loss": 0.0159, "step": 22170 }, { "epoch": 5.793182421204039, "grad_norm": 0.19182650744915009, "learning_rate": 2.344198167944229e-05, "loss": 0.0174, "step": 22180 }, { "epoch": 5.796161562559641, "grad_norm": 0.23217977583408356, "learning_rate": 2.342870112264759e-05, "loss": 0.0141, "step": 22190 }, { "epoch": 5.799140703915244, "grad_norm": 0.2160997986793518, "learning_rate": 2.3415419008064643e-05, "loss": 0.015, "step": 22200 }, { "epoch": 5.802119845270846, "grad_norm": 0.1828189343214035, "learning_rate": 2.3402135341728002e-05, "loss": 0.0183, "step": 22210 }, { "epoch": 5.805098986626448, "grad_norm": 0.2033083736896515, "learning_rate": 2.3388850129672947e-05, "loss": 0.0193, "step": 22220 }, { "epoch": 5.808078127982051, "grad_norm": 0.19433759152889252, "learning_rate": 2.3375563377935455e-05, "loss": 0.0202, "step": 22230 }, { "epoch": 5.811057269337653, "grad_norm": 0.20388513803482056, "learning_rate": 2.336227509255219e-05, "loss": 0.0191, "step": 22240 }, { "epoch": 5.814036410693255, "grad_norm": 0.2178264558315277, "learning_rate": 2.3348985279560533e-05, "loss": 0.0167, "step": 22250 }, { "epoch": 5.817015552048858, "grad_norm": 0.23809176683425903, "learning_rate": 2.3335693944998533e-05, "loss": 0.0155, "step": 22260 }, { "epoch": 5.81999469340446, "grad_norm": 0.21086148917675018, "learning_rate": 2.3322401094904948e-05, "loss": 0.0155, "step": 22270 }, { "epoch": 5.822973834760063, "grad_norm": 0.2141348421573639, "learning_rate": 2.3309106735319225e-05, "loss": 0.0181, "step": 22280 }, { "epoch": 5.825952976115666, "grad_norm": 0.21441999077796936, "learning_rate": 2.32958108722815e-05, "loss": 0.017, "step": 22290 }, { "epoch": 5.828932117471267, "grad_norm": 0.26823240518569946, "learning_rate": 2.3282513511832572e-05, "loss": 0.0163, "step": 22300 }, { "epoch": 5.83191125882687, "grad_norm": 0.2184048444032669, "learning_rate": 2.326921466001394e-05, "loss": 0.0177, "step": 22310 }, { "epoch": 5.834890400182473, "grad_norm": 0.25245970487594604, "learning_rate": 2.3255914322867774e-05, "loss": 0.0176, "step": 22320 }, { "epoch": 5.837869541538074, "grad_norm": 0.19161777198314667, "learning_rate": 2.3242612506436916e-05, "loss": 0.0152, "step": 22330 }, { "epoch": 5.840848682893677, "grad_norm": 0.22247137129306793, "learning_rate": 2.3229309216764885e-05, "loss": 0.0153, "step": 22340 }, { "epoch": 5.84382782424928, "grad_norm": 0.32918527722358704, "learning_rate": 2.3216004459895865e-05, "loss": 0.0177, "step": 22350 }, { "epoch": 5.846806965604882, "grad_norm": 0.19912783801555634, "learning_rate": 2.3202698241874715e-05, "loss": 0.0165, "step": 22360 }, { "epoch": 5.849786106960485, "grad_norm": 0.2361641824245453, "learning_rate": 2.3189390568746945e-05, "loss": 0.0162, "step": 22370 }, { "epoch": 5.852765248316087, "grad_norm": 0.2050868570804596, "learning_rate": 2.3176081446558736e-05, "loss": 0.0185, "step": 22380 }, { "epoch": 5.855744389671689, "grad_norm": 0.20007012784481049, "learning_rate": 2.3162770881356927e-05, "loss": 0.0152, "step": 22390 }, { "epoch": 5.858723531027292, "grad_norm": 0.21198180317878723, "learning_rate": 2.3149458879189e-05, "loss": 0.0191, "step": 22400 }, { "epoch": 5.861702672382894, "grad_norm": 0.17173025012016296, "learning_rate": 2.3136145446103105e-05, "loss": 0.0155, "step": 22410 }, { "epoch": 5.864681813738496, "grad_norm": 0.2535518705844879, "learning_rate": 2.312283058814805e-05, "loss": 0.0156, "step": 22420 }, { "epoch": 5.867660955094099, "grad_norm": 0.290085107088089, "learning_rate": 2.3109514311373256e-05, "loss": 0.0164, "step": 22430 }, { "epoch": 5.870640096449701, "grad_norm": 0.18783362209796906, "learning_rate": 2.309619662182882e-05, "loss": 0.0188, "step": 22440 }, { "epoch": 5.873619237805304, "grad_norm": 0.1973901093006134, "learning_rate": 2.308287752556547e-05, "loss": 0.0176, "step": 22450 }, { "epoch": 5.876598379160907, "grad_norm": 0.2024088054895401, "learning_rate": 2.3069557028634578e-05, "loss": 0.0159, "step": 22460 }, { "epoch": 5.879577520516508, "grad_norm": 0.20163658261299133, "learning_rate": 2.3056235137088143e-05, "loss": 0.0164, "step": 22470 }, { "epoch": 5.882556661872111, "grad_norm": 0.19487696886062622, "learning_rate": 2.3042911856978797e-05, "loss": 0.0164, "step": 22480 }, { "epoch": 5.885535803227714, "grad_norm": 0.20739944279193878, "learning_rate": 2.302958719435982e-05, "loss": 0.0162, "step": 22490 }, { "epoch": 5.888514944583316, "grad_norm": 0.21064554154872894, "learning_rate": 2.3016261155285102e-05, "loss": 0.0188, "step": 22500 }, { "epoch": 5.891494085938918, "grad_norm": 0.18036708235740662, "learning_rate": 2.300293374580917e-05, "loss": 0.0195, "step": 22510 }, { "epoch": 5.894473227294521, "grad_norm": 0.22244559228420258, "learning_rate": 2.2989604971987168e-05, "loss": 0.0158, "step": 22520 }, { "epoch": 5.897452368650123, "grad_norm": 0.1839110553264618, "learning_rate": 2.2976274839874856e-05, "loss": 0.016, "step": 22530 }, { "epoch": 5.900431510005726, "grad_norm": 0.19280195236206055, "learning_rate": 2.2962943355528623e-05, "loss": 0.0195, "step": 22540 }, { "epoch": 5.903410651361328, "grad_norm": 0.23938816785812378, "learning_rate": 2.2949610525005456e-05, "loss": 0.0169, "step": 22550 }, { "epoch": 5.90638979271693, "grad_norm": 0.1877104789018631, "learning_rate": 2.293627635436297e-05, "loss": 0.0161, "step": 22560 }, { "epoch": 5.909368934072533, "grad_norm": 0.1996297836303711, "learning_rate": 2.2922940849659376e-05, "loss": 0.0162, "step": 22570 }, { "epoch": 5.9123480754281355, "grad_norm": 0.2000254988670349, "learning_rate": 2.2909604016953503e-05, "loss": 0.0167, "step": 22580 }, { "epoch": 5.915327216783737, "grad_norm": 0.17757835984230042, "learning_rate": 2.289626586230478e-05, "loss": 0.0167, "step": 22590 }, { "epoch": 5.91830635813934, "grad_norm": 0.20336347818374634, "learning_rate": 2.288292639177322e-05, "loss": 0.02, "step": 22600 }, { "epoch": 5.921285499494942, "grad_norm": 0.17412486672401428, "learning_rate": 2.286958561141946e-05, "loss": 0.0165, "step": 22610 }, { "epoch": 5.924264640850545, "grad_norm": 0.2400120496749878, "learning_rate": 2.2856243527304712e-05, "loss": 0.0177, "step": 22620 }, { "epoch": 5.927243782206148, "grad_norm": 0.17074301838874817, "learning_rate": 2.2842900145490795e-05, "loss": 0.0187, "step": 22630 }, { "epoch": 5.930222923561749, "grad_norm": 0.19041593372821808, "learning_rate": 2.282955547204011e-05, "loss": 0.0189, "step": 22640 }, { "epoch": 5.933202064917352, "grad_norm": 0.21467278897762299, "learning_rate": 2.281620951301564e-05, "loss": 0.017, "step": 22650 }, { "epoch": 5.936181206272955, "grad_norm": 0.19844286143779755, "learning_rate": 2.280286227448096e-05, "loss": 0.0171, "step": 22660 }, { "epoch": 5.939160347628557, "grad_norm": 0.2394876331090927, "learning_rate": 2.2789513762500226e-05, "loss": 0.0189, "step": 22670 }, { "epoch": 5.942139488984159, "grad_norm": 0.18338021636009216, "learning_rate": 2.2776163983138168e-05, "loss": 0.0148, "step": 22680 }, { "epoch": 5.945118630339762, "grad_norm": 0.17026008665561676, "learning_rate": 2.2762812942460094e-05, "loss": 0.0161, "step": 22690 }, { "epoch": 5.948097771695364, "grad_norm": 0.19046659767627716, "learning_rate": 2.2749460646531887e-05, "loss": 0.0206, "step": 22700 }, { "epoch": 5.951076913050967, "grad_norm": 0.17756961286067963, "learning_rate": 2.273610710142e-05, "loss": 0.0136, "step": 22710 }, { "epoch": 5.9540560544065695, "grad_norm": 0.19623813033103943, "learning_rate": 2.272275231319145e-05, "loss": 0.0164, "step": 22720 }, { "epoch": 5.957035195762171, "grad_norm": 0.16418391466140747, "learning_rate": 2.2709396287913826e-05, "loss": 0.017, "step": 22730 }, { "epoch": 5.960014337117774, "grad_norm": 0.17588043212890625, "learning_rate": 2.2696039031655262e-05, "loss": 0.0204, "step": 22740 }, { "epoch": 5.9629934784733765, "grad_norm": 0.21901248395442963, "learning_rate": 2.2682680550484474e-05, "loss": 0.0168, "step": 22750 }, { "epoch": 5.965972619828978, "grad_norm": 0.2915601134300232, "learning_rate": 2.266932085047072e-05, "loss": 0.0167, "step": 22760 }, { "epoch": 5.968951761184581, "grad_norm": 0.18032124638557434, "learning_rate": 2.265595993768382e-05, "loss": 0.0149, "step": 22770 }, { "epoch": 5.971930902540183, "grad_norm": 0.17078638076782227, "learning_rate": 2.264259781819414e-05, "loss": 0.0207, "step": 22780 }, { "epoch": 5.974910043895786, "grad_norm": 0.21256160736083984, "learning_rate": 2.262923449807258e-05, "loss": 0.014, "step": 22790 }, { "epoch": 5.977889185251389, "grad_norm": 0.1782907396554947, "learning_rate": 2.2615869983390622e-05, "loss": 0.0163, "step": 22800 }, { "epoch": 5.98086832660699, "grad_norm": 0.18739143013954163, "learning_rate": 2.260250428022026e-05, "loss": 0.0161, "step": 22810 }, { "epoch": 5.983847467962593, "grad_norm": 0.18582595884799957, "learning_rate": 2.2589137394634034e-05, "loss": 0.0189, "step": 22820 }, { "epoch": 5.986826609318196, "grad_norm": 0.21469935774803162, "learning_rate": 2.257576933270502e-05, "loss": 0.0163, "step": 22830 }, { "epoch": 5.989805750673798, "grad_norm": 0.1764267235994339, "learning_rate": 2.256240010050685e-05, "loss": 0.0195, "step": 22840 }, { "epoch": 5.9927848920294, "grad_norm": 0.2495085448026657, "learning_rate": 2.254902970411366e-05, "loss": 0.0168, "step": 22850 }, { "epoch": 5.995764033385003, "grad_norm": 0.2139008343219757, "learning_rate": 2.2535658149600127e-05, "loss": 0.0167, "step": 22860 }, { "epoch": 5.998743174740605, "grad_norm": 0.1856565922498703, "learning_rate": 2.252228544304145e-05, "loss": 0.0169, "step": 22870 }, { "epoch": 6.001722316096208, "grad_norm": 0.20691923797130585, "learning_rate": 2.2508911590513347e-05, "loss": 0.0154, "step": 22880 }, { "epoch": 6.0047014574518105, "grad_norm": 0.21897225081920624, "learning_rate": 2.2495536598092082e-05, "loss": 0.015, "step": 22890 }, { "epoch": 6.007680598807412, "grad_norm": 0.22648707032203674, "learning_rate": 2.2482160471854405e-05, "loss": 0.0169, "step": 22900 }, { "epoch": 6.010659740163015, "grad_norm": 0.17344266176223755, "learning_rate": 2.24687832178776e-05, "loss": 0.017, "step": 22910 }, { "epoch": 6.0136388815186175, "grad_norm": 0.2115435004234314, "learning_rate": 2.245540484223945e-05, "loss": 0.0208, "step": 22920 }, { "epoch": 6.016618022874219, "grad_norm": 0.2456045001745224, "learning_rate": 2.2442025351018268e-05, "loss": 0.0177, "step": 22930 }, { "epoch": 6.019597164229822, "grad_norm": 0.1714634746313095, "learning_rate": 2.2428644750292855e-05, "loss": 0.0198, "step": 22940 }, { "epoch": 6.0225763055854244, "grad_norm": 0.19933916628360748, "learning_rate": 2.241526304614252e-05, "loss": 0.0155, "step": 22950 }, { "epoch": 6.025555446941027, "grad_norm": 0.2700021266937256, "learning_rate": 2.2401880244647078e-05, "loss": 0.0158, "step": 22960 }, { "epoch": 6.02853458829663, "grad_norm": 0.19818969070911407, "learning_rate": 2.2388496351886844e-05, "loss": 0.0169, "step": 22970 }, { "epoch": 6.031513729652232, "grad_norm": 0.2162981629371643, "learning_rate": 2.237511137394263e-05, "loss": 0.015, "step": 22980 }, { "epoch": 6.034492871007834, "grad_norm": 0.22279764711856842, "learning_rate": 2.236172531689572e-05, "loss": 0.0174, "step": 22990 }, { "epoch": 6.037472012363437, "grad_norm": 0.19800816476345062, "learning_rate": 2.234833818682792e-05, "loss": 0.0153, "step": 23000 }, { "epoch": 6.040451153719039, "grad_norm": 0.20464114844799042, "learning_rate": 2.23349499898215e-05, "loss": 0.0173, "step": 23010 }, { "epoch": 6.043430295074641, "grad_norm": 0.195867121219635, "learning_rate": 2.2321560731959222e-05, "loss": 0.015, "step": 23020 }, { "epoch": 6.046409436430244, "grad_norm": 0.23268626630306244, "learning_rate": 2.2308170419324337e-05, "loss": 0.0219, "step": 23030 }, { "epoch": 6.049388577785846, "grad_norm": 0.19922053813934326, "learning_rate": 2.2294779058000572e-05, "loss": 0.0147, "step": 23040 }, { "epoch": 6.052367719141449, "grad_norm": 0.18476209044456482, "learning_rate": 2.2281386654072108e-05, "loss": 0.0183, "step": 23050 }, { "epoch": 6.0553468604970515, "grad_norm": 0.20652997493743896, "learning_rate": 2.2267993213623642e-05, "loss": 0.0191, "step": 23060 }, { "epoch": 6.058326001852653, "grad_norm": 0.23201243579387665, "learning_rate": 2.225459874274031e-05, "loss": 0.0176, "step": 23070 }, { "epoch": 6.061305143208256, "grad_norm": 0.198011577129364, "learning_rate": 2.2241203247507724e-05, "loss": 0.0215, "step": 23080 }, { "epoch": 6.0642842845638585, "grad_norm": 0.19807220995426178, "learning_rate": 2.2227806734011966e-05, "loss": 0.0144, "step": 23090 }, { "epoch": 6.067263425919461, "grad_norm": 0.20360785722732544, "learning_rate": 2.221440920833957e-05, "loss": 0.0184, "step": 23100 }, { "epoch": 6.070242567275063, "grad_norm": 0.20980310440063477, "learning_rate": 2.220101067657755e-05, "loss": 0.0141, "step": 23110 }, { "epoch": 6.0732217086306655, "grad_norm": 0.1864863485097885, "learning_rate": 2.218761114481335e-05, "loss": 0.016, "step": 23120 }, { "epoch": 6.076200849986268, "grad_norm": 0.23386231064796448, "learning_rate": 2.2174210619134893e-05, "loss": 0.0174, "step": 23130 }, { "epoch": 6.079179991341871, "grad_norm": 0.19424474239349365, "learning_rate": 2.2160809105630542e-05, "loss": 0.0158, "step": 23140 }, { "epoch": 6.082159132697473, "grad_norm": 0.24687139689922333, "learning_rate": 2.2147406610389113e-05, "loss": 0.0174, "step": 23150 }, { "epoch": 6.085138274053075, "grad_norm": 0.19487439095973969, "learning_rate": 2.2134003139499852e-05, "loss": 0.0173, "step": 23160 }, { "epoch": 6.088117415408678, "grad_norm": 0.18766163289546967, "learning_rate": 2.2120598699052477e-05, "loss": 0.0162, "step": 23170 }, { "epoch": 6.09109655676428, "grad_norm": 0.2412940412759781, "learning_rate": 2.2107193295137116e-05, "loss": 0.0153, "step": 23180 }, { "epoch": 6.094075698119882, "grad_norm": 0.18839281797409058, "learning_rate": 2.2093786933844365e-05, "loss": 0.0213, "step": 23190 }, { "epoch": 6.097054839475485, "grad_norm": 0.1968384087085724, "learning_rate": 2.2080379621265228e-05, "loss": 0.0161, "step": 23200 }, { "epoch": 6.100033980831087, "grad_norm": 0.20817959308624268, "learning_rate": 2.2066971363491153e-05, "loss": 0.0178, "step": 23210 }, { "epoch": 6.10301312218669, "grad_norm": 0.2973457872867584, "learning_rate": 2.2053562166614028e-05, "loss": 0.0165, "step": 23220 }, { "epoch": 6.1059922635422925, "grad_norm": 0.20974795520305634, "learning_rate": 2.2040152036726147e-05, "loss": 0.0163, "step": 23230 }, { "epoch": 6.108971404897894, "grad_norm": 0.1900867074728012, "learning_rate": 2.202674097992024e-05, "loss": 0.0151, "step": 23240 }, { "epoch": 6.111950546253497, "grad_norm": 0.1898990422487259, "learning_rate": 2.2013329002289466e-05, "loss": 0.0176, "step": 23250 }, { "epoch": 6.1149296876090995, "grad_norm": 0.28197193145751953, "learning_rate": 2.1999916109927377e-05, "loss": 0.0183, "step": 23260 }, { "epoch": 6.117908828964702, "grad_norm": 0.1809089183807373, "learning_rate": 2.1986502308927955e-05, "loss": 0.0185, "step": 23270 }, { "epoch": 6.120887970320304, "grad_norm": 0.16906408965587616, "learning_rate": 2.1973087605385614e-05, "loss": 0.0181, "step": 23280 }, { "epoch": 6.1238671116759065, "grad_norm": 0.18763448297977448, "learning_rate": 2.1959672005395142e-05, "loss": 0.0204, "step": 23290 }, { "epoch": 6.001990927002103, "grad_norm": 0.16908138990402222, "learning_rate": 2.194625551505176e-05, "loss": 0.0193, "step": 23300 }, { "epoch": 6.0046399774830705, "grad_norm": 0.18666903674602509, "learning_rate": 2.1932838140451085e-05, "loss": 0.0162, "step": 23310 }, { "epoch": 6.007289027964039, "grad_norm": 0.22214095294475555, "learning_rate": 2.1919419887689134e-05, "loss": 0.0225, "step": 23320 }, { "epoch": 6.009938078445007, "grad_norm": 0.16952256858348846, "learning_rate": 2.1906000762862335e-05, "loss": 0.0183, "step": 23330 }, { "epoch": 6.012587128925976, "grad_norm": 0.1554306596517563, "learning_rate": 2.189258077206749e-05, "loss": 0.0159, "step": 23340 }, { "epoch": 6.015236179406944, "grad_norm": 0.17919299006462097, "learning_rate": 2.187915992140181e-05, "loss": 0.019, "step": 23350 }, { "epoch": 6.017885229887912, "grad_norm": 0.19525647163391113, "learning_rate": 2.18657382169629e-05, "loss": 0.0178, "step": 23360 }, { "epoch": 6.020534280368881, "grad_norm": 0.20352984964847565, "learning_rate": 2.185231566484874e-05, "loss": 0.0181, "step": 23370 }, { "epoch": 6.023183330849848, "grad_norm": 0.1715182065963745, "learning_rate": 2.18388922711577e-05, "loss": 0.0178, "step": 23380 }, { "epoch": 6.025832381330817, "grad_norm": 0.2079107016324997, "learning_rate": 2.1825468041988547e-05, "loss": 0.0163, "step": 23390 }, { "epoch": 6.028481431811785, "grad_norm": 0.21421843767166138, "learning_rate": 2.1812042983440396e-05, "loss": 0.0189, "step": 23400 }, { "epoch": 6.031130482292753, "grad_norm": 0.20399630069732666, "learning_rate": 2.1798617101612767e-05, "loss": 0.0143, "step": 23410 }, { "epoch": 6.033779532773721, "grad_norm": 0.17607998847961426, "learning_rate": 2.1785190402605553e-05, "loss": 0.0168, "step": 23420 }, { "epoch": 6.03642858325469, "grad_norm": 0.2561720907688141, "learning_rate": 2.1771762892518996e-05, "loss": 0.0157, "step": 23430 }, { "epoch": 6.0390776337356575, "grad_norm": 0.1979912668466568, "learning_rate": 2.1758334577453732e-05, "loss": 0.0165, "step": 23440 }, { "epoch": 6.041726684216626, "grad_norm": 0.19932378828525543, "learning_rate": 2.1744905463510736e-05, "loss": 0.0127, "step": 23450 }, { "epoch": 6.044375734697594, "grad_norm": 0.2416635900735855, "learning_rate": 2.1731475556791374e-05, "loss": 0.0176, "step": 23460 }, { "epoch": 6.047024785178563, "grad_norm": 0.21219590306282043, "learning_rate": 2.1718044863397358e-05, "loss": 0.0153, "step": 23470 }, { "epoch": 6.049673835659531, "grad_norm": 0.16920635104179382, "learning_rate": 2.170461338943075e-05, "loss": 0.0158, "step": 23480 }, { "epoch": 6.052322886140499, "grad_norm": 0.20539577305316925, "learning_rate": 2.1691181140993985e-05, "loss": 0.0188, "step": 23490 }, { "epoch": 6.054971936621468, "grad_norm": 0.1724439561367035, "learning_rate": 2.167774812418984e-05, "loss": 0.0167, "step": 23500 }, { "epoch": 6.057620987102435, "grad_norm": 0.17600756883621216, "learning_rate": 2.166431434512143e-05, "loss": 0.0178, "step": 23510 }, { "epoch": 6.060270037583404, "grad_norm": 0.17643724381923676, "learning_rate": 2.165087980989224e-05, "loss": 0.0163, "step": 23520 }, { "epoch": 6.062919088064372, "grad_norm": 0.21768087148666382, "learning_rate": 2.1637444524606072e-05, "loss": 0.0181, "step": 23530 }, { "epoch": 6.06556813854534, "grad_norm": 0.16806823015213013, "learning_rate": 2.16240084953671e-05, "loss": 0.014, "step": 23540 }, { "epoch": 6.068217189026308, "grad_norm": 0.19007423520088196, "learning_rate": 2.161057172827981e-05, "loss": 0.0156, "step": 23550 }, { "epoch": 6.070866239507277, "grad_norm": 0.2108355015516281, "learning_rate": 2.159713422944903e-05, "loss": 0.0183, "step": 23560 }, { "epoch": 6.0735152899882445, "grad_norm": 0.22073814272880554, "learning_rate": 2.1583696004979934e-05, "loss": 0.0152, "step": 23570 }, { "epoch": 6.076164340469213, "grad_norm": 0.17697015404701233, "learning_rate": 2.1570257060977995e-05, "loss": 0.0189, "step": 23580 }, { "epoch": 6.078813390950181, "grad_norm": 0.17683832347393036, "learning_rate": 2.1556817403549047e-05, "loss": 0.0162, "step": 23590 }, { "epoch": 6.08146244143115, "grad_norm": 0.132920041680336, "learning_rate": 2.154337703879923e-05, "loss": 0.0135, "step": 23600 }, { "epoch": 6.084111491912118, "grad_norm": 0.17573633790016174, "learning_rate": 2.152993597283501e-05, "loss": 0.0158, "step": 23610 }, { "epoch": 6.086760542393086, "grad_norm": 0.18283385038375854, "learning_rate": 2.151649421176316e-05, "loss": 0.0185, "step": 23620 }, { "epoch": 6.089409592874055, "grad_norm": 0.1814146637916565, "learning_rate": 2.15030517616908e-05, "loss": 0.0192, "step": 23630 }, { "epoch": 6.092058643355022, "grad_norm": 0.1816597580909729, "learning_rate": 2.148960862872532e-05, "loss": 0.0184, "step": 23640 }, { "epoch": 6.094707693835991, "grad_norm": 0.22600482404232025, "learning_rate": 2.1476164818974453e-05, "loss": 0.0166, "step": 23650 }, { "epoch": 6.097356744316959, "grad_norm": 0.16935217380523682, "learning_rate": 2.1462720338546226e-05, "loss": 0.0172, "step": 23660 }, { "epoch": 6.100005794797927, "grad_norm": 0.1956672966480255, "learning_rate": 2.144927519354898e-05, "loss": 0.017, "step": 23670 }, { "epoch": 6.102654845278895, "grad_norm": 0.15825586020946503, "learning_rate": 2.1435829390091346e-05, "loss": 0.0176, "step": 23680 }, { "epoch": 6.105303895759864, "grad_norm": 0.16650356352329254, "learning_rate": 2.1422382934282258e-05, "loss": 0.0145, "step": 23690 }, { "epoch": 6.1079529462408315, "grad_norm": 0.17535312473773956, "learning_rate": 2.1408935832230955e-05, "loss": 0.0178, "step": 23700 }, { "epoch": 6.1106019967218, "grad_norm": 0.1494160294532776, "learning_rate": 2.1395488090046954e-05, "loss": 0.0152, "step": 23710 }, { "epoch": 6.113251047202768, "grad_norm": 0.23544269800186157, "learning_rate": 2.138203971384008e-05, "loss": 0.0137, "step": 23720 }, { "epoch": 6.1159000976837365, "grad_norm": 0.16862507164478302, "learning_rate": 2.136859070972043e-05, "loss": 0.018, "step": 23730 }, { "epoch": 6.118549148164704, "grad_norm": 0.2202523946762085, "learning_rate": 2.1355141083798396e-05, "loss": 0.0133, "step": 23740 }, { "epoch": 6.121198198645673, "grad_norm": 0.16371504962444305, "learning_rate": 2.1341690842184648e-05, "loss": 0.0111, "step": 23750 }, { "epoch": 6.123847249126642, "grad_norm": 0.18529313802719116, "learning_rate": 2.132823999099015e-05, "loss": 0.0179, "step": 23760 }, { "epoch": 6.126496299607609, "grad_norm": 0.1946994662284851, "learning_rate": 2.131478853632612e-05, "loss": 0.0163, "step": 23770 }, { "epoch": 6.129145350088578, "grad_norm": 0.17480269074440002, "learning_rate": 2.130133648430406e-05, "loss": 0.0166, "step": 23780 }, { "epoch": 6.131794400569546, "grad_norm": 0.17444908618927002, "learning_rate": 2.128788384103575e-05, "loss": 0.0155, "step": 23790 }, { "epoch": 6.134443451050514, "grad_norm": 0.1630752682685852, "learning_rate": 2.1274430612633235e-05, "loss": 0.0168, "step": 23800 }, { "epoch": 6.137092501531482, "grad_norm": 0.16637831926345825, "learning_rate": 2.126097680520882e-05, "loss": 0.0157, "step": 23810 }, { "epoch": 6.139741552012451, "grad_norm": 0.18693393468856812, "learning_rate": 2.124752242487508e-05, "loss": 0.0178, "step": 23820 }, { "epoch": 6.1423906024934185, "grad_norm": 0.20860297977924347, "learning_rate": 2.1234067477744845e-05, "loss": 0.018, "step": 23830 }, { "epoch": 6.145039652974387, "grad_norm": 0.17026910185813904, "learning_rate": 2.1220611969931205e-05, "loss": 0.013, "step": 23840 }, { "epoch": 6.147688703455355, "grad_norm": 0.1537686437368393, "learning_rate": 2.1207155907547508e-05, "loss": 0.0132, "step": 23850 }, { "epoch": 6.1503377539363235, "grad_norm": 0.2108107954263687, "learning_rate": 2.119369929670734e-05, "loss": 0.0185, "step": 23860 }, { "epoch": 6.152986804417291, "grad_norm": 0.15569794178009033, "learning_rate": 2.1180242143524565e-05, "loss": 0.0138, "step": 23870 }, { "epoch": 6.15563585489826, "grad_norm": 0.15059475600719452, "learning_rate": 2.1166784454113257e-05, "loss": 0.0178, "step": 23880 }, { "epoch": 6.158284905379228, "grad_norm": 0.1455257683992386, "learning_rate": 2.1153326234587764e-05, "loss": 0.0157, "step": 23890 }, { "epoch": 6.160933955860196, "grad_norm": 0.1996590942144394, "learning_rate": 2.113986749106266e-05, "loss": 0.0173, "step": 23900 }, { "epoch": 6.163583006341165, "grad_norm": 0.20018821954727173, "learning_rate": 2.112640822965276e-05, "loss": 0.0182, "step": 23910 }, { "epoch": 6.166232056822133, "grad_norm": 0.1552141010761261, "learning_rate": 2.111294845647311e-05, "loss": 0.014, "step": 23920 }, { "epoch": 6.168881107303101, "grad_norm": 0.14196151494979858, "learning_rate": 2.109948817763899e-05, "loss": 0.0188, "step": 23930 }, { "epoch": 6.171530157784069, "grad_norm": 0.18644846975803375, "learning_rate": 2.108602739926593e-05, "loss": 0.0156, "step": 23940 }, { "epoch": 6.174179208265038, "grad_norm": 0.27688154578208923, "learning_rate": 2.107256612746965e-05, "loss": 0.018, "step": 23950 }, { "epoch": 6.1768282587460055, "grad_norm": 0.21413931250572205, "learning_rate": 2.1059104368366125e-05, "loss": 0.016, "step": 23960 }, { "epoch": 6.179477309226974, "grad_norm": 0.2256094068288803, "learning_rate": 2.1045642128071533e-05, "loss": 0.0161, "step": 23970 }, { "epoch": 6.182126359707942, "grad_norm": 0.18175756931304932, "learning_rate": 2.103217941270228e-05, "loss": 0.0149, "step": 23980 }, { "epoch": 6.1847754101889105, "grad_norm": 0.2360486388206482, "learning_rate": 2.101871622837499e-05, "loss": 0.016, "step": 23990 }, { "epoch": 6.187424460669878, "grad_norm": 0.16175700724124908, "learning_rate": 2.100525258120649e-05, "loss": 0.0166, "step": 24000 }, { "epoch": 6.190073511150847, "grad_norm": 0.21664337813854218, "learning_rate": 2.0991788477313818e-05, "loss": 0.0164, "step": 24010 }, { "epoch": 6.192722561631815, "grad_norm": 0.1611761599779129, "learning_rate": 2.097832392281424e-05, "loss": 0.0171, "step": 24020 }, { "epoch": 6.195371612112783, "grad_norm": 0.21873240172863007, "learning_rate": 2.0964858923825196e-05, "loss": 0.0154, "step": 24030 }, { "epoch": 6.198020662593752, "grad_norm": 0.2052845060825348, "learning_rate": 2.0951393486464345e-05, "loss": 0.0127, "step": 24040 }, { "epoch": 6.20066971307472, "grad_norm": 0.2123626470565796, "learning_rate": 2.0937927616849554e-05, "loss": 0.0155, "step": 24050 }, { "epoch": 6.203318763555688, "grad_norm": 0.18041691184043884, "learning_rate": 2.092446132109886e-05, "loss": 0.0168, "step": 24060 }, { "epoch": 6.205967814036656, "grad_norm": 0.18608541786670685, "learning_rate": 2.0910994605330524e-05, "loss": 0.0152, "step": 24070 }, { "epoch": 6.208616864517625, "grad_norm": 0.20297552645206451, "learning_rate": 2.0897527475662978e-05, "loss": 0.0176, "step": 24080 }, { "epoch": 6.2112659149985925, "grad_norm": 0.25770336389541626, "learning_rate": 2.0884059938214838e-05, "loss": 0.0165, "step": 24090 }, { "epoch": 6.213914965479561, "grad_norm": 0.18018676340579987, "learning_rate": 2.0870591999104925e-05, "loss": 0.0169, "step": 24100 }, { "epoch": 6.216564015960529, "grad_norm": 0.1915765255689621, "learning_rate": 2.0857123664452224e-05, "loss": 0.0171, "step": 24110 }, { "epoch": 6.2192130664414975, "grad_norm": 0.20295563340187073, "learning_rate": 2.0843654940375916e-05, "loss": 0.0156, "step": 24120 }, { "epoch": 6.221862116922465, "grad_norm": 0.18338288366794586, "learning_rate": 2.0830185832995343e-05, "loss": 0.0149, "step": 24130 }, { "epoch": 6.224511167403434, "grad_norm": 0.18515871465206146, "learning_rate": 2.081671634843003e-05, "loss": 0.0148, "step": 24140 }, { "epoch": 6.227160217884402, "grad_norm": 0.1871313601732254, "learning_rate": 2.0803246492799673e-05, "loss": 0.016, "step": 24150 }, { "epoch": 6.22980926836537, "grad_norm": 0.19945155084133148, "learning_rate": 2.078977627222414e-05, "loss": 0.0151, "step": 24160 }, { "epoch": 6.232458318846339, "grad_norm": 0.19220130145549774, "learning_rate": 2.0776305692823454e-05, "loss": 0.0157, "step": 24170 }, { "epoch": 6.235107369327307, "grad_norm": 0.16272565722465515, "learning_rate": 2.0762834760717813e-05, "loss": 0.0188, "step": 24180 }, { "epoch": 6.237756419808275, "grad_norm": 0.1353704184293747, "learning_rate": 2.0749363482027562e-05, "loss": 0.0162, "step": 24190 }, { "epoch": 6.240405470289243, "grad_norm": 0.19194744527339935, "learning_rate": 2.073589186287322e-05, "loss": 0.0168, "step": 24200 }, { "epoch": 6.243054520770212, "grad_norm": 0.15786150097846985, "learning_rate": 2.072241990937544e-05, "loss": 0.0176, "step": 24210 }, { "epoch": 6.2457035712511795, "grad_norm": 0.18356406688690186, "learning_rate": 2.0708947627655052e-05, "loss": 0.0151, "step": 24220 }, { "epoch": 6.248352621732148, "grad_norm": 0.18708094954490662, "learning_rate": 2.0695475023833004e-05, "loss": 0.0136, "step": 24230 }, { "epoch": 6.251001672213116, "grad_norm": 0.16073401272296906, "learning_rate": 2.0682002104030427e-05, "loss": 0.0133, "step": 24240 }, { "epoch": 6.2536507226940845, "grad_norm": 0.1949462741613388, "learning_rate": 2.066852887436857e-05, "loss": 0.0166, "step": 24250 }, { "epoch": 6.256299773175052, "grad_norm": 0.15663711726665497, "learning_rate": 2.065505534096883e-05, "loss": 0.0149, "step": 24260 }, { "epoch": 6.258948823656021, "grad_norm": 0.16964711248874664, "learning_rate": 2.0641581509952734e-05, "loss": 0.0161, "step": 24270 }, { "epoch": 6.261597874136989, "grad_norm": 0.1809985190629959, "learning_rate": 2.062810738744196e-05, "loss": 0.0152, "step": 24280 }, { "epoch": 6.264246924617957, "grad_norm": 0.20514865219593048, "learning_rate": 2.0614632979558308e-05, "loss": 0.0177, "step": 24290 }, { "epoch": 6.266895975098926, "grad_norm": 0.1726275235414505, "learning_rate": 2.0601158292423712e-05, "loss": 0.0166, "step": 24300 }, { "epoch": 6.269545025579894, "grad_norm": 0.14912033081054688, "learning_rate": 2.0587683332160223e-05, "loss": 0.0151, "step": 24310 }, { "epoch": 6.272194076060862, "grad_norm": 0.2590118944644928, "learning_rate": 2.0574208104890033e-05, "loss": 0.0157, "step": 24320 }, { "epoch": 6.27484312654183, "grad_norm": 0.18109650909900665, "learning_rate": 2.0560732616735438e-05, "loss": 0.0178, "step": 24330 }, { "epoch": 6.277492177022799, "grad_norm": 0.1803588718175888, "learning_rate": 2.0547256873818865e-05, "loss": 0.0143, "step": 24340 }, { "epoch": 6.2801412275037665, "grad_norm": 0.20140576362609863, "learning_rate": 2.0533780882262855e-05, "loss": 0.017, "step": 24350 }, { "epoch": 6.282790277984735, "grad_norm": 0.16911594569683075, "learning_rate": 2.052030464819005e-05, "loss": 0.0146, "step": 24360 }, { "epoch": 6.285439328465703, "grad_norm": 0.17285048961639404, "learning_rate": 2.050682817772322e-05, "loss": 0.0151, "step": 24370 }, { "epoch": 6.2880883789466715, "grad_norm": 0.23302645981311798, "learning_rate": 2.0493351476985234e-05, "loss": 0.0158, "step": 24380 }, { "epoch": 6.290737429427639, "grad_norm": 0.16739118099212646, "learning_rate": 2.0479874552099062e-05, "loss": 0.018, "step": 24390 }, { "epoch": 6.293386479908608, "grad_norm": 0.1707768738269806, "learning_rate": 2.046639740918778e-05, "loss": 0.0189, "step": 24400 }, { "epoch": 6.296035530389576, "grad_norm": 0.14413221180438995, "learning_rate": 2.045292005437456e-05, "loss": 0.0136, "step": 24410 }, { "epoch": 6.298684580870544, "grad_norm": 0.1712142676115036, "learning_rate": 2.043944249378268e-05, "loss": 0.0161, "step": 24420 }, { "epoch": 6.301333631351512, "grad_norm": 0.22295209765434265, "learning_rate": 2.0425964733535495e-05, "loss": 0.0199, "step": 24430 }, { "epoch": 6.303982681832481, "grad_norm": 0.19478091597557068, "learning_rate": 2.0412486779756468e-05, "loss": 0.0161, "step": 24440 }, { "epoch": 6.306631732313449, "grad_norm": 0.15568624436855316, "learning_rate": 2.039900863856913e-05, "loss": 0.0122, "step": 24450 }, { "epoch": 6.309280782794417, "grad_norm": 0.2137456089258194, "learning_rate": 2.0385530316097123e-05, "loss": 0.0186, "step": 24460 }, { "epoch": 6.311929833275386, "grad_norm": 0.2117794305086136, "learning_rate": 2.0372051818464153e-05, "loss": 0.0169, "step": 24470 }, { "epoch": 6.3145788837563535, "grad_norm": 0.19870400428771973, "learning_rate": 2.0358573151794004e-05, "loss": 0.0175, "step": 24480 }, { "epoch": 6.317227934237322, "grad_norm": 0.17384034395217896, "learning_rate": 2.0345094322210547e-05, "loss": 0.0185, "step": 24490 }, { "epoch": 6.31987698471829, "grad_norm": 0.17434996366500854, "learning_rate": 2.0331615335837723e-05, "loss": 0.0143, "step": 24500 }, { "epoch": 6.3225260351992585, "grad_norm": 0.1839786320924759, "learning_rate": 2.0318136198799544e-05, "loss": 0.0165, "step": 24510 }, { "epoch": 6.325175085680226, "grad_norm": 0.1696794182062149, "learning_rate": 2.0304656917220088e-05, "loss": 0.0142, "step": 24520 }, { "epoch": 6.327824136161195, "grad_norm": 0.1386200338602066, "learning_rate": 2.0291177497223506e-05, "loss": 0.0153, "step": 24530 }, { "epoch": 6.330473186642163, "grad_norm": 0.24540361762046814, "learning_rate": 2.0277697944934004e-05, "loss": 0.0141, "step": 24540 }, { "epoch": 6.333122237123131, "grad_norm": 0.18517781794071198, "learning_rate": 2.0264218266475848e-05, "loss": 0.016, "step": 24550 }, { "epoch": 6.335771287604099, "grad_norm": 0.20238681137561798, "learning_rate": 2.0250738467973367e-05, "loss": 0.0171, "step": 24560 }, { "epoch": 6.338420338085068, "grad_norm": 0.18426138162612915, "learning_rate": 2.023725855555094e-05, "loss": 0.0153, "step": 24570 }, { "epoch": 6.341069388566035, "grad_norm": 0.17255212366580963, "learning_rate": 2.0223778535333003e-05, "loss": 0.0148, "step": 24580 }, { "epoch": 6.343718439047004, "grad_norm": 0.1550869345664978, "learning_rate": 2.0210298413444042e-05, "loss": 0.0127, "step": 24590 }, { "epoch": 6.346367489527973, "grad_norm": 0.20791979134082794, "learning_rate": 2.0196818196008573e-05, "loss": 0.0149, "step": 24600 }, { "epoch": 6.3490165400089404, "grad_norm": 0.15230217576026917, "learning_rate": 2.018333788915118e-05, "loss": 0.0196, "step": 24610 }, { "epoch": 6.351665590489909, "grad_norm": 0.2286357879638672, "learning_rate": 2.016985749899646e-05, "loss": 0.0141, "step": 24620 }, { "epoch": 6.354314640970877, "grad_norm": 0.15895210206508636, "learning_rate": 2.0156377031669074e-05, "loss": 0.0132, "step": 24630 }, { "epoch": 6.3569636914518455, "grad_norm": 0.1633041948080063, "learning_rate": 2.0142896493293705e-05, "loss": 0.024, "step": 24640 }, { "epoch": 6.359612741932813, "grad_norm": 0.1940462589263916, "learning_rate": 2.0129415889995073e-05, "loss": 0.0139, "step": 24650 }, { "epoch": 6.362261792413782, "grad_norm": 0.19415581226348877, "learning_rate": 2.0115935227897915e-05, "loss": 0.0145, "step": 24660 }, { "epoch": 6.36491084289475, "grad_norm": 0.1714337319135666, "learning_rate": 2.0102454513127013e-05, "loss": 0.017, "step": 24670 }, { "epoch": 6.367559893375718, "grad_norm": 0.2829289734363556, "learning_rate": 2.0088973751807165e-05, "loss": 0.0155, "step": 24680 }, { "epoch": 6.370208943856686, "grad_norm": 0.22841329872608185, "learning_rate": 2.007549295006318e-05, "loss": 0.0191, "step": 24690 }, { "epoch": 6.372857994337655, "grad_norm": 0.21563342213630676, "learning_rate": 2.0062012114019903e-05, "loss": 0.0175, "step": 24700 }, { "epoch": 6.375507044818622, "grad_norm": 0.19913151860237122, "learning_rate": 2.0048531249802177e-05, "loss": 0.0184, "step": 24710 }, { "epoch": 6.378156095299591, "grad_norm": 0.17689645290374756, "learning_rate": 2.0035050363534876e-05, "loss": 0.0134, "step": 24720 }, { "epoch": 6.38080514578056, "grad_norm": 0.19643354415893555, "learning_rate": 2.0021569461342878e-05, "loss": 0.0195, "step": 24730 }, { "epoch": 6.383454196261527, "grad_norm": 0.1646076738834381, "learning_rate": 2.0008088549351058e-05, "loss": 0.0128, "step": 24740 }, { "epoch": 6.386103246742496, "grad_norm": 0.20844583213329315, "learning_rate": 1.9994607633684304e-05, "loss": 0.0151, "step": 24750 }, { "epoch": 6.388752297223464, "grad_norm": 0.2326686680316925, "learning_rate": 1.99811267204675e-05, "loss": 0.0167, "step": 24760 }, { "epoch": 6.3914013477044325, "grad_norm": 0.25642824172973633, "learning_rate": 1.9967645815825548e-05, "loss": 0.0144, "step": 24770 }, { "epoch": 6.3940503981854, "grad_norm": 0.16035497188568115, "learning_rate": 1.9954164925883316e-05, "loss": 0.015, "step": 24780 }, { "epoch": 6.396699448666369, "grad_norm": 0.1593172252178192, "learning_rate": 1.994068405676569e-05, "loss": 0.0161, "step": 24790 }, { "epoch": 6.399348499147337, "grad_norm": 0.15887658298015594, "learning_rate": 1.9927203214597538e-05, "loss": 0.0135, "step": 24800 }, { "epoch": 6.401997549628305, "grad_norm": 0.20346663892269135, "learning_rate": 1.9913722405503704e-05, "loss": 0.0184, "step": 24810 }, { "epoch": 6.404646600109273, "grad_norm": 0.18024206161499023, "learning_rate": 1.9900241635609033e-05, "loss": 0.0211, "step": 24820 }, { "epoch": 6.407295650590242, "grad_norm": 0.3013671040534973, "learning_rate": 1.9886760911038363e-05, "loss": 0.0169, "step": 24830 }, { "epoch": 6.409944701071209, "grad_norm": 0.16770537197589874, "learning_rate": 1.987328023791647e-05, "loss": 0.0159, "step": 24840 }, { "epoch": 6.412593751552178, "grad_norm": 0.16426295042037964, "learning_rate": 1.9859799622368157e-05, "loss": 0.0125, "step": 24850 }, { "epoch": 6.415242802033147, "grad_norm": 0.21788346767425537, "learning_rate": 1.984631907051815e-05, "loss": 0.0174, "step": 24860 }, { "epoch": 6.417891852514114, "grad_norm": 0.1903427243232727, "learning_rate": 1.9832838588491192e-05, "loss": 0.0181, "step": 24870 }, { "epoch": 6.420540902995083, "grad_norm": 0.19113053381443024, "learning_rate": 1.9819358182411974e-05, "loss": 0.0182, "step": 24880 }, { "epoch": 6.423189953476051, "grad_norm": 0.15123803913593292, "learning_rate": 1.980587785840514e-05, "loss": 0.0152, "step": 24890 }, { "epoch": 6.4258390039570195, "grad_norm": 0.16768094897270203, "learning_rate": 1.9792397622595324e-05, "loss": 0.0168, "step": 24900 }, { "epoch": 6.428488054437987, "grad_norm": 0.16511386632919312, "learning_rate": 1.97789174811071e-05, "loss": 0.0199, "step": 24910 }, { "epoch": 6.431137104918956, "grad_norm": 0.1606045365333557, "learning_rate": 1.976543744006499e-05, "loss": 0.0149, "step": 24920 }, { "epoch": 6.433786155399924, "grad_norm": 0.1618891954421997, "learning_rate": 1.9751957505593515e-05, "loss": 0.0139, "step": 24930 }, { "epoch": 6.436435205880892, "grad_norm": 0.18794287741184235, "learning_rate": 1.973847768381709e-05, "loss": 0.0135, "step": 24940 }, { "epoch": 6.43908425636186, "grad_norm": 0.18032073974609375, "learning_rate": 1.9724997980860118e-05, "loss": 0.0131, "step": 24950 }, { "epoch": 6.441733306842829, "grad_norm": 0.20993781089782715, "learning_rate": 1.9711518402846944e-05, "loss": 0.0156, "step": 24960 }, { "epoch": 6.444382357323796, "grad_norm": 0.1648184210062027, "learning_rate": 1.9698038955901835e-05, "loss": 0.0164, "step": 24970 }, { "epoch": 6.447031407804765, "grad_norm": 0.19139660894870758, "learning_rate": 1.9684559646149027e-05, "loss": 0.0148, "step": 24980 }, { "epoch": 5.200803735157561, "grad_norm": 0.17137540876865387, "learning_rate": 1.967108047971266e-05, "loss": 0.0158, "step": 24990 }, { "epoch": 5.202884602879401, "grad_norm": 0.19072118401527405, "learning_rate": 1.9657601462716842e-05, "loss": 0.0143, "step": 25000 }, { "epoch": 5.204965470601241, "grad_norm": 0.1782013326883316, "learning_rate": 1.9644122601285596e-05, "loss": 0.0125, "step": 25010 }, { "epoch": 5.207046338323081, "grad_norm": 0.18376041948795319, "learning_rate": 1.9630643901542874e-05, "loss": 0.0151, "step": 25020 }, { "epoch": 5.20912720604492, "grad_norm": 0.18284699320793152, "learning_rate": 1.961716536961256e-05, "loss": 0.0172, "step": 25030 }, { "epoch": 5.211208073766761, "grad_norm": 0.1930585354566574, "learning_rate": 1.960368701161846e-05, "loss": 0.0122, "step": 25040 }, { "epoch": 5.2132889414886, "grad_norm": 0.13902150094509125, "learning_rate": 1.9590208833684297e-05, "loss": 0.013, "step": 25050 }, { "epoch": 5.215369809210441, "grad_norm": 0.22776579856872559, "learning_rate": 1.9576730841933725e-05, "loss": 0.015, "step": 25060 }, { "epoch": 5.21745067693228, "grad_norm": 0.14541983604431152, "learning_rate": 1.9563253042490287e-05, "loss": 0.0122, "step": 25070 }, { "epoch": 5.219531544654121, "grad_norm": 0.1616363227367401, "learning_rate": 1.954977544147747e-05, "loss": 0.0145, "step": 25080 }, { "epoch": 5.22161241237596, "grad_norm": 0.1816519796848297, "learning_rate": 1.953629804501866e-05, "loss": 0.0131, "step": 25090 }, { "epoch": 5.223693280097801, "grad_norm": 0.1482958048582077, "learning_rate": 1.9522820859237127e-05, "loss": 0.0162, "step": 25100 }, { "epoch": 5.2257741478196404, "grad_norm": 0.16515839099884033, "learning_rate": 1.9509343890256087e-05, "loss": 0.0133, "step": 25110 }, { "epoch": 5.227855015541481, "grad_norm": 0.13738195598125458, "learning_rate": 1.949586714419862e-05, "loss": 0.0123, "step": 25120 }, { "epoch": 5.2299358832633205, "grad_norm": 0.27616140246391296, "learning_rate": 1.9482390627187725e-05, "loss": 0.0156, "step": 25130 }, { "epoch": 5.232016750985161, "grad_norm": 0.17256587743759155, "learning_rate": 1.9468914345346294e-05, "loss": 0.0121, "step": 25140 }, { "epoch": 5.2340976187070005, "grad_norm": 0.16098135709762573, "learning_rate": 1.945543830479711e-05, "loss": 0.0126, "step": 25150 }, { "epoch": 5.236178486428841, "grad_norm": 0.12830302119255066, "learning_rate": 1.944196251166285e-05, "loss": 0.0131, "step": 25160 }, { "epoch": 5.238259354150681, "grad_norm": 0.13806670904159546, "learning_rate": 1.942848697206606e-05, "loss": 0.0133, "step": 25170 }, { "epoch": 5.240340221872521, "grad_norm": 0.21071235835552216, "learning_rate": 1.9415011692129195e-05, "loss": 0.0139, "step": 25180 }, { "epoch": 5.242421089594361, "grad_norm": 0.16545610129833221, "learning_rate": 1.9401536677974592e-05, "loss": 0.0141, "step": 25190 }, { "epoch": 5.244501957316201, "grad_norm": 0.1734047532081604, "learning_rate": 1.9388061935724444e-05, "loss": 0.0153, "step": 25200 }, { "epoch": 5.246582825038041, "grad_norm": 0.22277452051639557, "learning_rate": 1.937458747150084e-05, "loss": 0.0147, "step": 25210 }, { "epoch": 5.248663692759881, "grad_norm": 0.17141196131706238, "learning_rate": 1.9361113291425747e-05, "loss": 0.0144, "step": 25220 }, { "epoch": 5.250744560481721, "grad_norm": 0.16552603244781494, "learning_rate": 1.9347639401620978e-05, "loss": 0.0158, "step": 25230 }, { "epoch": 5.252825428203561, "grad_norm": 0.20854952931404114, "learning_rate": 1.933416580820824e-05, "loss": 0.0153, "step": 25240 }, { "epoch": 5.254906295925401, "grad_norm": 0.15089458227157593, "learning_rate": 1.9320692517309087e-05, "loss": 0.0188, "step": 25250 }, { "epoch": 5.256987163647241, "grad_norm": 0.16022448241710663, "learning_rate": 1.930721953504495e-05, "loss": 0.0144, "step": 25260 }, { "epoch": 5.259068031369081, "grad_norm": 0.15377101302146912, "learning_rate": 1.929374686753711e-05, "loss": 0.0193, "step": 25270 }, { "epoch": 5.261148899090921, "grad_norm": 0.17997407913208008, "learning_rate": 1.9280274520906704e-05, "loss": 0.0131, "step": 25280 }, { "epoch": 5.263229766812761, "grad_norm": 0.17616941034793854, "learning_rate": 1.926680250127474e-05, "loss": 0.0145, "step": 25290 }, { "epoch": 5.265310634534601, "grad_norm": 0.17031162977218628, "learning_rate": 1.9253330814762042e-05, "loss": 0.0204, "step": 25300 }, { "epoch": 5.267391502256441, "grad_norm": 0.160043403506279, "learning_rate": 1.9239859467489325e-05, "loss": 0.0149, "step": 25310 }, { "epoch": 5.269472369978281, "grad_norm": 0.1253969967365265, "learning_rate": 1.922638846557713e-05, "loss": 0.0146, "step": 25320 }, { "epoch": 5.271553237700121, "grad_norm": 0.18791693449020386, "learning_rate": 1.921291781514583e-05, "loss": 0.0184, "step": 25330 }, { "epoch": 5.273634105421961, "grad_norm": 0.14052584767341614, "learning_rate": 1.9199447522315657e-05, "loss": 0.0132, "step": 25340 }, { "epoch": 5.275714973143801, "grad_norm": 0.15439161658287048, "learning_rate": 1.9185977593206676e-05, "loss": 0.0137, "step": 25350 }, { "epoch": 5.277795840865641, "grad_norm": 0.19920651614665985, "learning_rate": 1.9172508033938776e-05, "loss": 0.016, "step": 25360 }, { "epoch": 5.279876708587481, "grad_norm": 0.13279683887958527, "learning_rate": 1.9159038850631687e-05, "loss": 0.0123, "step": 25370 }, { "epoch": 5.281957576309321, "grad_norm": 0.18041202425956726, "learning_rate": 1.9145570049404972e-05, "loss": 0.0113, "step": 25380 }, { "epoch": 5.284038444031161, "grad_norm": 0.23813143372535706, "learning_rate": 1.913210163637801e-05, "loss": 0.0144, "step": 25390 }, { "epoch": 5.286119311753001, "grad_norm": 0.15847551822662354, "learning_rate": 1.911863361767002e-05, "loss": 0.0164, "step": 25400 }, { "epoch": 5.288200179474841, "grad_norm": 0.20158298313617706, "learning_rate": 1.910516599940001e-05, "loss": 0.0128, "step": 25410 }, { "epoch": 5.290281047196681, "grad_norm": 0.23492570221424103, "learning_rate": 1.909169878768685e-05, "loss": 0.017, "step": 25420 }, { "epoch": 5.292361914918521, "grad_norm": 0.16576650738716125, "learning_rate": 1.9078231988649187e-05, "loss": 0.0154, "step": 25430 }, { "epoch": 5.2944427826403615, "grad_norm": 0.24577254056930542, "learning_rate": 1.90647656084055e-05, "loss": 0.0138, "step": 25440 }, { "epoch": 5.296523650362201, "grad_norm": 0.13747747242450714, "learning_rate": 1.9051299653074077e-05, "loss": 0.0155, "step": 25450 }, { "epoch": 5.298604518084041, "grad_norm": 0.13734903931617737, "learning_rate": 1.9037834128773e-05, "loss": 0.0128, "step": 25460 }, { "epoch": 5.300685385805881, "grad_norm": 0.1756000518798828, "learning_rate": 1.9024369041620186e-05, "loss": 0.0178, "step": 25470 }, { "epoch": 5.302766253527721, "grad_norm": 0.1590210646390915, "learning_rate": 1.9010904397733305e-05, "loss": 0.0148, "step": 25480 }, { "epoch": 5.304847121249561, "grad_norm": 0.13026957213878632, "learning_rate": 1.8997440203229867e-05, "loss": 0.0174, "step": 25490 }, { "epoch": 5.306927988971401, "grad_norm": 0.20190174877643585, "learning_rate": 1.898397646422716e-05, "loss": 0.0153, "step": 25500 }, { "epoch": 5.309008856693241, "grad_norm": 0.2334361970424652, "learning_rate": 1.8970513186842278e-05, "loss": 0.0129, "step": 25510 }, { "epoch": 5.311089724415081, "grad_norm": 0.18989676237106323, "learning_rate": 1.8957050377192077e-05, "loss": 0.0131, "step": 25520 }, { "epoch": 5.313170592136921, "grad_norm": 0.17178963124752045, "learning_rate": 1.894358804139324e-05, "loss": 0.0135, "step": 25530 }, { "epoch": 5.315251459858761, "grad_norm": 0.1860206574201584, "learning_rate": 1.8930126185562196e-05, "loss": 0.0179, "step": 25540 }, { "epoch": 5.317332327580601, "grad_norm": 0.14649298787117004, "learning_rate": 1.8916664815815185e-05, "loss": 0.0142, "step": 25550 }, { "epoch": 5.319413195302441, "grad_norm": 0.19328202307224274, "learning_rate": 1.8903203938268208e-05, "loss": 0.017, "step": 25560 }, { "epoch": 5.321494063024281, "grad_norm": 0.13057826459407806, "learning_rate": 1.8889743559037053e-05, "loss": 0.0144, "step": 25570 }, { "epoch": 5.323574930746121, "grad_norm": 0.18297547101974487, "learning_rate": 1.8876283684237285e-05, "loss": 0.0155, "step": 25580 }, { "epoch": 5.325655798467961, "grad_norm": 0.15102644264698029, "learning_rate": 1.886282431998422e-05, "loss": 0.0202, "step": 25590 }, { "epoch": 5.327736666189801, "grad_norm": 0.15959212183952332, "learning_rate": 1.8849365472392965e-05, "loss": 0.0143, "step": 25600 }, { "epoch": 5.329817533911641, "grad_norm": 0.1834830641746521, "learning_rate": 1.8835907147578374e-05, "loss": 0.0161, "step": 25610 }, { "epoch": 5.331898401633481, "grad_norm": 0.17716051638126373, "learning_rate": 1.8822449351655078e-05, "loss": 0.0124, "step": 25620 }, { "epoch": 5.333979269355321, "grad_norm": 0.14724001288414001, "learning_rate": 1.880899209073746e-05, "loss": 0.0141, "step": 25630 }, { "epoch": 5.336060137077161, "grad_norm": 0.1466480791568756, "learning_rate": 1.8795535370939657e-05, "loss": 0.0162, "step": 25640 }, { "epoch": 5.338141004799001, "grad_norm": 0.17417165637016296, "learning_rate": 1.8782079198375564e-05, "loss": 0.0111, "step": 25650 }, { "epoch": 5.340221872520841, "grad_norm": 0.17235589027404785, "learning_rate": 1.8768623579158844e-05, "loss": 0.014, "step": 25660 }, { "epoch": 5.3423027402426815, "grad_norm": 0.28552109003067017, "learning_rate": 1.8755168519402865e-05, "loss": 0.0144, "step": 25670 }, { "epoch": 5.344383607964521, "grad_norm": 0.14576227962970734, "learning_rate": 1.8741714025220794e-05, "loss": 0.0135, "step": 25680 }, { "epoch": 5.3464644756863615, "grad_norm": 0.18343323469161987, "learning_rate": 1.87282601027255e-05, "loss": 0.0116, "step": 25690 }, { "epoch": 5.348545343408201, "grad_norm": 0.20623348653316498, "learning_rate": 1.8714806758029604e-05, "loss": 0.016, "step": 25700 }, { "epoch": 5.3506262111300416, "grad_norm": 0.12583796679973602, "learning_rate": 1.8701353997245487e-05, "loss": 0.0131, "step": 25710 }, { "epoch": 5.352707078851881, "grad_norm": 0.15215487778186798, "learning_rate": 1.8687901826485226e-05, "loss": 0.0156, "step": 25720 }, { "epoch": 5.354787946573722, "grad_norm": 0.16510960459709167, "learning_rate": 1.8674450251860665e-05, "loss": 0.013, "step": 25730 }, { "epoch": 5.356868814295561, "grad_norm": 0.22802871465682983, "learning_rate": 1.8660999279483353e-05, "loss": 0.0131, "step": 25740 }, { "epoch": 5.358949682017402, "grad_norm": 0.15858136117458344, "learning_rate": 1.8647548915464568e-05, "loss": 0.0137, "step": 25750 }, { "epoch": 5.361030549739241, "grad_norm": 0.14375296235084534, "learning_rate": 1.8634099165915335e-05, "loss": 0.0165, "step": 25760 }, { "epoch": 5.363111417461082, "grad_norm": 0.17402341961860657, "learning_rate": 1.8620650036946367e-05, "loss": 0.0169, "step": 25770 }, { "epoch": 5.365192285182921, "grad_norm": 0.18957015872001648, "learning_rate": 1.8607201534668114e-05, "loss": 0.015, "step": 25780 }, { "epoch": 5.367273152904762, "grad_norm": 0.15647269785404205, "learning_rate": 1.8593753665190748e-05, "loss": 0.0113, "step": 25790 }, { "epoch": 5.369354020626601, "grad_norm": 0.153586283326149, "learning_rate": 1.8580306434624127e-05, "loss": 0.0133, "step": 25800 }, { "epoch": 5.371434888348441, "grad_norm": 0.18048329651355743, "learning_rate": 1.8566859849077855e-05, "loss": 0.0168, "step": 25810 }, { "epoch": 5.373515756070281, "grad_norm": 0.17350393533706665, "learning_rate": 1.85534139146612e-05, "loss": 0.0163, "step": 25820 }, { "epoch": 5.375596623792122, "grad_norm": 0.20901452004909515, "learning_rate": 1.8539968637483174e-05, "loss": 0.0137, "step": 25830 }, { "epoch": 5.377677491513961, "grad_norm": 0.1793689876794815, "learning_rate": 1.852652402365247e-05, "loss": 0.0148, "step": 25840 }, { "epoch": 5.379758359235801, "grad_norm": 0.15162524580955505, "learning_rate": 1.8513080079277482e-05, "loss": 0.0134, "step": 25850 }, { "epoch": 5.381839226957641, "grad_norm": 0.17104865610599518, "learning_rate": 1.8499636810466298e-05, "loss": 0.0128, "step": 25860 }, { "epoch": 5.383920094679481, "grad_norm": 0.18259766697883606, "learning_rate": 1.8486194223326707e-05, "loss": 0.02, "step": 25870 }, { "epoch": 5.386000962401321, "grad_norm": 0.21646730601787567, "learning_rate": 1.8472752323966178e-05, "loss": 0.0156, "step": 25880 }, { "epoch": 5.388081830123161, "grad_norm": 0.14616301655769348, "learning_rate": 1.8459311118491884e-05, "loss": 0.0145, "step": 25890 }, { "epoch": 5.3901626978450015, "grad_norm": 0.14948515594005585, "learning_rate": 1.8445870613010653e-05, "loss": 0.0176, "step": 25900 }, { "epoch": 5.392243565566841, "grad_norm": 0.18757057189941406, "learning_rate": 1.8432430813629026e-05, "loss": 0.0152, "step": 25910 }, { "epoch": 5.3943244332886815, "grad_norm": 0.2461482584476471, "learning_rate": 1.841899172645322e-05, "loss": 0.0164, "step": 25920 }, { "epoch": 5.396405301010521, "grad_norm": 0.137393981218338, "learning_rate": 1.8405553357589094e-05, "loss": 0.0169, "step": 25930 }, { "epoch": 5.3984861687323615, "grad_norm": 0.2690237760543823, "learning_rate": 1.8392115713142233e-05, "loss": 0.0173, "step": 25940 }, { "epoch": 5.400567036454201, "grad_norm": 0.15332847833633423, "learning_rate": 1.8378678799217846e-05, "loss": 0.0135, "step": 25950 }, { "epoch": 5.402647904176042, "grad_norm": 0.15519404411315918, "learning_rate": 1.836524262192084e-05, "loss": 0.013, "step": 25960 }, { "epoch": 5.404728771897881, "grad_norm": 0.15050667524337769, "learning_rate": 1.8351807187355772e-05, "loss": 0.0161, "step": 25970 }, { "epoch": 5.406809639619722, "grad_norm": 0.1547541469335556, "learning_rate": 1.833837250162687e-05, "loss": 0.0141, "step": 25980 }, { "epoch": 5.408890507341561, "grad_norm": 0.1615159958600998, "learning_rate": 1.8324938570838023e-05, "loss": 0.0133, "step": 25990 }, { "epoch": 5.410971375063402, "grad_norm": 0.22096851468086243, "learning_rate": 1.8311505401092756e-05, "loss": 0.0181, "step": 26000 }, { "epoch": 5.413052242785241, "grad_norm": 0.17382697761058807, "learning_rate": 1.8298072998494277e-05, "loss": 0.0146, "step": 26010 }, { "epoch": 5.415133110507082, "grad_norm": 0.18964888155460358, "learning_rate": 1.828464136914544e-05, "loss": 0.0144, "step": 26020 }, { "epoch": 5.417213978228921, "grad_norm": 0.2393834888935089, "learning_rate": 1.8271210519148722e-05, "loss": 0.0155, "step": 26030 }, { "epoch": 5.419294845950762, "grad_norm": 0.16043643653392792, "learning_rate": 1.8257780454606274e-05, "loss": 0.0146, "step": 26040 }, { "epoch": 5.421375713672601, "grad_norm": 0.13404083251953125, "learning_rate": 1.8244351181619893e-05, "loss": 0.0121, "step": 26050 }, { "epoch": 5.423456581394442, "grad_norm": 0.2020963877439499, "learning_rate": 1.8230922706290988e-05, "loss": 0.0167, "step": 26060 }, { "epoch": 5.425537449116281, "grad_norm": 0.1649845391511917, "learning_rate": 1.8217495034720632e-05, "loss": 0.0137, "step": 26070 }, { "epoch": 5.427618316838122, "grad_norm": 0.14278879761695862, "learning_rate": 1.8204068173009512e-05, "loss": 0.0143, "step": 26080 }, { "epoch": 5.429699184559961, "grad_norm": 0.20041024684906006, "learning_rate": 1.8190642127257966e-05, "loss": 0.0152, "step": 26090 }, { "epoch": 5.431780052281802, "grad_norm": 0.17277541756629944, "learning_rate": 1.817721690356596e-05, "loss": 0.0188, "step": 26100 }, { "epoch": 5.433860920003641, "grad_norm": 0.16568568348884583, "learning_rate": 1.8163792508033054e-05, "loss": 0.013, "step": 26110 }, { "epoch": 5.435941787725482, "grad_norm": 0.15028099715709686, "learning_rate": 1.8150368946758484e-05, "loss": 0.0135, "step": 26120 }, { "epoch": 5.438022655447321, "grad_norm": 0.22061441838741302, "learning_rate": 1.8136946225841062e-05, "loss": 0.0136, "step": 26130 }, { "epoch": 5.440103523169162, "grad_norm": 0.12264414876699448, "learning_rate": 1.8123524351379247e-05, "loss": 0.0134, "step": 26140 }, { "epoch": 5.4421843908910015, "grad_norm": 0.19976705312728882, "learning_rate": 1.8110103329471106e-05, "loss": 0.0163, "step": 26150 }, { "epoch": 5.444265258612842, "grad_norm": 0.18691785633563995, "learning_rate": 1.80966831662143e-05, "loss": 0.013, "step": 26160 }, { "epoch": 5.4463461263346815, "grad_norm": 0.17124806344509125, "learning_rate": 1.8083263867706127e-05, "loss": 0.0129, "step": 26170 }, { "epoch": 5.448426994056522, "grad_norm": 0.1522866040468216, "learning_rate": 1.8069845440043487e-05, "loss": 0.013, "step": 26180 }, { "epoch": 5.4505078617783616, "grad_norm": 0.17798860371112823, "learning_rate": 1.8056427889322862e-05, "loss": 0.0143, "step": 26190 }, { "epoch": 5.452588729500201, "grad_norm": 0.1549074351787567, "learning_rate": 1.8043011221640363e-05, "loss": 0.013, "step": 26200 }, { "epoch": 5.454669597222042, "grad_norm": 0.15663816034793854, "learning_rate": 1.802959544309168e-05, "loss": 0.0102, "step": 26210 }, { "epoch": 5.456750464943882, "grad_norm": 0.11726734042167664, "learning_rate": 1.801618055977211e-05, "loss": 0.0133, "step": 26220 }, { "epoch": 5.458831332665722, "grad_norm": 0.12179184705018997, "learning_rate": 1.8002766577776545e-05, "loss": 0.0123, "step": 26230 }, { "epoch": 5.460912200387561, "grad_norm": 0.16338825225830078, "learning_rate": 1.7989353503199457e-05, "loss": 0.0173, "step": 26240 }, { "epoch": 5.462993068109402, "grad_norm": 0.16513825953006744, "learning_rate": 1.797594134213492e-05, "loss": 0.0162, "step": 26250 }, { "epoch": 5.465073935831241, "grad_norm": 0.2000940889120102, "learning_rate": 1.7962530100676573e-05, "loss": 0.0139, "step": 26260 }, { "epoch": 5.467154803553082, "grad_norm": 0.15730133652687073, "learning_rate": 1.794911978491765e-05, "loss": 0.0134, "step": 26270 }, { "epoch": 5.469235671274921, "grad_norm": 0.1721474528312683, "learning_rate": 1.793571040095098e-05, "loss": 0.0125, "step": 26280 }, { "epoch": 5.471316538996762, "grad_norm": 0.17023514211177826, "learning_rate": 1.792230195486893e-05, "loss": 0.0141, "step": 26290 }, { "epoch": 5.473397406718601, "grad_norm": 0.16219599545001984, "learning_rate": 1.7908894452763482e-05, "loss": 0.0192, "step": 26300 }, { "epoch": 5.475478274440442, "grad_norm": 0.14091834425926208, "learning_rate": 1.7895487900726152e-05, "loss": 0.0105, "step": 26310 }, { "epoch": 5.477559142162281, "grad_norm": 0.15971678495407104, "learning_rate": 1.7882082304848056e-05, "loss": 0.014, "step": 26320 }, { "epoch": 5.479640009884122, "grad_norm": 0.21315868198871613, "learning_rate": 1.7868677671219856e-05, "loss": 0.0145, "step": 26330 }, { "epoch": 5.481720877605961, "grad_norm": 0.16145722568035126, "learning_rate": 1.7855274005931786e-05, "loss": 0.0141, "step": 26340 }, { "epoch": 5.483801745327802, "grad_norm": 0.152473583817482, "learning_rate": 1.7841871315073627e-05, "loss": 0.0149, "step": 26350 }, { "epoch": 5.485882613049641, "grad_norm": 0.14547979831695557, "learning_rate": 1.7828469604734747e-05, "loss": 0.0142, "step": 26360 }, { "epoch": 5.487963480771482, "grad_norm": 0.15764367580413818, "learning_rate": 1.7815068881004024e-05, "loss": 0.0141, "step": 26370 }, { "epoch": 5.4900443484933215, "grad_norm": 0.18667419254779816, "learning_rate": 1.7801669149969932e-05, "loss": 0.0144, "step": 26380 }, { "epoch": 5.492125216215162, "grad_norm": 0.16331875324249268, "learning_rate": 1.7788270417720463e-05, "loss": 0.0131, "step": 26390 }, { "epoch": 5.4942060839370015, "grad_norm": 0.2171240746974945, "learning_rate": 1.7774872690343163e-05, "loss": 0.0128, "step": 26400 }, { "epoch": 5.496286951658842, "grad_norm": 0.23127605020999908, "learning_rate": 1.7761475973925145e-05, "loss": 0.0151, "step": 26410 }, { "epoch": 5.4983678193806815, "grad_norm": 0.15595181286334991, "learning_rate": 1.774808027455302e-05, "loss": 0.0122, "step": 26420 }, { "epoch": 5.500448687102522, "grad_norm": 0.16292855143547058, "learning_rate": 1.7734685598312975e-05, "loss": 0.0152, "step": 26430 }, { "epoch": 5.502529554824362, "grad_norm": 0.16100478172302246, "learning_rate": 1.7721291951290704e-05, "loss": 0.0148, "step": 26440 }, { "epoch": 5.504610422546202, "grad_norm": 0.14058032631874084, "learning_rate": 1.7707899339571457e-05, "loss": 0.0144, "step": 26450 }, { "epoch": 5.506691290268042, "grad_norm": 0.17623800039291382, "learning_rate": 1.7694507769240002e-05, "loss": 0.0125, "step": 26460 }, { "epoch": 5.508772157989882, "grad_norm": 0.17084510624408722, "learning_rate": 1.7681117246380623e-05, "loss": 0.0154, "step": 26470 }, { "epoch": 5.510853025711722, "grad_norm": 0.22612826526165009, "learning_rate": 1.7667727777077147e-05, "loss": 0.0126, "step": 26480 }, { "epoch": 5.512933893433562, "grad_norm": 0.2520594298839569, "learning_rate": 1.7654339367412926e-05, "loss": 0.0143, "step": 26490 }, { "epoch": 5.515014761155402, "grad_norm": 0.1830461025238037, "learning_rate": 1.7640952023470796e-05, "loss": 0.0133, "step": 26500 }, { "epoch": 5.517095628877242, "grad_norm": 0.16306522488594055, "learning_rate": 1.7627565751333157e-05, "loss": 0.0127, "step": 26510 }, { "epoch": 5.519176496599082, "grad_norm": 0.16155828535556793, "learning_rate": 1.7614180557081877e-05, "loss": 0.0133, "step": 26520 }, { "epoch": 5.521257364320922, "grad_norm": 0.1491851657629013, "learning_rate": 1.7600796446798362e-05, "loss": 0.0135, "step": 26530 }, { "epoch": 5.523338232042762, "grad_norm": 0.1854010969400406, "learning_rate": 1.7587413426563528e-05, "loss": 0.0109, "step": 26540 }, { "epoch": 5.525419099764602, "grad_norm": 0.18520958721637726, "learning_rate": 1.7574031502457772e-05, "loss": 0.0125, "step": 26550 }, { "epoch": 5.527499967486442, "grad_norm": 0.19445949792861938, "learning_rate": 1.7560650680561018e-05, "loss": 0.0118, "step": 26560 }, { "epoch": 5.529580835208282, "grad_norm": 0.19678428769111633, "learning_rate": 1.754727096695267e-05, "loss": 0.0162, "step": 26570 }, { "epoch": 5.531661702930122, "grad_norm": 0.15198567509651184, "learning_rate": 1.7533892367711632e-05, "loss": 0.0175, "step": 26580 }, { "epoch": 5.533742570651961, "grad_norm": 0.13987578451633453, "learning_rate": 1.752051488891632e-05, "loss": 0.0132, "step": 26590 }, { "epoch": 5.535823438373802, "grad_norm": 0.18121865391731262, "learning_rate": 1.7507138536644613e-05, "loss": 0.0139, "step": 26600 }, { "epoch": 5.537904306095642, "grad_norm": 0.1751001626253128, "learning_rate": 1.7493763316973905e-05, "loss": 0.016, "step": 26610 }, { "epoch": 5.539985173817482, "grad_norm": 0.15952643752098083, "learning_rate": 1.748038923598106e-05, "loss": 0.0119, "step": 26620 }, { "epoch": 5.5420660415393215, "grad_norm": 0.15826338529586792, "learning_rate": 1.746701629974242e-05, "loss": 0.0125, "step": 26630 }, { "epoch": 5.544146909261162, "grad_norm": 0.18028958141803741, "learning_rate": 1.7453644514333827e-05, "loss": 0.012, "step": 26640 }, { "epoch": 5.546227776983002, "grad_norm": 0.20213620364665985, "learning_rate": 1.744027388583057e-05, "loss": 0.0146, "step": 26650 }, { "epoch": 5.548308644704842, "grad_norm": 0.20918355882167816, "learning_rate": 1.7426904420307447e-05, "loss": 0.0144, "step": 26660 }, { "epoch": 5.5503895124266815, "grad_norm": 0.1553598791360855, "learning_rate": 1.74135361238387e-05, "loss": 0.0131, "step": 26670 }, { "epoch": 5.552470380148522, "grad_norm": 0.14079266786575317, "learning_rate": 1.7400169002498054e-05, "loss": 0.0129, "step": 26680 }, { "epoch": 5.554551247870362, "grad_norm": 0.17915190756320953, "learning_rate": 1.73868030623587e-05, "loss": 0.0173, "step": 26690 }, { "epoch": 5.556632115592202, "grad_norm": 0.24318507313728333, "learning_rate": 1.7373438309493282e-05, "loss": 0.0135, "step": 26700 }, { "epoch": 5.558712983314042, "grad_norm": 0.20411914587020874, "learning_rate": 1.7360074749973907e-05, "loss": 0.0119, "step": 26710 }, { "epoch": 5.560793851035882, "grad_norm": 0.19238615036010742, "learning_rate": 1.7346712389872163e-05, "loss": 0.019, "step": 26720 }, { "epoch": 5.562874718757722, "grad_norm": 0.1993524581193924, "learning_rate": 1.7333351235259056e-05, "loss": 0.0129, "step": 26730 }, { "epoch": 5.564955586479562, "grad_norm": 0.1668633371591568, "learning_rate": 1.7319991292205065e-05, "loss": 0.014, "step": 26740 }, { "epoch": 5.567036454201402, "grad_norm": 0.18557026982307434, "learning_rate": 1.7306632566780132e-05, "loss": 0.0119, "step": 26750 }, { "epoch": 5.569117321923242, "grad_norm": 0.1815817952156067, "learning_rate": 1.729327506505361e-05, "loss": 0.015, "step": 26760 }, { "epoch": 5.571198189645082, "grad_norm": 0.16499896347522736, "learning_rate": 1.727991879309433e-05, "loss": 0.0134, "step": 26770 }, { "epoch": 5.573279057366922, "grad_norm": 0.15485869348049164, "learning_rate": 1.7266563756970533e-05, "loss": 0.0165, "step": 26780 }, { "epoch": 5.575359925088762, "grad_norm": 0.1341298371553421, "learning_rate": 1.7253209962749934e-05, "loss": 0.0122, "step": 26790 }, { "epoch": 5.577440792810602, "grad_norm": 0.14069029688835144, "learning_rate": 1.723985741649966e-05, "loss": 0.015, "step": 26800 }, { "epoch": 5.579521660532442, "grad_norm": 0.16641351580619812, "learning_rate": 1.722650612428627e-05, "loss": 0.0126, "step": 26810 }, { "epoch": 5.581602528254282, "grad_norm": 0.1702808141708374, "learning_rate": 1.7213156092175775e-05, "loss": 0.0129, "step": 26820 }, { "epoch": 5.583683395976122, "grad_norm": 0.16125041246414185, "learning_rate": 1.7199807326233574e-05, "loss": 0.0153, "step": 26830 }, { "epoch": 5.585764263697962, "grad_norm": 0.14981257915496826, "learning_rate": 1.7186459832524527e-05, "loss": 0.0146, "step": 26840 }, { "epoch": 5.587845131419802, "grad_norm": 0.17125584185123444, "learning_rate": 1.7173113617112917e-05, "loss": 0.0132, "step": 26850 }, { "epoch": 5.589925999141642, "grad_norm": 0.1750120222568512, "learning_rate": 1.715976868606241e-05, "loss": 0.0177, "step": 26860 }, { "epoch": 5.592006866863482, "grad_norm": 0.2422703355550766, "learning_rate": 1.7146425045436122e-05, "loss": 0.0173, "step": 26870 }, { "epoch": 5.594087734585322, "grad_norm": 0.19178728759288788, "learning_rate": 1.7133082701296586e-05, "loss": 0.0129, "step": 26880 }, { "epoch": 5.596168602307162, "grad_norm": 0.20184099674224854, "learning_rate": 1.711974165970571e-05, "loss": 0.016, "step": 26890 }, { "epoch": 5.598249470029002, "grad_norm": 0.1563086211681366, "learning_rate": 1.7106401926724848e-05, "loss": 0.0168, "step": 26900 }, { "epoch": 5.600330337750842, "grad_norm": 0.16706781089305878, "learning_rate": 1.7093063508414728e-05, "loss": 0.012, "step": 26910 }, { "epoch": 5.6024112054726825, "grad_norm": 0.19936776161193848, "learning_rate": 1.7079726410835512e-05, "loss": 0.0192, "step": 26920 }, { "epoch": 5.604492073194522, "grad_norm": 0.18508806824684143, "learning_rate": 1.7066390640046747e-05, "loss": 0.0148, "step": 26930 }, { "epoch": 5.6065729409163625, "grad_norm": 0.12774758040905, "learning_rate": 1.7053056202107358e-05, "loss": 0.0124, "step": 26940 }, { "epoch": 5.608653808638202, "grad_norm": 0.175897479057312, "learning_rate": 1.7039723103075705e-05, "loss": 0.0162, "step": 26950 }, { "epoch": 5.6107346763600425, "grad_norm": 0.1810985654592514, "learning_rate": 1.70263913490095e-05, "loss": 0.0124, "step": 26960 }, { "epoch": 5.612815544081882, "grad_norm": 0.13632546365261078, "learning_rate": 1.701306094596587e-05, "loss": 0.0116, "step": 26970 }, { "epoch": 5.614896411803722, "grad_norm": 0.14301156997680664, "learning_rate": 1.6999731900001327e-05, "loss": 0.0134, "step": 26980 }, { "epoch": 5.616977279525562, "grad_norm": 0.16345807909965515, "learning_rate": 1.6986404217171744e-05, "loss": 0.0126, "step": 26990 }, { "epoch": 5.619058147247403, "grad_norm": 0.21610203385353088, "learning_rate": 1.69730779035324e-05, "loss": 0.012, "step": 27000 }, { "epoch": 5.621139014969242, "grad_norm": 0.2447291910648346, "learning_rate": 1.6959752965137948e-05, "loss": 0.0136, "step": 27010 }, { "epoch": 5.623219882691082, "grad_norm": 0.15198738873004913, "learning_rate": 1.6946429408042398e-05, "loss": 0.0139, "step": 27020 }, { "epoch": 5.625300750412922, "grad_norm": 0.14067602157592773, "learning_rate": 1.693310723829915e-05, "loss": 0.0128, "step": 27030 }, { "epoch": 5.627381618134763, "grad_norm": 0.2304868996143341, "learning_rate": 1.6919786461960967e-05, "loss": 0.0123, "step": 27040 }, { "epoch": 5.629462485856602, "grad_norm": 0.2044217586517334, "learning_rate": 1.6906467085079975e-05, "loss": 0.0137, "step": 27050 }, { "epoch": 5.631543353578442, "grad_norm": 0.19294941425323486, "learning_rate": 1.689314911370768e-05, "loss": 0.0125, "step": 27060 }, { "epoch": 5.633624221300282, "grad_norm": 0.17100247740745544, "learning_rate": 1.687983255389493e-05, "loss": 0.0131, "step": 27070 }, { "epoch": 5.635705089022122, "grad_norm": 0.22401319444179535, "learning_rate": 1.6866517411691953e-05, "loss": 0.0157, "step": 27080 }, { "epoch": 5.637785956743962, "grad_norm": 0.1934623420238495, "learning_rate": 1.68532036931483e-05, "loss": 0.0141, "step": 27090 }, { "epoch": 5.639866824465802, "grad_norm": 0.16677945852279663, "learning_rate": 1.6839891404312905e-05, "loss": 0.0128, "step": 27100 }, { "epoch": 5.641947692187642, "grad_norm": 0.20893751084804535, "learning_rate": 1.6826580551234054e-05, "loss": 0.014, "step": 27110 }, { "epoch": 5.644028559909482, "grad_norm": 0.1798425316810608, "learning_rate": 1.6813271139959355e-05, "loss": 0.012, "step": 27120 }, { "epoch": 5.646109427631322, "grad_norm": 0.17650124430656433, "learning_rate": 1.679996317653579e-05, "loss": 0.0137, "step": 27130 }, { "epoch": 5.648190295353162, "grad_norm": 0.1495649218559265, "learning_rate": 1.6786656667009643e-05, "loss": 0.0155, "step": 27140 }, { "epoch": 5.650271163075002, "grad_norm": 0.16214725375175476, "learning_rate": 1.6773351617426588e-05, "loss": 0.0131, "step": 27150 }, { "epoch": 5.652352030796842, "grad_norm": 0.15603801608085632, "learning_rate": 1.6760048033831607e-05, "loss": 0.0115, "step": 27160 }, { "epoch": 5.6544328985186825, "grad_norm": 0.2043343335390091, "learning_rate": 1.6746745922269012e-05, "loss": 0.0137, "step": 27170 }, { "epoch": 5.656513766240522, "grad_norm": 0.1279674619436264, "learning_rate": 1.6733445288782454e-05, "loss": 0.0111, "step": 27180 }, { "epoch": 5.6585946339623625, "grad_norm": 0.1462768167257309, "learning_rate": 1.6720146139414923e-05, "loss": 0.0119, "step": 27190 }, { "epoch": 5.660675501684202, "grad_norm": 0.1382192224264145, "learning_rate": 1.670684848020871e-05, "loss": 0.0128, "step": 27200 }, { "epoch": 5.6627563694060425, "grad_norm": 0.25600358843803406, "learning_rate": 1.669355231720546e-05, "loss": 0.0124, "step": 27210 }, { "epoch": 5.664837237127882, "grad_norm": 0.15197736024856567, "learning_rate": 1.66802576564461e-05, "loss": 0.0165, "step": 27220 }, { "epoch": 5.666918104849723, "grad_norm": 0.1725168526172638, "learning_rate": 1.6666964503970912e-05, "loss": 0.0178, "step": 27230 }, { "epoch": 5.668998972571562, "grad_norm": 0.13734278082847595, "learning_rate": 1.665367286581948e-05, "loss": 0.0134, "step": 27240 }, { "epoch": 5.671079840293403, "grad_norm": 0.1467522382736206, "learning_rate": 1.6640382748030677e-05, "loss": 0.0128, "step": 27250 }, { "epoch": 5.673160708015242, "grad_norm": 0.14379510283470154, "learning_rate": 1.662709415664273e-05, "loss": 0.0121, "step": 27260 }, { "epoch": 5.675241575737083, "grad_norm": 0.14049966633319855, "learning_rate": 1.6613807097693124e-05, "loss": 0.0115, "step": 27270 }, { "epoch": 5.677322443458922, "grad_norm": 0.16125714778900146, "learning_rate": 1.660052157721868e-05, "loss": 0.0123, "step": 27280 }, { "epoch": 5.679403311180763, "grad_norm": 0.1403745412826538, "learning_rate": 1.6587237601255523e-05, "loss": 0.014, "step": 27290 }, { "epoch": 5.681484178902602, "grad_norm": 0.18207435309886932, "learning_rate": 1.6573955175839043e-05, "loss": 0.0163, "step": 27300 }, { "epoch": 5.683565046624443, "grad_norm": 0.14870856702327728, "learning_rate": 1.6560674307003955e-05, "loss": 0.0127, "step": 27310 }, { "epoch": 5.685645914346282, "grad_norm": 0.1443280130624771, "learning_rate": 1.6547395000784274e-05, "loss": 0.0139, "step": 27320 }, { "epoch": 5.687726782068123, "grad_norm": 0.17119017243385315, "learning_rate": 1.6534117263213264e-05, "loss": 0.0175, "step": 27330 }, { "epoch": 5.689807649789962, "grad_norm": 0.20114289224147797, "learning_rate": 1.6520841100323526e-05, "loss": 0.0188, "step": 27340 }, { "epoch": 5.691888517511803, "grad_norm": 0.19607225060462952, "learning_rate": 1.65075665181469e-05, "loss": 0.0115, "step": 27350 }, { "epoch": 5.693969385233642, "grad_norm": 0.18298476934432983, "learning_rate": 1.649429352271454e-05, "loss": 0.0128, "step": 27360 }, { "epoch": 5.696050252955482, "grad_norm": 0.17387518286705017, "learning_rate": 1.6481022120056878e-05, "loss": 0.0141, "step": 27370 }, { "epoch": 5.698131120677322, "grad_norm": 0.21034584939479828, "learning_rate": 1.6467752316203597e-05, "loss": 0.0151, "step": 27380 }, { "epoch": 5.700211988399163, "grad_norm": 0.15725426375865936, "learning_rate": 1.6454484117183677e-05, "loss": 0.012, "step": 27390 }, { "epoch": 5.7022928561210025, "grad_norm": 0.1365690976381302, "learning_rate": 1.644121752902536e-05, "loss": 0.0104, "step": 27400 }, { "epoch": 5.704373723842842, "grad_norm": 0.15971851348876953, "learning_rate": 1.6427952557756154e-05, "loss": 0.0145, "step": 27410 }, { "epoch": 5.7064545915646825, "grad_norm": 0.1783229410648346, "learning_rate": 1.6414689209402845e-05, "loss": 0.0149, "step": 27420 }, { "epoch": 5.708535459286523, "grad_norm": 0.14364172518253326, "learning_rate": 1.640142748999146e-05, "loss": 0.0148, "step": 27430 }, { "epoch": 5.7106163270083625, "grad_norm": 0.1743333339691162, "learning_rate": 1.6388167405547303e-05, "loss": 0.0117, "step": 27440 }, { "epoch": 5.712697194730202, "grad_norm": 0.17671310901641846, "learning_rate": 1.637490896209494e-05, "loss": 0.0122, "step": 27450 }, { "epoch": 5.714778062452043, "grad_norm": 0.13782772421836853, "learning_rate": 1.6361652165658165e-05, "loss": 0.0125, "step": 27460 }, { "epoch": 5.716858930173882, "grad_norm": 0.17235812544822693, "learning_rate": 1.6348397022260062e-05, "loss": 0.0154, "step": 27470 }, { "epoch": 5.718939797895723, "grad_norm": 0.16088475286960602, "learning_rate": 1.6335143537922923e-05, "loss": 0.0136, "step": 27480 }, { "epoch": 5.721020665617562, "grad_norm": 0.1834164261817932, "learning_rate": 1.6321891718668315e-05, "loss": 0.0127, "step": 27490 }, { "epoch": 5.723101533339403, "grad_norm": 0.16363103687763214, "learning_rate": 1.630864157051704e-05, "loss": 0.011, "step": 27500 }, { "epoch": 5.725182401061242, "grad_norm": 0.16735148429870605, "learning_rate": 1.6295393099489133e-05, "loss": 0.0125, "step": 27510 }, { "epoch": 5.727263268783083, "grad_norm": 0.1620349884033203, "learning_rate": 1.6282146311603882e-05, "loss": 0.0126, "step": 27520 }, { "epoch": 5.729344136504922, "grad_norm": 0.16501326858997345, "learning_rate": 1.6268901212879802e-05, "loss": 0.0148, "step": 27530 }, { "epoch": 5.731425004226763, "grad_norm": 0.17359191179275513, "learning_rate": 1.625565780933463e-05, "loss": 0.0113, "step": 27540 }, { "epoch": 5.733505871948602, "grad_norm": 0.28823262453079224, "learning_rate": 1.6242416106985363e-05, "loss": 0.0138, "step": 27550 }, { "epoch": 5.735586739670443, "grad_norm": 0.17087078094482422, "learning_rate": 1.6229176111848185e-05, "loss": 0.0143, "step": 27560 }, { "epoch": 5.737667607392282, "grad_norm": 0.18916720151901245, "learning_rate": 1.6215937829938538e-05, "loss": 0.0184, "step": 27570 }, { "epoch": 5.739748475114123, "grad_norm": 0.16288258135318756, "learning_rate": 1.6202701267271078e-05, "loss": 0.015, "step": 27580 }, { "epoch": 5.741829342835962, "grad_norm": 0.16018757224082947, "learning_rate": 1.6189466429859658e-05, "loss": 0.0106, "step": 27590 }, { "epoch": 5.743910210557803, "grad_norm": 0.15021273493766785, "learning_rate": 1.6176233323717378e-05, "loss": 0.015, "step": 27600 }, { "epoch": 5.745991078279642, "grad_norm": 0.2543156147003174, "learning_rate": 1.6163001954856525e-05, "loss": 0.0157, "step": 27610 }, { "epoch": 5.748071946001483, "grad_norm": 0.15809518098831177, "learning_rate": 1.614977232928862e-05, "loss": 0.0128, "step": 27620 }, { "epoch": 5.750152813723322, "grad_norm": 0.14129571616649628, "learning_rate": 1.613654445302438e-05, "loss": 0.0143, "step": 27630 }, { "epoch": 5.752233681445163, "grad_norm": 0.12422384321689606, "learning_rate": 1.6123318332073722e-05, "loss": 0.0123, "step": 27640 }, { "epoch": 5.7543145491670025, "grad_norm": 0.25339367985725403, "learning_rate": 1.6110093972445778e-05, "loss": 0.0125, "step": 27650 }, { "epoch": 5.756395416888843, "grad_norm": 0.17219804227352142, "learning_rate": 1.6096871380148863e-05, "loss": 0.0122, "step": 27660 }, { "epoch": 5.7584762846106825, "grad_norm": 0.15103276073932648, "learning_rate": 1.6083650561190508e-05, "loss": 0.0122, "step": 27670 }, { "epoch": 5.760557152332523, "grad_norm": 0.14440734684467316, "learning_rate": 1.6070431521577433e-05, "loss": 0.0125, "step": 27680 }, { "epoch": 5.7626380200543625, "grad_norm": 0.17167158424854279, "learning_rate": 1.605721426731554e-05, "loss": 0.0105, "step": 27690 }, { "epoch": 5.764718887776203, "grad_norm": 0.1807912439107895, "learning_rate": 1.604399880440992e-05, "loss": 0.0124, "step": 27700 }, { "epoch": 5.766799755498043, "grad_norm": 0.17433440685272217, "learning_rate": 1.6030785138864873e-05, "loss": 0.0098, "step": 27710 }, { "epoch": 5.768880623219883, "grad_norm": 0.14872406423091888, "learning_rate": 1.601757327668385e-05, "loss": 0.0174, "step": 27720 }, { "epoch": 5.770961490941723, "grad_norm": 0.1904008984565735, "learning_rate": 1.6004363223869515e-05, "loss": 0.011, "step": 27730 }, { "epoch": 5.773042358663563, "grad_norm": 0.14750374853610992, "learning_rate": 1.5991154986423672e-05, "loss": 0.011, "step": 27740 }, { "epoch": 5.775123226385403, "grad_norm": 0.1384052187204361, "learning_rate": 1.5977948570347337e-05, "loss": 0.0146, "step": 27750 }, { "epoch": 5.777204094107242, "grad_norm": 0.11611384898424149, "learning_rate": 1.5964743981640683e-05, "loss": 0.0142, "step": 27760 }, { "epoch": 5.779284961829083, "grad_norm": 0.15156488120555878, "learning_rate": 1.5951541226303043e-05, "loss": 0.0114, "step": 27770 }, { "epoch": 5.781365829550923, "grad_norm": 0.14014177024364471, "learning_rate": 1.593834031033294e-05, "loss": 0.0142, "step": 27780 }, { "epoch": 5.783446697272763, "grad_norm": 0.14338183403015137, "learning_rate": 1.5925141239728027e-05, "loss": 0.0157, "step": 27790 }, { "epoch": 5.785527564994602, "grad_norm": 0.16528522968292236, "learning_rate": 1.5911944020485156e-05, "loss": 0.0193, "step": 27800 }, { "epoch": 5.787608432716443, "grad_norm": 0.19375288486480713, "learning_rate": 1.589874865860033e-05, "loss": 0.0126, "step": 27810 }, { "epoch": 5.789689300438283, "grad_norm": 0.15224269032478333, "learning_rate": 1.5885555160068674e-05, "loss": 0.0117, "step": 27820 }, { "epoch": 5.791770168160123, "grad_norm": 0.17339782416820526, "learning_rate": 1.587236353088451e-05, "loss": 0.0118, "step": 27830 }, { "epoch": 5.793851035881962, "grad_norm": 0.14371630549430847, "learning_rate": 1.5859173777041295e-05, "loss": 0.01, "step": 27840 }, { "epoch": 5.795931903603803, "grad_norm": 0.18587727844715118, "learning_rate": 1.5845985904531623e-05, "loss": 0.0152, "step": 27850 }, { "epoch": 5.798012771325642, "grad_norm": 0.13634146749973297, "learning_rate": 1.5832799919347243e-05, "loss": 0.0112, "step": 27860 }, { "epoch": 5.800093639047483, "grad_norm": 0.20484867691993713, "learning_rate": 1.581961582747905e-05, "loss": 0.0106, "step": 27870 }, { "epoch": 5.8021745067693224, "grad_norm": 0.1685122698545456, "learning_rate": 1.580643363491706e-05, "loss": 0.013, "step": 27880 }, { "epoch": 5.804255374491163, "grad_norm": 0.2086084932088852, "learning_rate": 1.5793253347650465e-05, "loss": 0.0143, "step": 27890 }, { "epoch": 5.8063362422130025, "grad_norm": 0.13185878098011017, "learning_rate": 1.5780074971667544e-05, "loss": 0.0141, "step": 27900 }, { "epoch": 5.808417109934843, "grad_norm": 0.20750871300697327, "learning_rate": 1.5766898512955747e-05, "loss": 0.0122, "step": 27910 }, { "epoch": 5.8104979776566825, "grad_norm": 0.21227368712425232, "learning_rate": 1.575372397750162e-05, "loss": 0.0143, "step": 27920 }, { "epoch": 5.812578845378523, "grad_norm": 0.15575510263442993, "learning_rate": 1.574055137129086e-05, "loss": 0.0132, "step": 27930 }, { "epoch": 5.814659713100363, "grad_norm": 0.15208134055137634, "learning_rate": 1.572738070030829e-05, "loss": 0.012, "step": 27940 }, { "epoch": 5.816740580822203, "grad_norm": 0.16460222005844116, "learning_rate": 1.571421197053782e-05, "loss": 0.0138, "step": 27950 }, { "epoch": 5.818821448544043, "grad_norm": 0.127112478017807, "learning_rate": 1.5701045187962524e-05, "loss": 0.0123, "step": 27960 }, { "epoch": 5.820902316265883, "grad_norm": 0.13260018825531006, "learning_rate": 1.5687880358564542e-05, "loss": 0.0154, "step": 27970 }, { "epoch": 5.822983183987723, "grad_norm": 0.17800591886043549, "learning_rate": 1.5674717488325174e-05, "loss": 0.0127, "step": 27980 }, { "epoch": 5.825064051709563, "grad_norm": 0.15590058267116547, "learning_rate": 1.5661556583224802e-05, "loss": 0.014, "step": 27990 }, { "epoch": 5.827144919431403, "grad_norm": 0.14070867002010345, "learning_rate": 1.564839764924292e-05, "loss": 0.0115, "step": 28000 }, { "epoch": 5.829225787153243, "grad_norm": 0.14465837180614471, "learning_rate": 1.5635240692358126e-05, "loss": 0.0119, "step": 28010 }, { "epoch": 5.831306654875083, "grad_norm": 0.25036168098449707, "learning_rate": 1.562208571854813e-05, "loss": 0.013, "step": 28020 }, { "epoch": 5.833387522596923, "grad_norm": 0.1344471126794815, "learning_rate": 1.560893273378972e-05, "loss": 0.0128, "step": 28030 }, { "epoch": 5.835468390318763, "grad_norm": 0.14449132978916168, "learning_rate": 1.5595781744058812e-05, "loss": 0.0126, "step": 28040 }, { "epoch": 5.837549258040603, "grad_norm": 0.11597193032503128, "learning_rate": 1.558263275533037e-05, "loss": 0.0115, "step": 28050 }, { "epoch": 5.839630125762443, "grad_norm": 0.2120255082845688, "learning_rate": 1.5569485773578497e-05, "loss": 0.0127, "step": 28060 }, { "epoch": 5.841710993484283, "grad_norm": 0.15455083549022675, "learning_rate": 1.5556340804776366e-05, "loss": 0.0112, "step": 28070 }, { "epoch": 5.843791861206123, "grad_norm": 0.13524559140205383, "learning_rate": 1.5543197854896207e-05, "loss": 0.014, "step": 28080 }, { "epoch": 5.845872728927963, "grad_norm": 0.15760929882526398, "learning_rate": 1.5530056929909387e-05, "loss": 0.0133, "step": 28090 }, { "epoch": 5.847953596649803, "grad_norm": 0.1630082130432129, "learning_rate": 1.55169180357863e-05, "loss": 0.0102, "step": 28100 }, { "epoch": 5.850034464371643, "grad_norm": 0.22374379634857178, "learning_rate": 1.550378117849646e-05, "loss": 0.0152, "step": 28110 }, { "epoch": 5.852115332093483, "grad_norm": 0.1886298954486847, "learning_rate": 1.5490646364008428e-05, "loss": 0.0121, "step": 28120 }, { "epoch": 5.854196199815323, "grad_norm": 0.16611097753047943, "learning_rate": 1.547751359828984e-05, "loss": 0.0135, "step": 28130 }, { "epoch": 5.856277067537163, "grad_norm": 0.14571601152420044, "learning_rate": 1.5464382887307414e-05, "loss": 0.0132, "step": 28140 }, { "epoch": 5.858357935259003, "grad_norm": 0.19899259507656097, "learning_rate": 1.545125423702693e-05, "loss": 0.0118, "step": 28150 }, { "epoch": 5.860438802980843, "grad_norm": 0.14635099470615387, "learning_rate": 1.543812765341322e-05, "loss": 0.0109, "step": 28160 }, { "epoch": 5.862519670702683, "grad_norm": 0.17257273197174072, "learning_rate": 1.5425003142430203e-05, "loss": 0.0107, "step": 28170 }, { "epoch": 6.000078977356064, "grad_norm": 0.15613587200641632, "learning_rate": 1.5411880710040814e-05, "loss": 0.0146, "step": 28180 }, { "epoch": 6.002485906302769, "grad_norm": 0.1255194991827011, "learning_rate": 1.5398760362207083e-05, "loss": 0.0078, "step": 28190 }, { "epoch": 6.004892835249475, "grad_norm": 0.1152849942445755, "learning_rate": 1.538564210489009e-05, "loss": 0.007, "step": 28200 }, { "epoch": 6.00729976419618, "grad_norm": 0.13668769598007202, "learning_rate": 1.537252594404994e-05, "loss": 0.0067, "step": 28210 }, { "epoch": 6.009706693142885, "grad_norm": 0.12412962317466736, "learning_rate": 1.53594118856458e-05, "loss": 0.0068, "step": 28220 }, { "epoch": 6.012113622089591, "grad_norm": 0.13419994711875916, "learning_rate": 1.534629993563588e-05, "loss": 0.0064, "step": 28230 }, { "epoch": 6.014520551036296, "grad_norm": 0.13010086119174957, "learning_rate": 1.5333190099977437e-05, "loss": 0.0072, "step": 28240 }, { "epoch": 6.016927479983001, "grad_norm": 0.10583556443452835, "learning_rate": 1.5320082384626772e-05, "loss": 0.0058, "step": 28250 }, { "epoch": 6.019334408929707, "grad_norm": 0.11712039262056351, "learning_rate": 1.5306976795539197e-05, "loss": 0.0067, "step": 28260 }, { "epoch": 6.021741337876412, "grad_norm": 0.13665583729743958, "learning_rate": 1.5293873338669087e-05, "loss": 0.0073, "step": 28270 }, { "epoch": 6.024148266823117, "grad_norm": 0.1105131059885025, "learning_rate": 1.5280772019969843e-05, "loss": 0.0058, "step": 28280 }, { "epoch": 6.026555195769823, "grad_norm": 0.11365997791290283, "learning_rate": 1.5267672845393874e-05, "loss": 0.0067, "step": 28290 }, { "epoch": 6.028962124716528, "grad_norm": 0.11977025866508484, "learning_rate": 1.525457582089264e-05, "loss": 0.006, "step": 28300 }, { "epoch": 6.031369053663233, "grad_norm": 0.1446986198425293, "learning_rate": 1.5241480952416599e-05, "loss": 0.006, "step": 28310 }, { "epoch": 6.033775982609939, "grad_norm": 0.11339996010065079, "learning_rate": 1.5228388245915259e-05, "loss": 0.0073, "step": 28320 }, { "epoch": 6.036182911556644, "grad_norm": 0.12562619149684906, "learning_rate": 1.5215297707337126e-05, "loss": 0.0075, "step": 28330 }, { "epoch": 6.038589840503349, "grad_norm": 0.12016166001558304, "learning_rate": 1.520220934262972e-05, "loss": 0.0068, "step": 28340 }, { "epoch": 6.040996769450055, "grad_norm": 0.1000683382153511, "learning_rate": 1.518912315773959e-05, "loss": 0.0067, "step": 28350 }, { "epoch": 6.04340369839676, "grad_norm": 0.11005350947380066, "learning_rate": 1.5176039158612271e-05, "loss": 0.0061, "step": 28360 }, { "epoch": 6.045810627343465, "grad_norm": 0.12876953184604645, "learning_rate": 1.5162957351192317e-05, "loss": 0.0066, "step": 28370 }, { "epoch": 6.0482175562901705, "grad_norm": 0.13340313732624054, "learning_rate": 1.5149877741423305e-05, "loss": 0.0088, "step": 28380 }, { "epoch": 6.050624485236876, "grad_norm": 0.11029195040464401, "learning_rate": 1.513680033524777e-05, "loss": 0.006, "step": 28390 }, { "epoch": 6.053031414183581, "grad_norm": 0.17134049534797668, "learning_rate": 1.5123725138607283e-05, "loss": 0.0066, "step": 28400 }, { "epoch": 6.0554383431302865, "grad_norm": 0.13022054731845856, "learning_rate": 1.5110652157442411e-05, "loss": 0.0071, "step": 28410 }, { "epoch": 6.0578452720769915, "grad_norm": 0.17232736945152283, "learning_rate": 1.509758139769268e-05, "loss": 0.0063, "step": 28420 }, { "epoch": 6.0602522010236965, "grad_norm": 0.13608357310295105, "learning_rate": 1.5084512865296648e-05, "loss": 0.0069, "step": 28430 }, { "epoch": 6.0626591299704025, "grad_norm": 0.11623223125934601, "learning_rate": 1.5071446566191825e-05, "loss": 0.0071, "step": 28440 }, { "epoch": 6.0650660589171075, "grad_norm": 0.1226058080792427, "learning_rate": 1.5058382506314738e-05, "loss": 0.008, "step": 28450 }, { "epoch": 6.0674729878638125, "grad_norm": 0.11047961562871933, "learning_rate": 1.5045320691600875e-05, "loss": 0.0068, "step": 28460 }, { "epoch": 6.069879916810518, "grad_norm": 0.13235126435756683, "learning_rate": 1.5032261127984715e-05, "loss": 0.0082, "step": 28470 }, { "epoch": 6.072286845757223, "grad_norm": 0.1068829819560051, "learning_rate": 1.5019203821399711e-05, "loss": 0.0067, "step": 28480 }, { "epoch": 6.074693774703929, "grad_norm": 0.10409795492887497, "learning_rate": 1.500614877777828e-05, "loss": 0.0064, "step": 28490 }, { "epoch": 6.077100703650634, "grad_norm": 0.16206391155719757, "learning_rate": 1.4993096003051825e-05, "loss": 0.0074, "step": 28500 }, { "epoch": 6.079507632597339, "grad_norm": 0.11954206973314285, "learning_rate": 1.4980045503150727e-05, "loss": 0.0061, "step": 28510 }, { "epoch": 6.081914561544045, "grad_norm": 0.17430974543094635, "learning_rate": 1.4966997284004304e-05, "loss": 0.0066, "step": 28520 }, { "epoch": 6.08432149049075, "grad_norm": 0.1636756956577301, "learning_rate": 1.4953951351540856e-05, "loss": 0.0086, "step": 28530 }, { "epoch": 6.086728419437455, "grad_norm": 0.10523508489131927, "learning_rate": 1.4940907711687657e-05, "loss": 0.0066, "step": 28540 }, { "epoch": 6.089135348384161, "grad_norm": 0.1360417753458023, "learning_rate": 1.4927866370370904e-05, "loss": 0.0074, "step": 28550 }, { "epoch": 6.091542277330866, "grad_norm": 0.15727216005325317, "learning_rate": 1.4914827333515791e-05, "loss": 0.0064, "step": 28560 }, { "epoch": 6.093949206277571, "grad_norm": 0.09291977435350418, "learning_rate": 1.4901790607046429e-05, "loss": 0.0082, "step": 28570 }, { "epoch": 6.096356135224277, "grad_norm": 0.1092514768242836, "learning_rate": 1.4888756196885897e-05, "loss": 0.006, "step": 28580 }, { "epoch": 6.098763064170982, "grad_norm": 0.12129055708646774, "learning_rate": 1.4875724108956226e-05, "loss": 0.0063, "step": 28590 }, { "epoch": 6.101169993117687, "grad_norm": 0.12863333523273468, "learning_rate": 1.4862694349178373e-05, "loss": 0.0072, "step": 28600 }, { "epoch": 6.103576922064393, "grad_norm": 0.09611213952302933, "learning_rate": 1.4849666923472266e-05, "loss": 0.0066, "step": 28610 }, { "epoch": 6.105983851011098, "grad_norm": 0.13360875844955444, "learning_rate": 1.4836641837756735e-05, "loss": 0.0062, "step": 28620 }, { "epoch": 6.108390779957803, "grad_norm": 0.1518712192773819, "learning_rate": 1.4823619097949584e-05, "loss": 0.0064, "step": 28630 }, { "epoch": 6.110797708904509, "grad_norm": 0.14202848076820374, "learning_rate": 1.4810598709967536e-05, "loss": 0.0062, "step": 28640 }, { "epoch": 6.113204637851214, "grad_norm": 0.10504938662052155, "learning_rate": 1.4797580679726233e-05, "loss": 0.0058, "step": 28650 }, { "epoch": 6.115611566797919, "grad_norm": 0.14838822185993195, "learning_rate": 1.4784565013140263e-05, "loss": 0.0069, "step": 28660 }, { "epoch": 6.118018495744625, "grad_norm": 0.13102465867996216, "learning_rate": 1.477155171612314e-05, "loss": 0.0085, "step": 28670 }, { "epoch": 6.12042542469133, "grad_norm": 0.09242929518222809, "learning_rate": 1.4758540794587287e-05, "loss": 0.0068, "step": 28680 }, { "epoch": 6.122832353638035, "grad_norm": 0.11663112789392471, "learning_rate": 1.4745532254444061e-05, "loss": 0.0068, "step": 28690 }, { "epoch": 6.125239282584741, "grad_norm": 0.1055082380771637, "learning_rate": 1.4732526101603727e-05, "loss": 0.0069, "step": 28700 }, { "epoch": 6.127646211531446, "grad_norm": 0.11307238042354584, "learning_rate": 1.4719522341975474e-05, "loss": 0.0064, "step": 28710 }, { "epoch": 6.130053140478151, "grad_norm": 0.12484130263328552, "learning_rate": 1.470652098146741e-05, "loss": 0.0071, "step": 28720 }, { "epoch": 6.132460069424857, "grad_norm": 0.10844410955905914, "learning_rate": 1.4693522025986525e-05, "loss": 0.0063, "step": 28730 }, { "epoch": 6.134866998371562, "grad_norm": 0.11674626916646957, "learning_rate": 1.468052548143875e-05, "loss": 0.0059, "step": 28740 }, { "epoch": 6.137273927318267, "grad_norm": 0.13142065703868866, "learning_rate": 1.4667531353728895e-05, "loss": 0.0066, "step": 28750 }, { "epoch": 6.139680856264973, "grad_norm": 0.11345147341489792, "learning_rate": 1.4654539648760686e-05, "loss": 0.0086, "step": 28760 }, { "epoch": 6.142087785211678, "grad_norm": 0.11375192552804947, "learning_rate": 1.4641550372436755e-05, "loss": 0.0072, "step": 28770 }, { "epoch": 6.144494714158384, "grad_norm": 0.10187787562608719, "learning_rate": 1.46285635306586e-05, "loss": 0.0059, "step": 28780 }, { "epoch": 6.146901643105089, "grad_norm": 0.090660460293293, "learning_rate": 1.4615579129326656e-05, "loss": 0.0063, "step": 28790 }, { "epoch": 6.149308572051794, "grad_norm": 0.1179288700222969, "learning_rate": 1.4602597174340206e-05, "loss": 0.0061, "step": 28800 }, { "epoch": 6.1517155009985, "grad_norm": 0.15253697335720062, "learning_rate": 1.4589617671597452e-05, "loss": 0.0065, "step": 28810 }, { "epoch": 6.154122429945205, "grad_norm": 0.12935036420822144, "learning_rate": 1.4576640626995471e-05, "loss": 0.0062, "step": 28820 }, { "epoch": 6.15652935889191, "grad_norm": 0.1259865015745163, "learning_rate": 1.4563666046430226e-05, "loss": 0.0074, "step": 28830 }, { "epoch": 6.158936287838616, "grad_norm": 0.1501617580652237, "learning_rate": 1.4550693935796546e-05, "loss": 0.0074, "step": 28840 }, { "epoch": 6.161343216785321, "grad_norm": 0.10202889144420624, "learning_rate": 1.453772430098817e-05, "loss": 0.0064, "step": 28850 }, { "epoch": 6.163750145732026, "grad_norm": 0.13675838708877563, "learning_rate": 1.4524757147897673e-05, "loss": 0.0072, "step": 28860 }, { "epoch": 6.166157074678732, "grad_norm": 0.10344427824020386, "learning_rate": 1.4511792482416538e-05, "loss": 0.0068, "step": 28870 }, { "epoch": 6.168564003625437, "grad_norm": 0.11133851855993271, "learning_rate": 1.4498830310435086e-05, "loss": 0.0077, "step": 28880 }, { "epoch": 6.170970932572142, "grad_norm": 0.12133053690195084, "learning_rate": 1.4485870637842529e-05, "loss": 0.0073, "step": 28890 }, { "epoch": 6.173377861518848, "grad_norm": 0.11256692558526993, "learning_rate": 1.4472913470526942e-05, "loss": 0.0073, "step": 28900 }, { "epoch": 6.175784790465553, "grad_norm": 0.16283775866031647, "learning_rate": 1.4459958814375242e-05, "loss": 0.0068, "step": 28910 }, { "epoch": 6.178191719412258, "grad_norm": 0.13112396001815796, "learning_rate": 1.444700667527323e-05, "loss": 0.0073, "step": 28920 }, { "epoch": 6.180598648358964, "grad_norm": 0.10064147412776947, "learning_rate": 1.4434057059105536e-05, "loss": 0.0068, "step": 28930 }, { "epoch": 6.183005577305669, "grad_norm": 0.08932317048311234, "learning_rate": 1.4421109971755672e-05, "loss": 0.006, "step": 28940 }, { "epoch": 6.185412506252374, "grad_norm": 0.14308884739875793, "learning_rate": 1.4408165419105991e-05, "loss": 0.0081, "step": 28950 }, { "epoch": 6.18781943519908, "grad_norm": 0.08948417752981186, "learning_rate": 1.4395223407037673e-05, "loss": 0.0059, "step": 28960 }, { "epoch": 6.190226364145785, "grad_norm": 0.11830602586269379, "learning_rate": 1.4382283941430774e-05, "loss": 0.007, "step": 28970 }, { "epoch": 6.19263329309249, "grad_norm": 0.11446068435907364, "learning_rate": 1.4369347028164193e-05, "loss": 0.0082, "step": 28980 }, { "epoch": 6.195040222039196, "grad_norm": 0.1256624311208725, "learning_rate": 1.4356412673115636e-05, "loss": 0.0067, "step": 28990 }, { "epoch": 6.197447150985901, "grad_norm": 0.12023365497589111, "learning_rate": 1.4343480882161687e-05, "loss": 0.0069, "step": 29000 }, { "epoch": 6.199854079932606, "grad_norm": 0.11053222417831421, "learning_rate": 1.4330551661177727e-05, "loss": 0.0061, "step": 29010 }, { "epoch": 6.2022610088793115, "grad_norm": 0.12565594911575317, "learning_rate": 1.4317625016038e-05, "loss": 0.0063, "step": 29020 }, { "epoch": 6.2046679378260166, "grad_norm": 0.10866978019475937, "learning_rate": 1.4304700952615577e-05, "loss": 0.0066, "step": 29030 }, { "epoch": 6.207074866772722, "grad_norm": 0.18096689879894257, "learning_rate": 1.4291779476782336e-05, "loss": 0.0065, "step": 29040 }, { "epoch": 6.2094817957194275, "grad_norm": 0.13079270720481873, "learning_rate": 1.4278860594408991e-05, "loss": 0.0064, "step": 29050 }, { "epoch": 6.2118887246661325, "grad_norm": 0.12700584530830383, "learning_rate": 1.426594431136508e-05, "loss": 0.0074, "step": 29060 }, { "epoch": 6.2142956536128375, "grad_norm": 0.14309082925319672, "learning_rate": 1.4253030633518956e-05, "loss": 0.0077, "step": 29070 }, { "epoch": 6.216702582559543, "grad_norm": 0.14738257229328156, "learning_rate": 1.4240119566737798e-05, "loss": 0.0067, "step": 29080 }, { "epoch": 6.2191095115062485, "grad_norm": 0.1436142772436142, "learning_rate": 1.4227211116887575e-05, "loss": 0.0074, "step": 29090 }, { "epoch": 6.2215164404529535, "grad_norm": 0.12507010996341705, "learning_rate": 1.4214305289833093e-05, "loss": 0.0077, "step": 29100 }, { "epoch": 6.223923369399659, "grad_norm": 0.1263379603624344, "learning_rate": 1.4201402091437962e-05, "loss": 0.0085, "step": 29110 }, { "epoch": 6.226330298346364, "grad_norm": 0.14287671446800232, "learning_rate": 1.4188501527564577e-05, "loss": 0.0065, "step": 29120 }, { "epoch": 6.22873722729307, "grad_norm": 0.1402895450592041, "learning_rate": 1.4175603604074166e-05, "loss": 0.0072, "step": 29130 }, { "epoch": 6.231144156239775, "grad_norm": 0.14834769070148468, "learning_rate": 1.4162708326826727e-05, "loss": 0.007, "step": 29140 }, { "epoch": 6.23355108518648, "grad_norm": 0.12212863564491272, "learning_rate": 1.4149815701681083e-05, "loss": 0.0085, "step": 29150 }, { "epoch": 6.235958014133186, "grad_norm": 0.1234765276312828, "learning_rate": 1.4136925734494834e-05, "loss": 0.0069, "step": 29160 }, { "epoch": 6.238364943079891, "grad_norm": 0.13585186004638672, "learning_rate": 1.412403843112438e-05, "loss": 0.0067, "step": 29170 }, { "epoch": 6.240771872026596, "grad_norm": 0.11608826369047165, "learning_rate": 1.4111153797424909e-05, "loss": 0.0068, "step": 29180 }, { "epoch": 6.243178800973302, "grad_norm": 0.10690636187791824, "learning_rate": 1.4098271839250394e-05, "loss": 0.0067, "step": 29190 }, { "epoch": 6.245585729920007, "grad_norm": 0.10542066395282745, "learning_rate": 1.4085392562453594e-05, "loss": 0.0064, "step": 29200 }, { "epoch": 6.247992658866712, "grad_norm": 0.10864251106977463, "learning_rate": 1.407251597288606e-05, "loss": 0.0066, "step": 29210 }, { "epoch": 6.250399587813418, "grad_norm": 0.12398894876241684, "learning_rate": 1.4059642076398096e-05, "loss": 0.0059, "step": 29220 }, { "epoch": 6.252806516760123, "grad_norm": 0.14574456214904785, "learning_rate": 1.404677087883881e-05, "loss": 0.0063, "step": 29230 }, { "epoch": 6.255213445706828, "grad_norm": 0.12353115528821945, "learning_rate": 1.4033902386056076e-05, "loss": 0.0072, "step": 29240 }, { "epoch": 6.257620374653534, "grad_norm": 0.1360079050064087, "learning_rate": 1.4021036603896523e-05, "loss": 0.0079, "step": 29250 }, { "epoch": 6.260027303600239, "grad_norm": 0.1361265927553177, "learning_rate": 1.4008173538205574e-05, "loss": 0.0077, "step": 29260 }, { "epoch": 6.262434232546944, "grad_norm": 0.12454346567392349, "learning_rate": 1.3995313194827392e-05, "loss": 0.008, "step": 29270 }, { "epoch": 6.26484116149365, "grad_norm": 0.10199309885501862, "learning_rate": 1.3982455579604928e-05, "loss": 0.007, "step": 29280 }, { "epoch": 6.267248090440355, "grad_norm": 0.11184428632259369, "learning_rate": 1.3969600698379875e-05, "loss": 0.0072, "step": 29290 }, { "epoch": 6.26965501938706, "grad_norm": 0.10244923084974289, "learning_rate": 1.3956748556992692e-05, "loss": 0.0073, "step": 29300 }, { "epoch": 6.272061948333766, "grad_norm": 0.14311464130878448, "learning_rate": 1.3943899161282598e-05, "loss": 0.0073, "step": 29310 }, { "epoch": 6.274468877280471, "grad_norm": 0.120786152780056, "learning_rate": 1.3931052517087539e-05, "loss": 0.0066, "step": 29320 }, { "epoch": 6.276875806227176, "grad_norm": 0.14966121315956116, "learning_rate": 1.3918208630244245e-05, "loss": 0.0074, "step": 29330 }, { "epoch": 6.279282735173882, "grad_norm": 0.1309308260679245, "learning_rate": 1.3905367506588187e-05, "loss": 0.0063, "step": 29340 }, { "epoch": 6.281689664120587, "grad_norm": 0.1446753889322281, "learning_rate": 1.389252915195355e-05, "loss": 0.0073, "step": 29350 }, { "epoch": 6.284096593067292, "grad_norm": 0.1415097713470459, "learning_rate": 1.3879693572173295e-05, "loss": 0.0087, "step": 29360 }, { "epoch": 6.286503522013998, "grad_norm": 0.12922193109989166, "learning_rate": 1.3866860773079115e-05, "loss": 0.0082, "step": 29370 }, { "epoch": 6.288910450960703, "grad_norm": 0.1795874983072281, "learning_rate": 1.3854030760501422e-05, "loss": 0.0066, "step": 29380 }, { "epoch": 6.291317379907408, "grad_norm": 0.15139588713645935, "learning_rate": 1.3841203540269388e-05, "loss": 0.0085, "step": 29390 }, { "epoch": 6.293724308854114, "grad_norm": 0.13909609615802765, "learning_rate": 1.3828379118210892e-05, "loss": 0.008, "step": 29400 }, { "epoch": 6.296131237800819, "grad_norm": 0.1469791680574417, "learning_rate": 1.3815557500152555e-05, "loss": 0.0062, "step": 29410 }, { "epoch": 6.298538166747525, "grad_norm": 0.11309733986854553, "learning_rate": 1.380273869191973e-05, "loss": 0.0125, "step": 29420 }, { "epoch": 6.30094509569423, "grad_norm": 0.14141128957271576, "learning_rate": 1.378992269933647e-05, "loss": 0.0073, "step": 29430 }, { "epoch": 6.303352024640935, "grad_norm": 0.1637071967124939, "learning_rate": 1.3777109528225582e-05, "loss": 0.0066, "step": 29440 }, { "epoch": 6.305758953587641, "grad_norm": 0.13841910660266876, "learning_rate": 1.3764299184408555e-05, "loss": 0.0101, "step": 29450 }, { "epoch": 6.308165882534346, "grad_norm": 0.10225043445825577, "learning_rate": 1.3751491673705618e-05, "loss": 0.0061, "step": 29460 }, { "epoch": 6.310572811481051, "grad_norm": 0.1629945933818817, "learning_rate": 1.3738687001935715e-05, "loss": 0.0065, "step": 29470 }, { "epoch": 6.312979740427757, "grad_norm": 0.12634702026844025, "learning_rate": 1.372588517491648e-05, "loss": 0.0066, "step": 29480 }, { "epoch": 6.315386669374462, "grad_norm": 0.10895011574029922, "learning_rate": 1.3713086198464267e-05, "loss": 0.0083, "step": 29490 }, { "epoch": 6.317793598321167, "grad_norm": 0.12015581130981445, "learning_rate": 1.3700290078394147e-05, "loss": 0.0076, "step": 29500 }, { "epoch": 6.320200527267873, "grad_norm": 0.12871696054935455, "learning_rate": 1.3687496820519867e-05, "loss": 0.0069, "step": 29510 }, { "epoch": 6.322607456214578, "grad_norm": 0.11257293820381165, "learning_rate": 1.367470643065389e-05, "loss": 0.0072, "step": 29520 }, { "epoch": 6.325014385161283, "grad_norm": 0.14067964255809784, "learning_rate": 1.366191891460737e-05, "loss": 0.0062, "step": 29530 }, { "epoch": 6.327421314107989, "grad_norm": 0.1089010238647461, "learning_rate": 1.3649134278190154e-05, "loss": 0.0078, "step": 29540 }, { "epoch": 6.329828243054694, "grad_norm": 0.1317889541387558, "learning_rate": 1.3636352527210805e-05, "loss": 0.007, "step": 29550 }, { "epoch": 6.332235172001399, "grad_norm": 0.11756113916635513, "learning_rate": 1.3623573667476527e-05, "loss": 0.0067, "step": 29560 }, { "epoch": 6.334642100948105, "grad_norm": 0.17563989758491516, "learning_rate": 1.3610797704793261e-05, "loss": 0.01, "step": 29570 }, { "epoch": 6.33704902989481, "grad_norm": 0.11599980294704437, "learning_rate": 1.3598024644965592e-05, "loss": 0.0075, "step": 29580 }, { "epoch": 6.339455958841515, "grad_norm": 0.11853928864002228, "learning_rate": 1.3585254493796809e-05, "loss": 0.0084, "step": 29590 }, { "epoch": 6.341862887788221, "grad_norm": 0.11190905421972275, "learning_rate": 1.3572487257088884e-05, "loss": 0.0071, "step": 29600 }, { "epoch": 6.344269816734926, "grad_norm": 0.12298472225666046, "learning_rate": 1.3559722940642433e-05, "loss": 0.0066, "step": 29610 }, { "epoch": 6.346676745681631, "grad_norm": 0.12040404975414276, "learning_rate": 1.3546961550256783e-05, "loss": 0.0064, "step": 29620 }, { "epoch": 6.349083674628337, "grad_norm": 0.11730237305164337, "learning_rate": 1.3534203091729912e-05, "loss": 0.0074, "step": 29630 }, { "epoch": 6.351490603575042, "grad_norm": 0.1223556324839592, "learning_rate": 1.3521447570858466e-05, "loss": 0.0086, "step": 29640 }, { "epoch": 6.353897532521747, "grad_norm": 0.1272115558385849, "learning_rate": 1.3508694993437761e-05, "loss": 0.0063, "step": 29650 }, { "epoch": 6.3563044614684525, "grad_norm": 0.10248813033103943, "learning_rate": 1.3495945365261771e-05, "loss": 0.008, "step": 29660 }, { "epoch": 6.3587113904151575, "grad_norm": 0.09754990041255951, "learning_rate": 1.3483198692123135e-05, "loss": 0.0074, "step": 29670 }, { "epoch": 6.361118319361863, "grad_norm": 0.1024959534406662, "learning_rate": 1.347045497981315e-05, "loss": 0.0107, "step": 29680 }, { "epoch": 6.3635252483085685, "grad_norm": 0.11912322044372559, "learning_rate": 1.345771423412176e-05, "loss": 0.0063, "step": 29690 }, { "epoch": 6.3659321772552735, "grad_norm": 0.12611404061317444, "learning_rate": 1.3444976460837574e-05, "loss": 0.0083, "step": 29700 }, { "epoch": 6.368339106201979, "grad_norm": 0.12932543456554413, "learning_rate": 1.3432241665747832e-05, "loss": 0.0066, "step": 29710 }, { "epoch": 6.370746035148684, "grad_norm": 0.13739502429962158, "learning_rate": 1.3419509854638435e-05, "loss": 0.007, "step": 29720 }, { "epoch": 6.3731529640953894, "grad_norm": 0.13709531724452972, "learning_rate": 1.3406781033293936e-05, "loss": 0.0088, "step": 29730 }, { "epoch": 6.3755598930420945, "grad_norm": 0.1705649346113205, "learning_rate": 1.3394055207497505e-05, "loss": 0.0075, "step": 29740 }, { "epoch": 6.3779668219888, "grad_norm": 0.14479073882102966, "learning_rate": 1.3381332383030978e-05, "loss": 0.0071, "step": 29750 }, { "epoch": 6.380373750935505, "grad_norm": 0.11669053882360458, "learning_rate": 1.3368612565674796e-05, "loss": 0.0075, "step": 29760 }, { "epoch": 6.382780679882211, "grad_norm": 0.13038866221904755, "learning_rate": 1.3355895761208069e-05, "loss": 0.006, "step": 29770 }, { "epoch": 6.385187608828916, "grad_norm": 0.11281511932611465, "learning_rate": 1.3343181975408515e-05, "loss": 0.006, "step": 29780 }, { "epoch": 6.387594537775621, "grad_norm": 0.16532301902770996, "learning_rate": 1.3330471214052478e-05, "loss": 0.0072, "step": 29790 }, { "epoch": 6.390001466722327, "grad_norm": 0.16739559173583984, "learning_rate": 1.3317763482914946e-05, "loss": 0.0084, "step": 29800 }, { "epoch": 6.392408395669032, "grad_norm": 0.10594046860933304, "learning_rate": 1.3305058787769522e-05, "loss": 0.0073, "step": 29810 }, { "epoch": 6.394815324615737, "grad_norm": 0.12443137913942337, "learning_rate": 1.329235713438842e-05, "loss": 0.0058, "step": 29820 }, { "epoch": 6.397222253562443, "grad_norm": 0.09216293692588806, "learning_rate": 1.3279658528542492e-05, "loss": 0.0091, "step": 29830 }, { "epoch": 6.399629182509148, "grad_norm": 0.14686523377895355, "learning_rate": 1.3266962976001177e-05, "loss": 0.0066, "step": 29840 }, { "epoch": 6.402036111455853, "grad_norm": 0.12949252128601074, "learning_rate": 1.3254270482532556e-05, "loss": 0.0075, "step": 29850 }, { "epoch": 6.404443040402559, "grad_norm": 0.10157175362110138, "learning_rate": 1.324158105390331e-05, "loss": 0.0069, "step": 29860 }, { "epoch": 6.406849969349264, "grad_norm": 0.1271977722644806, "learning_rate": 1.3228894695878715e-05, "loss": 0.0086, "step": 29870 }, { "epoch": 6.409256898295969, "grad_norm": 0.10511290282011032, "learning_rate": 1.3216211414222668e-05, "loss": 0.0063, "step": 29880 }, { "epoch": 6.411663827242675, "grad_norm": 0.12209538370370865, "learning_rate": 1.320353121469766e-05, "loss": 0.0067, "step": 29890 }, { "epoch": 6.41407075618938, "grad_norm": 0.10407979041337967, "learning_rate": 1.319085410306478e-05, "loss": 0.0079, "step": 29900 }, { "epoch": 6.416477685136085, "grad_norm": 0.12643036246299744, "learning_rate": 1.3178180085083732e-05, "loss": 0.0069, "step": 29910 }, { "epoch": 6.418884614082791, "grad_norm": 0.09942235052585602, "learning_rate": 1.3165509166512784e-05, "loss": 0.0082, "step": 29920 }, { "epoch": 6.421291543029496, "grad_norm": 0.12255717813968658, "learning_rate": 1.3152841353108818e-05, "loss": 0.0074, "step": 29930 }, { "epoch": 6.423698471976201, "grad_norm": 0.13905179500579834, "learning_rate": 1.3140176650627307e-05, "loss": 0.0069, "step": 29940 }, { "epoch": 6.426105400922907, "grad_norm": 0.12488560378551483, "learning_rate": 1.3127515064822292e-05, "loss": 0.0073, "step": 29950 }, { "epoch": 6.428512329869612, "grad_norm": 0.12422879785299301, "learning_rate": 1.311485660144642e-05, "loss": 0.0066, "step": 29960 }, { "epoch": 6.430919258816317, "grad_norm": 0.12068314850330353, "learning_rate": 1.3102201266250893e-05, "loss": 0.0067, "step": 29970 }, { "epoch": 6.433326187763023, "grad_norm": 0.1274469494819641, "learning_rate": 1.3089549064985523e-05, "loss": 0.007, "step": 29980 }, { "epoch": 6.435733116709728, "grad_norm": 0.11559482663869858, "learning_rate": 1.3076900003398674e-05, "loss": 0.0076, "step": 29990 }, { "epoch": 6.438140045656433, "grad_norm": 0.09918392449617386, "learning_rate": 1.3064254087237296e-05, "loss": 0.0071, "step": 30000 }, { "epoch": 6.440546974603139, "grad_norm": 0.11271458864212036, "learning_rate": 1.30516113222469e-05, "loss": 0.0065, "step": 30010 }, { "epoch": 6.442953903549844, "grad_norm": 0.12222353368997574, "learning_rate": 1.3038971714171575e-05, "loss": 0.0065, "step": 30020 }, { "epoch": 6.445360832496549, "grad_norm": 0.1538599282503128, "learning_rate": 1.302633526875397e-05, "loss": 0.0067, "step": 30030 }, { "epoch": 6.447767761443255, "grad_norm": 0.14753763377666473, "learning_rate": 1.3013701991735304e-05, "loss": 0.0069, "step": 30040 }, { "epoch": 6.45017469038996, "grad_norm": 0.1241428479552269, "learning_rate": 1.3001071888855339e-05, "loss": 0.0079, "step": 30050 }, { "epoch": 6.452581619336666, "grad_norm": 0.1460520178079605, "learning_rate": 1.2988444965852418e-05, "loss": 0.0075, "step": 30060 }, { "epoch": 6.454988548283371, "grad_norm": 0.12046828120946884, "learning_rate": 1.2975821228463432e-05, "loss": 0.0083, "step": 30070 }, { "epoch": 6.457395477230076, "grad_norm": 0.1455804407596588, "learning_rate": 1.2963200682423805e-05, "loss": 0.0067, "step": 30080 }, { "epoch": 6.459802406176782, "grad_norm": 0.1381777673959732, "learning_rate": 1.2950583333467548e-05, "loss": 0.007, "step": 30090 }, { "epoch": 6.462209335123487, "grad_norm": 0.15541286766529083, "learning_rate": 1.2937969187327175e-05, "loss": 0.0075, "step": 30100 }, { "epoch": 6.464616264070192, "grad_norm": 0.11675727367401123, "learning_rate": 1.2925358249733788e-05, "loss": 0.0074, "step": 30110 }, { "epoch": 6.467023193016898, "grad_norm": 0.12469244003295898, "learning_rate": 1.2912750526417003e-05, "loss": 0.0076, "step": 30120 }, { "epoch": 6.469430121963603, "grad_norm": 0.13153168559074402, "learning_rate": 1.2900146023104986e-05, "loss": 0.0089, "step": 30130 }, { "epoch": 6.471837050910308, "grad_norm": 0.16285301744937897, "learning_rate": 1.2887544745524445e-05, "loss": 0.0074, "step": 30140 }, { "epoch": 6.474243979857014, "grad_norm": 0.11980532109737396, "learning_rate": 1.2874946699400597e-05, "loss": 0.0081, "step": 30150 }, { "epoch": 6.476650908803719, "grad_norm": 0.16683639585971832, "learning_rate": 1.2862351890457221e-05, "loss": 0.0077, "step": 30160 }, { "epoch": 6.479057837750424, "grad_norm": 0.09164316207170486, "learning_rate": 1.2849760324416623e-05, "loss": 0.0071, "step": 30170 }, { "epoch": 6.48146476669713, "grad_norm": 0.1193753331899643, "learning_rate": 1.2837172006999609e-05, "loss": 0.0069, "step": 30180 }, { "epoch": 6.483871695643835, "grad_norm": 0.1374012529850006, "learning_rate": 1.2824586943925535e-05, "loss": 0.0073, "step": 30190 }, { "epoch": 6.48627862459054, "grad_norm": 0.09730469435453415, "learning_rate": 1.2812005140912278e-05, "loss": 0.0072, "step": 30200 }, { "epoch": 6.488685553537246, "grad_norm": 0.1698528230190277, "learning_rate": 1.2799426603676208e-05, "loss": 0.0068, "step": 30210 }, { "epoch": 6.491092482483951, "grad_norm": 0.16689284145832062, "learning_rate": 1.2786851337932248e-05, "loss": 0.0068, "step": 30220 }, { "epoch": 6.493499411430656, "grad_norm": 0.13599401712417603, "learning_rate": 1.2774279349393801e-05, "loss": 0.006, "step": 30230 }, { "epoch": 6.495906340377362, "grad_norm": 0.15718208253383636, "learning_rate": 1.2761710643772798e-05, "loss": 0.0073, "step": 30240 }, { "epoch": 6.498313269324067, "grad_norm": 0.11374863237142563, "learning_rate": 1.2749145226779684e-05, "loss": 0.0065, "step": 30250 }, { "epoch": 6.500720198270772, "grad_norm": 0.1257520616054535, "learning_rate": 1.2736583104123393e-05, "loss": 0.0071, "step": 30260 }, { "epoch": 6.5031271272174775, "grad_norm": 0.11542712152004242, "learning_rate": 1.2724024281511377e-05, "loss": 0.0068, "step": 30270 }, { "epoch": 6.505534056164183, "grad_norm": 0.12480369210243225, "learning_rate": 1.2711468764649575e-05, "loss": 0.0068, "step": 30280 }, { "epoch": 6.507940985110888, "grad_norm": 0.14011956751346588, "learning_rate": 1.2698916559242438e-05, "loss": 0.0078, "step": 30290 }, { "epoch": 6.5103479140575935, "grad_norm": 0.11466432362794876, "learning_rate": 1.2686367670992912e-05, "loss": 0.0068, "step": 30300 }, { "epoch": 6.5127548430042985, "grad_norm": 0.11419127881526947, "learning_rate": 1.2673822105602417e-05, "loss": 0.0075, "step": 30310 }, { "epoch": 6.5151617719510035, "grad_norm": 0.16122683882713318, "learning_rate": 1.2661279868770883e-05, "loss": 0.0068, "step": 30320 }, { "epoch": 6.5175687008977095, "grad_norm": 0.12307937443256378, "learning_rate": 1.2648740966196731e-05, "loss": 0.0069, "step": 30330 }, { "epoch": 6.5199756298444145, "grad_norm": 0.15853425860404968, "learning_rate": 1.2636205403576845e-05, "loss": 0.0066, "step": 30340 }, { "epoch": 6.52238255879112, "grad_norm": 0.14024965465068817, "learning_rate": 1.2623673186606605e-05, "loss": 0.0065, "step": 30350 }, { "epoch": 6.524789487737825, "grad_norm": 0.1216452419757843, "learning_rate": 1.2611144320979873e-05, "loss": 0.0065, "step": 30360 }, { "epoch": 6.52719641668453, "grad_norm": 0.10053020715713501, "learning_rate": 1.2598618812388979e-05, "loss": 0.0069, "step": 30370 }, { "epoch": 6.5296033456312355, "grad_norm": 0.13394960761070251, "learning_rate": 1.258609666652475e-05, "loss": 0.0067, "step": 30380 }, { "epoch": 6.532010274577941, "grad_norm": 0.11304625123739243, "learning_rate": 1.2573577889076449e-05, "loss": 0.0073, "step": 30390 }, { "epoch": 6.534417203524646, "grad_norm": 0.12130361795425415, "learning_rate": 1.2561062485731847e-05, "loss": 0.0078, "step": 30400 }, { "epoch": 6.536824132471352, "grad_norm": 0.14622332155704498, "learning_rate": 1.2548550462177146e-05, "loss": 0.0066, "step": 30410 }, { "epoch": 6.539231061418057, "grad_norm": 0.13642936944961548, "learning_rate": 1.253604182409704e-05, "loss": 0.0066, "step": 30420 }, { "epoch": 6.541637990364762, "grad_norm": 0.12123441696166992, "learning_rate": 1.2523536577174684e-05, "loss": 0.0062, "step": 30430 }, { "epoch": 6.544044919311468, "grad_norm": 0.13181082904338837, "learning_rate": 1.2511034727091661e-05, "loss": 0.0063, "step": 30440 }, { "epoch": 6.546451848258173, "grad_norm": 0.1371023803949356, "learning_rate": 1.2498536279528058e-05, "loss": 0.0062, "step": 30450 }, { "epoch": 6.548858777204878, "grad_norm": 0.146981343626976, "learning_rate": 1.2486041240162373e-05, "loss": 0.0068, "step": 30460 }, { "epoch": 6.551265706151584, "grad_norm": 0.11684882640838623, "learning_rate": 1.247354961467158e-05, "loss": 0.0077, "step": 30470 }, { "epoch": 6.553672635098289, "grad_norm": 0.14541758596897125, "learning_rate": 1.2461061408731098e-05, "loss": 0.0072, "step": 30480 }, { "epoch": 6.556079564044994, "grad_norm": 0.1456659436225891, "learning_rate": 1.2448576628014788e-05, "loss": 0.0074, "step": 30490 }, { "epoch": 6.5584864929917, "grad_norm": 0.09911232441663742, "learning_rate": 1.243609527819495e-05, "loss": 0.007, "step": 30500 }, { "epoch": 6.560893421938405, "grad_norm": 0.10277780890464783, "learning_rate": 1.2423617364942349e-05, "loss": 0.0073, "step": 30510 }, { "epoch": 6.56330035088511, "grad_norm": 0.10862764716148376, "learning_rate": 1.2411142893926152e-05, "loss": 0.006, "step": 30520 }, { "epoch": 6.565707279831816, "grad_norm": 0.12641672790050507, "learning_rate": 1.2398671870813996e-05, "loss": 0.0063, "step": 30530 }, { "epoch": 6.568114208778521, "grad_norm": 0.12647296488285065, "learning_rate": 1.2386204301271928e-05, "loss": 0.0062, "step": 30540 }, { "epoch": 6.570521137725226, "grad_norm": 0.14826977252960205, "learning_rate": 1.2373740190964436e-05, "loss": 0.0067, "step": 30550 }, { "epoch": 6.572928066671932, "grad_norm": 0.1677585244178772, "learning_rate": 1.2361279545554447e-05, "loss": 0.0068, "step": 30560 }, { "epoch": 6.575334995618637, "grad_norm": 0.13485656678676605, "learning_rate": 1.2348822370703288e-05, "loss": 0.0071, "step": 30570 }, { "epoch": 6.577741924565342, "grad_norm": 0.1556442826986313, "learning_rate": 1.2336368672070735e-05, "loss": 0.0069, "step": 30580 }, { "epoch": 6.580148853512048, "grad_norm": 0.1200638860464096, "learning_rate": 1.2323918455314965e-05, "loss": 0.0065, "step": 30590 }, { "epoch": 6.582555782458753, "grad_norm": 0.1291312724351883, "learning_rate": 1.2311471726092578e-05, "loss": 0.0069, "step": 30600 }, { "epoch": 6.584962711405458, "grad_norm": 0.12210019677877426, "learning_rate": 1.2299028490058606e-05, "loss": 0.0076, "step": 30610 }, { "epoch": 6.587369640352164, "grad_norm": 0.1270192712545395, "learning_rate": 1.2286588752866466e-05, "loss": 0.006, "step": 30620 }, { "epoch": 6.589776569298869, "grad_norm": 0.13713517785072327, "learning_rate": 1.2274152520168004e-05, "loss": 0.0074, "step": 30630 }, { "epoch": 6.592183498245575, "grad_norm": 0.1571984589099884, "learning_rate": 1.2261719797613481e-05, "loss": 0.0073, "step": 30640 }, { "epoch": 6.59459042719228, "grad_norm": 0.1309814453125, "learning_rate": 1.2249290590851533e-05, "loss": 0.006, "step": 30650 }, { "epoch": 6.596997356138985, "grad_norm": 0.15809127688407898, "learning_rate": 1.2236864905529235e-05, "loss": 0.0074, "step": 30660 }, { "epoch": 6.59940428508569, "grad_norm": 0.12663303315639496, "learning_rate": 1.2224442747292032e-05, "loss": 0.0072, "step": 30670 }, { "epoch": 6.601811214032396, "grad_norm": 0.15092812478542328, "learning_rate": 1.2212024121783781e-05, "loss": 0.0068, "step": 30680 }, { "epoch": 6.604218142979101, "grad_norm": 0.12358996272087097, "learning_rate": 1.2199609034646741e-05, "loss": 0.0073, "step": 30690 }, { "epoch": 6.606625071925807, "grad_norm": 0.1245322972536087, "learning_rate": 1.218719749152155e-05, "loss": 0.007, "step": 30700 }, { "epoch": 6.609032000872512, "grad_norm": 0.1225384771823883, "learning_rate": 1.2174789498047232e-05, "loss": 0.0073, "step": 30710 }, { "epoch": 6.611438929819217, "grad_norm": 0.07556332647800446, "learning_rate": 1.2162385059861217e-05, "loss": 0.007, "step": 30720 }, { "epoch": 6.613845858765922, "grad_norm": 0.09950641542673111, "learning_rate": 1.2149984182599299e-05, "loss": 0.0066, "step": 30730 }, { "epoch": 6.616252787712628, "grad_norm": 0.13274747133255005, "learning_rate": 1.2137586871895679e-05, "loss": 0.0066, "step": 30740 }, { "epoch": 6.618659716659333, "grad_norm": 0.11561068147420883, "learning_rate": 1.2125193133382905e-05, "loss": 0.0069, "step": 30750 }, { "epoch": 6.621066645606039, "grad_norm": 0.12740804255008698, "learning_rate": 1.211280297269193e-05, "loss": 0.0074, "step": 30760 }, { "epoch": 6.623473574552744, "grad_norm": 0.1285659819841385, "learning_rate": 1.2100416395452079e-05, "loss": 0.0089, "step": 30770 }, { "epoch": 6.625880503499449, "grad_norm": 0.13778181374073029, "learning_rate": 1.2088033407291024e-05, "loss": 0.0069, "step": 30780 }, { "epoch": 6.628287432446155, "grad_norm": 0.13337333500385284, "learning_rate": 1.2075654013834841e-05, "loss": 0.0075, "step": 30790 }, { "epoch": 6.63069436139286, "grad_norm": 0.10848786681890488, "learning_rate": 1.2063278220707936e-05, "loss": 0.0069, "step": 30800 }, { "epoch": 6.633101290339565, "grad_norm": 0.08036765456199646, "learning_rate": 1.2050906033533116e-05, "loss": 0.0061, "step": 30810 }, { "epoch": 6.635508219286271, "grad_norm": 0.10248853266239166, "learning_rate": 1.2038537457931523e-05, "loss": 0.007, "step": 30820 }, { "epoch": 6.637915148232976, "grad_norm": 0.09860971570014954, "learning_rate": 1.202617249952267e-05, "loss": 0.0084, "step": 30830 }, { "epoch": 6.640322077179681, "grad_norm": 0.12025798857212067, "learning_rate": 1.2013811163924426e-05, "loss": 0.0071, "step": 30840 }, { "epoch": 6.642729006126387, "grad_norm": 0.12150115519762039, "learning_rate": 1.200145345675301e-05, "loss": 0.0074, "step": 30850 }, { "epoch": 6.645135935073092, "grad_norm": 0.10867945849895477, "learning_rate": 1.1989099383622991e-05, "loss": 0.007, "step": 30860 }, { "epoch": 6.647542864019797, "grad_norm": 0.1035371720790863, "learning_rate": 1.19767489501473e-05, "loss": 0.0072, "step": 30870 }, { "epoch": 6.649949792966503, "grad_norm": 0.09422024339437485, "learning_rate": 1.1964402161937194e-05, "loss": 0.0077, "step": 30880 }, { "epoch": 6.652356721913208, "grad_norm": 0.11086302995681763, "learning_rate": 1.1952059024602285e-05, "loss": 0.0074, "step": 30890 }, { "epoch": 6.654763650859913, "grad_norm": 0.12020327150821686, "learning_rate": 1.1939719543750541e-05, "loss": 0.0071, "step": 30900 }, { "epoch": 6.6571705798066185, "grad_norm": 0.09512214362621307, "learning_rate": 1.1927383724988234e-05, "loss": 0.0063, "step": 30910 }, { "epoch": 6.6595775087533235, "grad_norm": 0.12939925491809845, "learning_rate": 1.191505157392001e-05, "loss": 0.0074, "step": 30920 }, { "epoch": 6.661984437700029, "grad_norm": 0.11506154388189316, "learning_rate": 1.1902723096148812e-05, "loss": 0.0065, "step": 30930 }, { "epoch": 6.6643913666467345, "grad_norm": 0.11508338898420334, "learning_rate": 1.1890398297275945e-05, "loss": 0.0072, "step": 30940 }, { "epoch": 6.6667982955934395, "grad_norm": 0.13030001521110535, "learning_rate": 1.1878077182901028e-05, "loss": 0.0062, "step": 30950 }, { "epoch": 6.6692052245401445, "grad_norm": 0.11342653632164001, "learning_rate": 1.1865759758622005e-05, "loss": 0.0076, "step": 30960 }, { "epoch": 6.67161215348685, "grad_norm": 0.10492388904094696, "learning_rate": 1.1853446030035155e-05, "loss": 0.0072, "step": 30970 }, { "epoch": 6.6740190824335555, "grad_norm": 0.1433475762605667, "learning_rate": 1.184113600273505e-05, "loss": 0.0088, "step": 30980 }, { "epoch": 6.676426011380261, "grad_norm": 0.13367632031440735, "learning_rate": 1.1828829682314616e-05, "loss": 0.0066, "step": 30990 }, { "epoch": 6.678832940326966, "grad_norm": 0.17372313141822815, "learning_rate": 1.1816527074365078e-05, "loss": 0.0075, "step": 31000 }, { "epoch": 6.681239869273671, "grad_norm": 0.10676586627960205, "learning_rate": 1.1804228184475967e-05, "loss": 0.0075, "step": 31010 }, { "epoch": 6.683646798220376, "grad_norm": 0.10216367989778519, "learning_rate": 1.1791933018235135e-05, "loss": 0.0067, "step": 31020 }, { "epoch": 6.686053727167082, "grad_norm": 0.12986217439174652, "learning_rate": 1.1779641581228754e-05, "loss": 0.0064, "step": 31030 }, { "epoch": 6.688460656113787, "grad_norm": 0.15533986687660217, "learning_rate": 1.1767353879041263e-05, "loss": 0.0084, "step": 31040 }, { "epoch": 6.690867585060493, "grad_norm": 0.10127491503953934, "learning_rate": 1.1755069917255452e-05, "loss": 0.0076, "step": 31050 }, { "epoch": 6.693274514007198, "grad_norm": 0.10999008268117905, "learning_rate": 1.1742789701452373e-05, "loss": 0.0065, "step": 31060 }, { "epoch": 6.695681442953903, "grad_norm": 0.1367650330066681, "learning_rate": 1.173051323721139e-05, "loss": 0.0064, "step": 31070 }, { "epoch": 6.698088371900609, "grad_norm": 0.13774476945400238, "learning_rate": 1.1718240530110176e-05, "loss": 0.0058, "step": 31080 }, { "epoch": 6.700495300847314, "grad_norm": 0.10290727019309998, "learning_rate": 1.1705971585724672e-05, "loss": 0.0072, "step": 31090 }, { "epoch": 6.702902229794019, "grad_norm": 0.13085822761058807, "learning_rate": 1.1693706409629136e-05, "loss": 0.0083, "step": 31100 }, { "epoch": 6.705309158740725, "grad_norm": 0.12971019744873047, "learning_rate": 1.1681445007396084e-05, "loss": 0.0065, "step": 31110 }, { "epoch": 6.70771608768743, "grad_norm": 0.11573072522878647, "learning_rate": 1.1669187384596346e-05, "loss": 0.0063, "step": 31120 }, { "epoch": 6.710123016634135, "grad_norm": 0.11128953844308853, "learning_rate": 1.1656933546799014e-05, "loss": 0.0064, "step": 31130 }, { "epoch": 6.712529945580841, "grad_norm": 0.13428068161010742, "learning_rate": 1.1644683499571481e-05, "loss": 0.007, "step": 31140 }, { "epoch": 6.714936874527546, "grad_norm": 0.15620073676109314, "learning_rate": 1.1632437248479389e-05, "loss": 0.0061, "step": 31150 }, { "epoch": 6.717343803474251, "grad_norm": 0.11722344160079956, "learning_rate": 1.1620194799086687e-05, "loss": 0.0078, "step": 31160 }, { "epoch": 6.719750732420957, "grad_norm": 0.11818581074476242, "learning_rate": 1.160795615695557e-05, "loss": 0.0059, "step": 31170 }, { "epoch": 6.722157661367662, "grad_norm": 0.1166880875825882, "learning_rate": 1.1595721327646527e-05, "loss": 0.0065, "step": 31180 }, { "epoch": 6.724564590314367, "grad_norm": 0.11496548354625702, "learning_rate": 1.1583490316718287e-05, "loss": 0.0059, "step": 31190 }, { "epoch": 6.726971519261073, "grad_norm": 0.10649397224187851, "learning_rate": 1.1571263129727867e-05, "loss": 0.0059, "step": 31200 }, { "epoch": 6.729378448207778, "grad_norm": 0.15688644349575043, "learning_rate": 1.1559039772230548e-05, "loss": 0.0065, "step": 31210 }, { "epoch": 6.731785377154483, "grad_norm": 0.1315668374300003, "learning_rate": 1.1546820249779849e-05, "loss": 0.0075, "step": 31220 }, { "epoch": 6.734192306101189, "grad_norm": 0.10188055783510208, "learning_rate": 1.153460456792757e-05, "loss": 0.008, "step": 31230 }, { "epoch": 6.736599235047894, "grad_norm": 0.11675544083118439, "learning_rate": 1.1522392732223743e-05, "loss": 0.007, "step": 31240 }, { "epoch": 6.739006163994599, "grad_norm": 0.1007097065448761, "learning_rate": 1.1510184748216676e-05, "loss": 0.0071, "step": 31250 }, { "epoch": 6.741413092941305, "grad_norm": 0.11527372896671295, "learning_rate": 1.1497980621452923e-05, "loss": 0.0066, "step": 31260 }, { "epoch": 6.74382002188801, "grad_norm": 0.12700603902339935, "learning_rate": 1.1485780357477258e-05, "loss": 0.0072, "step": 31270 }, { "epoch": 6.746226950834716, "grad_norm": 0.08584686368703842, "learning_rate": 1.1473583961832739e-05, "loss": 0.0057, "step": 31280 }, { "epoch": 6.748633879781421, "grad_norm": 0.11406141519546509, "learning_rate": 1.146139144006065e-05, "loss": 0.0071, "step": 31290 }, { "epoch": 6.751040808728126, "grad_norm": 0.12516281008720398, "learning_rate": 1.1449202797700499e-05, "loss": 0.0069, "step": 31300 }, { "epoch": 6.753447737674831, "grad_norm": 0.1048053726553917, "learning_rate": 1.143701804029006e-05, "loss": 0.0087, "step": 31310 }, { "epoch": 6.755854666621537, "grad_norm": 0.155784472823143, "learning_rate": 1.1424837173365315e-05, "loss": 0.0076, "step": 31320 }, { "epoch": 6.758261595568242, "grad_norm": 0.11166238784790039, "learning_rate": 1.1412660202460497e-05, "loss": 0.0063, "step": 31330 }, { "epoch": 6.760668524514948, "grad_norm": 0.13192152976989746, "learning_rate": 1.1400487133108076e-05, "loss": 0.0063, "step": 31340 }, { "epoch": 6.763075453461653, "grad_norm": 0.09710825234651566, "learning_rate": 1.1388317970838722e-05, "loss": 0.0072, "step": 31350 }, { "epoch": 6.765482382408358, "grad_norm": 0.15359237790107727, "learning_rate": 1.137615272118134e-05, "loss": 0.0072, "step": 31360 }, { "epoch": 6.767889311355064, "grad_norm": 0.09879883378744125, "learning_rate": 1.1363991389663078e-05, "loss": 0.006, "step": 31370 }, { "epoch": 6.770296240301769, "grad_norm": 0.13747024536132812, "learning_rate": 1.1351833981809271e-05, "loss": 0.0065, "step": 31380 }, { "epoch": 6.772703169248474, "grad_norm": 0.12754079699516296, "learning_rate": 1.13396805031435e-05, "loss": 0.0063, "step": 31390 }, { "epoch": 6.77511009819518, "grad_norm": 0.192085400223732, "learning_rate": 1.132753095918754e-05, "loss": 0.0072, "step": 31400 }, { "epoch": 6.777517027141885, "grad_norm": 0.1236446350812912, "learning_rate": 1.1315385355461399e-05, "loss": 0.0074, "step": 31410 }, { "epoch": 6.77992395608859, "grad_norm": 0.11395534127950668, "learning_rate": 1.1303243697483266e-05, "loss": 0.0068, "step": 31420 }, { "epoch": 6.782330885035296, "grad_norm": 0.10358556360006332, "learning_rate": 1.1291105990769566e-05, "loss": 0.0071, "step": 31430 }, { "epoch": 6.784737813982001, "grad_norm": 0.10740833729505539, "learning_rate": 1.1278972240834923e-05, "loss": 0.006, "step": 31440 }, { "epoch": 6.787144742928706, "grad_norm": 0.11831473559141159, "learning_rate": 1.126684245319214e-05, "loss": 0.006, "step": 31450 }, { "epoch": 6.789551671875412, "grad_norm": 0.12090504169464111, "learning_rate": 1.1254716633352245e-05, "loss": 0.0068, "step": 31460 }, { "epoch": 6.791958600822117, "grad_norm": 0.08941730856895447, "learning_rate": 1.1242594786824466e-05, "loss": 0.0063, "step": 31470 }, { "epoch": 6.794365529768822, "grad_norm": 0.1306385099887848, "learning_rate": 1.1230476919116198e-05, "loss": 0.0062, "step": 31480 }, { "epoch": 6.796772458715528, "grad_norm": 0.14290544390678406, "learning_rate": 1.1218363035733057e-05, "loss": 0.0072, "step": 31490 }, { "epoch": 6.799179387662233, "grad_norm": 0.09304504841566086, "learning_rate": 1.1206253142178828e-05, "loss": 0.0077, "step": 31500 }, { "epoch": 6.801586316608938, "grad_norm": 0.11267529428005219, "learning_rate": 1.11941472439555e-05, "loss": 0.0059, "step": 31510 }, { "epoch": 6.8039932455556436, "grad_norm": 0.10773956030607224, "learning_rate": 1.1182045346563241e-05, "loss": 0.0063, "step": 31520 }, { "epoch": 6.806400174502349, "grad_norm": 0.09507082402706146, "learning_rate": 1.116994745550039e-05, "loss": 0.0066, "step": 31530 }, { "epoch": 6.808807103449054, "grad_norm": 0.18311725556850433, "learning_rate": 1.1157853576263484e-05, "loss": 0.0075, "step": 31540 }, { "epoch": 6.8112140323957595, "grad_norm": 0.09211025387048721, "learning_rate": 1.1145763714347221e-05, "loss": 0.0072, "step": 31550 }, { "epoch": 6.8136209613424645, "grad_norm": 0.12012846022844315, "learning_rate": 1.1133677875244484e-05, "loss": 0.0056, "step": 31560 }, { "epoch": 6.81602789028917, "grad_norm": 0.11510230600833893, "learning_rate": 1.1121596064446334e-05, "loss": 0.0085, "step": 31570 }, { "epoch": 6.8184348192358755, "grad_norm": 0.1472606211900711, "learning_rate": 1.1109518287441981e-05, "loss": 0.0071, "step": 31580 }, { "epoch": 6.8208417481825805, "grad_norm": 0.12890589237213135, "learning_rate": 1.1097444549718825e-05, "loss": 0.0074, "step": 31590 }, { "epoch": 6.8232486771292855, "grad_norm": 0.10625536739826202, "learning_rate": 1.1085374856762416e-05, "loss": 0.0073, "step": 31600 }, { "epoch": 6.825655606075991, "grad_norm": 0.14324185252189636, "learning_rate": 1.1073309214056462e-05, "loss": 0.0068, "step": 31610 }, { "epoch": 6.8280625350226964, "grad_norm": 0.11072063446044922, "learning_rate": 1.1061247627082859e-05, "loss": 0.0072, "step": 31620 }, { "epoch": 6.830469463969402, "grad_norm": 0.1308569312095642, "learning_rate": 1.1049190101321615e-05, "loss": 0.0069, "step": 31630 }, { "epoch": 6.832876392916107, "grad_norm": 0.19401833415031433, "learning_rate": 1.1037136642250941e-05, "loss": 0.008, "step": 31640 }, { "epoch": 6.835283321862812, "grad_norm": 0.09883515536785126, "learning_rate": 1.1025087255347174e-05, "loss": 0.0064, "step": 31650 }, { "epoch": 6.837690250809517, "grad_norm": 0.12931247055530548, "learning_rate": 1.1013041946084792e-05, "loss": 0.006, "step": 31660 }, { "epoch": 6.840097179756223, "grad_norm": 0.12627190351486206, "learning_rate": 1.1001000719936455e-05, "loss": 0.0082, "step": 31670 }, { "epoch": 6.842504108702928, "grad_norm": 0.10981853306293488, "learning_rate": 1.0988963582372922e-05, "loss": 0.0063, "step": 31680 }, { "epoch": 6.844911037649634, "grad_norm": 0.11639920622110367, "learning_rate": 1.0976930538863134e-05, "loss": 0.0066, "step": 31690 }, { "epoch": 6.847317966596339, "grad_norm": 0.11532722413539886, "learning_rate": 1.096490159487416e-05, "loss": 0.0065, "step": 31700 }, { "epoch": 6.849724895543044, "grad_norm": 0.11190548539161682, "learning_rate": 1.095287675587119e-05, "loss": 0.0076, "step": 31710 }, { "epoch": 6.85213182448975, "grad_norm": 0.14550797641277313, "learning_rate": 1.0940856027317565e-05, "loss": 0.0057, "step": 31720 }, { "epoch": 6.854538753436455, "grad_norm": 0.12157774716615677, "learning_rate": 1.0928839414674767e-05, "loss": 0.0055, "step": 31730 }, { "epoch": 6.85694568238316, "grad_norm": 0.1659715324640274, "learning_rate": 1.0916826923402381e-05, "loss": 0.008, "step": 31740 }, { "epoch": 6.859352611329866, "grad_norm": 0.1534738540649414, "learning_rate": 1.0904818558958152e-05, "loss": 0.0061, "step": 31750 }, { "epoch": 6.861759540276571, "grad_norm": 0.12304066121578217, "learning_rate": 1.0892814326797913e-05, "loss": 0.0057, "step": 31760 }, { "epoch": 6.864166469223276, "grad_norm": 0.15329678356647491, "learning_rate": 1.088081423237565e-05, "loss": 0.0077, "step": 31770 }, { "epoch": 6.866573398169982, "grad_norm": 0.09837432205677032, "learning_rate": 1.0868818281143465e-05, "loss": 0.0076, "step": 31780 }, { "epoch": 6.868980327116687, "grad_norm": 0.10938508063554764, "learning_rate": 1.0856826478551554e-05, "loss": 0.0073, "step": 31790 }, { "epoch": 6.871387256063392, "grad_norm": 0.10451966524124146, "learning_rate": 1.0844838830048265e-05, "loss": 0.006, "step": 31800 }, { "epoch": 6.873794185010098, "grad_norm": 0.12543565034866333, "learning_rate": 1.0832855341080021e-05, "loss": 0.0068, "step": 31810 }, { "epoch": 6.876201113956803, "grad_norm": 0.12739752233028412, "learning_rate": 1.0820876017091391e-05, "loss": 0.0064, "step": 31820 }, { "epoch": 6.878608042903508, "grad_norm": 0.12196753174066544, "learning_rate": 1.0808900863525013e-05, "loss": 0.0062, "step": 31830 }, { "epoch": 6.881014971850214, "grad_norm": 0.10717349499464035, "learning_rate": 1.0796929885821672e-05, "loss": 0.0072, "step": 31840 }, { "epoch": 6.883421900796919, "grad_norm": 0.10465966910123825, "learning_rate": 1.0784963089420216e-05, "loss": 0.0068, "step": 31850 }, { "epoch": 6.885828829743624, "grad_norm": 0.12439031898975372, "learning_rate": 1.0773000479757627e-05, "loss": 0.0063, "step": 31860 }, { "epoch": 6.88823575869033, "grad_norm": 0.11679153889417648, "learning_rate": 1.0761042062268959e-05, "loss": 0.0077, "step": 31870 }, { "epoch": 6.890642687637035, "grad_norm": 0.12250272184610367, "learning_rate": 1.0749087842387383e-05, "loss": 0.0078, "step": 31880 }, { "epoch": 6.89304961658374, "grad_norm": 0.09260718524456024, "learning_rate": 1.0737137825544135e-05, "loss": 0.007, "step": 31890 }, { "epoch": 6.895456545530446, "grad_norm": 0.14270034432411194, "learning_rate": 1.072519201716857e-05, "loss": 0.0068, "step": 31900 }, { "epoch": 6.897863474477151, "grad_norm": 0.1761486977338791, "learning_rate": 1.0713250422688129e-05, "loss": 0.0071, "step": 31910 }, { "epoch": 6.900270403423857, "grad_norm": 0.12259601056575775, "learning_rate": 1.0701313047528305e-05, "loss": 0.0082, "step": 31920 }, { "epoch": 6.902677332370562, "grad_norm": 0.13061495125293732, "learning_rate": 1.0689379897112724e-05, "loss": 0.0062, "step": 31930 }, { "epoch": 6.905084261317267, "grad_norm": 0.11843175441026688, "learning_rate": 1.0677450976863044e-05, "loss": 0.0083, "step": 31940 }, { "epoch": 6.907491190263972, "grad_norm": 0.09125055372714996, "learning_rate": 1.066552629219903e-05, "loss": 0.0087, "step": 31950 }, { "epoch": 6.909898119210678, "grad_norm": 0.10152092576026917, "learning_rate": 1.065360584853853e-05, "loss": 0.0077, "step": 31960 }, { "epoch": 6.912305048157383, "grad_norm": 0.12234465777873993, "learning_rate": 1.064168965129743e-05, "loss": 0.0058, "step": 31970 }, { "epoch": 6.914711977104089, "grad_norm": 0.11756631731987, "learning_rate": 1.062977770588972e-05, "loss": 0.0069, "step": 31980 }, { "epoch": 6.917118906050794, "grad_norm": 0.14469516277313232, "learning_rate": 1.061787001772745e-05, "loss": 0.0067, "step": 31990 }, { "epoch": 6.919525834997499, "grad_norm": 0.11330592632293701, "learning_rate": 1.0605966592220717e-05, "loss": 0.0081, "step": 32000 }, { "epoch": 6.921932763944205, "grad_norm": 0.11056458950042725, "learning_rate": 1.0594067434777714e-05, "loss": 0.0059, "step": 32010 }, { "epoch": 6.92433969289091, "grad_norm": 0.09184174984693527, "learning_rate": 1.0582172550804662e-05, "loss": 0.0057, "step": 32020 }, { "epoch": 6.926746621837615, "grad_norm": 0.09056900441646576, "learning_rate": 1.0570281945705857e-05, "loss": 0.0069, "step": 32030 }, { "epoch": 6.929153550784321, "grad_norm": 0.09738947451114655, "learning_rate": 1.0558395624883668e-05, "loss": 0.0063, "step": 32040 }, { "epoch": 6.931560479731026, "grad_norm": 0.10064485669136047, "learning_rate": 1.0546513593738472e-05, "loss": 0.0078, "step": 32050 }, { "epoch": 6.933967408677731, "grad_norm": 0.10308422893285751, "learning_rate": 1.0534635857668747e-05, "loss": 0.0058, "step": 32060 }, { "epoch": 6.936374337624437, "grad_norm": 0.1058502271771431, "learning_rate": 1.0522762422070984e-05, "loss": 0.0072, "step": 32070 }, { "epoch": 6.938781266571142, "grad_norm": 0.13962320983409882, "learning_rate": 1.0510893292339732e-05, "loss": 0.0056, "step": 32080 }, { "epoch": 6.941188195517847, "grad_norm": 0.10456061363220215, "learning_rate": 1.0499028473867589e-05, "loss": 0.0067, "step": 32090 }, { "epoch": 6.943595124464553, "grad_norm": 0.10954656451940536, "learning_rate": 1.0487167972045185e-05, "loss": 0.0066, "step": 32100 }, { "epoch": 6.946002053411258, "grad_norm": 0.09542009234428406, "learning_rate": 1.0475311792261196e-05, "loss": 0.0056, "step": 32110 }, { "epoch": 6.948408982357963, "grad_norm": 0.13731886446475983, "learning_rate": 1.0463459939902338e-05, "loss": 0.0064, "step": 32120 }, { "epoch": 6.950815911304669, "grad_norm": 0.07201135903596878, "learning_rate": 1.0451612420353339e-05, "loss": 0.0065, "step": 32130 }, { "epoch": 6.953222840251374, "grad_norm": 0.11858013272285461, "learning_rate": 1.0439769238996993e-05, "loss": 0.0062, "step": 32140 }, { "epoch": 6.955629769198079, "grad_norm": 0.14307337999343872, "learning_rate": 1.0427930401214089e-05, "loss": 0.0064, "step": 32150 }, { "epoch": 6.9580366981447845, "grad_norm": 0.11977291852235794, "learning_rate": 1.0416095912383462e-05, "loss": 0.0061, "step": 32160 }, { "epoch": 6.96044362709149, "grad_norm": 0.13256435096263885, "learning_rate": 1.0404265777881974e-05, "loss": 0.0064, "step": 32170 }, { "epoch": 7.002419705328442, "grad_norm": 0.08275268226861954, "learning_rate": 1.0392440003084487e-05, "loss": 0.0093, "step": 32180 }, { "epoch": 7.0049255490342714, "grad_norm": 0.10907074064016342, "learning_rate": 1.0380618593363914e-05, "loss": 0.0089, "step": 32190 }, { "epoch": 7.007431392740101, "grad_norm": 0.10637936741113663, "learning_rate": 1.036880155409115e-05, "loss": 0.0074, "step": 32200 }, { "epoch": 7.00993723644593, "grad_norm": 0.11159010231494904, "learning_rate": 1.0356988890635132e-05, "loss": 0.0095, "step": 32210 }, { "epoch": 7.01244308015176, "grad_norm": 0.14084985852241516, "learning_rate": 1.0345180608362799e-05, "loss": 0.0103, "step": 32220 }, { "epoch": 7.01494892385759, "grad_norm": 0.10382339358329773, "learning_rate": 1.0333376712639094e-05, "loss": 0.0094, "step": 32230 }, { "epoch": 7.01745476756342, "grad_norm": 0.11610200256109238, "learning_rate": 1.0321577208826978e-05, "loss": 0.0077, "step": 32240 }, { "epoch": 7.019960611269249, "grad_norm": 0.20033811032772064, "learning_rate": 1.03097821022874e-05, "loss": 0.0109, "step": 32250 }, { "epoch": 7.022466454975079, "grad_norm": 0.12374366819858551, "learning_rate": 1.029799139837933e-05, "loss": 0.0086, "step": 32260 }, { "epoch": 7.0249722986809084, "grad_norm": 0.2552463710308075, "learning_rate": 1.0286205102459736e-05, "loss": 0.011, "step": 32270 }, { "epoch": 7.027478142386737, "grad_norm": 0.16911646723747253, "learning_rate": 1.0274423219883562e-05, "loss": 0.01, "step": 32280 }, { "epoch": 7.029983986092567, "grad_norm": 0.12384067475795746, "learning_rate": 1.0262645756003777e-05, "loss": 0.0091, "step": 32290 }, { "epoch": 7.032489829798397, "grad_norm": 0.10108627378940582, "learning_rate": 1.0250872716171313e-05, "loss": 0.0079, "step": 32300 }, { "epoch": 7.034995673504227, "grad_norm": 0.18959659337997437, "learning_rate": 1.0239104105735115e-05, "loss": 0.0119, "step": 32310 }, { "epoch": 7.037501517210056, "grad_norm": 0.17757345736026764, "learning_rate": 1.02273399300421e-05, "loss": 0.0096, "step": 32320 }, { "epoch": 7.040007360915886, "grad_norm": 0.10611298680305481, "learning_rate": 1.0215580194437185e-05, "loss": 0.0096, "step": 32330 }, { "epoch": 7.042513204621716, "grad_norm": 0.16491034626960754, "learning_rate": 1.0203824904263251e-05, "loss": 0.0092, "step": 32340 }, { "epoch": 7.045019048327545, "grad_norm": 0.12039139121770859, "learning_rate": 1.0192074064861184e-05, "loss": 0.0084, "step": 32350 }, { "epoch": 7.047524892033374, "grad_norm": 0.10670739412307739, "learning_rate": 1.0180327681569817e-05, "loss": 0.0082, "step": 32360 }, { "epoch": 7.050030735739204, "grad_norm": 0.12741023302078247, "learning_rate": 1.016858575972599e-05, "loss": 0.0093, "step": 32370 }, { "epoch": 7.052536579445034, "grad_norm": 0.10160192102193832, "learning_rate": 1.0156848304664489e-05, "loss": 0.0073, "step": 32380 }, { "epoch": 7.055042423150864, "grad_norm": 0.11047062277793884, "learning_rate": 1.0145115321718087e-05, "loss": 0.0139, "step": 32390 }, { "epoch": 7.057548266856693, "grad_norm": 0.17702612280845642, "learning_rate": 1.0133386816217528e-05, "loss": 0.0101, "step": 32400 }, { "epoch": 7.060054110562523, "grad_norm": 0.14322137832641602, "learning_rate": 1.01216627934915e-05, "loss": 0.0098, "step": 32410 }, { "epoch": 7.062559954268353, "grad_norm": 0.14895103871822357, "learning_rate": 1.0109943258866676e-05, "loss": 0.01, "step": 32420 }, { "epoch": 7.0650657979741815, "grad_norm": 0.15172810852527618, "learning_rate": 1.0098228217667695e-05, "loss": 0.0074, "step": 32430 }, { "epoch": 7.067571641680011, "grad_norm": 0.22893881797790527, "learning_rate": 1.008651767521712e-05, "loss": 0.0107, "step": 32440 }, { "epoch": 7.070077485385841, "grad_norm": 0.11061058193445206, "learning_rate": 1.0074811636835512e-05, "loss": 0.008, "step": 32450 }, { "epoch": 7.072583329091671, "grad_norm": 0.13337905704975128, "learning_rate": 1.0063110107841347e-05, "loss": 0.0113, "step": 32460 }, { "epoch": 7.0750891727975, "grad_norm": 0.10288804024457932, "learning_rate": 1.0051413093551081e-05, "loss": 0.0119, "step": 32470 }, { "epoch": 7.07759501650333, "grad_norm": 0.11078733205795288, "learning_rate": 1.0039720599279116e-05, "loss": 0.0128, "step": 32480 }, { "epoch": 7.08010086020916, "grad_norm": 0.1336498260498047, "learning_rate": 1.0028032630337778e-05, "loss": 0.0092, "step": 32490 }, { "epoch": 7.08260670391499, "grad_norm": 0.1215728148818016, "learning_rate": 1.0016349192037368e-05, "loss": 0.0089, "step": 32500 }, { "epoch": 7.0851125476208185, "grad_norm": 0.1656973958015442, "learning_rate": 1.0004670289686095e-05, "loss": 0.0085, "step": 32510 }, { "epoch": 7.087618391326648, "grad_norm": 0.09157652407884598, "learning_rate": 9.992995928590137e-06, "loss": 0.007, "step": 32520 }, { "epoch": 7.090124235032478, "grad_norm": 0.10599406063556671, "learning_rate": 9.9813261140536e-06, "loss": 0.0089, "step": 32530 }, { "epoch": 7.092630078738308, "grad_norm": 0.3414015769958496, "learning_rate": 9.969660851378514e-06, "loss": 0.0097, "step": 32540 }, { "epoch": 7.095135922444137, "grad_norm": 0.10681771486997604, "learning_rate": 9.958000145864842e-06, "loss": 0.0081, "step": 32550 }, { "epoch": 7.097641766149967, "grad_norm": 0.14559826254844666, "learning_rate": 9.946344002810495e-06, "loss": 0.0073, "step": 32560 }, { "epoch": 7.100147609855797, "grad_norm": 0.209620401263237, "learning_rate": 9.934692427511286e-06, "loss": 0.011, "step": 32570 }, { "epoch": 7.102653453561626, "grad_norm": 0.12145911157131195, "learning_rate": 9.92304542526098e-06, "loss": 0.0081, "step": 32580 }, { "epoch": 7.1051592972674555, "grad_norm": 0.13488909602165222, "learning_rate": 9.911403001351232e-06, "loss": 0.0115, "step": 32590 }, { "epoch": 7.107665140973285, "grad_norm": 0.14880861341953278, "learning_rate": 9.899765161071644e-06, "loss": 0.0091, "step": 32600 }, { "epoch": 7.110170984679115, "grad_norm": 0.1434558928012848, "learning_rate": 9.888131909709732e-06, "loss": 0.01, "step": 32610 }, { "epoch": 7.112676828384944, "grad_norm": 0.11037392914295197, "learning_rate": 9.876503252550909e-06, "loss": 0.0085, "step": 32620 }, { "epoch": 7.115182672090774, "grad_norm": 0.0989370048046112, "learning_rate": 9.864879194878522e-06, "loss": 0.0095, "step": 32630 }, { "epoch": 7.117688515796604, "grad_norm": 0.1275223046541214, "learning_rate": 9.853259741973808e-06, "loss": 0.0082, "step": 32640 }, { "epoch": 7.120194359502434, "grad_norm": 0.09895727038383484, "learning_rate": 9.84164489911593e-06, "loss": 0.0096, "step": 32650 }, { "epoch": 7.122700203208263, "grad_norm": 0.13453346490859985, "learning_rate": 9.830034671581957e-06, "loss": 0.0114, "step": 32660 }, { "epoch": 7.1252060469140925, "grad_norm": 0.0861908420920372, "learning_rate": 9.818429064646835e-06, "loss": 0.0078, "step": 32670 }, { "epoch": 7.127711890619922, "grad_norm": 0.09316643327474594, "learning_rate": 9.80682808358344e-06, "loss": 0.0102, "step": 32680 }, { "epoch": 7.130217734325752, "grad_norm": 0.12472070753574371, "learning_rate": 9.79523173366254e-06, "loss": 0.0067, "step": 32690 }, { "epoch": 7.132723578031581, "grad_norm": 0.10978880524635315, "learning_rate": 9.783640020152782e-06, "loss": 0.0111, "step": 32700 }, { "epoch": 7.135229421737411, "grad_norm": 0.11579641699790955, "learning_rate": 9.772052948320733e-06, "loss": 0.0083, "step": 32710 }, { "epoch": 7.137735265443241, "grad_norm": 0.13896386325359344, "learning_rate": 9.760470523430822e-06, "loss": 0.01, "step": 32720 }, { "epoch": 7.14024110914907, "grad_norm": 0.12832750380039215, "learning_rate": 9.748892750745387e-06, "loss": 0.0114, "step": 32730 }, { "epoch": 7.1427469528549, "grad_norm": 0.10729813575744629, "learning_rate": 9.737319635524656e-06, "loss": 0.01, "step": 32740 }, { "epoch": 7.1452527965607295, "grad_norm": 0.08214232325553894, "learning_rate": 9.725751183026715e-06, "loss": 0.009, "step": 32750 }, { "epoch": 7.147758640266559, "grad_norm": 0.13085341453552246, "learning_rate": 9.714187398507567e-06, "loss": 0.0101, "step": 32760 }, { "epoch": 7.150264483972388, "grad_norm": 0.22223958373069763, "learning_rate": 9.702628287221057e-06, "loss": 0.0099, "step": 32770 }, { "epoch": 7.152770327678218, "grad_norm": 0.0844661295413971, "learning_rate": 9.691073854418941e-06, "loss": 0.0088, "step": 32780 }, { "epoch": 7.155276171384048, "grad_norm": 0.09846645593643188, "learning_rate": 9.679524105350827e-06, "loss": 0.0077, "step": 32790 }, { "epoch": 7.157782015089878, "grad_norm": 0.1299034208059311, "learning_rate": 9.667979045264196e-06, "loss": 0.0086, "step": 32800 }, { "epoch": 7.160287858795707, "grad_norm": 0.13190308213233948, "learning_rate": 9.656438679404407e-06, "loss": 0.0115, "step": 32810 }, { "epoch": 7.162793702501537, "grad_norm": 0.26075318455696106, "learning_rate": 9.644903013014695e-06, "loss": 0.0129, "step": 32820 }, { "epoch": 7.1652995462073665, "grad_norm": 0.13382042944431305, "learning_rate": 9.63337205133613e-06, "loss": 0.0108, "step": 32830 }, { "epoch": 7.167805389913196, "grad_norm": 0.15058472752571106, "learning_rate": 9.62184579960768e-06, "loss": 0.0087, "step": 32840 }, { "epoch": 7.170311233619025, "grad_norm": 0.10307680815458298, "learning_rate": 9.610324263066144e-06, "loss": 0.0075, "step": 32850 }, { "epoch": 7.172817077324855, "grad_norm": 0.14202366769313812, "learning_rate": 9.598807446946192e-06, "loss": 0.0103, "step": 32860 }, { "epoch": 7.175322921030685, "grad_norm": 0.10129663348197937, "learning_rate": 9.58729535648036e-06, "loss": 0.0102, "step": 32870 }, { "epoch": 7.177828764736515, "grad_norm": 0.15794244408607483, "learning_rate": 9.575787996899007e-06, "loss": 0.009, "step": 32880 }, { "epoch": 7.180334608442344, "grad_norm": 0.11800691485404968, "learning_rate": 9.564285373430377e-06, "loss": 0.0089, "step": 32890 }, { "epoch": 7.182840452148174, "grad_norm": 0.09836656600236893, "learning_rate": 9.55278749130053e-06, "loss": 0.0063, "step": 32900 }, { "epoch": 7.1853462958540035, "grad_norm": 0.14659704267978668, "learning_rate": 9.541294355733397e-06, "loss": 0.0081, "step": 32910 }, { "epoch": 7.1878521395598325, "grad_norm": 0.08832225948572159, "learning_rate": 9.529805971950749e-06, "loss": 0.0087, "step": 32920 }, { "epoch": 7.190357983265662, "grad_norm": 0.110434390604496, "learning_rate": 9.518322345172177e-06, "loss": 0.0079, "step": 32930 }, { "epoch": 7.192863826971492, "grad_norm": 0.07788343727588654, "learning_rate": 9.506843480615133e-06, "loss": 0.007, "step": 32940 }, { "epoch": 7.195369670677322, "grad_norm": 0.15672564506530762, "learning_rate": 9.495369383494906e-06, "loss": 0.0086, "step": 32950 }, { "epoch": 7.197875514383151, "grad_norm": 0.187017023563385, "learning_rate": 9.483900059024597e-06, "loss": 0.0087, "step": 32960 }, { "epoch": 7.200381358088981, "grad_norm": 0.08220938593149185, "learning_rate": 9.472435512415168e-06, "loss": 0.0095, "step": 32970 }, { "epoch": 7.202887201794811, "grad_norm": 0.24451841413974762, "learning_rate": 9.460975748875376e-06, "loss": 0.0102, "step": 32980 }, { "epoch": 7.2053930455006405, "grad_norm": 0.20249949395656586, "learning_rate": 9.449520773611838e-06, "loss": 0.0084, "step": 32990 }, { "epoch": 7.2078988892064695, "grad_norm": 0.12940168380737305, "learning_rate": 9.438070591828983e-06, "loss": 0.0084, "step": 33000 }, { "epoch": 7.210404732912299, "grad_norm": 0.11851653456687927, "learning_rate": 9.426625208729054e-06, "loss": 0.0092, "step": 33010 }, { "epoch": 7.212910576618129, "grad_norm": 0.13651615381240845, "learning_rate": 9.415184629512115e-06, "loss": 0.0075, "step": 33020 }, { "epoch": 7.215416420323959, "grad_norm": 0.09958738833665848, "learning_rate": 9.403748859376068e-06, "loss": 0.009, "step": 33030 }, { "epoch": 7.217922264029788, "grad_norm": 0.10803381353616714, "learning_rate": 9.392317903516597e-06, "loss": 0.0092, "step": 33040 }, { "epoch": 7.220428107735618, "grad_norm": 0.08225560933351517, "learning_rate": 9.380891767127234e-06, "loss": 0.0078, "step": 33050 }, { "epoch": 7.222933951441448, "grad_norm": 0.10973416268825531, "learning_rate": 9.369470455399285e-06, "loss": 0.0074, "step": 33060 }, { "epoch": 7.225439795147277, "grad_norm": 0.2327316254377365, "learning_rate": 9.358053973521903e-06, "loss": 0.012, "step": 33070 }, { "epoch": 7.2279456388531065, "grad_norm": 0.13876350224018097, "learning_rate": 9.346642326682006e-06, "loss": 0.0102, "step": 33080 }, { "epoch": 7.230451482558936, "grad_norm": 0.11266385763883591, "learning_rate": 9.335235520064347e-06, "loss": 0.0086, "step": 33090 }, { "epoch": 7.232957326264766, "grad_norm": 0.11523076891899109, "learning_rate": 9.323833558851477e-06, "loss": 0.0106, "step": 33100 }, { "epoch": 7.235463169970595, "grad_norm": 0.09748681634664536, "learning_rate": 9.312436448223716e-06, "loss": 0.0086, "step": 33110 }, { "epoch": 7.237969013676425, "grad_norm": 0.10768880695104599, "learning_rate": 9.301044193359214e-06, "loss": 0.0072, "step": 33120 }, { "epoch": 7.240474857382255, "grad_norm": 0.08023136109113693, "learning_rate": 9.289656799433909e-06, "loss": 0.0102, "step": 33130 }, { "epoch": 7.242980701088085, "grad_norm": 0.19802959263324738, "learning_rate": 9.278274271621505e-06, "loss": 0.0111, "step": 33140 }, { "epoch": 7.245486544793914, "grad_norm": 0.11024095863103867, "learning_rate": 9.266896615093534e-06, "loss": 0.0077, "step": 33150 }, { "epoch": 7.2479923884997435, "grad_norm": 0.12072983384132385, "learning_rate": 9.255523835019278e-06, "loss": 0.0067, "step": 33160 }, { "epoch": 7.250498232205573, "grad_norm": 0.18105760216712952, "learning_rate": 9.244155936565825e-06, "loss": 0.0099, "step": 33170 }, { "epoch": 7.253004075911403, "grad_norm": 0.12197783589363098, "learning_rate": 9.232792924898049e-06, "loss": 0.0072, "step": 33180 }, { "epoch": 7.255509919617232, "grad_norm": 0.09523019194602966, "learning_rate": 9.22143480517858e-06, "loss": 0.0087, "step": 33190 }, { "epoch": 7.258015763323062, "grad_norm": 0.11747388541698456, "learning_rate": 9.210081582567853e-06, "loss": 0.0113, "step": 33200 }, { "epoch": 7.260521607028892, "grad_norm": 0.09926802664995193, "learning_rate": 9.198733262224056e-06, "loss": 0.0088, "step": 33210 }, { "epoch": 7.263027450734722, "grad_norm": 0.14337152242660522, "learning_rate": 9.18738984930316e-06, "loss": 0.0099, "step": 33220 }, { "epoch": 7.265533294440551, "grad_norm": 0.09669367223978043, "learning_rate": 9.176051348958917e-06, "loss": 0.0059, "step": 33230 }, { "epoch": 7.2680391381463805, "grad_norm": 0.09951623529195786, "learning_rate": 9.164717766342819e-06, "loss": 0.0065, "step": 33240 }, { "epoch": 7.27054498185221, "grad_norm": 0.22143366932868958, "learning_rate": 9.153389106604154e-06, "loss": 0.0081, "step": 33250 }, { "epoch": 7.273050825558039, "grad_norm": 0.09472747892141342, "learning_rate": 9.142065374889955e-06, "loss": 0.0099, "step": 33260 }, { "epoch": 7.275556669263869, "grad_norm": 0.10592927783727646, "learning_rate": 9.130746576345008e-06, "loss": 0.0101, "step": 33270 }, { "epoch": 7.278062512969699, "grad_norm": 0.1743759959936142, "learning_rate": 9.119432716111891e-06, "loss": 0.0077, "step": 33280 }, { "epoch": 7.280568356675529, "grad_norm": 0.10429559648036957, "learning_rate": 9.108123799330899e-06, "loss": 0.0073, "step": 33290 }, { "epoch": 7.283074200381358, "grad_norm": 0.11359341442584991, "learning_rate": 9.096819831140106e-06, "loss": 0.0106, "step": 33300 }, { "epoch": 7.285580044087188, "grad_norm": 0.10288742929697037, "learning_rate": 9.085520816675343e-06, "loss": 0.009, "step": 33310 }, { "epoch": 7.2880858877930175, "grad_norm": 0.1497664898633957, "learning_rate": 9.07422676107016e-06, "loss": 0.0087, "step": 33320 }, { "epoch": 7.290591731498847, "grad_norm": 0.10095776617527008, "learning_rate": 9.062937669455887e-06, "loss": 0.0092, "step": 33330 }, { "epoch": 7.293097575204676, "grad_norm": 0.1483621597290039, "learning_rate": 9.051653546961571e-06, "loss": 0.0114, "step": 33340 }, { "epoch": 7.295603418910506, "grad_norm": 0.10325617343187332, "learning_rate": 9.040374398714022e-06, "loss": 0.0076, "step": 33350 }, { "epoch": 7.298109262616336, "grad_norm": 0.10374870896339417, "learning_rate": 9.029100229837789e-06, "loss": 0.0105, "step": 33360 }, { "epoch": 7.300615106322166, "grad_norm": 0.13275834918022156, "learning_rate": 9.017831045455137e-06, "loss": 0.0073, "step": 33370 }, { "epoch": 7.303120950027995, "grad_norm": 0.08612576872110367, "learning_rate": 9.00656685068609e-06, "loss": 0.0093, "step": 33380 }, { "epoch": 7.305626793733825, "grad_norm": 0.16284030675888062, "learning_rate": 8.995307650648403e-06, "loss": 0.0082, "step": 33390 }, { "epoch": 7.3081326374396545, "grad_norm": 0.12432314455509186, "learning_rate": 8.984053450457541e-06, "loss": 0.0074, "step": 33400 }, { "epoch": 7.310638481145483, "grad_norm": 0.11109273135662079, "learning_rate": 8.972804255226728e-06, "loss": 0.0069, "step": 33410 }, { "epoch": 7.313144324851313, "grad_norm": 0.10570468008518219, "learning_rate": 8.961560070066878e-06, "loss": 0.0089, "step": 33420 }, { "epoch": 7.315650168557143, "grad_norm": 0.0941881462931633, "learning_rate": 8.950320900086664e-06, "loss": 0.0066, "step": 33430 }, { "epoch": 7.318156012262973, "grad_norm": 0.08343647420406342, "learning_rate": 8.939086750392465e-06, "loss": 0.0062, "step": 33440 }, { "epoch": 7.320661855968802, "grad_norm": 0.1151280552148819, "learning_rate": 8.927857626088366e-06, "loss": 0.0091, "step": 33450 }, { "epoch": 7.323167699674632, "grad_norm": 0.09701399505138397, "learning_rate": 8.916633532276198e-06, "loss": 0.0089, "step": 33460 }, { "epoch": 7.325673543380462, "grad_norm": 0.09329188615083694, "learning_rate": 8.905414474055473e-06, "loss": 0.0079, "step": 33470 }, { "epoch": 7.3281793870862915, "grad_norm": 0.16877926886081696, "learning_rate": 8.894200456523452e-06, "loss": 0.0072, "step": 33480 }, { "epoch": 7.33068523079212, "grad_norm": 0.09634123742580414, "learning_rate": 8.882991484775065e-06, "loss": 0.0093, "step": 33490 }, { "epoch": 7.33319107449795, "grad_norm": 0.07741771638393402, "learning_rate": 8.871787563902989e-06, "loss": 0.0085, "step": 33500 }, { "epoch": 7.33569691820378, "grad_norm": 0.09824532270431519, "learning_rate": 8.860588698997575e-06, "loss": 0.0072, "step": 33510 }, { "epoch": 7.33820276190961, "grad_norm": 0.12373263388872147, "learning_rate": 8.8493948951469e-06, "loss": 0.007, "step": 33520 }, { "epoch": 7.340708605615439, "grad_norm": 0.08195700496435165, "learning_rate": 8.838206157436719e-06, "loss": 0.008, "step": 33530 }, { "epoch": 7.343214449321269, "grad_norm": 0.12329854816198349, "learning_rate": 8.827022490950512e-06, "loss": 0.0081, "step": 33540 }, { "epoch": 7.345720293027099, "grad_norm": 0.11577993631362915, "learning_rate": 8.815843900769426e-06, "loss": 0.0099, "step": 33550 }, { "epoch": 7.348226136732928, "grad_norm": 0.14934180676937103, "learning_rate": 8.804670391972323e-06, "loss": 0.0075, "step": 33560 }, { "epoch": 7.350731980438757, "grad_norm": 0.12858150899410248, "learning_rate": 8.793501969635758e-06, "loss": 0.0084, "step": 33570 }, { "epoch": 7.353237824144587, "grad_norm": 0.1440422236919403, "learning_rate": 8.78233863883395e-06, "loss": 0.0057, "step": 33580 }, { "epoch": 7.355743667850417, "grad_norm": 0.08193643391132355, "learning_rate": 8.771180404638837e-06, "loss": 0.0092, "step": 33590 }, { "epoch": 7.358249511556246, "grad_norm": 0.1714475005865097, "learning_rate": 8.760027272120013e-06, "loss": 0.0072, "step": 33600 }, { "epoch": 7.360755355262076, "grad_norm": 0.1272462159395218, "learning_rate": 8.74887924634477e-06, "loss": 0.0075, "step": 33610 }, { "epoch": 7.363261198967906, "grad_norm": 0.1100604459643364, "learning_rate": 8.737736332378086e-06, "loss": 0.0072, "step": 33620 }, { "epoch": 7.365767042673736, "grad_norm": 0.16165503859519958, "learning_rate": 8.726598535282593e-06, "loss": 0.0093, "step": 33630 }, { "epoch": 7.368272886379565, "grad_norm": 0.18200775980949402, "learning_rate": 8.715465860118621e-06, "loss": 0.0112, "step": 33640 }, { "epoch": 7.370778730085394, "grad_norm": 0.17006978392601013, "learning_rate": 8.704338311944169e-06, "loss": 0.0121, "step": 33650 }, { "epoch": 7.373284573791224, "grad_norm": 0.08479057997465134, "learning_rate": 8.693215895814885e-06, "loss": 0.0073, "step": 33660 }, { "epoch": 7.375790417497054, "grad_norm": 0.09676782041788101, "learning_rate": 8.682098616784124e-06, "loss": 0.0081, "step": 33670 }, { "epoch": 7.378296261202883, "grad_norm": 0.1109863892197609, "learning_rate": 8.670986479902865e-06, "loss": 0.0109, "step": 33680 }, { "epoch": 7.380802104908713, "grad_norm": 0.10433271527290344, "learning_rate": 8.65987949021978e-06, "loss": 0.0092, "step": 33690 }, { "epoch": 7.383307948614543, "grad_norm": 0.10235805809497833, "learning_rate": 8.648777652781199e-06, "loss": 0.0086, "step": 33700 }, { "epoch": 7.385813792320372, "grad_norm": 0.1190604567527771, "learning_rate": 8.637680972631095e-06, "loss": 0.0054, "step": 33710 }, { "epoch": 7.388319636026202, "grad_norm": 0.12044576555490494, "learning_rate": 8.626589454811119e-06, "loss": 0.0079, "step": 33720 }, { "epoch": 7.390825479732031, "grad_norm": 0.10107846558094025, "learning_rate": 8.615503104360558e-06, "loss": 0.0079, "step": 33730 }, { "epoch": 7.393331323437861, "grad_norm": 0.14831507205963135, "learning_rate": 8.604421926316355e-06, "loss": 0.0075, "step": 33740 }, { "epoch": 7.39583716714369, "grad_norm": 0.0862073004245758, "learning_rate": 8.593345925713122e-06, "loss": 0.0097, "step": 33750 }, { "epoch": 7.39834301084952, "grad_norm": 0.10816335678100586, "learning_rate": 8.582275107583085e-06, "loss": 0.01, "step": 33760 }, { "epoch": 7.40084885455535, "grad_norm": 0.08622723817825317, "learning_rate": 8.571209476956146e-06, "loss": 0.009, "step": 33770 }, { "epoch": 7.40335469826118, "grad_norm": 0.1463107019662857, "learning_rate": 8.560149038859844e-06, "loss": 0.0098, "step": 33780 }, { "epoch": 7.405860541967009, "grad_norm": 0.1521511673927307, "learning_rate": 8.549093798319339e-06, "loss": 0.0081, "step": 33790 }, { "epoch": 7.408366385672839, "grad_norm": 0.14453649520874023, "learning_rate": 8.538043760357458e-06, "loss": 0.0079, "step": 33800 }, { "epoch": 7.410872229378668, "grad_norm": 0.09304424375295639, "learning_rate": 8.526998929994635e-06, "loss": 0.0071, "step": 33810 }, { "epoch": 7.413378073084498, "grad_norm": 0.12866733968257904, "learning_rate": 8.515959312248961e-06, "loss": 0.0082, "step": 33820 }, { "epoch": 7.415883916790327, "grad_norm": 0.1186399981379509, "learning_rate": 8.504924912136157e-06, "loss": 0.0092, "step": 33830 }, { "epoch": 7.418389760496157, "grad_norm": 0.12872697412967682, "learning_rate": 8.493895734669553e-06, "loss": 0.0106, "step": 33840 }, { "epoch": 7.420895604201987, "grad_norm": 0.10628846287727356, "learning_rate": 8.482871784860138e-06, "loss": 0.0079, "step": 33850 }, { "epoch": 7.423401447907816, "grad_norm": 0.11438819766044617, "learning_rate": 8.471853067716488e-06, "loss": 0.0056, "step": 33860 }, { "epoch": 7.425907291613646, "grad_norm": 0.20625124871730804, "learning_rate": 8.460839588244836e-06, "loss": 0.0082, "step": 33870 }, { "epoch": 7.428413135319476, "grad_norm": 0.1351015567779541, "learning_rate": 8.44983135144902e-06, "loss": 0.0071, "step": 33880 }, { "epoch": 7.430918979025305, "grad_norm": 0.11129476875066757, "learning_rate": 8.43882836233049e-06, "loss": 0.0086, "step": 33890 }, { "epoch": 7.433424822731134, "grad_norm": 0.09639249742031097, "learning_rate": 8.427830625888325e-06, "loss": 0.0059, "step": 33900 }, { "epoch": 7.435930666436964, "grad_norm": 0.10626310855150223, "learning_rate": 8.416838147119216e-06, "loss": 0.0064, "step": 33910 }, { "epoch": 7.438436510142794, "grad_norm": 0.1643921285867691, "learning_rate": 8.405850931017447e-06, "loss": 0.0103, "step": 33920 }, { "epoch": 7.440942353848624, "grad_norm": 0.1364458203315735, "learning_rate": 8.39486898257494e-06, "loss": 0.0086, "step": 33930 }, { "epoch": 7.443448197554453, "grad_norm": 0.11035924404859543, "learning_rate": 8.383892306781194e-06, "loss": 0.0065, "step": 33940 }, { "epoch": 7.445954041260283, "grad_norm": 0.10727917402982712, "learning_rate": 8.372920908623339e-06, "loss": 0.0059, "step": 33950 }, { "epoch": 7.4484598849661126, "grad_norm": 0.07808483392000198, "learning_rate": 8.361954793086084e-06, "loss": 0.0086, "step": 33960 }, { "epoch": 7.450965728671942, "grad_norm": 0.10915587097406387, "learning_rate": 8.350993965151764e-06, "loss": 0.0058, "step": 33970 }, { "epoch": 7.453471572377771, "grad_norm": 0.08689790964126587, "learning_rate": 8.340038429800283e-06, "loss": 0.0092, "step": 33980 }, { "epoch": 7.455977416083601, "grad_norm": 0.10985678434371948, "learning_rate": 8.329088192009154e-06, "loss": 0.0112, "step": 33990 }, { "epoch": 7.458483259789431, "grad_norm": 0.13425056636333466, "learning_rate": 8.318143256753488e-06, "loss": 0.0116, "step": 34000 }, { "epoch": 7.46098910349526, "grad_norm": 0.18912728130817413, "learning_rate": 8.307203629005986e-06, "loss": 0.0112, "step": 34010 }, { "epoch": 7.46349494720109, "grad_norm": 0.09185069799423218, "learning_rate": 8.296269313736925e-06, "loss": 0.0084, "step": 34020 }, { "epoch": 7.46600079090692, "grad_norm": 0.09444573521614075, "learning_rate": 8.285340315914183e-06, "loss": 0.0092, "step": 34030 }, { "epoch": 7.4685066346127496, "grad_norm": 0.13623741269111633, "learning_rate": 8.274416640503211e-06, "loss": 0.0059, "step": 34040 }, { "epoch": 7.4710124783185785, "grad_norm": 0.11547993123531342, "learning_rate": 8.263498292467047e-06, "loss": 0.0061, "step": 34050 }, { "epoch": 7.473518322024408, "grad_norm": 0.10128726065158844, "learning_rate": 8.252585276766318e-06, "loss": 0.0071, "step": 34060 }, { "epoch": 7.476024165730238, "grad_norm": 0.10647986829280853, "learning_rate": 8.241677598359205e-06, "loss": 0.0051, "step": 34070 }, { "epoch": 7.478530009436068, "grad_norm": 0.09667297452688217, "learning_rate": 8.230775262201484e-06, "loss": 0.0088, "step": 34080 }, { "epoch": 7.481035853141897, "grad_norm": 0.08394874632358551, "learning_rate": 8.219878273246504e-06, "loss": 0.0066, "step": 34090 }, { "epoch": 7.483541696847727, "grad_norm": 0.12883320450782776, "learning_rate": 8.208986636445167e-06, "loss": 0.0073, "step": 34100 }, { "epoch": 7.486047540553557, "grad_norm": 0.14150315523147583, "learning_rate": 8.198100356745964e-06, "loss": 0.0088, "step": 34110 }, { "epoch": 7.4885533842593865, "grad_norm": 0.11078334599733353, "learning_rate": 8.187219439094931e-06, "loss": 0.006, "step": 34120 }, { "epoch": 7.4910592279652155, "grad_norm": 0.23968781530857086, "learning_rate": 8.176343888435687e-06, "loss": 0.0111, "step": 34130 }, { "epoch": 7.493565071671045, "grad_norm": 0.10612232238054276, "learning_rate": 8.16547370970941e-06, "loss": 0.0081, "step": 34140 }, { "epoch": 7.496070915376875, "grad_norm": 0.10158432275056839, "learning_rate": 8.154608907854819e-06, "loss": 0.0069, "step": 34150 }, { "epoch": 7.498576759082704, "grad_norm": 0.10327087342739105, "learning_rate": 8.14374948780822e-06, "loss": 0.0113, "step": 34160 }, { "epoch": 7.501082602788534, "grad_norm": 0.09483197331428528, "learning_rate": 8.132895454503439e-06, "loss": 0.0088, "step": 34170 }, { "epoch": 7.503588446494364, "grad_norm": 0.18011574447155, "learning_rate": 8.12204681287188e-06, "loss": 0.0101, "step": 34180 }, { "epoch": 7.506094290200194, "grad_norm": 0.12782864272594452, "learning_rate": 8.1112035678425e-06, "loss": 0.0102, "step": 34190 }, { "epoch": 7.508600133906023, "grad_norm": 0.0986800491809845, "learning_rate": 8.100365724341786e-06, "loss": 0.0064, "step": 34200 }, { "epoch": 7.5111059776118525, "grad_norm": 0.09669991582632065, "learning_rate": 8.08953328729377e-06, "loss": 0.0067, "step": 34210 }, { "epoch": 7.513611821317682, "grad_norm": 0.097514308989048, "learning_rate": 8.078706261620055e-06, "loss": 0.0067, "step": 34220 }, { "epoch": 7.516117665023512, "grad_norm": 0.08270708471536636, "learning_rate": 8.067884652239748e-06, "loss": 0.0072, "step": 34230 }, { "epoch": 7.518623508729341, "grad_norm": 0.08764758706092834, "learning_rate": 8.057068464069533e-06, "loss": 0.007, "step": 34240 }, { "epoch": 7.521129352435171, "grad_norm": 0.14616964757442474, "learning_rate": 8.046257702023597e-06, "loss": 0.0102, "step": 34250 }, { "epoch": 7.523635196141001, "grad_norm": 0.14217549562454224, "learning_rate": 8.035452371013682e-06, "loss": 0.0078, "step": 34260 }, { "epoch": 7.526141039846831, "grad_norm": 0.07426004856824875, "learning_rate": 8.024652475949067e-06, "loss": 0.0114, "step": 34270 }, { "epoch": 7.52864688355266, "grad_norm": 0.1066439226269722, "learning_rate": 8.013858021736534e-06, "loss": 0.0077, "step": 34280 }, { "epoch": 7.5311527272584895, "grad_norm": 0.09636229276657104, "learning_rate": 8.00306901328043e-06, "loss": 0.0103, "step": 34290 }, { "epoch": 7.533658570964319, "grad_norm": 0.13530969619750977, "learning_rate": 7.99228545548259e-06, "loss": 0.0094, "step": 34300 }, { "epoch": 7.536164414670148, "grad_norm": 0.10871104151010513, "learning_rate": 7.9815073532424e-06, "loss": 0.0082, "step": 34310 }, { "epoch": 7.538670258375978, "grad_norm": 0.1190742701292038, "learning_rate": 7.970734711456765e-06, "loss": 0.008, "step": 34320 }, { "epoch": 7.541176102081808, "grad_norm": 0.07089721411466599, "learning_rate": 7.959967535020087e-06, "loss": 0.008, "step": 34330 }, { "epoch": 7.543681945787638, "grad_norm": 0.10812743753194809, "learning_rate": 7.949205828824314e-06, "loss": 0.0092, "step": 34340 }, { "epoch": 7.546187789493467, "grad_norm": 0.1356368362903595, "learning_rate": 7.938449597758892e-06, "loss": 0.0075, "step": 34350 }, { "epoch": 7.548693633199297, "grad_norm": 0.09013943374156952, "learning_rate": 7.927698846710776e-06, "loss": 0.008, "step": 34360 }, { "epoch": 7.5511994769051265, "grad_norm": 0.16715790331363678, "learning_rate": 7.916953580564451e-06, "loss": 0.0076, "step": 34370 }, { "epoch": 7.553705320610956, "grad_norm": 0.14561499655246735, "learning_rate": 7.906213804201879e-06, "loss": 0.0082, "step": 34380 }, { "epoch": 7.556211164316785, "grad_norm": 0.0815485492348671, "learning_rate": 7.895479522502553e-06, "loss": 0.0059, "step": 34390 }, { "epoch": 7.558717008022615, "grad_norm": 0.09464819729328156, "learning_rate": 7.884750740343473e-06, "loss": 0.0056, "step": 34400 }, { "epoch": 7.561222851728445, "grad_norm": 0.09176971018314362, "learning_rate": 7.874027462599113e-06, "loss": 0.0079, "step": 34410 }, { "epoch": 7.563728695434275, "grad_norm": 0.09357970952987671, "learning_rate": 7.863309694141477e-06, "loss": 0.0065, "step": 34420 }, { "epoch": 7.566234539140104, "grad_norm": 0.11058258265256882, "learning_rate": 7.852597439840035e-06, "loss": 0.0069, "step": 34430 }, { "epoch": 7.568740382845934, "grad_norm": 0.07406598329544067, "learning_rate": 7.841890704561785e-06, "loss": 0.0064, "step": 34440 }, { "epoch": 7.5712462265517635, "grad_norm": 0.12673982977867126, "learning_rate": 7.831189493171193e-06, "loss": 0.0069, "step": 34450 }, { "epoch": 7.5737520702575925, "grad_norm": 0.10019352287054062, "learning_rate": 7.820493810530218e-06, "loss": 0.0078, "step": 34460 }, { "epoch": 7.576257913963422, "grad_norm": 0.26656898856163025, "learning_rate": 7.809803661498316e-06, "loss": 0.0085, "step": 34470 }, { "epoch": 7.578763757669252, "grad_norm": 0.10399617999792099, "learning_rate": 7.799119050932432e-06, "loss": 0.008, "step": 34480 }, { "epoch": 7.581269601375082, "grad_norm": 0.10634967684745789, "learning_rate": 7.788439983686975e-06, "loss": 0.0088, "step": 34490 }, { "epoch": 7.583775445080912, "grad_norm": 0.18276095390319824, "learning_rate": 7.777766464613862e-06, "loss": 0.0084, "step": 34500 }, { "epoch": 7.586281288786741, "grad_norm": 0.13486731052398682, "learning_rate": 7.767098498562461e-06, "loss": 0.0094, "step": 34510 }, { "epoch": 7.588787132492571, "grad_norm": 0.12322235107421875, "learning_rate": 7.756436090379638e-06, "loss": 0.0088, "step": 34520 }, { "epoch": 7.5912929761984005, "grad_norm": 0.07382646948099136, "learning_rate": 7.745779244909735e-06, "loss": 0.0072, "step": 34530 }, { "epoch": 7.5937988199042294, "grad_norm": 0.12313121557235718, "learning_rate": 7.735127966994544e-06, "loss": 0.0082, "step": 34540 }, { "epoch": 7.596304663610059, "grad_norm": 0.09292915463447571, "learning_rate": 7.724482261473358e-06, "loss": 0.0087, "step": 34550 }, { "epoch": 7.598810507315889, "grad_norm": 0.09510049223899841, "learning_rate": 7.713842133182907e-06, "loss": 0.0084, "step": 34560 }, { "epoch": 7.601316351021719, "grad_norm": 0.11872823536396027, "learning_rate": 7.703207586957409e-06, "loss": 0.008, "step": 34570 }, { "epoch": 7.603822194727548, "grad_norm": 0.1334230750799179, "learning_rate": 7.692578627628549e-06, "loss": 0.0068, "step": 34580 }, { "epoch": 7.606328038433378, "grad_norm": 0.07420721650123596, "learning_rate": 7.681955260025446e-06, "loss": 0.0083, "step": 34590 }, { "epoch": 7.608833882139208, "grad_norm": 0.10070428252220154, "learning_rate": 7.671337488974709e-06, "loss": 0.0102, "step": 34600 }, { "epoch": 7.611339725845037, "grad_norm": 0.09709013253450394, "learning_rate": 7.660725319300395e-06, "loss": 0.0097, "step": 34610 }, { "epoch": 7.613845569550866, "grad_norm": 0.06644345074892044, "learning_rate": 7.650118755823998e-06, "loss": 0.0067, "step": 34620 }, { "epoch": 7.616351413256696, "grad_norm": 0.08552383631467819, "learning_rate": 7.639517803364501e-06, "loss": 0.0094, "step": 34630 }, { "epoch": 7.618857256962526, "grad_norm": 0.08965963870286942, "learning_rate": 7.628922466738293e-06, "loss": 0.0057, "step": 34640 }, { "epoch": 7.621363100668356, "grad_norm": 0.0859944298863411, "learning_rate": 7.618332750759254e-06, "loss": 0.0081, "step": 34650 }, { "epoch": 7.623868944374185, "grad_norm": 0.11740031093358994, "learning_rate": 7.6077486602386786e-06, "loss": 0.0089, "step": 34660 }, { "epoch": 7.626374788080015, "grad_norm": 0.09350020438432693, "learning_rate": 7.597170199985331e-06, "loss": 0.0113, "step": 34670 }, { "epoch": 7.628880631785845, "grad_norm": 0.09229455143213272, "learning_rate": 7.5865973748053915e-06, "loss": 0.0073, "step": 34680 }, { "epoch": 7.631386475491674, "grad_norm": 0.11459354311227798, "learning_rate": 7.576030189502503e-06, "loss": 0.0068, "step": 34690 }, { "epoch": 7.633892319197503, "grad_norm": 0.08529884368181229, "learning_rate": 7.565468648877727e-06, "loss": 0.0078, "step": 34700 }, { "epoch": 7.636398162903333, "grad_norm": 0.07371398061513901, "learning_rate": 7.554912757729584e-06, "loss": 0.007, "step": 34710 }, { "epoch": 7.638904006609163, "grad_norm": 0.16367895901203156, "learning_rate": 7.544362520853999e-06, "loss": 0.0093, "step": 34720 }, { "epoch": 7.641409850314992, "grad_norm": 0.18244487047195435, "learning_rate": 7.533817943044346e-06, "loss": 0.0071, "step": 34730 }, { "epoch": 7.643915694020822, "grad_norm": 0.10545351356267929, "learning_rate": 7.523279029091435e-06, "loss": 0.0067, "step": 34740 }, { "epoch": 7.646421537726652, "grad_norm": 0.09277526289224625, "learning_rate": 7.512745783783477e-06, "loss": 0.0093, "step": 34750 }, { "epoch": 7.648927381432481, "grad_norm": 0.15939445793628693, "learning_rate": 7.502218211906138e-06, "loss": 0.0081, "step": 34760 }, { "epoch": 7.651433225138311, "grad_norm": 0.10894455015659332, "learning_rate": 7.491696318242474e-06, "loss": 0.0068, "step": 34770 }, { "epoch": 7.65393906884414, "grad_norm": 0.1068325862288475, "learning_rate": 7.481180107572989e-06, "loss": 0.0099, "step": 34780 }, { "epoch": 7.65644491254997, "grad_norm": 0.08630590885877609, "learning_rate": 7.470669584675599e-06, "loss": 0.0074, "step": 34790 }, { "epoch": 7.6589507562558, "grad_norm": 0.1997733861207962, "learning_rate": 7.460164754325616e-06, "loss": 0.0123, "step": 34800 }, { "epoch": 7.661456599961629, "grad_norm": 0.11344775557518005, "learning_rate": 7.449665621295796e-06, "loss": 0.0068, "step": 34810 }, { "epoch": 7.663962443667459, "grad_norm": 0.12312676012516022, "learning_rate": 7.439172190356279e-06, "loss": 0.0068, "step": 34820 }, { "epoch": 7.666468287373289, "grad_norm": 0.15625189244747162, "learning_rate": 7.428684466274632e-06, "loss": 0.0074, "step": 34830 }, { "epoch": 7.668974131079118, "grad_norm": 0.10021638125181198, "learning_rate": 7.418202453815828e-06, "loss": 0.0077, "step": 34840 }, { "epoch": 7.671479974784948, "grad_norm": 0.0765567272901535, "learning_rate": 7.40772615774223e-06, "loss": 0.0075, "step": 34850 }, { "epoch": 7.673985818490777, "grad_norm": 0.07942689955234528, "learning_rate": 7.397255582813625e-06, "loss": 0.0067, "step": 34860 }, { "epoch": 7.676491662196607, "grad_norm": 0.08753988146781921, "learning_rate": 7.386790733787181e-06, "loss": 0.0082, "step": 34870 }, { "epoch": 7.678997505902436, "grad_norm": 0.2264910489320755, "learning_rate": 7.376331615417478e-06, "loss": 0.0089, "step": 34880 }, { "epoch": 7.681503349608266, "grad_norm": 0.15682053565979004, "learning_rate": 7.365878232456494e-06, "loss": 0.0085, "step": 34890 }, { "epoch": 7.684009193314096, "grad_norm": 0.11799176037311554, "learning_rate": 7.355430589653585e-06, "loss": 0.0072, "step": 34900 }, { "epoch": 7.686515037019925, "grad_norm": 0.10733599215745926, "learning_rate": 7.344988691755517e-06, "loss": 0.0064, "step": 34910 }, { "epoch": 7.689020880725755, "grad_norm": 0.18159238994121552, "learning_rate": 7.334552543506437e-06, "loss": 0.0103, "step": 34920 }, { "epoch": 7.691526724431585, "grad_norm": 0.11346950381994247, "learning_rate": 7.32412214964787e-06, "loss": 0.0082, "step": 34930 }, { "epoch": 7.694032568137414, "grad_norm": 0.13324564695358276, "learning_rate": 7.313697514918754e-06, "loss": 0.0085, "step": 34940 }, { "epoch": 7.696538411843244, "grad_norm": 0.1105237752199173, "learning_rate": 7.303278644055378e-06, "loss": 0.0073, "step": 34950 }, { "epoch": 7.699044255549073, "grad_norm": 0.10006535053253174, "learning_rate": 7.292865541791437e-06, "loss": 0.0063, "step": 34960 }, { "epoch": 7.701550099254903, "grad_norm": 0.07903921604156494, "learning_rate": 7.282458212857999e-06, "loss": 0.0086, "step": 34970 }, { "epoch": 7.704055942960733, "grad_norm": 0.11281129717826843, "learning_rate": 7.2720566619834955e-06, "loss": 0.0059, "step": 34980 }, { "epoch": 7.706561786666562, "grad_norm": 0.09755261987447739, "learning_rate": 7.2616608938937584e-06, "loss": 0.0075, "step": 34990 }, { "epoch": 7.709067630372392, "grad_norm": 0.1010575219988823, "learning_rate": 7.251270913311961e-06, "loss": 0.0071, "step": 35000 }, { "epoch": 7.711573474078222, "grad_norm": 0.1093018501996994, "learning_rate": 7.240886724958669e-06, "loss": 0.0086, "step": 35010 }, { "epoch": 7.714079317784051, "grad_norm": 0.07258759438991547, "learning_rate": 7.230508333551823e-06, "loss": 0.0081, "step": 35020 }, { "epoch": 7.71658516148988, "grad_norm": 0.09047434478998184, "learning_rate": 7.2201357438066975e-06, "loss": 0.009, "step": 35030 }, { "epoch": 7.71909100519571, "grad_norm": 0.2066592574119568, "learning_rate": 7.209768960435964e-06, "loss": 0.0095, "step": 35040 }, { "epoch": 7.72159684890154, "grad_norm": 0.10210789740085602, "learning_rate": 7.199407988149647e-06, "loss": 0.0061, "step": 35050 }, { "epoch": 7.724102692607369, "grad_norm": 0.12078050523996353, "learning_rate": 7.189052831655112e-06, "loss": 0.0082, "step": 35060 }, { "epoch": 7.726608536313199, "grad_norm": 0.20263127982616425, "learning_rate": 7.1787034956571155e-06, "loss": 0.0087, "step": 35070 }, { "epoch": 7.729114380019029, "grad_norm": 0.0823783278465271, "learning_rate": 7.168359984857731e-06, "loss": 0.0079, "step": 35080 }, { "epoch": 7.731620223724859, "grad_norm": 0.0898759514093399, "learning_rate": 7.15802230395642e-06, "loss": 0.0069, "step": 35090 }, { "epoch": 7.734126067430688, "grad_norm": 0.07781212031841278, "learning_rate": 7.147690457649981e-06, "loss": 0.0076, "step": 35100 }, { "epoch": 7.736631911136517, "grad_norm": 0.11412646621465683, "learning_rate": 7.1373644506325516e-06, "loss": 0.0076, "step": 35110 }, { "epoch": 7.739137754842347, "grad_norm": 0.17949891090393066, "learning_rate": 7.1270442875956394e-06, "loss": 0.0067, "step": 35120 }, { "epoch": 7.741643598548177, "grad_norm": 0.09140664339065552, "learning_rate": 7.116729973228069e-06, "loss": 0.0072, "step": 35130 }, { "epoch": 7.744149442254006, "grad_norm": 0.20969602465629578, "learning_rate": 7.106421512216037e-06, "loss": 0.0088, "step": 35140 }, { "epoch": 7.746655285959836, "grad_norm": 0.12347796559333801, "learning_rate": 7.096118909243052e-06, "loss": 0.0083, "step": 35150 }, { "epoch": 7.749161129665666, "grad_norm": 0.09729664027690887, "learning_rate": 7.085822168989988e-06, "loss": 0.0074, "step": 35160 }, { "epoch": 7.751666973371496, "grad_norm": 0.08552992343902588, "learning_rate": 7.075531296135032e-06, "loss": 0.0069, "step": 35170 }, { "epoch": 7.7541728170773245, "grad_norm": 0.109068363904953, "learning_rate": 7.065246295353727e-06, "loss": 0.0085, "step": 35180 }, { "epoch": 7.756678660783154, "grad_norm": 0.1155996173620224, "learning_rate": 7.054967171318923e-06, "loss": 0.0061, "step": 35190 }, { "epoch": 7.759184504488984, "grad_norm": 0.06965380162000656, "learning_rate": 7.044693928700828e-06, "loss": 0.0095, "step": 35200 }, { "epoch": 7.761690348194814, "grad_norm": 0.08038663119077682, "learning_rate": 7.03442657216695e-06, "loss": 0.0064, "step": 35210 }, { "epoch": 7.764196191900643, "grad_norm": 0.10086628049612045, "learning_rate": 7.024165106382141e-06, "loss": 0.0066, "step": 35220 }, { "epoch": 7.766702035606473, "grad_norm": 0.09153997898101807, "learning_rate": 7.013909536008583e-06, "loss": 0.0086, "step": 35230 }, { "epoch": 7.769207879312303, "grad_norm": 0.13078121840953827, "learning_rate": 7.003659865705752e-06, "loss": 0.0068, "step": 35240 }, { "epoch": 7.771713723018133, "grad_norm": 0.23037368059158325, "learning_rate": 6.993416100130475e-06, "loss": 0.0066, "step": 35250 }, { "epoch": 7.7742195667239615, "grad_norm": 0.1067538782954216, "learning_rate": 6.983178243936864e-06, "loss": 0.0088, "step": 35260 }, { "epoch": 7.776725410429791, "grad_norm": 0.13877935707569122, "learning_rate": 6.972946301776373e-06, "loss": 0.008, "step": 35270 }, { "epoch": 7.779231254135621, "grad_norm": 0.09955627471208572, "learning_rate": 6.962720278297768e-06, "loss": 0.0097, "step": 35280 }, { "epoch": 7.78173709784145, "grad_norm": 0.07979287207126617, "learning_rate": 6.9525001781470995e-06, "loss": 0.0082, "step": 35290 }, { "epoch": 7.78424294154728, "grad_norm": 0.08607818931341171, "learning_rate": 6.942286005967753e-06, "loss": 0.0081, "step": 35300 }, { "epoch": 7.78674878525311, "grad_norm": 0.09661680459976196, "learning_rate": 6.9320777664004205e-06, "loss": 0.0078, "step": 35310 }, { "epoch": 7.78925462895894, "grad_norm": 0.1115199401974678, "learning_rate": 6.921875464083074e-06, "loss": 0.0076, "step": 35320 }, { "epoch": 7.791760472664769, "grad_norm": 0.08787154406309128, "learning_rate": 6.911679103651019e-06, "loss": 0.0069, "step": 35330 }, { "epoch": 7.7942663163705985, "grad_norm": 0.0779227688908577, "learning_rate": 6.901488689736835e-06, "loss": 0.0062, "step": 35340 }, { "epoch": 7.796772160076428, "grad_norm": 0.10677357763051987, "learning_rate": 6.891304226970415e-06, "loss": 0.0083, "step": 35350 }, { "epoch": 7.799278003782258, "grad_norm": 0.1087464988231659, "learning_rate": 6.8811257199789514e-06, "loss": 0.0063, "step": 35360 }, { "epoch": 7.801783847488087, "grad_norm": 0.10866356641054153, "learning_rate": 6.870953173386914e-06, "loss": 0.0071, "step": 35370 }, { "epoch": 7.804289691193917, "grad_norm": 0.0993465855717659, "learning_rate": 6.860786591816085e-06, "loss": 0.0069, "step": 35380 }, { "epoch": 7.806795534899747, "grad_norm": 0.07485151290893555, "learning_rate": 6.850625979885519e-06, "loss": 0.0058, "step": 35390 }, { "epoch": 7.809301378605577, "grad_norm": 0.16288413107395172, "learning_rate": 6.840471342211561e-06, "loss": 0.0068, "step": 35400 }, { "epoch": 7.811807222311406, "grad_norm": 0.09213276207447052, "learning_rate": 6.830322683407855e-06, "loss": 0.0068, "step": 35410 }, { "epoch": 7.8143130660172355, "grad_norm": 0.13675647974014282, "learning_rate": 6.820180008085311e-06, "loss": 0.0055, "step": 35420 }, { "epoch": 7.816818909723065, "grad_norm": 0.10834959149360657, "learning_rate": 6.810043320852133e-06, "loss": 0.0078, "step": 35430 }, { "epoch": 7.819324753428894, "grad_norm": 0.10568954050540924, "learning_rate": 6.799912626313807e-06, "loss": 0.0063, "step": 35440 }, { "epoch": 7.821830597134724, "grad_norm": 0.09933456778526306, "learning_rate": 6.789787929073077e-06, "loss": 0.0081, "step": 35450 }, { "epoch": 7.824336440840554, "grad_norm": 0.11701945215463638, "learning_rate": 6.779669233729988e-06, "loss": 0.007, "step": 35460 }, { "epoch": 7.826842284546384, "grad_norm": 0.13091018795967102, "learning_rate": 6.769556544881833e-06, "loss": 0.0096, "step": 35470 }, { "epoch": 7.829348128252213, "grad_norm": 0.06694869697093964, "learning_rate": 6.759449867123191e-06, "loss": 0.0068, "step": 35480 }, { "epoch": 7.831853971958043, "grad_norm": 0.1058906763792038, "learning_rate": 6.749349205045916e-06, "loss": 0.0067, "step": 35490 }, { "epoch": 7.8343598156638725, "grad_norm": 0.07821207493543625, "learning_rate": 6.7392545632391085e-06, "loss": 0.0049, "step": 35500 }, { "epoch": 7.836865659369702, "grad_norm": 0.08587947487831116, "learning_rate": 6.729165946289153e-06, "loss": 0.006, "step": 35510 }, { "epoch": 7.839371503075531, "grad_norm": 0.06356310844421387, "learning_rate": 6.719083358779681e-06, "loss": 0.006, "step": 35520 }, { "epoch": 7.841877346781361, "grad_norm": 0.15048982203006744, "learning_rate": 6.709006805291598e-06, "loss": 0.007, "step": 35530 }, { "epoch": 7.844383190487191, "grad_norm": 0.06852659583091736, "learning_rate": 6.6989362904030645e-06, "loss": 0.0077, "step": 35540 }, { "epoch": 7.846889034193021, "grad_norm": 0.08083212375640869, "learning_rate": 6.688871818689486e-06, "loss": 0.0088, "step": 35550 }, { "epoch": 7.84939487789885, "grad_norm": 0.08553776144981384, "learning_rate": 6.678813394723538e-06, "loss": 0.0068, "step": 35560 }, { "epoch": 7.85190072160468, "grad_norm": 0.09969456493854523, "learning_rate": 6.668761023075147e-06, "loss": 0.0076, "step": 35570 }, { "epoch": 7.8544065653105095, "grad_norm": 0.13315513730049133, "learning_rate": 6.658714708311473e-06, "loss": 0.0081, "step": 35580 }, { "epoch": 7.8569124090163385, "grad_norm": 0.13204751908779144, "learning_rate": 6.648674454996949e-06, "loss": 0.0067, "step": 35590 }, { "epoch": 7.859418252722168, "grad_norm": 0.13832972943782806, "learning_rate": 6.638640267693226e-06, "loss": 0.0086, "step": 35600 }, { "epoch": 7.861924096427998, "grad_norm": 0.0871187373995781, "learning_rate": 6.628612150959228e-06, "loss": 0.0076, "step": 35610 }, { "epoch": 7.864429940133828, "grad_norm": 0.08827085793018341, "learning_rate": 6.618590109351093e-06, "loss": 0.0068, "step": 35620 }, { "epoch": 7.866935783839657, "grad_norm": 0.14271672070026398, "learning_rate": 6.6085741474222265e-06, "loss": 0.007, "step": 35630 }, { "epoch": 7.869441627545487, "grad_norm": 0.09750137478113174, "learning_rate": 6.598564269723249e-06, "loss": 0.0088, "step": 35640 }, { "epoch": 7.871947471251317, "grad_norm": 0.0803549662232399, "learning_rate": 6.588560480802022e-06, "loss": 0.0075, "step": 35650 }, { "epoch": 7.8744533149571465, "grad_norm": 0.13643014430999756, "learning_rate": 6.5785627852036504e-06, "loss": 0.0053, "step": 35660 }, { "epoch": 7.8769591586629755, "grad_norm": 0.121835857629776, "learning_rate": 6.568571187470467e-06, "loss": 0.0083, "step": 35670 }, { "epoch": 7.879465002368805, "grad_norm": 0.10487689822912216, "learning_rate": 6.558585692142026e-06, "loss": 0.0096, "step": 35680 }, { "epoch": 7.881970846074635, "grad_norm": 0.08149490505456924, "learning_rate": 6.548606303755119e-06, "loss": 0.0066, "step": 35690 }, { "epoch": 7.884476689780465, "grad_norm": 0.1439167708158493, "learning_rate": 6.53863302684375e-06, "loss": 0.0076, "step": 35700 }, { "epoch": 7.886982533486294, "grad_norm": 0.09946610033512115, "learning_rate": 6.528665865939163e-06, "loss": 0.0068, "step": 35710 }, { "epoch": 7.889488377192124, "grad_norm": 0.24598373472690582, "learning_rate": 6.518704825569819e-06, "loss": 0.0067, "step": 35720 }, { "epoch": 7.891994220897954, "grad_norm": 0.08349896222352982, "learning_rate": 6.5087499102613825e-06, "loss": 0.0076, "step": 35730 }, { "epoch": 7.894500064603783, "grad_norm": 0.09016942977905273, "learning_rate": 6.498801124536751e-06, "loss": 0.0085, "step": 35740 }, { "epoch": 7.8970059083096125, "grad_norm": 0.09191475063562393, "learning_rate": 6.488858472916044e-06, "loss": 0.0061, "step": 35750 }, { "epoch": 7.899511752015442, "grad_norm": 0.06638669967651367, "learning_rate": 6.478921959916566e-06, "loss": 0.0111, "step": 35760 }, { "epoch": 7.902017595721272, "grad_norm": 0.09396471083164215, "learning_rate": 6.468991590052866e-06, "loss": 0.0081, "step": 35770 }, { "epoch": 7.904523439427101, "grad_norm": 0.06750231981277466, "learning_rate": 6.459067367836671e-06, "loss": 0.0056, "step": 35780 }, { "epoch": 7.907029283132931, "grad_norm": 0.06152792274951935, "learning_rate": 6.449149297776935e-06, "loss": 0.0068, "step": 35790 }, { "epoch": 7.909535126838761, "grad_norm": 0.0743464007973671, "learning_rate": 6.439237384379819e-06, "loss": 0.0054, "step": 35800 }, { "epoch": 7.912040970544591, "grad_norm": 0.09955011308193207, "learning_rate": 6.429331632148668e-06, "loss": 0.0078, "step": 35810 }, { "epoch": 7.91454681425042, "grad_norm": 0.1589689403772354, "learning_rate": 6.419432045584049e-06, "loss": 0.0075, "step": 35820 }, { "epoch": 7.9170526579562495, "grad_norm": 0.18222308158874512, "learning_rate": 6.409538629183705e-06, "loss": 0.0091, "step": 35830 }, { "epoch": 7.919558501662079, "grad_norm": 0.10069799423217773, "learning_rate": 6.3996513874426e-06, "loss": 0.0093, "step": 35840 }, { "epoch": 7.922064345367909, "grad_norm": 0.11357899755239487, "learning_rate": 6.389770324852873e-06, "loss": 0.0053, "step": 35850 }, { "epoch": 7.924570189073738, "grad_norm": 0.11506646126508713, "learning_rate": 6.379895445903872e-06, "loss": 0.0079, "step": 35860 }, { "epoch": 7.927076032779568, "grad_norm": 0.11021401733160019, "learning_rate": 6.3700267550821145e-06, "loss": 0.0067, "step": 35870 }, { "epoch": 7.929581876485398, "grad_norm": 0.07882853597402573, "learning_rate": 6.360164256871333e-06, "loss": 0.0055, "step": 35880 }, { "epoch": 7.932087720191227, "grad_norm": 0.05710717663168907, "learning_rate": 6.3503079557524195e-06, "loss": 0.0067, "step": 35890 }, { "epoch": 7.934593563897057, "grad_norm": 0.07145318388938904, "learning_rate": 6.340457856203477e-06, "loss": 0.0078, "step": 35900 }, { "epoch": 7.9370994076028865, "grad_norm": 0.06779986619949341, "learning_rate": 6.330613962699761e-06, "loss": 0.0047, "step": 35910 }, { "epoch": 7.939605251308716, "grad_norm": 0.08547310531139374, "learning_rate": 6.320776279713732e-06, "loss": 0.0072, "step": 35920 }, { "epoch": 7.942111095014545, "grad_norm": 0.1033957228064537, "learning_rate": 6.31094481171503e-06, "loss": 0.0072, "step": 35930 }, { "epoch": 7.944616938720375, "grad_norm": 0.08793795853853226, "learning_rate": 6.301119563170444e-06, "loss": 0.0081, "step": 35940 }, { "epoch": 7.947122782426205, "grad_norm": 0.11462560296058655, "learning_rate": 6.29130053854397e-06, "loss": 0.0062, "step": 35950 }, { "epoch": 7.949628626132035, "grad_norm": 0.09097141772508621, "learning_rate": 6.28148774229675e-06, "loss": 0.007, "step": 35960 }, { "epoch": 7.952134469837864, "grad_norm": 0.19802051782608032, "learning_rate": 6.27168117888711e-06, "loss": 0.0076, "step": 35970 }, { "epoch": 7.954640313543694, "grad_norm": 0.08013864606618881, "learning_rate": 6.261880852770552e-06, "loss": 0.0067, "step": 35980 }, { "epoch": 7.9571461572495235, "grad_norm": 0.08561041951179504, "learning_rate": 6.252086768399719e-06, "loss": 0.0062, "step": 35990 }, { "epoch": 7.959652000955353, "grad_norm": 0.11327535659074783, "learning_rate": 6.24229893022444e-06, "loss": 0.0068, "step": 36000 }, { "epoch": 7.962157844661182, "grad_norm": 0.13210701942443848, "learning_rate": 6.232517342691706e-06, "loss": 0.006, "step": 36010 }, { "epoch": 7.964663688367012, "grad_norm": 0.07166098058223724, "learning_rate": 6.222742010245646e-06, "loss": 0.0057, "step": 36020 }, { "epoch": 7.967169532072842, "grad_norm": 0.06817661970853806, "learning_rate": 6.212972937327579e-06, "loss": 0.0074, "step": 36030 }, { "epoch": 7.969675375778671, "grad_norm": 0.15704169869422913, "learning_rate": 6.203210128375945e-06, "loss": 0.0082, "step": 36040 }, { "epoch": 7.972181219484501, "grad_norm": 0.11284251511096954, "learning_rate": 6.193453587826368e-06, "loss": 0.0075, "step": 36050 }, { "epoch": 7.974687063190331, "grad_norm": 0.11321806907653809, "learning_rate": 6.183703320111618e-06, "loss": 0.0064, "step": 36060 }, { "epoch": 7.9771929068961605, "grad_norm": 0.16512060165405273, "learning_rate": 6.173959329661592e-06, "loss": 0.0105, "step": 36070 }, { "epoch": 7.979698750601989, "grad_norm": 0.12027587741613388, "learning_rate": 6.164221620903372e-06, "loss": 0.0062, "step": 36080 }, { "epoch": 7.982204594307819, "grad_norm": 0.15376897156238556, "learning_rate": 6.154490198261147e-06, "loss": 0.0069, "step": 36090 }, { "epoch": 7.984710438013649, "grad_norm": 0.15769003331661224, "learning_rate": 6.144765066156289e-06, "loss": 0.0066, "step": 36100 }, { "epoch": 7.987216281719479, "grad_norm": 0.06796184182167053, "learning_rate": 6.135046229007277e-06, "loss": 0.006, "step": 36110 }, { "epoch": 7.989722125425308, "grad_norm": 0.07103321701288223, "learning_rate": 6.125333691229749e-06, "loss": 0.0047, "step": 36120 }, { "epoch": 7.992227969131138, "grad_norm": 0.10138924419879913, "learning_rate": 6.115627457236477e-06, "loss": 0.0069, "step": 36130 }, { "epoch": 7.994733812836968, "grad_norm": 0.09277825802564621, "learning_rate": 6.105927531437379e-06, "loss": 0.0071, "step": 36140 }, { "epoch": 7.9972396565427974, "grad_norm": 0.08437768369913101, "learning_rate": 6.0962339182394805e-06, "loss": 0.0074, "step": 36150 }, { "epoch": 7.999745500248626, "grad_norm": 0.1049957200884819, "learning_rate": 6.086546622046971e-06, "loss": 0.0074, "step": 36160 }, { "epoch": 8.002251343954455, "grad_norm": 0.1135839894413948, "learning_rate": 6.0768656472611434e-06, "loss": 0.0076, "step": 36170 }, { "epoch": 8.004757187660285, "grad_norm": 0.06280838698148727, "learning_rate": 6.067190998280432e-06, "loss": 0.0051, "step": 36180 }, { "epoch": 8.007263031366115, "grad_norm": 0.11980070173740387, "learning_rate": 6.057522679500405e-06, "loss": 0.0057, "step": 36190 }, { "epoch": 8.009768875071945, "grad_norm": 0.10461264103651047, "learning_rate": 6.0478606953137295e-06, "loss": 0.0069, "step": 36200 }, { "epoch": 8.012274718777775, "grad_norm": 0.21044063568115234, "learning_rate": 6.038205050110224e-06, "loss": 0.0077, "step": 36210 }, { "epoch": 8.014780562483605, "grad_norm": 0.21274645626544952, "learning_rate": 6.028555748276796e-06, "loss": 0.0072, "step": 36220 }, { "epoch": 8.017286406189434, "grad_norm": 0.07138682156801224, "learning_rate": 6.018912794197498e-06, "loss": 0.0067, "step": 36230 }, { "epoch": 8.019792249895264, "grad_norm": 0.08897475153207779, "learning_rate": 6.00927619225349e-06, "loss": 0.0053, "step": 36240 }, { "epoch": 8.022298093601094, "grad_norm": 0.14679205417633057, "learning_rate": 5.999645946823034e-06, "loss": 0.0065, "step": 36250 }, { "epoch": 8.024803937306922, "grad_norm": 0.08847475051879883, "learning_rate": 5.990022062281517e-06, "loss": 0.008, "step": 36260 }, { "epoch": 8.027309781012752, "grad_norm": 0.0746171772480011, "learning_rate": 5.980404543001444e-06, "loss": 0.0074, "step": 36270 }, { "epoch": 8.029815624718582, "grad_norm": 0.10499439388513565, "learning_rate": 5.970793393352401e-06, "loss": 0.0068, "step": 36280 }, { "epoch": 8.032321468424412, "grad_norm": 0.08818477392196655, "learning_rate": 5.961188617701108e-06, "loss": 0.0064, "step": 36290 }, { "epoch": 8.034827312130242, "grad_norm": 0.09847619384527206, "learning_rate": 5.951590220411365e-06, "loss": 0.0059, "step": 36300 }, { "epoch": 8.037333155836071, "grad_norm": 0.12127798050642014, "learning_rate": 5.9419982058441e-06, "loss": 0.0076, "step": 36310 }, { "epoch": 8.0398389995419, "grad_norm": 0.10391832143068314, "learning_rate": 5.93241257835731e-06, "loss": 0.0065, "step": 36320 }, { "epoch": 8.04234484324773, "grad_norm": 0.10504184663295746, "learning_rate": 5.922833342306125e-06, "loss": 0.0063, "step": 36330 }, { "epoch": 8.04485068695356, "grad_norm": 0.11094427108764648, "learning_rate": 5.913260502042737e-06, "loss": 0.0078, "step": 36340 }, { "epoch": 8.047356530659389, "grad_norm": 0.08850150555372238, "learning_rate": 5.903694061916465e-06, "loss": 0.0067, "step": 36350 }, { "epoch": 8.049862374365219, "grad_norm": 0.09967691451311111, "learning_rate": 5.894134026273686e-06, "loss": 0.0056, "step": 36360 }, { "epoch": 8.052368218071049, "grad_norm": 0.12476783990859985, "learning_rate": 5.8845803994579e-06, "loss": 0.0053, "step": 36370 }, { "epoch": 8.054874061776879, "grad_norm": 0.14671072363853455, "learning_rate": 5.875033185809669e-06, "loss": 0.0066, "step": 36380 }, { "epoch": 8.057379905482708, "grad_norm": 0.09469853341579437, "learning_rate": 5.865492389666654e-06, "loss": 0.0075, "step": 36390 }, { "epoch": 8.059885749188538, "grad_norm": 0.08023320138454437, "learning_rate": 5.855958015363609e-06, "loss": 0.0046, "step": 36400 }, { "epoch": 8.062391592894366, "grad_norm": 0.0899844691157341, "learning_rate": 5.8464300672323474e-06, "loss": 0.0089, "step": 36410 }, { "epoch": 8.064897436600196, "grad_norm": 0.07275997847318649, "learning_rate": 5.836908549601785e-06, "loss": 0.0078, "step": 36420 }, { "epoch": 8.067403280306026, "grad_norm": 0.08382225781679153, "learning_rate": 5.827393466797897e-06, "loss": 0.0061, "step": 36430 }, { "epoch": 8.069909124011856, "grad_norm": 0.11877277493476868, "learning_rate": 5.817884823143749e-06, "loss": 0.0098, "step": 36440 }, { "epoch": 8.072414967717686, "grad_norm": 0.10599182546138763, "learning_rate": 5.808382622959481e-06, "loss": 0.0062, "step": 36450 }, { "epoch": 8.074920811423516, "grad_norm": 0.06825459748506546, "learning_rate": 5.798886870562292e-06, "loss": 0.0082, "step": 36460 }, { "epoch": 8.001506747608857, "grad_norm": 0.11879447847604752, "learning_rate": 5.789397570266471e-06, "loss": 0.0101, "step": 36470 }, { "epoch": 8.003836019274724, "grad_norm": 0.12246419489383698, "learning_rate": 5.779914726383353e-06, "loss": 0.0114, "step": 36480 }, { "epoch": 8.006165290940588, "grad_norm": 0.12006139755249023, "learning_rate": 5.7704383432213565e-06, "loss": 0.0131, "step": 36490 }, { "epoch": 8.008494562606455, "grad_norm": 0.08577100932598114, "learning_rate": 5.7609684250859685e-06, "loss": 0.012, "step": 36500 }, { "epoch": 8.01082383427232, "grad_norm": 0.25951510667800903, "learning_rate": 5.751504976279716e-06, "loss": 0.0149, "step": 36510 }, { "epoch": 8.013153105938187, "grad_norm": 0.10508269816637039, "learning_rate": 5.742048001102216e-06, "loss": 0.0114, "step": 36520 }, { "epoch": 8.015482377604053, "grad_norm": 0.1154923141002655, "learning_rate": 5.732597503850113e-06, "loss": 0.01, "step": 36530 }, { "epoch": 8.01781164926992, "grad_norm": 0.12685953080654144, "learning_rate": 5.7231534888171305e-06, "loss": 0.0143, "step": 36540 }, { "epoch": 8.020140920935784, "grad_norm": 0.08257848769426346, "learning_rate": 5.713715960294049e-06, "loss": 0.014, "step": 36550 }, { "epoch": 8.02247019260165, "grad_norm": 0.07002828270196915, "learning_rate": 5.704284922568679e-06, "loss": 0.0146, "step": 36560 }, { "epoch": 8.024799464267517, "grad_norm": 0.17091166973114014, "learning_rate": 5.694860379925913e-06, "loss": 0.0103, "step": 36570 }, { "epoch": 8.027128735933383, "grad_norm": 0.1278984546661377, "learning_rate": 5.685442336647666e-06, "loss": 0.0092, "step": 36580 }, { "epoch": 8.02945800759925, "grad_norm": 0.10749363154172897, "learning_rate": 5.676030797012908e-06, "loss": 0.0152, "step": 36590 }, { "epoch": 8.031787279265115, "grad_norm": 0.13892629742622375, "learning_rate": 5.666625765297665e-06, "loss": 0.016, "step": 36600 }, { "epoch": 8.03411655093098, "grad_norm": 0.18454644083976746, "learning_rate": 5.657227245774988e-06, "loss": 0.0139, "step": 36610 }, { "epoch": 8.036445822596846, "grad_norm": 0.08750554919242859, "learning_rate": 5.647835242714983e-06, "loss": 0.0128, "step": 36620 }, { "epoch": 8.038775094262713, "grad_norm": 0.14589829742908478, "learning_rate": 5.638449760384799e-06, "loss": 0.0157, "step": 36630 }, { "epoch": 8.041104365928579, "grad_norm": 0.09567399322986603, "learning_rate": 5.629070803048602e-06, "loss": 0.0131, "step": 36640 }, { "epoch": 8.043433637594445, "grad_norm": 0.11196539551019669, "learning_rate": 5.619698374967617e-06, "loss": 0.0141, "step": 36650 }, { "epoch": 8.045762909260311, "grad_norm": 0.13703450560569763, "learning_rate": 5.610332480400083e-06, "loss": 0.0174, "step": 36660 }, { "epoch": 8.048092180926176, "grad_norm": 0.07332141697406769, "learning_rate": 5.600973123601281e-06, "loss": 0.013, "step": 36670 }, { "epoch": 8.050421452592042, "grad_norm": 0.187479630112648, "learning_rate": 5.5916203088235265e-06, "loss": 0.0123, "step": 36680 }, { "epoch": 8.052750724257908, "grad_norm": 0.1683785617351532, "learning_rate": 5.582274040316147e-06, "loss": 0.0131, "step": 36690 }, { "epoch": 8.055079995923775, "grad_norm": 0.13342136144638062, "learning_rate": 5.572934322325509e-06, "loss": 0.0149, "step": 36700 }, { "epoch": 8.057409267589641, "grad_norm": 0.15211082994937897, "learning_rate": 5.563601159095004e-06, "loss": 0.0108, "step": 36710 }, { "epoch": 8.059738539255507, "grad_norm": 0.3627464175224304, "learning_rate": 5.554274554865029e-06, "loss": 0.0125, "step": 36720 }, { "epoch": 8.062067810921372, "grad_norm": 0.0994172990322113, "learning_rate": 5.544954513873024e-06, "loss": 0.0118, "step": 36730 }, { "epoch": 8.064397082587238, "grad_norm": 0.1593247503042221, "learning_rate": 5.5356410403534234e-06, "loss": 0.0182, "step": 36740 }, { "epoch": 8.066726354253104, "grad_norm": 0.142432302236557, "learning_rate": 5.526334138537696e-06, "loss": 0.0143, "step": 36750 }, { "epoch": 8.06905562591897, "grad_norm": 0.2176980823278427, "learning_rate": 5.517033812654322e-06, "loss": 0.0144, "step": 36760 }, { "epoch": 8.071384897584837, "grad_norm": 0.1857328861951828, "learning_rate": 5.507740066928782e-06, "loss": 0.0119, "step": 36770 }, { "epoch": 8.073714169250703, "grad_norm": 0.11951420456171036, "learning_rate": 5.4984529055835825e-06, "loss": 0.0105, "step": 36780 }, { "epoch": 8.076043440916568, "grad_norm": 0.17162741720676422, "learning_rate": 5.489172332838224e-06, "loss": 0.0103, "step": 36790 }, { "epoch": 8.078372712582434, "grad_norm": 0.14743854105472565, "learning_rate": 5.4798983529092295e-06, "loss": 0.0086, "step": 36800 }, { "epoch": 8.0807019842483, "grad_norm": 0.07326424866914749, "learning_rate": 5.470630970010107e-06, "loss": 0.0097, "step": 36810 }, { "epoch": 8.083031255914166, "grad_norm": 0.19134899973869324, "learning_rate": 5.461370188351389e-06, "loss": 0.0111, "step": 36820 }, { "epoch": 8.085360527580033, "grad_norm": 0.1377040594816208, "learning_rate": 5.452116012140589e-06, "loss": 0.0182, "step": 36830 }, { "epoch": 8.087689799245899, "grad_norm": 0.13084763288497925, "learning_rate": 5.442868445582237e-06, "loss": 0.0164, "step": 36840 }, { "epoch": 8.090019070911763, "grad_norm": 0.38048675656318665, "learning_rate": 5.433627492877842e-06, "loss": 0.0162, "step": 36850 }, { "epoch": 8.09234834257763, "grad_norm": 0.1344769448041916, "learning_rate": 5.424393158225927e-06, "loss": 0.0093, "step": 36860 }, { "epoch": 8.094677614243496, "grad_norm": 0.23385648429393768, "learning_rate": 5.415165445821986e-06, "loss": 0.012, "step": 36870 }, { "epoch": 8.097006885909362, "grad_norm": 0.09961674362421036, "learning_rate": 5.405944359858526e-06, "loss": 0.0125, "step": 36880 }, { "epoch": 8.099336157575229, "grad_norm": 0.10665493458509445, "learning_rate": 5.39672990452504e-06, "loss": 0.0134, "step": 36890 }, { "epoch": 8.101665429241095, "grad_norm": 0.17882850766181946, "learning_rate": 5.387522084007988e-06, "loss": 0.0147, "step": 36900 }, { "epoch": 8.10399470090696, "grad_norm": 0.1597972810268402, "learning_rate": 5.378320902490841e-06, "loss": 0.0091, "step": 36910 }, { "epoch": 8.106323972572826, "grad_norm": 0.16173647344112396, "learning_rate": 5.369126364154036e-06, "loss": 0.0151, "step": 36920 }, { "epoch": 8.108653244238692, "grad_norm": 0.1553219109773636, "learning_rate": 5.3599384731750015e-06, "loss": 0.0089, "step": 36930 }, { "epoch": 8.110982515904558, "grad_norm": 0.07567547261714935, "learning_rate": 5.3507572337281475e-06, "loss": 0.0114, "step": 36940 }, { "epoch": 8.113311787570424, "grad_norm": 0.12192735821008682, "learning_rate": 5.3415826499848485e-06, "loss": 0.011, "step": 36950 }, { "epoch": 8.11564105923629, "grad_norm": 0.0932469591498375, "learning_rate": 5.332414726113469e-06, "loss": 0.0104, "step": 36960 }, { "epoch": 8.117970330902157, "grad_norm": 0.09423872828483582, "learning_rate": 5.323253466279348e-06, "loss": 0.0143, "step": 36970 }, { "epoch": 8.120299602568021, "grad_norm": 0.1303485482931137, "learning_rate": 5.314098874644777e-06, "loss": 0.0126, "step": 36980 }, { "epoch": 8.122628874233888, "grad_norm": 0.07448077946901321, "learning_rate": 5.304950955369051e-06, "loss": 0.0152, "step": 36990 }, { "epoch": 8.124958145899754, "grad_norm": 0.0800182968378067, "learning_rate": 5.295809712608397e-06, "loss": 0.013, "step": 37000 }, { "epoch": 8.12728741756562, "grad_norm": 0.09941169619560242, "learning_rate": 5.286675150516034e-06, "loss": 0.0095, "step": 37010 }, { "epoch": 8.129616689231487, "grad_norm": 0.09216099232435226, "learning_rate": 5.277547273242143e-06, "loss": 0.0116, "step": 37020 }, { "epoch": 8.131945960897353, "grad_norm": 0.08617964386940002, "learning_rate": 5.26842608493386e-06, "loss": 0.0104, "step": 37030 }, { "epoch": 8.134275232563217, "grad_norm": 0.14095810055732727, "learning_rate": 5.2593115897352765e-06, "loss": 0.0103, "step": 37040 }, { "epoch": 8.136604504229084, "grad_norm": 0.09287010878324509, "learning_rate": 5.250203791787463e-06, "loss": 0.0075, "step": 37050 }, { "epoch": 8.13893377589495, "grad_norm": 0.15587285161018372, "learning_rate": 5.241102695228426e-06, "loss": 0.0141, "step": 37060 }, { "epoch": 8.141263047560816, "grad_norm": 0.10587888956069946, "learning_rate": 5.232008304193146e-06, "loss": 0.0136, "step": 37070 }, { "epoch": 8.143592319226682, "grad_norm": 0.11686287820339203, "learning_rate": 5.222920622813539e-06, "loss": 0.0142, "step": 37080 }, { "epoch": 8.145921590892549, "grad_norm": 0.14373035728931427, "learning_rate": 5.2138396552184866e-06, "loss": 0.0137, "step": 37090 }, { "epoch": 8.148250862558413, "grad_norm": 0.08771449327468872, "learning_rate": 5.204765405533818e-06, "loss": 0.0079, "step": 37100 }, { "epoch": 8.15058013422428, "grad_norm": 0.09355900436639786, "learning_rate": 5.1956978778823e-06, "loss": 0.009, "step": 37110 }, { "epoch": 8.152909405890146, "grad_norm": 0.15771321952342987, "learning_rate": 5.18663707638366e-06, "loss": 0.0144, "step": 37120 }, { "epoch": 8.155238677556012, "grad_norm": 0.17721092700958252, "learning_rate": 5.177583005154552e-06, "loss": 0.0108, "step": 37130 }, { "epoch": 8.157567949221878, "grad_norm": 0.14875639975070953, "learning_rate": 5.168535668308588e-06, "loss": 0.0125, "step": 37140 }, { "epoch": 8.159897220887744, "grad_norm": 0.10182561725378036, "learning_rate": 5.15949506995632e-06, "loss": 0.0129, "step": 37150 }, { "epoch": 8.162226492553609, "grad_norm": 0.21566340327262878, "learning_rate": 5.15046121420522e-06, "loss": 0.0122, "step": 37160 }, { "epoch": 8.164555764219475, "grad_norm": 0.09904821217060089, "learning_rate": 5.141434105159724e-06, "loss": 0.0128, "step": 37170 }, { "epoch": 8.166885035885342, "grad_norm": 0.11203067004680634, "learning_rate": 5.1324137469211745e-06, "loss": 0.0101, "step": 37180 }, { "epoch": 8.169214307551208, "grad_norm": 0.10090601444244385, "learning_rate": 5.123400143587866e-06, "loss": 0.0117, "step": 37190 }, { "epoch": 8.171543579217074, "grad_norm": 0.0839562714099884, "learning_rate": 5.114393299255027e-06, "loss": 0.0133, "step": 37200 }, { "epoch": 8.17387285088294, "grad_norm": 0.15589044988155365, "learning_rate": 5.105393218014792e-06, "loss": 0.012, "step": 37210 }, { "epoch": 8.176202122548805, "grad_norm": 0.11106444895267487, "learning_rate": 5.096399903956242e-06, "loss": 0.013, "step": 37220 }, { "epoch": 8.178531394214671, "grad_norm": 0.29027873277664185, "learning_rate": 5.087413361165388e-06, "loss": 0.0106, "step": 37230 }, { "epoch": 8.180860665880537, "grad_norm": 0.10504327714443207, "learning_rate": 5.078433593725143e-06, "loss": 0.013, "step": 37240 }, { "epoch": 8.183189937546404, "grad_norm": 0.17297294735908508, "learning_rate": 5.069460605715364e-06, "loss": 0.0101, "step": 37250 }, { "epoch": 8.18551920921227, "grad_norm": 0.1347963809967041, "learning_rate": 5.06049440121281e-06, "loss": 0.0129, "step": 37260 }, { "epoch": 8.187848480878136, "grad_norm": 0.11745607852935791, "learning_rate": 5.051534984291173e-06, "loss": 0.0119, "step": 37270 }, { "epoch": 8.190177752544, "grad_norm": 0.06386442482471466, "learning_rate": 5.042582359021044e-06, "loss": 0.0131, "step": 37280 }, { "epoch": 8.192507024209867, "grad_norm": 0.10630090534687042, "learning_rate": 5.033636529469949e-06, "loss": 0.0123, "step": 37290 }, { "epoch": 8.194836295875733, "grad_norm": 0.11526919156312943, "learning_rate": 5.0246974997023155e-06, "loss": 0.014, "step": 37300 }, { "epoch": 8.1971655675416, "grad_norm": 0.06162027269601822, "learning_rate": 5.015765273779469e-06, "loss": 0.008, "step": 37310 }, { "epoch": 8.199494839207466, "grad_norm": 0.07618357241153717, "learning_rate": 5.006839855759666e-06, "loss": 0.0136, "step": 37320 }, { "epoch": 8.201824110873332, "grad_norm": 0.08331146836280823, "learning_rate": 4.997921249698068e-06, "loss": 0.0105, "step": 37330 }, { "epoch": 8.204153382539197, "grad_norm": 0.09012570232152939, "learning_rate": 4.989009459646716e-06, "loss": 0.0123, "step": 37340 }, { "epoch": 8.206482654205063, "grad_norm": 0.0668463185429573, "learning_rate": 4.980104489654592e-06, "loss": 0.0121, "step": 37350 }, { "epoch": 8.20881192587093, "grad_norm": 0.1363411247730255, "learning_rate": 4.971206343767543e-06, "loss": 0.01, "step": 37360 }, { "epoch": 8.211141197536795, "grad_norm": 0.1022375226020813, "learning_rate": 4.962315026028339e-06, "loss": 0.0119, "step": 37370 }, { "epoch": 8.213470469202662, "grad_norm": 0.13801468908786774, "learning_rate": 4.953430540476649e-06, "loss": 0.0108, "step": 37380 }, { "epoch": 8.215799740868528, "grad_norm": 0.07171899825334549, "learning_rate": 4.944552891149017e-06, "loss": 0.0158, "step": 37390 }, { "epoch": 8.218129012534392, "grad_norm": 0.20009857416152954, "learning_rate": 4.935682082078899e-06, "loss": 0.0133, "step": 37400 }, { "epoch": 8.220458284200259, "grad_norm": 0.2048514038324356, "learning_rate": 4.926818117296643e-06, "loss": 0.0124, "step": 37410 }, { "epoch": 8.222787555866125, "grad_norm": 0.08166893571615219, "learning_rate": 4.917961000829474e-06, "loss": 0.0105, "step": 37420 }, { "epoch": 8.225116827531991, "grad_norm": 0.15065047144889832, "learning_rate": 4.909110736701523e-06, "loss": 0.0131, "step": 37430 }, { "epoch": 8.227446099197858, "grad_norm": 0.07617972791194916, "learning_rate": 4.90026732893379e-06, "loss": 0.0096, "step": 37440 }, { "epoch": 8.229775370863724, "grad_norm": 0.07835905998945236, "learning_rate": 4.891430781544173e-06, "loss": 0.0118, "step": 37450 }, { "epoch": 8.232104642529588, "grad_norm": 0.10101970285177231, "learning_rate": 4.882601098547453e-06, "loss": 0.0137, "step": 37460 }, { "epoch": 8.234433914195455, "grad_norm": 0.13296152651309967, "learning_rate": 4.873778283955277e-06, "loss": 0.009, "step": 37470 }, { "epoch": 8.23676318586132, "grad_norm": 0.21453188359737396, "learning_rate": 4.864962341776198e-06, "loss": 0.0119, "step": 37480 }, { "epoch": 8.239092457527187, "grad_norm": 0.11174515634775162, "learning_rate": 4.856153276015614e-06, "loss": 0.0123, "step": 37490 }, { "epoch": 8.241421729193053, "grad_norm": 0.11146574467420578, "learning_rate": 4.847351090675831e-06, "loss": 0.0106, "step": 37500 }, { "epoch": 8.24375100085892, "grad_norm": 0.1027582511305809, "learning_rate": 4.838555789756001e-06, "loss": 0.0105, "step": 37510 }, { "epoch": 8.246080272524784, "grad_norm": 0.10646221786737442, "learning_rate": 4.829767377252172e-06, "loss": 0.0098, "step": 37520 }, { "epoch": 8.24840954419065, "grad_norm": 0.152821883559227, "learning_rate": 4.820985857157241e-06, "loss": 0.0157, "step": 37530 }, { "epoch": 8.250738815856517, "grad_norm": 0.12725357711315155, "learning_rate": 4.812211233460995e-06, "loss": 0.0165, "step": 37540 }, { "epoch": 8.253068087522383, "grad_norm": 0.11219295859336853, "learning_rate": 4.8034435101500655e-06, "loss": 0.0176, "step": 37550 }, { "epoch": 8.25539735918825, "grad_norm": 0.09755301475524902, "learning_rate": 4.794682691207968e-06, "loss": 0.0109, "step": 37560 }, { "epoch": 8.257726630854116, "grad_norm": 0.10923080891370773, "learning_rate": 4.785928780615068e-06, "loss": 0.0156, "step": 37570 }, { "epoch": 8.26005590251998, "grad_norm": 0.12990723550319672, "learning_rate": 4.777181782348596e-06, "loss": 0.0109, "step": 37580 }, { "epoch": 8.262385174185846, "grad_norm": 0.07622852921485901, "learning_rate": 4.768441700382653e-06, "loss": 0.013, "step": 37590 }, { "epoch": 8.264714445851713, "grad_norm": 0.07600579410791397, "learning_rate": 4.759708538688177e-06, "loss": 0.0111, "step": 37600 }, { "epoch": 8.267043717517579, "grad_norm": 0.08386851847171783, "learning_rate": 4.750982301232982e-06, "loss": 0.0111, "step": 37610 }, { "epoch": 8.269372989183445, "grad_norm": 0.07044485956430435, "learning_rate": 4.742262991981717e-06, "loss": 0.0122, "step": 37620 }, { "epoch": 8.271702260849311, "grad_norm": 0.14278657734394073, "learning_rate": 4.733550614895896e-06, "loss": 0.0137, "step": 37630 }, { "epoch": 8.274031532515176, "grad_norm": 0.09531226009130478, "learning_rate": 4.72484517393389e-06, "loss": 0.0082, "step": 37640 }, { "epoch": 8.276360804181042, "grad_norm": 0.2593216896057129, "learning_rate": 4.716146673050896e-06, "loss": 0.0116, "step": 37650 }, { "epoch": 8.278690075846908, "grad_norm": 0.10296206921339035, "learning_rate": 4.707455116198978e-06, "loss": 0.0122, "step": 37660 }, { "epoch": 8.281019347512775, "grad_norm": 0.21677230298519135, "learning_rate": 4.69877050732704e-06, "loss": 0.0125, "step": 37670 }, { "epoch": 8.283348619178641, "grad_norm": 0.08269315212965012, "learning_rate": 4.69009285038082e-06, "loss": 0.0118, "step": 37680 }, { "epoch": 8.285677890844507, "grad_norm": 0.22145964205265045, "learning_rate": 4.681422149302916e-06, "loss": 0.013, "step": 37690 }, { "epoch": 8.288007162510372, "grad_norm": 0.11899950355291367, "learning_rate": 4.672758408032742e-06, "loss": 0.0162, "step": 37700 }, { "epoch": 8.290336434176238, "grad_norm": 0.07505667954683304, "learning_rate": 4.664101630506568e-06, "loss": 0.0112, "step": 37710 }, { "epoch": 8.292665705842104, "grad_norm": 0.0742870420217514, "learning_rate": 4.6554518206575e-06, "loss": 0.0078, "step": 37720 }, { "epoch": 8.29499497750797, "grad_norm": 0.14768987894058228, "learning_rate": 4.646808982415463e-06, "loss": 0.0104, "step": 37730 }, { "epoch": 8.297324249173837, "grad_norm": 0.12418001145124435, "learning_rate": 4.638173119707235e-06, "loss": 0.0122, "step": 37740 }, { "epoch": 8.299653520839703, "grad_norm": 0.10706186294555664, "learning_rate": 4.6295442364564e-06, "loss": 0.0113, "step": 37750 }, { "epoch": 8.301982792505568, "grad_norm": 0.07371841371059418, "learning_rate": 4.620922336583402e-06, "loss": 0.0093, "step": 37760 }, { "epoch": 8.304312064171434, "grad_norm": 0.11275073140859604, "learning_rate": 4.612307424005484e-06, "loss": 0.0102, "step": 37770 }, { "epoch": 8.3066413358373, "grad_norm": 0.07504423707723618, "learning_rate": 4.603699502636727e-06, "loss": 0.0137, "step": 37780 }, { "epoch": 8.308970607503166, "grad_norm": 0.11118856817483902, "learning_rate": 4.595098576388033e-06, "loss": 0.0084, "step": 37790 }, { "epoch": 8.311299879169033, "grad_norm": 0.08760987222194672, "learning_rate": 4.586504649167136e-06, "loss": 0.013, "step": 37800 }, { "epoch": 8.313629150834899, "grad_norm": 0.08741356432437897, "learning_rate": 4.577917724878571e-06, "loss": 0.01, "step": 37810 }, { "epoch": 8.315958422500763, "grad_norm": 0.09229130297899246, "learning_rate": 4.5693378074237105e-06, "loss": 0.0138, "step": 37820 }, { "epoch": 8.31828769416663, "grad_norm": 0.11826887726783752, "learning_rate": 4.560764900700725e-06, "loss": 0.0099, "step": 37830 }, { "epoch": 8.320616965832496, "grad_norm": 0.1224520206451416, "learning_rate": 4.552199008604616e-06, "loss": 0.0101, "step": 37840 }, { "epoch": 8.322946237498362, "grad_norm": 0.10124773532152176, "learning_rate": 4.543640135027194e-06, "loss": 0.0113, "step": 37850 }, { "epoch": 8.325275509164229, "grad_norm": 0.0734691247344017, "learning_rate": 4.535088283857069e-06, "loss": 0.0094, "step": 37860 }, { "epoch": 8.327604780830095, "grad_norm": 0.10463391989469528, "learning_rate": 4.52654345897968e-06, "loss": 0.0134, "step": 37870 }, { "epoch": 8.329934052495961, "grad_norm": 0.046486470848321915, "learning_rate": 4.518005664277252e-06, "loss": 0.0098, "step": 37880 }, { "epoch": 8.332263324161826, "grad_norm": 0.10948356986045837, "learning_rate": 4.509474903628835e-06, "loss": 0.0094, "step": 37890 }, { "epoch": 8.334592595827692, "grad_norm": 0.09358666837215424, "learning_rate": 4.500951180910276e-06, "loss": 0.0083, "step": 37900 }, { "epoch": 8.336921867493558, "grad_norm": 0.1250772774219513, "learning_rate": 4.492434499994218e-06, "loss": 0.011, "step": 37910 }, { "epoch": 8.339251139159424, "grad_norm": 0.12322565913200378, "learning_rate": 4.483924864750113e-06, "loss": 0.0149, "step": 37920 }, { "epoch": 8.34158041082529, "grad_norm": 0.13675831258296967, "learning_rate": 4.475422279044215e-06, "loss": 0.0081, "step": 37930 }, { "epoch": 8.343909682491155, "grad_norm": 0.12592709064483643, "learning_rate": 4.4669267467395595e-06, "loss": 0.0136, "step": 37940 }, { "epoch": 8.346238954157021, "grad_norm": 0.10418612509965897, "learning_rate": 4.458438271695995e-06, "loss": 0.0101, "step": 37950 }, { "epoch": 8.348568225822888, "grad_norm": 0.14104723930358887, "learning_rate": 4.4499568577701506e-06, "loss": 0.0109, "step": 37960 }, { "epoch": 8.350897497488754, "grad_norm": 0.23583807051181793, "learning_rate": 4.4414825088154575e-06, "loss": 0.0099, "step": 37970 }, { "epoch": 8.35322676915462, "grad_norm": 0.12068284302949905, "learning_rate": 4.433015228682127e-06, "loss": 0.011, "step": 37980 }, { "epoch": 8.355556040820487, "grad_norm": 0.1149199828505516, "learning_rate": 4.424555021217169e-06, "loss": 0.0109, "step": 37990 }, { "epoch": 8.357885312486353, "grad_norm": 0.09298490732908249, "learning_rate": 4.416101890264368e-06, "loss": 0.0135, "step": 38000 }, { "epoch": 8.360214584152217, "grad_norm": 0.08467907458543777, "learning_rate": 4.407655839664311e-06, "loss": 0.0123, "step": 38010 }, { "epoch": 8.362543855818084, "grad_norm": 0.1811617910861969, "learning_rate": 4.3992168732543435e-06, "loss": 0.0096, "step": 38020 }, { "epoch": 8.36487312748395, "grad_norm": 0.0655340924859047, "learning_rate": 4.3907849948686195e-06, "loss": 0.0114, "step": 38030 }, { "epoch": 8.367202399149816, "grad_norm": 0.1623382866382599, "learning_rate": 4.38236020833805e-06, "loss": 0.0126, "step": 38040 }, { "epoch": 8.369531670815682, "grad_norm": 0.16154521703720093, "learning_rate": 4.373942517490335e-06, "loss": 0.0106, "step": 38050 }, { "epoch": 8.371860942481547, "grad_norm": 0.10698480904102325, "learning_rate": 4.365531926149955e-06, "loss": 0.0092, "step": 38060 }, { "epoch": 8.374190214147413, "grad_norm": 0.09431950002908707, "learning_rate": 4.357128438138152e-06, "loss": 0.0092, "step": 38070 }, { "epoch": 8.37651948581328, "grad_norm": 0.08799221366643906, "learning_rate": 4.348732057272953e-06, "loss": 0.0082, "step": 38080 }, { "epoch": 8.378848757479146, "grad_norm": 0.08785348385572433, "learning_rate": 4.340342787369145e-06, "loss": 0.0086, "step": 38090 }, { "epoch": 8.381178029145012, "grad_norm": 0.09684218466281891, "learning_rate": 4.33196063223829e-06, "loss": 0.0103, "step": 38100 }, { "epoch": 8.383507300810878, "grad_norm": 0.164109468460083, "learning_rate": 4.323585595688727e-06, "loss": 0.0089, "step": 38110 }, { "epoch": 8.385836572476745, "grad_norm": 0.18844327330589294, "learning_rate": 4.315217681525536e-06, "loss": 0.0113, "step": 38120 }, { "epoch": 8.388165844142609, "grad_norm": 0.12166304141283035, "learning_rate": 4.306856893550593e-06, "loss": 0.0093, "step": 38130 }, { "epoch": 8.390495115808475, "grad_norm": 0.10941187292337418, "learning_rate": 4.2985032355625014e-06, "loss": 0.0113, "step": 38140 }, { "epoch": 8.392824387474342, "grad_norm": 0.07419411092996597, "learning_rate": 4.2901567113566545e-06, "loss": 0.0105, "step": 38150 }, { "epoch": 8.395153659140208, "grad_norm": 0.07725989818572998, "learning_rate": 4.281817324725195e-06, "loss": 0.0075, "step": 38160 }, { "epoch": 8.397482930806074, "grad_norm": 0.09227379411458969, "learning_rate": 4.2734850794570094e-06, "loss": 0.0101, "step": 38170 }, { "epoch": 8.39981220247194, "grad_norm": 0.12847280502319336, "learning_rate": 4.265159979337759e-06, "loss": 0.0105, "step": 38180 }, { "epoch": 8.402141474137805, "grad_norm": 0.10236822813749313, "learning_rate": 4.2568420281498546e-06, "loss": 0.0083, "step": 38190 }, { "epoch": 8.404470745803671, "grad_norm": 0.08738196641206741, "learning_rate": 4.248531229672446e-06, "loss": 0.0098, "step": 38200 }, { "epoch": 8.406800017469537, "grad_norm": 0.1750573068857193, "learning_rate": 4.2402275876814514e-06, "loss": 0.0105, "step": 38210 }, { "epoch": 8.409129289135404, "grad_norm": 0.07588647305965424, "learning_rate": 4.231931105949525e-06, "loss": 0.0109, "step": 38220 }, { "epoch": 8.41145856080127, "grad_norm": 0.07072395831346512, "learning_rate": 4.223641788246062e-06, "loss": 0.0127, "step": 38230 }, { "epoch": 8.413787832467136, "grad_norm": 0.08830422908067703, "learning_rate": 4.215359638337228e-06, "loss": 0.0103, "step": 38240 }, { "epoch": 8.416117104133, "grad_norm": 0.07676402479410172, "learning_rate": 4.207084659985903e-06, "loss": 0.009, "step": 38250 }, { "epoch": 8.418446375798867, "grad_norm": 0.10720198601484299, "learning_rate": 4.198816856951733e-06, "loss": 0.0067, "step": 38260 }, { "epoch": 8.420775647464733, "grad_norm": 0.22229591012001038, "learning_rate": 4.190556232991081e-06, "loss": 0.0099, "step": 38270 }, { "epoch": 8.4231049191306, "grad_norm": 0.17615778744220734, "learning_rate": 4.182302791857067e-06, "loss": 0.0127, "step": 38280 }, { "epoch": 8.425434190796466, "grad_norm": 0.18549568951129913, "learning_rate": 4.1740565372995445e-06, "loss": 0.0117, "step": 38290 }, { "epoch": 8.427763462462332, "grad_norm": 0.09072698652744293, "learning_rate": 4.165817473065088e-06, "loss": 0.0098, "step": 38300 }, { "epoch": 8.430092734128197, "grad_norm": 0.09980200231075287, "learning_rate": 4.1575856028970256e-06, "loss": 0.0071, "step": 38310 }, { "epoch": 8.432422005794063, "grad_norm": 0.11127863079309464, "learning_rate": 4.149360930535395e-06, "loss": 0.0112, "step": 38320 }, { "epoch": 8.43475127745993, "grad_norm": 0.10821820050477982, "learning_rate": 4.141143459716983e-06, "loss": 0.0121, "step": 38330 }, { "epoch": 8.437080549125795, "grad_norm": 0.12421075999736786, "learning_rate": 4.132933194175299e-06, "loss": 0.0096, "step": 38340 }, { "epoch": 8.439409820791662, "grad_norm": 0.13131538033485413, "learning_rate": 4.124730137640569e-06, "loss": 0.0113, "step": 38350 }, { "epoch": 8.441739092457528, "grad_norm": 0.10658397525548935, "learning_rate": 4.116534293839751e-06, "loss": 0.0103, "step": 38360 }, { "epoch": 8.444068364123392, "grad_norm": 0.17261965572834015, "learning_rate": 4.108345666496538e-06, "loss": 0.0109, "step": 38370 }, { "epoch": 8.446397635789259, "grad_norm": 0.16871008276939392, "learning_rate": 4.100164259331316e-06, "loss": 0.0108, "step": 38380 }, { "epoch": 8.448726907455125, "grad_norm": 0.06752187758684158, "learning_rate": 4.091990076061221e-06, "loss": 0.01, "step": 38390 }, { "epoch": 8.451056179120991, "grad_norm": 0.16480953991413116, "learning_rate": 4.083823120400079e-06, "loss": 0.0072, "step": 38400 }, { "epoch": 8.453385450786858, "grad_norm": 0.19792857766151428, "learning_rate": 4.075663396058453e-06, "loss": 0.01, "step": 38410 }, { "epoch": 8.455714722452724, "grad_norm": 0.2038603574037552, "learning_rate": 4.067510906743621e-06, "loss": 0.0135, "step": 38420 }, { "epoch": 8.458043994118588, "grad_norm": 0.15148340165615082, "learning_rate": 4.059365656159553e-06, "loss": 0.0091, "step": 38430 }, { "epoch": 8.460373265784455, "grad_norm": 0.08877284079790115, "learning_rate": 4.051227648006955e-06, "loss": 0.011, "step": 38440 }, { "epoch": 8.46270253745032, "grad_norm": 0.07518044859170914, "learning_rate": 4.043096885983222e-06, "loss": 0.0101, "step": 38450 }, { "epoch": 8.465031809116187, "grad_norm": 0.09434840083122253, "learning_rate": 4.034973373782474e-06, "loss": 0.0099, "step": 38460 }, { "epoch": 8.467361080782053, "grad_norm": 0.09662331640720367, "learning_rate": 4.026857115095517e-06, "loss": 0.0083, "step": 38470 }, { "epoch": 8.46969035244792, "grad_norm": 0.11509634554386139, "learning_rate": 4.01874811360989e-06, "loss": 0.0127, "step": 38480 }, { "epoch": 8.472019624113784, "grad_norm": 0.08532702922821045, "learning_rate": 4.010646373009801e-06, "loss": 0.0119, "step": 38490 }, { "epoch": 8.47434889577965, "grad_norm": 0.11313314735889435, "learning_rate": 4.002551896976192e-06, "loss": 0.0095, "step": 38500 }, { "epoch": 8.476678167445517, "grad_norm": 0.11968868225812912, "learning_rate": 3.9944646891866765e-06, "loss": 0.0103, "step": 38510 }, { "epoch": 8.479007439111383, "grad_norm": 0.1281035989522934, "learning_rate": 3.986384753315586e-06, "loss": 0.0094, "step": 38520 }, { "epoch": 8.48133671077725, "grad_norm": 0.1381305456161499, "learning_rate": 3.978312093033933e-06, "loss": 0.0115, "step": 38530 }, { "epoch": 8.483665982443116, "grad_norm": 0.20146633684635162, "learning_rate": 3.970246712009438e-06, "loss": 0.013, "step": 38540 }, { "epoch": 8.48599525410898, "grad_norm": 0.08612987399101257, "learning_rate": 3.96218861390651e-06, "loss": 0.0083, "step": 38550 }, { "epoch": 8.488324525774846, "grad_norm": 0.08961211144924164, "learning_rate": 3.954137802386236e-06, "loss": 0.0095, "step": 38560 }, { "epoch": 8.490653797440713, "grad_norm": 0.11083094775676727, "learning_rate": 3.946094281106416e-06, "loss": 0.0088, "step": 38570 }, { "epoch": 8.492983069106579, "grad_norm": 0.08478772640228271, "learning_rate": 3.938058053721512e-06, "loss": 0.0095, "step": 38580 }, { "epoch": 8.495312340772445, "grad_norm": 0.08116518706083298, "learning_rate": 3.930029123882695e-06, "loss": 0.0117, "step": 38590 }, { "epoch": 8.497641612438311, "grad_norm": 0.14028479158878326, "learning_rate": 3.922007495237812e-06, "loss": 0.0092, "step": 38600 }, { "epoch": 8.499970884104176, "grad_norm": 0.08299793303012848, "learning_rate": 3.91399317143138e-06, "loss": 0.0089, "step": 38610 }, { "epoch": 8.502300155770042, "grad_norm": 0.18364012241363525, "learning_rate": 3.90598615610462e-06, "loss": 0.0089, "step": 38620 }, { "epoch": 8.504629427435908, "grad_norm": 0.09950145334005356, "learning_rate": 3.897986452895419e-06, "loss": 0.0091, "step": 38630 }, { "epoch": 8.506958699101775, "grad_norm": 0.0789848044514656, "learning_rate": 3.889994065438339e-06, "loss": 0.0113, "step": 38640 }, { "epoch": 8.509287970767641, "grad_norm": 0.10152124613523483, "learning_rate": 3.8820089973646305e-06, "loss": 0.009, "step": 38650 }, { "epoch": 8.511617242433507, "grad_norm": 0.18167467415332794, "learning_rate": 3.874031252302202e-06, "loss": 0.0069, "step": 38660 }, { "epoch": 8.513946514099372, "grad_norm": 0.10490066558122635, "learning_rate": 3.866060833875646e-06, "loss": 0.0076, "step": 38670 }, { "epoch": 8.516275785765238, "grad_norm": 0.08484331518411636, "learning_rate": 3.858097745706235e-06, "loss": 0.008, "step": 38680 }, { "epoch": 8.518605057431104, "grad_norm": 0.09764789044857025, "learning_rate": 3.850141991411893e-06, "loss": 0.0085, "step": 38690 }, { "epoch": 8.52093432909697, "grad_norm": 0.09953711926937103, "learning_rate": 3.842193574607212e-06, "loss": 0.0089, "step": 38700 }, { "epoch": 8.523263600762837, "grad_norm": 0.08627825230360031, "learning_rate": 3.834252498903468e-06, "loss": 0.0102, "step": 38710 }, { "epoch": 8.525592872428703, "grad_norm": 0.09826690703630447, "learning_rate": 3.826318767908585e-06, "loss": 0.0062, "step": 38720 }, { "epoch": 8.527922144094568, "grad_norm": 0.08482235670089722, "learning_rate": 3.818392385227161e-06, "loss": 0.0103, "step": 38730 }, { "epoch": 8.530251415760434, "grad_norm": 0.12070261687040329, "learning_rate": 3.8104733544604445e-06, "loss": 0.0107, "step": 38740 }, { "epoch": 8.5325806874263, "grad_norm": 0.10835502296686172, "learning_rate": 3.8025616792063536e-06, "loss": 0.0094, "step": 38750 }, { "epoch": 8.534909959092166, "grad_norm": 0.10025788843631744, "learning_rate": 3.7946573630594662e-06, "loss": 0.0114, "step": 38760 }, { "epoch": 8.537239230758033, "grad_norm": 0.1381930708885193, "learning_rate": 3.786760409611001e-06, "loss": 0.0116, "step": 38770 }, { "epoch": 8.539568502423899, "grad_norm": 0.10932258516550064, "learning_rate": 3.77887082244885e-06, "loss": 0.0078, "step": 38780 }, { "epoch": 8.541897774089765, "grad_norm": 0.05206163227558136, "learning_rate": 3.770988605157544e-06, "loss": 0.0084, "step": 38790 }, { "epoch": 8.54422704575563, "grad_norm": 0.06809955090284348, "learning_rate": 3.7631137613182733e-06, "loss": 0.0109, "step": 38800 }, { "epoch": 8.546556317421496, "grad_norm": 0.07114329934120178, "learning_rate": 3.755246294508883e-06, "loss": 0.008, "step": 38810 }, { "epoch": 8.548885589087362, "grad_norm": 0.12385844439268112, "learning_rate": 3.7473862083038513e-06, "loss": 0.0117, "step": 38820 }, { "epoch": 8.551214860753229, "grad_norm": 0.11855867505073547, "learning_rate": 3.7395335062743177e-06, "loss": 0.0102, "step": 38830 }, { "epoch": 8.553544132419095, "grad_norm": 0.07976050674915314, "learning_rate": 3.7316881919880566e-06, "loss": 0.0108, "step": 38840 }, { "epoch": 8.55587340408496, "grad_norm": 0.07922257483005524, "learning_rate": 3.72385026900949e-06, "loss": 0.0102, "step": 38850 }, { "epoch": 8.558202675750826, "grad_norm": 0.12118888646364212, "learning_rate": 3.71601974089969e-06, "loss": 0.0123, "step": 38860 }, { "epoch": 8.560531947416692, "grad_norm": 0.09550791233778, "learning_rate": 3.7081966112163504e-06, "loss": 0.0095, "step": 38870 }, { "epoch": 8.562861219082558, "grad_norm": 0.1227182000875473, "learning_rate": 3.7003808835138165e-06, "loss": 0.0103, "step": 38880 }, { "epoch": 8.565190490748424, "grad_norm": 0.0703873559832573, "learning_rate": 3.6925725613430775e-06, "loss": 0.0108, "step": 38890 }, { "epoch": 8.56751976241429, "grad_norm": 0.0769476518034935, "learning_rate": 3.684771648251737e-06, "loss": 0.0086, "step": 38900 }, { "epoch": 8.569849034080157, "grad_norm": 0.08700913935899734, "learning_rate": 3.6769781477840537e-06, "loss": 0.0087, "step": 38910 }, { "epoch": 8.572178305746021, "grad_norm": 0.11807127296924591, "learning_rate": 3.6691920634808975e-06, "loss": 0.0079, "step": 38920 }, { "epoch": 8.10196251837016, "grad_norm": 0.108480304479599, "learning_rate": 3.661413398879794e-06, "loss": 0.007, "step": 38930 }, { "epoch": 8.104043386092, "grad_norm": 0.1187518760561943, "learning_rate": 3.6536421575148694e-06, "loss": 0.0089, "step": 38940 }, { "epoch": 8.10612425381384, "grad_norm": 0.08972402662038803, "learning_rate": 3.6458783429169044e-06, "loss": 0.0096, "step": 38950 }, { "epoch": 8.108205121535681, "grad_norm": 0.178303062915802, "learning_rate": 3.6381219586132876e-06, "loss": 0.0124, "step": 38960 }, { "epoch": 8.11028598925752, "grad_norm": 0.0844849944114685, "learning_rate": 3.630373008128032e-06, "loss": 0.0081, "step": 38970 }, { "epoch": 8.11236685697936, "grad_norm": 0.10633397847414017, "learning_rate": 3.622631494981781e-06, "loss": 0.0111, "step": 38980 }, { "epoch": 8.1144477247012, "grad_norm": 0.12151327729225159, "learning_rate": 3.6148974226918032e-06, "loss": 0.0132, "step": 38990 }, { "epoch": 8.116528592423041, "grad_norm": 0.12500080466270447, "learning_rate": 3.6071707947719703e-06, "loss": 0.012, "step": 39000 }, { "epoch": 8.11860946014488, "grad_norm": 0.10087081789970398, "learning_rate": 3.599451614732785e-06, "loss": 0.0099, "step": 39010 }, { "epoch": 8.12069032786672, "grad_norm": 0.06971297413110733, "learning_rate": 3.5917398860813646e-06, "loss": 0.0098, "step": 39020 }, { "epoch": 8.12277119558856, "grad_norm": 0.07277083396911621, "learning_rate": 3.5840356123214303e-06, "loss": 0.0099, "step": 39030 }, { "epoch": 8.124852063310401, "grad_norm": 0.08586494624614716, "learning_rate": 3.5763387969533357e-06, "loss": 0.009, "step": 39040 }, { "epoch": 8.12693293103224, "grad_norm": 0.1360335499048233, "learning_rate": 3.5686494434740216e-06, "loss": 0.0106, "step": 39050 }, { "epoch": 8.12901379875408, "grad_norm": 0.05976837873458862, "learning_rate": 3.560967555377057e-06, "loss": 0.01, "step": 39060 }, { "epoch": 8.13109466647592, "grad_norm": 0.08704391866922379, "learning_rate": 3.55329313615262e-06, "loss": 0.0119, "step": 39070 }, { "epoch": 8.133175534197761, "grad_norm": 0.09598387032747269, "learning_rate": 3.5456261892874766e-06, "loss": 0.0088, "step": 39080 }, { "epoch": 8.1352564019196, "grad_norm": 0.07760238647460938, "learning_rate": 3.5379667182650222e-06, "loss": 0.0079, "step": 39090 }, { "epoch": 8.13733726964144, "grad_norm": 0.07039795070886612, "learning_rate": 3.5303147265652316e-06, "loss": 0.0104, "step": 39100 }, { "epoch": 8.13941813736328, "grad_norm": 0.07495097070932388, "learning_rate": 3.522670217664701e-06, "loss": 0.0071, "step": 39110 }, { "epoch": 8.141499005085121, "grad_norm": 0.06956373900175095, "learning_rate": 3.515033195036619e-06, "loss": 0.0058, "step": 39120 }, { "epoch": 8.14357987280696, "grad_norm": 0.07311958074569702, "learning_rate": 3.507403662150768e-06, "loss": 0.0099, "step": 39130 }, { "epoch": 8.1456607405288, "grad_norm": 0.13693368434906006, "learning_rate": 3.4997816224735392e-06, "loss": 0.0076, "step": 39140 }, { "epoch": 8.14774160825064, "grad_norm": 0.08581458777189255, "learning_rate": 3.492167079467905e-06, "loss": 0.0104, "step": 39150 }, { "epoch": 8.149822475972481, "grad_norm": 0.12251931428909302, "learning_rate": 3.4845600365934473e-06, "loss": 0.0084, "step": 39160 }, { "epoch": 8.15190334369432, "grad_norm": 0.08758221566677094, "learning_rate": 3.4769604973063253e-06, "loss": 0.0101, "step": 39170 }, { "epoch": 8.15398421141616, "grad_norm": 0.10404974222183228, "learning_rate": 3.4693684650593063e-06, "loss": 0.0071, "step": 39180 }, { "epoch": 8.156065079138001, "grad_norm": 0.1388026773929596, "learning_rate": 3.461783943301724e-06, "loss": 0.011, "step": 39190 }, { "epoch": 8.158145946859841, "grad_norm": 0.05257052555680275, "learning_rate": 3.454206935479525e-06, "loss": 0.0081, "step": 39200 }, { "epoch": 8.16022681458168, "grad_norm": 0.17370820045471191, "learning_rate": 3.446637445035219e-06, "loss": 0.0087, "step": 39210 }, { "epoch": 8.16230768230352, "grad_norm": 0.07340814173221588, "learning_rate": 3.4390754754079204e-06, "loss": 0.0075, "step": 39220 }, { "epoch": 8.164388550025361, "grad_norm": 0.19034267961978912, "learning_rate": 3.431521030033309e-06, "loss": 0.0114, "step": 39230 }, { "epoch": 8.166469417747201, "grad_norm": 0.1110968142747879, "learning_rate": 3.4239741123436598e-06, "loss": 0.0122, "step": 39240 }, { "epoch": 8.16855028546904, "grad_norm": 0.0689028799533844, "learning_rate": 3.416434725767823e-06, "loss": 0.009, "step": 39250 }, { "epoch": 8.17063115319088, "grad_norm": 0.11848253756761551, "learning_rate": 3.4089028737312213e-06, "loss": 0.0104, "step": 39260 }, { "epoch": 8.172712020912721, "grad_norm": 0.12904392182826996, "learning_rate": 3.4013785596558657e-06, "loss": 0.0093, "step": 39270 }, { "epoch": 8.17479288863456, "grad_norm": 0.06788501143455505, "learning_rate": 3.3938617869603265e-06, "loss": 0.008, "step": 39280 }, { "epoch": 8.1768737563564, "grad_norm": 0.09239306300878525, "learning_rate": 3.3863525590597623e-06, "loss": 0.0068, "step": 39290 }, { "epoch": 8.17895462407824, "grad_norm": 0.0753227025270462, "learning_rate": 3.378850879365905e-06, "loss": 0.0076, "step": 39300 }, { "epoch": 8.181035491800081, "grad_norm": 0.11120042204856873, "learning_rate": 3.3713567512870404e-06, "loss": 0.0087, "step": 39310 }, { "epoch": 8.18311635952192, "grad_norm": 0.07831035554409027, "learning_rate": 3.363870178228037e-06, "loss": 0.0083, "step": 39320 }, { "epoch": 8.18519722724376, "grad_norm": 0.10565684735774994, "learning_rate": 3.3563911635903337e-06, "loss": 0.0092, "step": 39330 }, { "epoch": 8.1872780949656, "grad_norm": 0.11926842480897903, "learning_rate": 3.3489197107719163e-06, "loss": 0.0083, "step": 39340 }, { "epoch": 8.189358962687441, "grad_norm": 0.07081342488527298, "learning_rate": 3.341455823167361e-06, "loss": 0.0093, "step": 39350 }, { "epoch": 8.19143983040928, "grad_norm": 0.10901276022195816, "learning_rate": 3.333999504167782e-06, "loss": 0.0087, "step": 39360 }, { "epoch": 8.19352069813112, "grad_norm": 0.25644591450691223, "learning_rate": 3.3265507571608736e-06, "loss": 0.0106, "step": 39370 }, { "epoch": 8.19560156585296, "grad_norm": 0.18657177686691284, "learning_rate": 3.3191095855308818e-06, "loss": 0.0089, "step": 39380 }, { "epoch": 8.197682433574801, "grad_norm": 0.1823728382587433, "learning_rate": 3.3116759926586072e-06, "loss": 0.01, "step": 39390 }, { "epoch": 8.19976330129664, "grad_norm": 0.13576056063175201, "learning_rate": 3.304249981921419e-06, "loss": 0.0084, "step": 39400 }, { "epoch": 8.20184416901848, "grad_norm": 0.0789337083697319, "learning_rate": 3.2968315566932296e-06, "loss": 0.0093, "step": 39410 }, { "epoch": 8.203925036740321, "grad_norm": 0.18120600283145905, "learning_rate": 3.289420720344505e-06, "loss": 0.0081, "step": 39420 }, { "epoch": 8.206005904462161, "grad_norm": 0.07626734673976898, "learning_rate": 3.2820174762422742e-06, "loss": 0.0081, "step": 39430 }, { "epoch": 8.208086772184, "grad_norm": 0.06745655089616776, "learning_rate": 3.274621827750106e-06, "loss": 0.0095, "step": 39440 }, { "epoch": 8.21016763990584, "grad_norm": 0.1232304647564888, "learning_rate": 3.2672337782281227e-06, "loss": 0.0083, "step": 39450 }, { "epoch": 8.212248507627681, "grad_norm": 0.19050776958465576, "learning_rate": 3.2598533310330005e-06, "loss": 0.011, "step": 39460 }, { "epoch": 8.214329375349521, "grad_norm": 0.219247967004776, "learning_rate": 3.2524804895179463e-06, "loss": 0.0113, "step": 39470 }, { "epoch": 8.21641024307136, "grad_norm": 0.07498574256896973, "learning_rate": 3.245115257032727e-06, "loss": 0.0097, "step": 39480 }, { "epoch": 8.2184911107932, "grad_norm": 0.20692689716815948, "learning_rate": 3.2377576369236376e-06, "loss": 0.0082, "step": 39490 }, { "epoch": 8.220571978515041, "grad_norm": 0.13649766147136688, "learning_rate": 3.2304076325335276e-06, "loss": 0.0083, "step": 39500 }, { "epoch": 8.222652846236882, "grad_norm": 0.13850030303001404, "learning_rate": 3.2230652472017865e-06, "loss": 0.0085, "step": 39510 }, { "epoch": 8.22473371395872, "grad_norm": 0.15147647261619568, "learning_rate": 3.2157304842643255e-06, "loss": 0.0088, "step": 39520 }, { "epoch": 8.22681458168056, "grad_norm": 0.1927219182252884, "learning_rate": 3.2084033470536147e-06, "loss": 0.0113, "step": 39530 }, { "epoch": 8.228895449402401, "grad_norm": 0.0855102464556694, "learning_rate": 3.20108383889864e-06, "loss": 0.0105, "step": 39540 }, { "epoch": 8.230976317124242, "grad_norm": 0.12316803634166718, "learning_rate": 3.1937719631249343e-06, "loss": 0.0101, "step": 39550 }, { "epoch": 8.23305718484608, "grad_norm": 0.07993479818105698, "learning_rate": 3.1864677230545627e-06, "loss": 0.0068, "step": 39560 }, { "epoch": 8.23513805256792, "grad_norm": 0.08533801138401031, "learning_rate": 3.1791711220061063e-06, "loss": 0.0113, "step": 39570 }, { "epoch": 8.237218920289761, "grad_norm": 0.13122907280921936, "learning_rate": 3.171882163294695e-06, "loss": 0.009, "step": 39580 }, { "epoch": 8.239299788011602, "grad_norm": 0.07828789204359055, "learning_rate": 3.1646008502319783e-06, "loss": 0.0096, "step": 39590 }, { "epoch": 8.24138065573344, "grad_norm": 0.09481694549322128, "learning_rate": 3.1573271861261244e-06, "loss": 0.0103, "step": 39600 }, { "epoch": 8.24346152345528, "grad_norm": 0.1552610695362091, "learning_rate": 3.1500611742818397e-06, "loss": 0.013, "step": 39610 }, { "epoch": 8.245542391177121, "grad_norm": 0.0785442665219307, "learning_rate": 3.1428028180003435e-06, "loss": 0.0142, "step": 39620 }, { "epoch": 8.247623258898962, "grad_norm": 0.09240670502185822, "learning_rate": 3.1355521205793837e-06, "loss": 0.0082, "step": 39630 }, { "epoch": 8.2497041266208, "grad_norm": 0.0899609699845314, "learning_rate": 3.1283090853132214e-06, "loss": 0.0076, "step": 39640 }, { "epoch": 8.251784994342641, "grad_norm": 0.3175401985645294, "learning_rate": 3.1210737154926485e-06, "loss": 0.0123, "step": 39650 }, { "epoch": 8.253865862064481, "grad_norm": 0.14399005472660065, "learning_rate": 3.1138460144049554e-06, "loss": 0.0076, "step": 39660 }, { "epoch": 8.25594672978632, "grad_norm": 0.1289641261100769, "learning_rate": 3.106625985333971e-06, "loss": 0.0111, "step": 39670 }, { "epoch": 8.25802759750816, "grad_norm": 0.07762686908245087, "learning_rate": 3.0994136315600174e-06, "loss": 0.011, "step": 39680 }, { "epoch": 8.260108465230001, "grad_norm": 0.12456537038087845, "learning_rate": 3.0922089563599455e-06, "loss": 0.0096, "step": 39690 }, { "epoch": 8.262189332951841, "grad_norm": 0.07417621463537216, "learning_rate": 3.0850119630071053e-06, "loss": 0.0106, "step": 39700 }, { "epoch": 8.26427020067368, "grad_norm": 0.09287869185209274, "learning_rate": 3.077822654771363e-06, "loss": 0.0066, "step": 39710 }, { "epoch": 8.26635106839552, "grad_norm": 0.11014819890260696, "learning_rate": 3.0706410349191018e-06, "loss": 0.0086, "step": 39720 }, { "epoch": 8.268431936117361, "grad_norm": 0.09801095724105835, "learning_rate": 3.0634671067131893e-06, "loss": 0.009, "step": 39730 }, { "epoch": 8.270512803839202, "grad_norm": 0.10956773906946182, "learning_rate": 3.056300873413025e-06, "loss": 0.0073, "step": 39740 }, { "epoch": 8.27259367156104, "grad_norm": 0.09401309490203857, "learning_rate": 3.0491423382744846e-06, "loss": 0.0098, "step": 39750 }, { "epoch": 8.27467453928288, "grad_norm": 0.08576271682977676, "learning_rate": 3.0419915045499704e-06, "loss": 0.009, "step": 39760 }, { "epoch": 8.276755407004721, "grad_norm": 0.07668405026197433, "learning_rate": 3.034848375488375e-06, "loss": 0.0149, "step": 39770 }, { "epoch": 8.278836274726562, "grad_norm": 0.10046751797199249, "learning_rate": 3.027712954335087e-06, "loss": 0.0093, "step": 39780 }, { "epoch": 8.2809171424484, "grad_norm": 0.1255587488412857, "learning_rate": 3.020585244332006e-06, "loss": 0.0082, "step": 39790 }, { "epoch": 8.28299801017024, "grad_norm": 0.052804674953222275, "learning_rate": 3.0134652487175087e-06, "loss": 0.0103, "step": 39800 }, { "epoch": 8.285078877892081, "grad_norm": 0.09668810665607452, "learning_rate": 3.0063529707264807e-06, "loss": 0.0099, "step": 39810 }, { "epoch": 8.287159745613922, "grad_norm": 0.09276807308197021, "learning_rate": 2.9992484135903055e-06, "loss": 0.011, "step": 39820 }, { "epoch": 8.28924061333576, "grad_norm": 0.09371642023324966, "learning_rate": 2.992151580536842e-06, "loss": 0.0059, "step": 39830 }, { "epoch": 8.2913214810576, "grad_norm": 0.10554361343383789, "learning_rate": 2.9850624747904546e-06, "loss": 0.0076, "step": 39840 }, { "epoch": 8.293402348779441, "grad_norm": 0.09306617081165314, "learning_rate": 2.9779810995719915e-06, "loss": 0.0061, "step": 39850 }, { "epoch": 8.295483216501282, "grad_norm": 0.20978741347789764, "learning_rate": 2.970907458098784e-06, "loss": 0.0093, "step": 39860 }, { "epoch": 8.29756408422312, "grad_norm": 0.10992341488599777, "learning_rate": 2.9638415535846634e-06, "loss": 0.0096, "step": 39870 }, { "epoch": 8.29964495194496, "grad_norm": 0.0682918131351471, "learning_rate": 2.956783389239928e-06, "loss": 0.0093, "step": 39880 }, { "epoch": 8.301725819666801, "grad_norm": 0.06333421915769577, "learning_rate": 2.9497329682713682e-06, "loss": 0.0057, "step": 39890 }, { "epoch": 8.303806687388642, "grad_norm": 0.07306152582168579, "learning_rate": 2.942690293882262e-06, "loss": 0.0071, "step": 39900 }, { "epoch": 8.30588755511048, "grad_norm": 0.09793873131275177, "learning_rate": 2.9356553692723544e-06, "loss": 0.0074, "step": 39910 }, { "epoch": 8.307968422832321, "grad_norm": 0.11305315047502518, "learning_rate": 2.928628197637886e-06, "loss": 0.0084, "step": 39920 }, { "epoch": 8.310049290554161, "grad_norm": 0.0749354362487793, "learning_rate": 2.9216087821715568e-06, "loss": 0.0058, "step": 39930 }, { "epoch": 8.312130158276002, "grad_norm": 0.06444257497787476, "learning_rate": 2.914597126062557e-06, "loss": 0.0075, "step": 39940 }, { "epoch": 8.31421102599784, "grad_norm": 0.07344274967908859, "learning_rate": 2.9075932324965505e-06, "loss": 0.0073, "step": 39950 }, { "epoch": 8.316291893719681, "grad_norm": 0.07931576669216156, "learning_rate": 2.90059710465566e-06, "loss": 0.0085, "step": 39960 }, { "epoch": 8.318372761441521, "grad_norm": 0.08718324452638626, "learning_rate": 2.893608745718501e-06, "loss": 0.0065, "step": 39970 }, { "epoch": 8.320453629163362, "grad_norm": 0.08653713017702103, "learning_rate": 2.8866281588601386e-06, "loss": 0.0125, "step": 39980 }, { "epoch": 8.3225344968852, "grad_norm": 0.10869032144546509, "learning_rate": 2.8796553472521236e-06, "loss": 0.0082, "step": 39990 }, { "epoch": 8.324615364607041, "grad_norm": 0.05520116537809372, "learning_rate": 2.8726903140624673e-06, "loss": 0.0077, "step": 40000 }, { "epoch": 8.326696232328882, "grad_norm": 0.05552654340863228, "learning_rate": 2.865733062455642e-06, "loss": 0.0067, "step": 40010 }, { "epoch": 8.328777100050722, "grad_norm": 0.06271768361330032, "learning_rate": 2.8587835955925935e-06, "loss": 0.0074, "step": 40020 }, { "epoch": 8.33085796777256, "grad_norm": 0.09871833026409149, "learning_rate": 2.8518419166307308e-06, "loss": 0.0093, "step": 40030 }, { "epoch": 8.332938835494401, "grad_norm": 0.11982183158397675, "learning_rate": 2.8449080287239095e-06, "loss": 0.0067, "step": 40040 }, { "epoch": 8.335019703216242, "grad_norm": 0.10472586750984192, "learning_rate": 2.8379819350224667e-06, "loss": 0.0073, "step": 40050 }, { "epoch": 8.33710057093808, "grad_norm": 0.10039252787828445, "learning_rate": 2.8310636386731815e-06, "loss": 0.0088, "step": 40060 }, { "epoch": 8.33918143865992, "grad_norm": 0.09157045185565948, "learning_rate": 2.824153142819297e-06, "loss": 0.0082, "step": 40070 }, { "epoch": 8.341262306381761, "grad_norm": 0.11413531750440598, "learning_rate": 2.8172504506005194e-06, "loss": 0.0081, "step": 40080 }, { "epoch": 8.343343174103602, "grad_norm": 0.23642921447753906, "learning_rate": 2.8103555651529935e-06, "loss": 0.0116, "step": 40090 }, { "epoch": 8.34542404182544, "grad_norm": 0.07879970222711563, "learning_rate": 2.8034684896093333e-06, "loss": 0.007, "step": 40100 }, { "epoch": 8.34750490954728, "grad_norm": 0.2336500883102417, "learning_rate": 2.7965892270985896e-06, "loss": 0.0093, "step": 40110 }, { "epoch": 8.349585777269121, "grad_norm": 0.1591164618730545, "learning_rate": 2.7897177807462774e-06, "loss": 0.0085, "step": 40120 }, { "epoch": 8.351666644990962, "grad_norm": 0.17389264702796936, "learning_rate": 2.782854153674348e-06, "loss": 0.0092, "step": 40130 }, { "epoch": 8.3537475127128, "grad_norm": 0.0714581161737442, "learning_rate": 2.775998349001212e-06, "loss": 0.0084, "step": 40140 }, { "epoch": 8.355828380434641, "grad_norm": 0.08429581671953201, "learning_rate": 2.769150369841713e-06, "loss": 0.0049, "step": 40150 }, { "epoch": 8.357909248156481, "grad_norm": 0.10383068025112152, "learning_rate": 2.762310219307156e-06, "loss": 0.01, "step": 40160 }, { "epoch": 8.359990115878322, "grad_norm": 0.0737941563129425, "learning_rate": 2.755477900505268e-06, "loss": 0.0092, "step": 40170 }, { "epoch": 8.36207098360016, "grad_norm": 0.08407005667686462, "learning_rate": 2.748653416540239e-06, "loss": 0.0095, "step": 40180 }, { "epoch": 8.364151851322001, "grad_norm": 0.12894904613494873, "learning_rate": 2.7418367705126823e-06, "loss": 0.0086, "step": 40190 }, { "epoch": 8.366232719043841, "grad_norm": 0.14932626485824585, "learning_rate": 2.7350279655196587e-06, "loss": 0.01, "step": 40200 }, { "epoch": 8.368313586765682, "grad_norm": 0.0711284950375557, "learning_rate": 2.728227004654673e-06, "loss": 0.0068, "step": 40210 }, { "epoch": 8.37039445448752, "grad_norm": 0.1314389407634735, "learning_rate": 2.7214338910076476e-06, "loss": 0.0073, "step": 40220 }, { "epoch": 8.372475322209361, "grad_norm": 0.11747179180383682, "learning_rate": 2.7146486276649574e-06, "loss": 0.007, "step": 40230 }, { "epoch": 8.374556189931202, "grad_norm": 0.06717334687709808, "learning_rate": 2.707871217709399e-06, "loss": 0.0085, "step": 40240 }, { "epoch": 8.376637057653042, "grad_norm": 0.14525175094604492, "learning_rate": 2.701101664220205e-06, "loss": 0.0083, "step": 40250 }, { "epoch": 8.37871792537488, "grad_norm": 0.13546673953533173, "learning_rate": 2.6943399702730476e-06, "loss": 0.0084, "step": 40260 }, { "epoch": 8.380798793096721, "grad_norm": 0.11752260476350784, "learning_rate": 2.687586138940006e-06, "loss": 0.0084, "step": 40270 }, { "epoch": 8.382879660818562, "grad_norm": 0.10313417762517929, "learning_rate": 2.6808401732896096e-06, "loss": 0.0104, "step": 40280 }, { "epoch": 8.384960528540402, "grad_norm": 0.11375397443771362, "learning_rate": 2.6741020763868063e-06, "loss": 0.009, "step": 40290 }, { "epoch": 8.38704139626224, "grad_norm": 0.04842818155884743, "learning_rate": 2.667371851292959e-06, "loss": 0.0063, "step": 40300 }, { "epoch": 8.389122263984081, "grad_norm": 0.14587636291980743, "learning_rate": 2.660649501065875e-06, "loss": 0.0078, "step": 40310 }, { "epoch": 8.391203131705922, "grad_norm": 0.08283071964979172, "learning_rate": 2.6539350287597578e-06, "loss": 0.0086, "step": 40320 }, { "epoch": 8.393283999427762, "grad_norm": 0.1122986376285553, "learning_rate": 2.6472284374252533e-06, "loss": 0.0085, "step": 40330 }, { "epoch": 8.3953648671496, "grad_norm": 0.12925155460834503, "learning_rate": 2.6405297301094223e-06, "loss": 0.0093, "step": 40340 }, { "epoch": 8.397445734871441, "grad_norm": 0.12389827519655228, "learning_rate": 2.633838909855737e-06, "loss": 0.0103, "step": 40350 }, { "epoch": 8.399526602593282, "grad_norm": 0.06549548357725143, "learning_rate": 2.6271559797040814e-06, "loss": 0.0073, "step": 40360 }, { "epoch": 8.401607470315122, "grad_norm": 0.10706428438425064, "learning_rate": 2.620480942690775e-06, "loss": 0.0082, "step": 40370 }, { "epoch": 8.403688338036961, "grad_norm": 0.0621766559779644, "learning_rate": 2.6138138018485303e-06, "loss": 0.0124, "step": 40380 }, { "epoch": 8.405769205758801, "grad_norm": 0.11627908796072006, "learning_rate": 2.6071545602064885e-06, "loss": 0.0079, "step": 40390 }, { "epoch": 8.407850073480642, "grad_norm": 0.08130341023206711, "learning_rate": 2.600503220790185e-06, "loss": 0.0053, "step": 40400 }, { "epoch": 8.409930941202482, "grad_norm": 0.09405633062124252, "learning_rate": 2.5938597866215776e-06, "loss": 0.009, "step": 40410 }, { "epoch": 8.412011808924321, "grad_norm": 0.0996982678771019, "learning_rate": 2.587224260719037e-06, "loss": 0.0082, "step": 40420 }, { "epoch": 8.414092676646161, "grad_norm": 0.10272222012281418, "learning_rate": 2.5805966460973197e-06, "loss": 0.0105, "step": 40430 }, { "epoch": 8.416173544368002, "grad_norm": 0.07711566984653473, "learning_rate": 2.5739769457676155e-06, "loss": 0.0075, "step": 40440 }, { "epoch": 8.41825441208984, "grad_norm": 0.09598489105701447, "learning_rate": 2.56736516273749e-06, "loss": 0.0073, "step": 40450 }, { "epoch": 8.420335279811681, "grad_norm": 0.1796882301568985, "learning_rate": 2.56076130001093e-06, "loss": 0.0079, "step": 40460 }, { "epoch": 8.422416147533522, "grad_norm": 0.058884989470243454, "learning_rate": 2.5541653605883277e-06, "loss": 0.0083, "step": 40470 }, { "epoch": 8.424497015255362, "grad_norm": 0.09972178190946579, "learning_rate": 2.5475773474664543e-06, "loss": 0.0101, "step": 40480 }, { "epoch": 8.4265778829772, "grad_norm": 0.0719456896185875, "learning_rate": 2.540997263638505e-06, "loss": 0.0065, "step": 40490 }, { "epoch": 8.428658750699041, "grad_norm": 0.08667018264532089, "learning_rate": 2.53442511209405e-06, "loss": 0.0082, "step": 40500 }, { "epoch": 8.430739618420882, "grad_norm": 0.05312857776880264, "learning_rate": 2.52786089581907e-06, "loss": 0.0073, "step": 40510 }, { "epoch": 8.432820486142722, "grad_norm": 0.07896831631660461, "learning_rate": 2.5213046177959387e-06, "loss": 0.0064, "step": 40520 }, { "epoch": 8.43490135386456, "grad_norm": 0.08082757890224457, "learning_rate": 2.5147562810034167e-06, "loss": 0.0063, "step": 40530 }, { "epoch": 8.436982221586401, "grad_norm": 0.09281459450721741, "learning_rate": 2.5082158884166585e-06, "loss": 0.0076, "step": 40540 }, { "epoch": 8.439063089308242, "grad_norm": 0.0706305056810379, "learning_rate": 2.501683443007219e-06, "loss": 0.0073, "step": 40550 }, { "epoch": 8.441143957030082, "grad_norm": 0.15011131763458252, "learning_rate": 2.4951589477430238e-06, "loss": 0.0091, "step": 40560 }, { "epoch": 8.44322482475192, "grad_norm": 0.14027974009513855, "learning_rate": 2.4886424055884086e-06, "loss": 0.0079, "step": 40570 }, { "epoch": 8.445305692473761, "grad_norm": 0.21203620731830597, "learning_rate": 2.4821338195040755e-06, "loss": 0.0086, "step": 40580 }, { "epoch": 8.447386560195602, "grad_norm": 0.06453978270292282, "learning_rate": 2.4756331924471264e-06, "loss": 0.0089, "step": 40590 }, { "epoch": 8.449467427917442, "grad_norm": 0.0694454163312912, "learning_rate": 2.4691405273710347e-06, "loss": 0.0078, "step": 40600 }, { "epoch": 8.451548295639281, "grad_norm": 0.05762457475066185, "learning_rate": 2.462655827225673e-06, "loss": 0.0101, "step": 40610 }, { "epoch": 8.453629163361121, "grad_norm": 0.09763813763856888, "learning_rate": 2.456179094957278e-06, "loss": 0.0068, "step": 40620 }, { "epoch": 8.455710031082962, "grad_norm": 0.15035505592823029, "learning_rate": 2.4497103335084706e-06, "loss": 0.0066, "step": 40630 }, { "epoch": 8.457790898804802, "grad_norm": 0.06331322342157364, "learning_rate": 2.443249545818258e-06, "loss": 0.0078, "step": 40640 }, { "epoch": 8.459871766526641, "grad_norm": 0.09937255084514618, "learning_rate": 2.436796734822022e-06, "loss": 0.0078, "step": 40650 }, { "epoch": 8.461952634248481, "grad_norm": 0.07226204127073288, "learning_rate": 2.430351903451513e-06, "loss": 0.0071, "step": 40660 }, { "epoch": 8.464033501970322, "grad_norm": 0.06745560467243195, "learning_rate": 2.423915054634862e-06, "loss": 0.008, "step": 40670 }, { "epoch": 8.466114369692162, "grad_norm": 0.06684581190347672, "learning_rate": 2.4174861912965764e-06, "loss": 0.0067, "step": 40680 }, { "epoch": 8.468195237414001, "grad_norm": 0.10477949678897858, "learning_rate": 2.411065316357526e-06, "loss": 0.0073, "step": 40690 }, { "epoch": 8.470276105135842, "grad_norm": 0.1731996089220047, "learning_rate": 2.404652432734962e-06, "loss": 0.0113, "step": 40700 }, { "epoch": 8.472356972857682, "grad_norm": 0.11389993131160736, "learning_rate": 2.3982475433424914e-06, "loss": 0.0089, "step": 40710 }, { "epoch": 8.474437840579522, "grad_norm": 0.2804078757762909, "learning_rate": 2.391850651090104e-06, "loss": 0.0095, "step": 40720 }, { "epoch": 8.476518708301361, "grad_norm": 0.10071039199829102, "learning_rate": 2.385461758884149e-06, "loss": 0.0094, "step": 40730 }, { "epoch": 8.478599576023202, "grad_norm": 0.09840194135904312, "learning_rate": 2.379080869627335e-06, "loss": 0.0063, "step": 40740 }, { "epoch": 8.480680443745042, "grad_norm": 0.07386711984872818, "learning_rate": 2.3727079862187498e-06, "loss": 0.0061, "step": 40750 }, { "epoch": 8.482761311466883, "grad_norm": 0.06741689890623093, "learning_rate": 2.366343111553826e-06, "loss": 0.0068, "step": 40760 }, { "epoch": 8.484842179188721, "grad_norm": 0.10638216882944107, "learning_rate": 2.3599862485243684e-06, "loss": 0.0089, "step": 40770 }, { "epoch": 8.486923046910562, "grad_norm": 0.1352587789297104, "learning_rate": 2.353637400018545e-06, "loss": 0.0069, "step": 40780 }, { "epoch": 8.489003914632402, "grad_norm": 0.08132541924715042, "learning_rate": 2.3472965689208716e-06, "loss": 0.0064, "step": 40790 }, { "epoch": 8.491084782354243, "grad_norm": 0.05374515801668167, "learning_rate": 2.340963758112231e-06, "loss": 0.0073, "step": 40800 }, { "epoch": 8.493165650076081, "grad_norm": 0.08244877308607101, "learning_rate": 2.3346389704698515e-06, "loss": 0.0071, "step": 40810 }, { "epoch": 8.495246517797922, "grad_norm": 0.09605415165424347, "learning_rate": 2.3283222088673307e-06, "loss": 0.0077, "step": 40820 }, { "epoch": 8.497327385519762, "grad_norm": 0.05672159790992737, "learning_rate": 2.322013476174605e-06, "loss": 0.0063, "step": 40830 }, { "epoch": 8.4994082532416, "grad_norm": 0.07651606947183609, "learning_rate": 2.3157127752579766e-06, "loss": 0.0068, "step": 40840 }, { "epoch": 8.501489120963441, "grad_norm": 0.08675338327884674, "learning_rate": 2.3094201089800827e-06, "loss": 0.0076, "step": 40850 }, { "epoch": 8.503569988685282, "grad_norm": 0.1365157514810562, "learning_rate": 2.3031354801999273e-06, "loss": 0.01, "step": 40860 }, { "epoch": 8.505650856407122, "grad_norm": 0.16331860423088074, "learning_rate": 2.2968588917728465e-06, "loss": 0.0089, "step": 40870 }, { "epoch": 8.507731724128961, "grad_norm": 0.20390261709690094, "learning_rate": 2.290590346550541e-06, "loss": 0.0079, "step": 40880 }, { "epoch": 8.509812591850801, "grad_norm": 0.044017575681209564, "learning_rate": 2.284329847381037e-06, "loss": 0.0073, "step": 40890 }, { "epoch": 8.511893459572642, "grad_norm": 0.15155915915966034, "learning_rate": 2.27807739710872e-06, "loss": 0.0107, "step": 40900 }, { "epoch": 8.513974327294482, "grad_norm": 0.07231338322162628, "learning_rate": 2.2718329985743193e-06, "loss": 0.0089, "step": 40910 }, { "epoch": 8.516055195016321, "grad_norm": 0.09412063658237457, "learning_rate": 2.26559665461489e-06, "loss": 0.0099, "step": 40920 }, { "epoch": 8.518136062738161, "grad_norm": 0.06618320941925049, "learning_rate": 2.259368368063848e-06, "loss": 0.0082, "step": 40930 }, { "epoch": 8.520216930460002, "grad_norm": 0.10535834729671478, "learning_rate": 2.2531481417509317e-06, "loss": 0.0088, "step": 40940 }, { "epoch": 8.522297798181842, "grad_norm": 0.10601697862148285, "learning_rate": 2.246935978502227e-06, "loss": 0.0082, "step": 40950 }, { "epoch": 8.524378665903681, "grad_norm": 0.09583815187215805, "learning_rate": 2.240731881140159e-06, "loss": 0.0076, "step": 40960 }, { "epoch": 8.526459533625522, "grad_norm": 0.12508156895637512, "learning_rate": 2.234535852483475e-06, "loss": 0.0104, "step": 40970 }, { "epoch": 8.528540401347362, "grad_norm": 0.099318727850914, "learning_rate": 2.228347895347267e-06, "loss": 0.0071, "step": 40980 }, { "epoch": 8.530621269069202, "grad_norm": 0.08671266585588455, "learning_rate": 2.2221680125429647e-06, "loss": 0.011, "step": 40990 }, { "epoch": 8.532702136791041, "grad_norm": 0.19517448544502258, "learning_rate": 2.21599620687831e-06, "loss": 0.0072, "step": 41000 }, { "epoch": 8.534783004512882, "grad_norm": 0.1333044320344925, "learning_rate": 2.209832481157397e-06, "loss": 0.0061, "step": 41010 }, { "epoch": 8.536863872234722, "grad_norm": 0.06788235902786255, "learning_rate": 2.203676838180633e-06, "loss": 0.0071, "step": 41020 }, { "epoch": 8.538944739956563, "grad_norm": 0.18559671938419342, "learning_rate": 2.1975292807447567e-06, "loss": 0.0114, "step": 41030 }, { "epoch": 8.541025607678401, "grad_norm": 0.27615824341773987, "learning_rate": 2.191389811642846e-06, "loss": 0.0064, "step": 41040 }, { "epoch": 8.543106475400242, "grad_norm": 0.10903999209403992, "learning_rate": 2.1852584336642813e-06, "loss": 0.0064, "step": 41050 }, { "epoch": 8.545187343122082, "grad_norm": 0.053131330758333206, "learning_rate": 2.1791351495947886e-06, "loss": 0.0079, "step": 41060 }, { "epoch": 8.547268210843923, "grad_norm": 0.06186584383249283, "learning_rate": 2.173019962216403e-06, "loss": 0.0102, "step": 41070 }, { "epoch": 8.549349078565761, "grad_norm": 0.09164402633905411, "learning_rate": 2.1669128743074787e-06, "loss": 0.0081, "step": 41080 }, { "epoch": 8.551429946287602, "grad_norm": 0.12240957468748093, "learning_rate": 2.1608138886427054e-06, "loss": 0.0114, "step": 41090 }, { "epoch": 8.553510814009442, "grad_norm": 0.15577732026576996, "learning_rate": 2.154723007993076e-06, "loss": 0.0076, "step": 41100 }, { "epoch": 8.555591681731283, "grad_norm": 0.11075333505868912, "learning_rate": 2.1486402351259072e-06, "loss": 0.0074, "step": 41110 }, { "epoch": 8.557672549453121, "grad_norm": 0.12143658101558685, "learning_rate": 2.14256557280484e-06, "loss": 0.0087, "step": 41120 }, { "epoch": 8.559753417174962, "grad_norm": 0.1493082344532013, "learning_rate": 2.136499023789813e-06, "loss": 0.0079, "step": 41130 }, { "epoch": 8.561834284896802, "grad_norm": 0.07645668834447861, "learning_rate": 2.1304405908370973e-06, "loss": 0.0057, "step": 41140 }, { "epoch": 8.563915152618643, "grad_norm": 0.09106865525245667, "learning_rate": 2.1243902766992573e-06, "loss": 0.0098, "step": 41150 }, { "epoch": 8.565996020340481, "grad_norm": 0.11108054220676422, "learning_rate": 2.1183480841251856e-06, "loss": 0.0072, "step": 41160 }, { "epoch": 8.568076888062322, "grad_norm": 0.07673870027065277, "learning_rate": 2.1123140158600817e-06, "loss": 0.0067, "step": 41170 }, { "epoch": 8.570157755784162, "grad_norm": 0.14297185838222504, "learning_rate": 2.1062880746454393e-06, "loss": 0.0087, "step": 41180 }, { "epoch": 8.572238623506003, "grad_norm": 0.17582501471042633, "learning_rate": 2.100270263219084e-06, "loss": 0.0089, "step": 41190 }, { "epoch": 8.574319491227842, "grad_norm": 0.06590855121612549, "learning_rate": 2.094260584315122e-06, "loss": 0.0083, "step": 41200 }, { "epoch": 8.576400358949682, "grad_norm": 0.1475789099931717, "learning_rate": 2.0882590406639824e-06, "loss": 0.0061, "step": 41210 }, { "epoch": 8.578481226671522, "grad_norm": 0.09401775151491165, "learning_rate": 2.0822656349923974e-06, "loss": 0.0088, "step": 41220 }, { "epoch": 8.580562094393361, "grad_norm": 0.05265146866440773, "learning_rate": 2.0762803700233913e-06, "loss": 0.0057, "step": 41230 }, { "epoch": 8.582642962115202, "grad_norm": 0.1627068966627121, "learning_rate": 2.0703032484762953e-06, "loss": 0.0063, "step": 41240 }, { "epoch": 8.584723829837042, "grad_norm": 0.060133613646030426, "learning_rate": 2.0643342730667483e-06, "loss": 0.0081, "step": 41250 }, { "epoch": 8.586804697558883, "grad_norm": 0.10560832917690277, "learning_rate": 2.058373446506672e-06, "loss": 0.0072, "step": 41260 }, { "epoch": 8.588885565280723, "grad_norm": 0.11641678214073181, "learning_rate": 2.052420771504302e-06, "loss": 0.0056, "step": 41270 }, { "epoch": 8.590966433002562, "grad_norm": 0.06506258249282837, "learning_rate": 2.046476250764158e-06, "loss": 0.0074, "step": 41280 }, { "epoch": 8.593047300724402, "grad_norm": 0.15498830378055573, "learning_rate": 2.040539886987063e-06, "loss": 0.0092, "step": 41290 }, { "epoch": 8.595128168446243, "grad_norm": 0.12259676307439804, "learning_rate": 2.034611682870129e-06, "loss": 0.0117, "step": 41300 }, { "epoch": 8.597209036168081, "grad_norm": 0.09440529346466064, "learning_rate": 2.0286916411067658e-06, "loss": 0.0058, "step": 41310 }, { "epoch": 8.599289903889922, "grad_norm": 0.09719114005565643, "learning_rate": 2.0227797643866665e-06, "loss": 0.0073, "step": 41320 }, { "epoch": 8.601370771611762, "grad_norm": 0.08566975593566895, "learning_rate": 2.0168760553958267e-06, "loss": 0.0061, "step": 41330 }, { "epoch": 8.603451639333603, "grad_norm": 0.18056173622608185, "learning_rate": 2.0109805168165164e-06, "loss": 0.0097, "step": 41340 }, { "epoch": 8.605532507055441, "grad_norm": 0.1338055282831192, "learning_rate": 2.0050931513273087e-06, "loss": 0.008, "step": 41350 }, { "epoch": 8.607613374777282, "grad_norm": 0.123575858771801, "learning_rate": 1.999213961603046e-06, "loss": 0.0103, "step": 41360 }, { "epoch": 8.609694242499122, "grad_norm": 0.07464218139648438, "learning_rate": 1.9933429503148737e-06, "loss": 0.0064, "step": 41370 }, { "epoch": 8.611775110220963, "grad_norm": 0.08831335604190826, "learning_rate": 1.9874801201302164e-06, "loss": 0.0071, "step": 41380 }, { "epoch": 8.613855977942801, "grad_norm": 0.19154630601406097, "learning_rate": 1.9816254737127716e-06, "loss": 0.0067, "step": 41390 }, { "epoch": 8.615936845664642, "grad_norm": 0.12459193170070648, "learning_rate": 1.975779013722532e-06, "loss": 0.0065, "step": 41400 }, { "epoch": 8.618017713386482, "grad_norm": 0.07260539382696152, "learning_rate": 1.969940742815757e-06, "loss": 0.0071, "step": 41410 }, { "epoch": 8.620098581108323, "grad_norm": 0.097539022564888, "learning_rate": 1.964110663644998e-06, "loss": 0.0096, "step": 41420 }, { "epoch": 8.622179448830162, "grad_norm": 0.18206317722797394, "learning_rate": 1.9582887788590855e-06, "loss": 0.0072, "step": 41430 }, { "epoch": 8.624260316552002, "grad_norm": 0.10486558079719543, "learning_rate": 1.9524750911031096e-06, "loss": 0.0085, "step": 41440 }, { "epoch": 8.626341184273842, "grad_norm": 0.0826316699385643, "learning_rate": 1.9466696030184605e-06, "loss": 0.0087, "step": 41450 }, { "epoch": 8.628422051995683, "grad_norm": 0.08642084896564484, "learning_rate": 1.940872317242779e-06, "loss": 0.0057, "step": 41460 }, { "epoch": 8.630502919717522, "grad_norm": 0.0781787857413292, "learning_rate": 1.935083236409998e-06, "loss": 0.006, "step": 41470 }, { "epoch": 8.632583787439362, "grad_norm": 0.10378873348236084, "learning_rate": 1.9293023631503137e-06, "loss": 0.0089, "step": 41480 }, { "epoch": 8.634664655161203, "grad_norm": 0.06862279027700424, "learning_rate": 1.923529700090192e-06, "loss": 0.0071, "step": 41490 }, { "epoch": 8.636745522883043, "grad_norm": 0.0881941169500351, "learning_rate": 1.9177652498523745e-06, "loss": 0.0064, "step": 41500 }, { "epoch": 8.638826390604882, "grad_norm": 0.22228160500526428, "learning_rate": 1.9120090150558714e-06, "loss": 0.0082, "step": 41510 }, { "epoch": 8.640907258326722, "grad_norm": 0.09641992300748825, "learning_rate": 1.9062609983159497e-06, "loss": 0.0057, "step": 41520 }, { "epoch": 8.642988126048563, "grad_norm": 0.0736495703458786, "learning_rate": 1.9005212022441565e-06, "loss": 0.005, "step": 41530 }, { "epoch": 8.645068993770403, "grad_norm": 0.10853618383407593, "learning_rate": 1.894789629448297e-06, "loss": 0.0075, "step": 41540 }, { "epoch": 8.647149861492242, "grad_norm": 0.12420989573001862, "learning_rate": 1.8890662825324347e-06, "loss": 0.0094, "step": 41550 }, { "epoch": 8.649230729214082, "grad_norm": 0.061760902404785156, "learning_rate": 1.8833511640969115e-06, "loss": 0.0075, "step": 41560 }, { "epoch": 8.651311596935923, "grad_norm": 0.06877193599939346, "learning_rate": 1.8776442767383107e-06, "loss": 0.0071, "step": 41570 }, { "epoch": 8.653392464657763, "grad_norm": 0.1790354698896408, "learning_rate": 1.871945623049496e-06, "loss": 0.0063, "step": 41580 }, { "epoch": 8.655473332379602, "grad_norm": 0.07675588130950928, "learning_rate": 1.8662552056195737e-06, "loss": 0.0073, "step": 41590 }, { "epoch": 8.657554200101442, "grad_norm": 0.11598487943410873, "learning_rate": 1.8605730270339185e-06, "loss": 0.0094, "step": 41600 }, { "epoch": 8.659635067823283, "grad_norm": 0.10247883945703506, "learning_rate": 1.8548990898741604e-06, "loss": 0.0065, "step": 41610 }, { "epoch": 8.661715935545121, "grad_norm": 0.07387291640043259, "learning_rate": 1.8492333967181775e-06, "loss": 0.009, "step": 41620 }, { "epoch": 8.663796803266962, "grad_norm": 0.0641070082783699, "learning_rate": 1.8435759501401152e-06, "loss": 0.0062, "step": 41630 }, { "epoch": 8.665877670988802, "grad_norm": 0.1566159576177597, "learning_rate": 1.8379267527103573e-06, "loss": 0.0084, "step": 41640 }, { "epoch": 8.667958538710643, "grad_norm": 0.1007685735821724, "learning_rate": 1.8322858069955506e-06, "loss": 0.0114, "step": 41650 }, { "epoch": 8.670039406432483, "grad_norm": 0.09421567618846893, "learning_rate": 1.8266531155585942e-06, "loss": 0.007, "step": 41660 }, { "epoch": 8.672120274154322, "grad_norm": 0.08023203909397125, "learning_rate": 1.821028680958623e-06, "loss": 0.0083, "step": 41670 }, { "epoch": 8.674201141876162, "grad_norm": 0.07845676690340042, "learning_rate": 1.8154125057510353e-06, "loss": 0.0074, "step": 41680 }, { "epoch": 8.676282009598003, "grad_norm": 0.06003197282552719, "learning_rate": 1.8098045924874742e-06, "loss": 0.0104, "step": 41690 }, { "epoch": 8.678362877319842, "grad_norm": 0.1390623301267624, "learning_rate": 1.8042049437158193e-06, "loss": 0.0076, "step": 41700 }, { "epoch": 8.680443745041682, "grad_norm": 0.06988146901130676, "learning_rate": 1.7986135619802092e-06, "loss": 0.0082, "step": 41710 }, { "epoch": 8.682524612763522, "grad_norm": 0.09577075392007828, "learning_rate": 1.7930304498210115e-06, "loss": 0.0075, "step": 41720 }, { "epoch": 8.684605480485363, "grad_norm": 0.059443872421979904, "learning_rate": 1.7874556097748508e-06, "loss": 0.0086, "step": 41730 }, { "epoch": 8.686686348207202, "grad_norm": 0.09795813262462616, "learning_rate": 1.7818890443745873e-06, "loss": 0.0072, "step": 41740 }, { "epoch": 8.688767215929042, "grad_norm": 0.05750931426882744, "learning_rate": 1.7763307561493182e-06, "loss": 0.0084, "step": 41750 }, { "epoch": 8.690848083650883, "grad_norm": 0.09297998249530792, "learning_rate": 1.7707807476243876e-06, "loss": 0.0075, "step": 41760 }, { "epoch": 8.692928951372723, "grad_norm": 0.11899355053901672, "learning_rate": 1.7652390213213677e-06, "loss": 0.0086, "step": 41770 }, { "epoch": 8.695009819094562, "grad_norm": 0.07591723650693893, "learning_rate": 1.759705579758082e-06, "loss": 0.0064, "step": 41780 }, { "epoch": 8.697090686816402, "grad_norm": 0.08790618926286697, "learning_rate": 1.7541804254485729e-06, "loss": 0.0054, "step": 41790 }, { "epoch": 8.699171554538243, "grad_norm": 0.09477187693119049, "learning_rate": 1.7486635609031343e-06, "loss": 0.006, "step": 41800 }, { "epoch": 8.701252422260083, "grad_norm": 0.0925530418753624, "learning_rate": 1.7431549886282796e-06, "loss": 0.0047, "step": 41810 }, { "epoch": 8.703333289981922, "grad_norm": 0.07875482738018036, "learning_rate": 1.7376547111267661e-06, "loss": 0.0074, "step": 41820 }, { "epoch": 8.705414157703762, "grad_norm": 0.10560823976993561, "learning_rate": 1.7321627308975708e-06, "loss": 0.0085, "step": 41830 }, { "epoch": 8.707495025425603, "grad_norm": 0.11930020153522491, "learning_rate": 1.7266790504359155e-06, "loss": 0.0073, "step": 41840 }, { "epoch": 8.709575893147443, "grad_norm": 0.056278228759765625, "learning_rate": 1.721203672233236e-06, "loss": 0.006, "step": 41850 }, { "epoch": 8.711656760869282, "grad_norm": 0.16447854042053223, "learning_rate": 1.7157365987772045e-06, "loss": 0.0093, "step": 41860 }, { "epoch": 8.713737628591122, "grad_norm": 0.11358098685741425, "learning_rate": 1.7102778325517255e-06, "loss": 0.0079, "step": 41870 }, { "epoch": 8.715818496312963, "grad_norm": 0.07564477622509003, "learning_rate": 1.704827376036915e-06, "loss": 0.0072, "step": 41880 }, { "epoch": 8.717899364034803, "grad_norm": 0.1048271507024765, "learning_rate": 1.6993852317091253e-06, "loss": 0.0092, "step": 41890 }, { "epoch": 8.719980231756642, "grad_norm": 0.2238674908876419, "learning_rate": 1.693951402040921e-06, "loss": 0.0101, "step": 41900 }, { "epoch": 8.722061099478482, "grad_norm": 0.0643467828631401, "learning_rate": 1.6885258895011026e-06, "loss": 0.0086, "step": 41910 }, { "epoch": 8.724141967200323, "grad_norm": 0.11013390868902206, "learning_rate": 1.6831086965546872e-06, "loss": 0.0081, "step": 41920 }, { "epoch": 8.726222834922163, "grad_norm": 0.07110068947076797, "learning_rate": 1.6776998256628996e-06, "loss": 0.0044, "step": 41930 }, { "epoch": 8.728303702644002, "grad_norm": 0.11043071746826172, "learning_rate": 1.6722992792832026e-06, "loss": 0.0061, "step": 41940 }, { "epoch": 8.730384570365842, "grad_norm": 0.17162451148033142, "learning_rate": 1.6669070598692694e-06, "loss": 0.0102, "step": 41950 }, { "epoch": 8.732465438087683, "grad_norm": 0.16676510870456696, "learning_rate": 1.66152316987098e-06, "loss": 0.0094, "step": 41960 }, { "epoch": 8.734546305809523, "grad_norm": 0.14399085938930511, "learning_rate": 1.6561476117344489e-06, "loss": 0.0085, "step": 41970 }, { "epoch": 8.736627173531362, "grad_norm": 0.23785454034805298, "learning_rate": 1.6507803879019847e-06, "loss": 0.0068, "step": 41980 }, { "epoch": 8.738708041253203, "grad_norm": 0.16950832307338715, "learning_rate": 1.6454215008121255e-06, "loss": 0.0084, "step": 41990 }, { "epoch": 8.740788908975043, "grad_norm": 0.11092496663331985, "learning_rate": 1.6400709528996172e-06, "loss": 0.0076, "step": 42000 }, { "epoch": 8.742869776696882, "grad_norm": 0.09148725122213364, "learning_rate": 1.6347287465954153e-06, "loss": 0.0093, "step": 42010 }, { "epoch": 8.744950644418722, "grad_norm": 0.09107708185911179, "learning_rate": 1.6293948843266783e-06, "loss": 0.007, "step": 42020 }, { "epoch": 8.747031512140563, "grad_norm": 0.08973706513643265, "learning_rate": 1.6240693685167918e-06, "loss": 0.008, "step": 42030 }, { "epoch": 8.749112379862403, "grad_norm": 0.11577080190181732, "learning_rate": 1.6187522015853296e-06, "loss": 0.0111, "step": 42040 }, { "epoch": 8.751193247584244, "grad_norm": 0.18958882987499237, "learning_rate": 1.613443385948086e-06, "loss": 0.0086, "step": 42050 }, { "epoch": 8.753274115306082, "grad_norm": 0.09488187730312347, "learning_rate": 1.6081429240170533e-06, "loss": 0.0059, "step": 42060 }, { "epoch": 8.755354983027923, "grad_norm": 0.13379855453968048, "learning_rate": 1.6028508182004322e-06, "loss": 0.0101, "step": 42070 }, { "epoch": 8.757435850749763, "grad_norm": 0.06651078909635544, "learning_rate": 1.5975670709026302e-06, "loss": 0.0052, "step": 42080 }, { "epoch": 8.759516718471602, "grad_norm": 0.09290680289268494, "learning_rate": 1.5922916845242453e-06, "loss": 0.0093, "step": 42090 }, { "epoch": 8.761597586193442, "grad_norm": 0.09460045397281647, "learning_rate": 1.5870246614620933e-06, "loss": 0.0081, "step": 42100 }, { "epoch": 8.763678453915283, "grad_norm": 0.08468155562877655, "learning_rate": 1.5817660041091732e-06, "loss": 0.0097, "step": 42110 }, { "epoch": 8.765759321637123, "grad_norm": 0.08865047246217728, "learning_rate": 1.5765157148546962e-06, "loss": 0.0069, "step": 42120 }, { "epoch": 8.767840189358962, "grad_norm": 0.09081028401851654, "learning_rate": 1.5712737960840673e-06, "loss": 0.0091, "step": 42130 }, { "epoch": 8.769921057080802, "grad_norm": 0.08232511579990387, "learning_rate": 1.566040250178884e-06, "loss": 0.0109, "step": 42140 }, { "epoch": 8.772001924802643, "grad_norm": 0.17235016822814941, "learning_rate": 1.5608150795169486e-06, "loss": 0.0081, "step": 42150 }, { "epoch": 8.774082792524483, "grad_norm": 0.12372034043073654, "learning_rate": 1.5555982864722464e-06, "loss": 0.0081, "step": 42160 }, { "epoch": 8.776163660246322, "grad_norm": 0.10314176976680756, "learning_rate": 1.550389873414968e-06, "loss": 0.0052, "step": 42170 }, { "epoch": 8.778244527968162, "grad_norm": 0.05731961876153946, "learning_rate": 1.5451898427114942e-06, "loss": 0.0072, "step": 42180 }, { "epoch": 8.780325395690003, "grad_norm": 0.08026111871004105, "learning_rate": 1.5399981967243882e-06, "loss": 0.008, "step": 42190 }, { "epoch": 8.782406263411843, "grad_norm": 0.06104070693254471, "learning_rate": 1.5348149378124166e-06, "loss": 0.0079, "step": 42200 }, { "epoch": 8.784487131133682, "grad_norm": 0.06121620908379555, "learning_rate": 1.529640068330529e-06, "loss": 0.0055, "step": 42210 }, { "epoch": 8.786567998855523, "grad_norm": 0.17214667797088623, "learning_rate": 1.5244735906298601e-06, "loss": 0.0072, "step": 42220 }, { "epoch": 8.788648866577363, "grad_norm": 0.13823054730892181, "learning_rate": 1.5193155070577393e-06, "loss": 0.0072, "step": 42230 }, { "epoch": 8.790729734299203, "grad_norm": 0.07654181867837906, "learning_rate": 1.5141658199576758e-06, "loss": 0.0049, "step": 42240 }, { "epoch": 8.792810602021042, "grad_norm": 0.1548680067062378, "learning_rate": 1.5090245316693697e-06, "loss": 0.0079, "step": 42250 }, { "epoch": 8.794891469742883, "grad_norm": 0.062001775950193405, "learning_rate": 1.5038916445287011e-06, "loss": 0.006, "step": 42260 }, { "epoch": 8.796972337464723, "grad_norm": 0.07857709378004074, "learning_rate": 1.498767160867729e-06, "loss": 0.0077, "step": 42270 }, { "epoch": 8.799053205186564, "grad_norm": 0.2556517720222473, "learning_rate": 1.4936510830147088e-06, "loss": 0.0074, "step": 42280 }, { "epoch": 8.801134072908402, "grad_norm": 0.13853494822978973, "learning_rate": 1.488543413294059e-06, "loss": 0.0088, "step": 42290 }, { "epoch": 8.803214940630243, "grad_norm": 0.08110374957323074, "learning_rate": 1.4834441540263899e-06, "loss": 0.006, "step": 42300 }, { "epoch": 8.805295808352083, "grad_norm": 0.11035579442977905, "learning_rate": 1.4783533075284906e-06, "loss": 0.0085, "step": 42310 }, { "epoch": 8.807376676073924, "grad_norm": 0.10306422412395477, "learning_rate": 1.4732708761133196e-06, "loss": 0.0074, "step": 42320 }, { "epoch": 8.809457543795762, "grad_norm": 0.09861325472593307, "learning_rate": 1.468196862090021e-06, "loss": 0.0083, "step": 42330 }, { "epoch": 8.811538411517603, "grad_norm": 0.08744680881500244, "learning_rate": 1.463131267763911e-06, "loss": 0.0066, "step": 42340 }, { "epoch": 8.813619279239443, "grad_norm": 0.05732414871454239, "learning_rate": 1.4580740954364748e-06, "loss": 0.0065, "step": 42350 }, { "epoch": 8.815700146961284, "grad_norm": 0.16665713489055634, "learning_rate": 1.4530253474053858e-06, "loss": 0.0067, "step": 42360 }, { "epoch": 8.817781014683122, "grad_norm": 0.14788818359375, "learning_rate": 1.4479850259644712e-06, "loss": 0.0076, "step": 42370 }, { "epoch": 8.819861882404963, "grad_norm": 0.09716673940420151, "learning_rate": 1.4429531334037416e-06, "loss": 0.0077, "step": 42380 }, { "epoch": 8.821942750126803, "grad_norm": 0.07579904049634933, "learning_rate": 1.4379296720093816e-06, "loss": 0.009, "step": 42390 }, { "epoch": 8.824023617848642, "grad_norm": 0.05240193009376526, "learning_rate": 1.432914644063732e-06, "loss": 0.0074, "step": 42400 }, { "epoch": 8.826104485570482, "grad_norm": 0.09558863192796707, "learning_rate": 1.4279080518453147e-06, "loss": 0.0088, "step": 42410 }, { "epoch": 8.828185353292323, "grad_norm": 0.04820014536380768, "learning_rate": 1.4229098976288059e-06, "loss": 0.0043, "step": 42420 }, { "epoch": 8.830266221014163, "grad_norm": 0.09843944013118744, "learning_rate": 1.417920183685062e-06, "loss": 0.0066, "step": 42430 }, { "epoch": 8.832347088736004, "grad_norm": 0.1184830591082573, "learning_rate": 1.412938912281101e-06, "loss": 0.0081, "step": 42440 }, { "epoch": 8.834427956457843, "grad_norm": 0.05329704284667969, "learning_rate": 1.4079660856800925e-06, "loss": 0.0063, "step": 42450 }, { "epoch": 8.836508824179683, "grad_norm": 0.05930676311254501, "learning_rate": 1.4030017061413893e-06, "loss": 0.0071, "step": 42460 }, { "epoch": 8.838589691901523, "grad_norm": 0.06427107751369476, "learning_rate": 1.398045775920487e-06, "loss": 0.0061, "step": 42470 }, { "epoch": 8.840670559623362, "grad_norm": 0.05938183143734932, "learning_rate": 1.39309829726906e-06, "loss": 0.006, "step": 42480 }, { "epoch": 8.842751427345203, "grad_norm": 0.08208992332220078, "learning_rate": 1.388159272434928e-06, "loss": 0.0077, "step": 42490 }, { "epoch": 8.844832295067043, "grad_norm": 0.10463625937700272, "learning_rate": 1.3832287036620805e-06, "loss": 0.0089, "step": 42500 }, { "epoch": 8.846913162788884, "grad_norm": 0.09296385943889618, "learning_rate": 1.3783065931906592e-06, "loss": 0.0061, "step": 42510 }, { "epoch": 8.848994030510722, "grad_norm": 0.09360918402671814, "learning_rate": 1.3733929432569658e-06, "loss": 0.0081, "step": 42520 }, { "epoch": 8.851074898232563, "grad_norm": 0.061294399201869965, "learning_rate": 1.3684877560934529e-06, "loss": 0.0099, "step": 42530 }, { "epoch": 8.853155765954403, "grad_norm": 0.0858725979924202, "learning_rate": 1.3635910339287373e-06, "loss": 0.0066, "step": 42540 }, { "epoch": 8.855236633676244, "grad_norm": 0.1267247498035431, "learning_rate": 1.3587027789875796e-06, "loss": 0.0088, "step": 42550 }, { "epoch": 8.857317501398082, "grad_norm": 0.1352294385433197, "learning_rate": 1.3538229934909032e-06, "loss": 0.0103, "step": 42560 }, { "epoch": 8.859398369119923, "grad_norm": 0.08748891949653625, "learning_rate": 1.3489516796557766e-06, "loss": 0.0052, "step": 42570 }, { "epoch": 8.861479236841763, "grad_norm": 0.10883894562721252, "learning_rate": 1.3440888396954187e-06, "loss": 0.009, "step": 42580 }, { "epoch": 8.863560104563604, "grad_norm": 0.10880115628242493, "learning_rate": 1.3392344758192088e-06, "loss": 0.0058, "step": 42590 }, { "epoch": 8.865640972285442, "grad_norm": 0.12859366834163666, "learning_rate": 1.3343885902326581e-06, "loss": 0.0059, "step": 42600 }, { "epoch": 8.867721840007283, "grad_norm": 0.17608888447284698, "learning_rate": 1.3295511851374387e-06, "loss": 0.0077, "step": 42610 }, { "epoch": 8.869802707729123, "grad_norm": 0.07577411085367203, "learning_rate": 1.3247222627313728e-06, "loss": 0.0044, "step": 42620 }, { "epoch": 8.871883575450964, "grad_norm": 0.10501013696193695, "learning_rate": 1.3199018252084095e-06, "loss": 0.0072, "step": 42630 }, { "epoch": 8.873964443172802, "grad_norm": 0.0990971252322197, "learning_rate": 1.315089874758666e-06, "loss": 0.0082, "step": 42640 }, { "epoch": 8.876045310894643, "grad_norm": 0.17287346720695496, "learning_rate": 1.310286413568389e-06, "loss": 0.0097, "step": 42650 }, { "epoch": 8.878126178616483, "grad_norm": 0.0708920806646347, "learning_rate": 1.3054914438199707e-06, "loss": 0.0055, "step": 42660 }, { "epoch": 8.880207046338324, "grad_norm": 0.08970358967781067, "learning_rate": 1.3007049676919503e-06, "loss": 0.0066, "step": 42670 }, { "epoch": 8.882287914060162, "grad_norm": 0.06545579433441162, "learning_rate": 1.2959269873589998e-06, "loss": 0.006, "step": 42680 }, { "epoch": 8.884368781782003, "grad_norm": 0.06792227923870087, "learning_rate": 1.29115750499194e-06, "loss": 0.0074, "step": 42690 }, { "epoch": 8.886449649503843, "grad_norm": 0.27577275037765503, "learning_rate": 1.2863965227577268e-06, "loss": 0.0073, "step": 42700 }, { "epoch": 8.888530517225684, "grad_norm": 0.12700004875659943, "learning_rate": 1.2816440428194477e-06, "loss": 0.0055, "step": 42710 }, { "epoch": 8.890611384947523, "grad_norm": 0.0867462009191513, "learning_rate": 1.276900067336344e-06, "loss": 0.0087, "step": 42720 }, { "epoch": 8.892692252669363, "grad_norm": 0.13346397876739502, "learning_rate": 1.2721645984637742e-06, "loss": 0.0098, "step": 42730 }, { "epoch": 8.894773120391203, "grad_norm": 0.1331183910369873, "learning_rate": 1.2674376383532416e-06, "loss": 0.0082, "step": 42740 }, { "epoch": 8.896853988113044, "grad_norm": 0.10034052282571793, "learning_rate": 1.2627191891523883e-06, "loss": 0.0073, "step": 42750 }, { "epoch": 8.898934855834883, "grad_norm": 0.15299859642982483, "learning_rate": 1.2580092530049747e-06, "loss": 0.0072, "step": 42760 }, { "epoch": 8.901015723556723, "grad_norm": 0.06103363633155823, "learning_rate": 1.2533078320509073e-06, "loss": 0.0078, "step": 42770 }, { "epoch": 8.903096591278564, "grad_norm": 0.06086548790335655, "learning_rate": 1.2486149284262216e-06, "loss": 0.0058, "step": 42780 }, { "epoch": 8.905177459000402, "grad_norm": 0.11461715400218964, "learning_rate": 1.2439305442630767e-06, "loss": 0.0092, "step": 42790 }, { "epoch": 8.907258326722243, "grad_norm": 0.06653105467557907, "learning_rate": 1.2392546816897677e-06, "loss": 0.0072, "step": 42800 }, { "epoch": 8.909339194444083, "grad_norm": 0.09459495544433594, "learning_rate": 1.2345873428307132e-06, "loss": 0.0074, "step": 42810 }, { "epoch": 8.911420062165924, "grad_norm": 0.07659419625997543, "learning_rate": 1.229928529806459e-06, "loss": 0.0086, "step": 42820 }, { "epoch": 8.913500929887764, "grad_norm": 0.07753589004278183, "learning_rate": 1.2252782447336875e-06, "loss": 0.0078, "step": 42830 }, { "epoch": 8.915581797609603, "grad_norm": 0.06948669999837875, "learning_rate": 1.220636489725191e-06, "loss": 0.0104, "step": 42840 }, { "epoch": 8.917662665331443, "grad_norm": 0.052657801657915115, "learning_rate": 1.216003266889898e-06, "loss": 0.0063, "step": 42850 }, { "epoch": 8.919743533053284, "grad_norm": 0.09012342244386673, "learning_rate": 1.2113785783328536e-06, "loss": 0.0074, "step": 42860 }, { "epoch": 8.921824400775122, "grad_norm": 0.06625135987997055, "learning_rate": 1.2067624261552301e-06, "loss": 0.0068, "step": 42870 }, { "epoch": 8.923905268496963, "grad_norm": 0.10835239291191101, "learning_rate": 1.2021548124543215e-06, "loss": 0.0065, "step": 42880 }, { "epoch": 8.925986136218803, "grad_norm": 0.08235184848308563, "learning_rate": 1.1975557393235349e-06, "loss": 0.0073, "step": 42890 }, { "epoch": 8.928067003940644, "grad_norm": 0.18035569787025452, "learning_rate": 1.1929652088524057e-06, "loss": 0.0068, "step": 42900 }, { "epoch": 8.930147871662482, "grad_norm": 0.05821339040994644, "learning_rate": 1.188383223126588e-06, "loss": 0.006, "step": 42910 }, { "epoch": 8.932228739384323, "grad_norm": 0.13284708559513092, "learning_rate": 1.1838097842278473e-06, "loss": 0.0075, "step": 42920 }, { "epoch": 8.934309607106163, "grad_norm": 0.06807632744312286, "learning_rate": 1.1792448942340707e-06, "loss": 0.0052, "step": 42930 }, { "epoch": 8.936390474828004, "grad_norm": 0.07177594304084778, "learning_rate": 1.1746885552192588e-06, "loss": 0.0049, "step": 42940 }, { "epoch": 8.938471342549843, "grad_norm": 0.11085748672485352, "learning_rate": 1.1701407692535295e-06, "loss": 0.009, "step": 42950 }, { "epoch": 8.940552210271683, "grad_norm": 0.19911466538906097, "learning_rate": 1.1656015384031116e-06, "loss": 0.0104, "step": 42960 }, { "epoch": 8.942633077993523, "grad_norm": 0.09142617136240005, "learning_rate": 1.1610708647303514e-06, "loss": 0.0085, "step": 42970 }, { "epoch": 8.944713945715364, "grad_norm": 0.1346694529056549, "learning_rate": 1.1565487502937022e-06, "loss": 0.0073, "step": 42980 }, { "epoch": 8.946794813437203, "grad_norm": 0.10059171169996262, "learning_rate": 1.1520351971477361e-06, "loss": 0.0077, "step": 42990 }, { "epoch": 8.948875681159043, "grad_norm": 0.09567129611968994, "learning_rate": 1.1475302073431237e-06, "loss": 0.007, "step": 43000 }, { "epoch": 8.950956548880884, "grad_norm": 0.08797800540924072, "learning_rate": 1.1430337829266568e-06, "loss": 0.0097, "step": 43010 }, { "epoch": 8.953037416602724, "grad_norm": 0.18621474504470825, "learning_rate": 1.138545925941228e-06, "loss": 0.0104, "step": 43020 }, { "epoch": 8.955118284324563, "grad_norm": 0.08327832818031311, "learning_rate": 1.1340666384258392e-06, "loss": 0.0062, "step": 43030 }, { "epoch": 8.957199152046403, "grad_norm": 0.14716635644435883, "learning_rate": 1.129595922415605e-06, "loss": 0.0106, "step": 43040 }, { "epoch": 8.959280019768244, "grad_norm": 0.1749573051929474, "learning_rate": 1.1251337799417338e-06, "loss": 0.0119, "step": 43050 }, { "epoch": 8.961360887490084, "grad_norm": 0.06453065574169159, "learning_rate": 1.1206802130315507e-06, "loss": 0.0079, "step": 43060 }, { "epoch": 8.963441755211923, "grad_norm": 0.09353671222925186, "learning_rate": 1.116235223708475e-06, "loss": 0.0067, "step": 43070 }, { "epoch": 8.965522622933763, "grad_norm": 0.10127923637628555, "learning_rate": 1.1117988139920333e-06, "loss": 0.0082, "step": 43080 }, { "epoch": 8.967603490655604, "grad_norm": 0.0680418461561203, "learning_rate": 1.1073709858978575e-06, "loss": 0.0086, "step": 43090 }, { "epoch": 8.969684358377444, "grad_norm": 0.265470415353775, "learning_rate": 1.1029517414376744e-06, "loss": 0.0087, "step": 43100 }, { "epoch": 8.971765226099283, "grad_norm": 0.14091138541698456, "learning_rate": 1.0985410826193154e-06, "loss": 0.0081, "step": 43110 }, { "epoch": 8.973846093821123, "grad_norm": 0.16711966693401337, "learning_rate": 1.0941390114467044e-06, "loss": 0.0065, "step": 43120 }, { "epoch": 8.975926961542964, "grad_norm": 0.07491280883550644, "learning_rate": 1.08974552991987e-06, "loss": 0.0076, "step": 43130 }, { "epoch": 8.978007829264804, "grad_norm": 0.09953214228153229, "learning_rate": 1.0853606400349426e-06, "loss": 0.0067, "step": 43140 }, { "epoch": 9.001528340753538, "grad_norm": 0.08585535734891891, "learning_rate": 1.080984343784135e-06, "loss": 0.0072, "step": 43150 }, { "epoch": 9.003654727888897, "grad_norm": 0.03871747478842735, "learning_rate": 1.0766166431557678e-06, "loss": 0.0068, "step": 43160 }, { "epoch": 9.005781115024254, "grad_norm": 0.17430676519870758, "learning_rate": 1.0722575401342539e-06, "loss": 0.0108, "step": 43170 }, { "epoch": 9.007907502159611, "grad_norm": 0.07026193290948868, "learning_rate": 1.0679070367000931e-06, "loss": 0.0077, "step": 43180 }, { "epoch": 9.01003388929497, "grad_norm": 0.08568747341632843, "learning_rate": 1.0635651348298892e-06, "loss": 0.0049, "step": 43190 }, { "epoch": 9.012160276430327, "grad_norm": 0.07394997030496597, "learning_rate": 1.0592318364963283e-06, "loss": 0.0061, "step": 43200 }, { "epoch": 9.014286663565686, "grad_norm": 0.10956282168626785, "learning_rate": 1.054907143668189e-06, "loss": 0.0051, "step": 43210 }, { "epoch": 9.016413050701043, "grad_norm": 0.07789157330989838, "learning_rate": 1.0505910583103485e-06, "loss": 0.0058, "step": 43220 }, { "epoch": 9.0185394378364, "grad_norm": 0.07950172573328018, "learning_rate": 1.0462835823837603e-06, "loss": 0.0068, "step": 43230 }, { "epoch": 9.02066582497176, "grad_norm": 0.07860558480024338, "learning_rate": 1.041984717845479e-06, "loss": 0.0074, "step": 43240 }, { "epoch": 9.022792212107117, "grad_norm": 0.10873427987098694, "learning_rate": 1.0376944666486378e-06, "loss": 0.0064, "step": 43250 }, { "epoch": 9.024918599242474, "grad_norm": 0.1490882933139801, "learning_rate": 1.0334128307424573e-06, "loss": 0.0085, "step": 43260 }, { "epoch": 9.027044986377833, "grad_norm": 0.1354614943265915, "learning_rate": 1.0291398120722551e-06, "loss": 0.008, "step": 43270 }, { "epoch": 9.02917137351319, "grad_norm": 0.1137605682015419, "learning_rate": 1.0248754125794157e-06, "loss": 0.0079, "step": 43280 }, { "epoch": 9.031297760648549, "grad_norm": 0.12577199935913086, "learning_rate": 1.0206196342014186e-06, "loss": 0.007, "step": 43290 }, { "epoch": 9.033424147783906, "grad_norm": 0.08145640790462494, "learning_rate": 1.0163724788718299e-06, "loss": 0.0061, "step": 43300 }, { "epoch": 9.035550534919263, "grad_norm": 0.0568091943860054, "learning_rate": 1.0121339485202842e-06, "loss": 0.0074, "step": 43310 }, { "epoch": 9.037676922054622, "grad_norm": 0.10337910801172256, "learning_rate": 1.0079040450725141e-06, "loss": 0.0075, "step": 43320 }, { "epoch": 9.03980330918998, "grad_norm": 0.1385839432477951, "learning_rate": 1.003682770450316e-06, "loss": 0.0063, "step": 43330 }, { "epoch": 9.041929696325337, "grad_norm": 0.12185501307249069, "learning_rate": 9.994701265715801e-07, "loss": 0.0054, "step": 43340 }, { "epoch": 9.044056083460696, "grad_norm": 0.06970648467540741, "learning_rate": 9.952661153502685e-07, "loss": 0.0046, "step": 43350 }, { "epoch": 9.046182470596053, "grad_norm": 0.12437082082033157, "learning_rate": 9.91070738696418e-07, "loss": 0.0052, "step": 43360 }, { "epoch": 9.04830885773141, "grad_norm": 0.15015363693237305, "learning_rate": 9.86883998516155e-07, "loss": 0.0107, "step": 43370 }, { "epoch": 9.050435244866769, "grad_norm": 0.09563765674829483, "learning_rate": 9.82705896711662e-07, "loss": 0.0082, "step": 43380 }, { "epoch": 9.052561632002126, "grad_norm": 0.13191330432891846, "learning_rate": 9.785364351812165e-07, "loss": 0.009, "step": 43390 }, { "epoch": 9.054688019137485, "grad_norm": 0.0805736854672432, "learning_rate": 9.743756158191631e-07, "loss": 0.0044, "step": 43400 }, { "epoch": 9.056814406272842, "grad_norm": 0.12110349535942078, "learning_rate": 9.702234405159116e-07, "loss": 0.0095, "step": 43410 }, { "epoch": 9.0589407934082, "grad_norm": 0.07589340209960938, "learning_rate": 9.6607991115796e-07, "loss": 0.0095, "step": 43420 }, { "epoch": 9.061067180543558, "grad_norm": 0.12988291680812836, "learning_rate": 9.619450296278643e-07, "loss": 0.0058, "step": 43430 }, { "epoch": 9.063193567678915, "grad_norm": 0.070185087621212, "learning_rate": 9.578187978042598e-07, "loss": 0.0066, "step": 43440 }, { "epoch": 9.065319954814273, "grad_norm": 0.22988833487033844, "learning_rate": 9.537012175618488e-07, "loss": 0.0079, "step": 43450 }, { "epoch": 9.067446341949632, "grad_norm": 0.1612226366996765, "learning_rate": 9.495922907714017e-07, "loss": 0.0093, "step": 43460 }, { "epoch": 9.069572729084989, "grad_norm": 0.046985018998384476, "learning_rate": 9.454920192997586e-07, "loss": 0.0066, "step": 43470 }, { "epoch": 9.071699116220348, "grad_norm": 0.05776224657893181, "learning_rate": 9.414004050098313e-07, "loss": 0.0081, "step": 43480 }, { "epoch": 9.073825503355705, "grad_norm": 0.037076108157634735, "learning_rate": 9.373174497605908e-07, "loss": 0.0056, "step": 43490 }, { "epoch": 9.075951890491062, "grad_norm": 0.08027701824903488, "learning_rate": 9.33243155407082e-07, "loss": 0.0086, "step": 43500 }, { "epoch": 9.078078277626421, "grad_norm": 0.044314201921224594, "learning_rate": 9.291775238004041e-07, "loss": 0.0063, "step": 43510 }, { "epoch": 9.080204664761778, "grad_norm": 0.06215430423617363, "learning_rate": 9.251205567877309e-07, "loss": 0.0061, "step": 43520 }, { "epoch": 9.082331051897135, "grad_norm": 0.08229780197143555, "learning_rate": 9.210722562122986e-07, "loss": 0.008, "step": 43530 }, { "epoch": 9.084457439032494, "grad_norm": 0.14111310243606567, "learning_rate": 9.170326239133942e-07, "loss": 0.006, "step": 43540 }, { "epoch": 9.086583826167852, "grad_norm": 0.056710876524448395, "learning_rate": 9.130016617263848e-07, "loss": 0.0084, "step": 43550 }, { "epoch": 9.088710213303209, "grad_norm": 0.06737388670444489, "learning_rate": 9.089793714826811e-07, "loss": 0.0085, "step": 43560 }, { "epoch": 9.090836600438568, "grad_norm": 0.06676501780748367, "learning_rate": 9.049657550097656e-07, "loss": 0.0043, "step": 43570 }, { "epoch": 9.092962987573925, "grad_norm": 0.11745407432317734, "learning_rate": 9.009608141311732e-07, "loss": 0.0094, "step": 43580 }, { "epoch": 9.095089374709284, "grad_norm": 0.06045607477426529, "learning_rate": 8.969645506665014e-07, "loss": 0.0054, "step": 43590 }, { "epoch": 9.097215761844641, "grad_norm": 0.06917860358953476, "learning_rate": 8.929769664314003e-07, "loss": 0.0046, "step": 43600 }, { "epoch": 9.099342148979998, "grad_norm": 0.12161790579557419, "learning_rate": 8.889980632375872e-07, "loss": 0.0087, "step": 43610 }, { "epoch": 9.101468536115357, "grad_norm": 0.11192425340414047, "learning_rate": 8.850278428928182e-07, "loss": 0.0111, "step": 43620 }, { "epoch": 9.103594923250714, "grad_norm": 0.09394463896751404, "learning_rate": 8.810663072009218e-07, "loss": 0.0046, "step": 43630 }, { "epoch": 9.105721310386071, "grad_norm": 0.17193284630775452, "learning_rate": 8.771134579617669e-07, "loss": 0.0077, "step": 43640 }, { "epoch": 9.10784769752143, "grad_norm": 0.20654839277267456, "learning_rate": 8.731692969712902e-07, "loss": 0.0082, "step": 43650 }, { "epoch": 9.109974084656788, "grad_norm": 0.05853259935975075, "learning_rate": 8.692338260214628e-07, "loss": 0.0055, "step": 43660 }, { "epoch": 9.112100471792147, "grad_norm": 0.06910739839076996, "learning_rate": 8.653070469003255e-07, "loss": 0.0058, "step": 43670 }, { "epoch": 9.114226858927504, "grad_norm": 0.08242575079202652, "learning_rate": 8.613889613919579e-07, "loss": 0.0067, "step": 43680 }, { "epoch": 9.116353246062861, "grad_norm": 0.0485476553440094, "learning_rate": 8.574795712764983e-07, "loss": 0.0102, "step": 43690 }, { "epoch": 9.11847963319822, "grad_norm": 0.06176432594656944, "learning_rate": 8.535788783301236e-07, "loss": 0.0074, "step": 43700 }, { "epoch": 9.120606020333577, "grad_norm": 0.06763320416212082, "learning_rate": 8.496868843250716e-07, "loss": 0.0109, "step": 43710 }, { "epoch": 9.122732407468934, "grad_norm": 0.04887926205992699, "learning_rate": 8.458035910296192e-07, "loss": 0.0072, "step": 43720 }, { "epoch": 9.124858794604293, "grad_norm": 0.07984121888875961, "learning_rate": 8.419290002080905e-07, "loss": 0.0054, "step": 43730 }, { "epoch": 9.12698518173965, "grad_norm": 0.0800546184182167, "learning_rate": 8.380631136208661e-07, "loss": 0.0065, "step": 43740 }, { "epoch": 9.129111568875008, "grad_norm": 0.09197541326284409, "learning_rate": 8.342059330243568e-07, "loss": 0.0075, "step": 43750 }, { "epoch": 9.131237956010366, "grad_norm": 0.0655195564031601, "learning_rate": 8.303574601710296e-07, "loss": 0.0056, "step": 43760 }, { "epoch": 9.133364343145724, "grad_norm": 0.10536655783653259, "learning_rate": 8.26517696809388e-07, "loss": 0.0069, "step": 43770 }, { "epoch": 9.135490730281083, "grad_norm": 0.10433217883110046, "learning_rate": 8.226866446839809e-07, "loss": 0.008, "step": 43780 }, { "epoch": 9.13761711741644, "grad_norm": 0.14779846370220184, "learning_rate": 8.188643055354029e-07, "loss": 0.0085, "step": 43790 }, { "epoch": 9.139743504551797, "grad_norm": 0.1324242204427719, "learning_rate": 8.150506811002845e-07, "loss": 0.0072, "step": 43800 }, { "epoch": 9.141869891687156, "grad_norm": 0.0631752610206604, "learning_rate": 8.112457731113021e-07, "loss": 0.0092, "step": 43810 }, { "epoch": 9.143996278822513, "grad_norm": 0.09891177713871002, "learning_rate": 8.07449583297164e-07, "loss": 0.0063, "step": 43820 }, { "epoch": 9.14612266595787, "grad_norm": 0.061699043959379196, "learning_rate": 8.036621133826239e-07, "loss": 0.0062, "step": 43830 }, { "epoch": 9.14824905309323, "grad_norm": 0.0521279014647007, "learning_rate": 7.998833650884785e-07, "loss": 0.0093, "step": 43840 }, { "epoch": 9.150375440228586, "grad_norm": 0.04294484853744507, "learning_rate": 7.961133401315457e-07, "loss": 0.0043, "step": 43850 }, { "epoch": 9.152501827363944, "grad_norm": 0.10053151100873947, "learning_rate": 7.923520402246976e-07, "loss": 0.0071, "step": 43860 }, { "epoch": 9.154628214499303, "grad_norm": 0.051960524171590805, "learning_rate": 7.885994670768315e-07, "loss": 0.006, "step": 43870 }, { "epoch": 9.15675460163466, "grad_norm": 0.06905228644609451, "learning_rate": 7.848556223928838e-07, "loss": 0.006, "step": 43880 }, { "epoch": 9.158880988770019, "grad_norm": 0.28353390097618103, "learning_rate": 7.811205078738249e-07, "loss": 0.0072, "step": 43890 }, { "epoch": 9.161007375905376, "grad_norm": 0.08262723684310913, "learning_rate": 7.773941252166551e-07, "loss": 0.005, "step": 43900 }, { "epoch": 9.163133763040733, "grad_norm": 0.065423384308815, "learning_rate": 7.736764761144155e-07, "loss": 0.0081, "step": 43910 }, { "epoch": 9.165260150176092, "grad_norm": 0.04460057243704796, "learning_rate": 7.699675622561708e-07, "loss": 0.0056, "step": 43920 }, { "epoch": 9.16738653731145, "grad_norm": 0.05175139755010605, "learning_rate": 7.662673853270198e-07, "loss": 0.0064, "step": 43930 }, { "epoch": 9.169512924446806, "grad_norm": 0.05110732093453407, "learning_rate": 7.625759470080951e-07, "loss": 0.0055, "step": 43940 }, { "epoch": 9.171639311582165, "grad_norm": 0.042186688631772995, "learning_rate": 7.588932489765533e-07, "loss": 0.0076, "step": 43950 }, { "epoch": 9.173765698717522, "grad_norm": 0.03831162303686142, "learning_rate": 7.552192929055868e-07, "loss": 0.0047, "step": 43960 }, { "epoch": 9.175892085852881, "grad_norm": 0.059913519769907, "learning_rate": 7.515540804644117e-07, "loss": 0.0053, "step": 43970 }, { "epoch": 9.178018472988239, "grad_norm": 0.05432553216814995, "learning_rate": 7.478976133182691e-07, "loss": 0.0057, "step": 43980 }, { "epoch": 9.180144860123596, "grad_norm": 0.1273956298828125, "learning_rate": 7.442498931284304e-07, "loss": 0.0075, "step": 43990 }, { "epoch": 9.182271247258955, "grad_norm": 0.08870675414800644, "learning_rate": 7.406109215521984e-07, "loss": 0.0096, "step": 44000 }, { "epoch": 9.184397634394312, "grad_norm": 0.110597625374794, "learning_rate": 7.369807002428864e-07, "loss": 0.0076, "step": 44010 }, { "epoch": 9.186524021529669, "grad_norm": 0.1588108390569687, "learning_rate": 7.333592308498482e-07, "loss": 0.006, "step": 44020 }, { "epoch": 9.188650408665028, "grad_norm": 0.1925429105758667, "learning_rate": 7.297465150184502e-07, "loss": 0.0063, "step": 44030 }, { "epoch": 9.190776795800385, "grad_norm": 0.18151724338531494, "learning_rate": 7.261425543900858e-07, "loss": 0.0077, "step": 44040 }, { "epoch": 9.192903182935742, "grad_norm": 0.07405953854322433, "learning_rate": 7.225473506021718e-07, "loss": 0.006, "step": 44050 }, { "epoch": 9.195029570071101, "grad_norm": 0.10109872370958328, "learning_rate": 7.189609052881441e-07, "loss": 0.0067, "step": 44060 }, { "epoch": 9.197155957206459, "grad_norm": 0.061573851853609085, "learning_rate": 7.153832200774591e-07, "loss": 0.0052, "step": 44070 }, { "epoch": 9.199282344341817, "grad_norm": 0.08058679103851318, "learning_rate": 7.118142965955943e-07, "loss": 0.0075, "step": 44080 }, { "epoch": 9.201408731477175, "grad_norm": 0.08419036120176315, "learning_rate": 7.082541364640461e-07, "loss": 0.0075, "step": 44090 }, { "epoch": 9.203535118612532, "grad_norm": 0.0761040523648262, "learning_rate": 7.047027413003337e-07, "loss": 0.0048, "step": 44100 }, { "epoch": 9.20566150574789, "grad_norm": 0.05536279454827309, "learning_rate": 7.011601127179846e-07, "loss": 0.005, "step": 44110 }, { "epoch": 9.207787892883248, "grad_norm": 0.07678913325071335, "learning_rate": 6.976262523265509e-07, "loss": 0.0051, "step": 44120 }, { "epoch": 9.209914280018605, "grad_norm": 0.060867078602313995, "learning_rate": 6.941011617315973e-07, "loss": 0.01, "step": 44130 }, { "epoch": 9.212040667153964, "grad_norm": 0.1176331490278244, "learning_rate": 6.905848425347073e-07, "loss": 0.0048, "step": 44140 }, { "epoch": 9.214167054289321, "grad_norm": 0.09472845494747162, "learning_rate": 6.870772963334738e-07, "loss": 0.0056, "step": 44150 }, { "epoch": 9.21629344142468, "grad_norm": 0.08918268233537674, "learning_rate": 6.835785247215132e-07, "loss": 0.0078, "step": 44160 }, { "epoch": 9.218419828560037, "grad_norm": 0.11568470299243927, "learning_rate": 6.800885292884452e-07, "loss": 0.0067, "step": 44170 }, { "epoch": 9.220546215695395, "grad_norm": 0.10704737901687622, "learning_rate": 6.766073116199079e-07, "loss": 0.0056, "step": 44180 }, { "epoch": 9.222672602830754, "grad_norm": 0.061901893466711044, "learning_rate": 6.731348732975474e-07, "loss": 0.0042, "step": 44190 }, { "epoch": 9.22479898996611, "grad_norm": 0.07100498676300049, "learning_rate": 6.696712158990282e-07, "loss": 0.0072, "step": 44200 }, { "epoch": 9.226925377101468, "grad_norm": 0.09124291688203812, "learning_rate": 6.662163409980138e-07, "loss": 0.0066, "step": 44210 }, { "epoch": 9.229051764236827, "grad_norm": 0.12594954669475555, "learning_rate": 6.627702501641887e-07, "loss": 0.0061, "step": 44220 }, { "epoch": 9.231178151372184, "grad_norm": 0.05327873304486275, "learning_rate": 6.593329449632447e-07, "loss": 0.006, "step": 44230 }, { "epoch": 9.233304538507541, "grad_norm": 0.07314978539943695, "learning_rate": 6.559044269568704e-07, "loss": 0.0099, "step": 44240 }, { "epoch": 9.2354309256429, "grad_norm": 0.04885036498308182, "learning_rate": 6.524846977027799e-07, "loss": 0.0071, "step": 44250 }, { "epoch": 9.237557312778257, "grad_norm": 0.06665904074907303, "learning_rate": 6.490737587546814e-07, "loss": 0.0062, "step": 44260 }, { "epoch": 9.239683699913616, "grad_norm": 0.04405943676829338, "learning_rate": 6.456716116622908e-07, "loss": 0.0078, "step": 44270 }, { "epoch": 9.241810087048973, "grad_norm": 0.10825002193450928, "learning_rate": 6.422782579713361e-07, "loss": 0.0055, "step": 44280 }, { "epoch": 9.24393647418433, "grad_norm": 0.06892578303813934, "learning_rate": 6.388936992235417e-07, "loss": 0.0081, "step": 44290 }, { "epoch": 9.24606286131969, "grad_norm": 0.08062999695539474, "learning_rate": 6.355179369566445e-07, "loss": 0.0072, "step": 44300 }, { "epoch": 9.248189248455047, "grad_norm": 0.052913058549165726, "learning_rate": 6.321509727043796e-07, "loss": 0.0081, "step": 44310 }, { "epoch": 9.250315635590404, "grad_norm": 0.139417365193367, "learning_rate": 6.287928079964833e-07, "loss": 0.0086, "step": 44320 }, { "epoch": 9.252442022725763, "grad_norm": 0.12297014147043228, "learning_rate": 6.254434443586998e-07, "loss": 0.0063, "step": 44330 }, { "epoch": 9.25456840986112, "grad_norm": 0.0887414887547493, "learning_rate": 6.221028833127674e-07, "loss": 0.006, "step": 44340 }, { "epoch": 9.256694796996479, "grad_norm": 0.2608810067176819, "learning_rate": 6.187711263764318e-07, "loss": 0.0061, "step": 44350 }, { "epoch": 9.258821184131836, "grad_norm": 0.08862505108118057, "learning_rate": 6.154481750634378e-07, "loss": 0.0054, "step": 44360 }, { "epoch": 9.260947571267193, "grad_norm": 0.03829760476946831, "learning_rate": 6.121340308835244e-07, "loss": 0.0064, "step": 44370 }, { "epoch": 9.263073958402552, "grad_norm": 0.09015384316444397, "learning_rate": 6.088286953424316e-07, "loss": 0.0046, "step": 44380 }, { "epoch": 9.26520034553791, "grad_norm": 0.07228472083806992, "learning_rate": 6.055321699419025e-07, "loss": 0.0065, "step": 44390 }, { "epoch": 9.267326732673267, "grad_norm": 0.06892426311969757, "learning_rate": 6.022444561796659e-07, "loss": 0.0063, "step": 44400 }, { "epoch": 9.269453119808626, "grad_norm": 0.060706041753292084, "learning_rate": 5.989655555494622e-07, "loss": 0.0067, "step": 44410 }, { "epoch": 9.271579506943983, "grad_norm": 0.049873508512973785, "learning_rate": 5.956954695410111e-07, "loss": 0.0075, "step": 44420 }, { "epoch": 9.27370589407934, "grad_norm": 0.1070433259010315, "learning_rate": 5.924341996400418e-07, "loss": 0.0058, "step": 44430 }, { "epoch": 9.275832281214699, "grad_norm": 0.08542082458734512, "learning_rate": 5.891817473282735e-07, "loss": 0.0053, "step": 44440 }, { "epoch": 9.277958668350056, "grad_norm": 0.06404203921556473, "learning_rate": 5.859381140834109e-07, "loss": 0.0078, "step": 44450 }, { "epoch": 9.280085055485415, "grad_norm": 0.10317094624042511, "learning_rate": 5.827033013791683e-07, "loss": 0.0051, "step": 44460 }, { "epoch": 9.282211442620772, "grad_norm": 0.16677653789520264, "learning_rate": 5.794773106852347e-07, "loss": 0.0074, "step": 44470 }, { "epoch": 9.28433782975613, "grad_norm": 0.1727048009634018, "learning_rate": 5.762601434673021e-07, "loss": 0.0065, "step": 44480 }, { "epoch": 9.286464216891488, "grad_norm": 0.09208568930625916, "learning_rate": 5.730518011870544e-07, "loss": 0.0055, "step": 44490 }, { "epoch": 9.288590604026846, "grad_norm": 0.0730184018611908, "learning_rate": 5.698522853021593e-07, "loss": 0.0066, "step": 44500 }, { "epoch": 9.290716991162203, "grad_norm": 0.18349424004554749, "learning_rate": 5.666615972662781e-07, "loss": 0.0069, "step": 44510 }, { "epoch": 9.292843378297562, "grad_norm": 0.06328018754720688, "learning_rate": 5.634797385290602e-07, "loss": 0.007, "step": 44520 }, { "epoch": 9.294969765432919, "grad_norm": 0.07041031122207642, "learning_rate": 5.603067105361448e-07, "loss": 0.0056, "step": 44530 }, { "epoch": 9.297096152568278, "grad_norm": 0.08229384571313858, "learning_rate": 5.571425147291587e-07, "loss": 0.0079, "step": 44540 }, { "epoch": 9.299222539703635, "grad_norm": 0.05631183087825775, "learning_rate": 5.539871525457141e-07, "loss": 0.0082, "step": 44550 }, { "epoch": 9.301348926838992, "grad_norm": 0.056544095277786255, "learning_rate": 5.50840625419411e-07, "loss": 0.0067, "step": 44560 }, { "epoch": 9.303475313974351, "grad_norm": 0.08337735384702682, "learning_rate": 5.477029347798368e-07, "loss": 0.0085, "step": 44570 }, { "epoch": 9.305601701109708, "grad_norm": 0.0794767290353775, "learning_rate": 5.445740820525602e-07, "loss": 0.0081, "step": 44580 }, { "epoch": 9.307728088245065, "grad_norm": 0.08466177433729172, "learning_rate": 5.414540686591418e-07, "loss": 0.0039, "step": 44590 }, { "epoch": 9.309854475380424, "grad_norm": 0.08099808543920517, "learning_rate": 5.383428960171167e-07, "loss": 0.006, "step": 44600 }, { "epoch": 9.311980862515782, "grad_norm": 0.06388359516859055, "learning_rate": 5.352405655400095e-07, "loss": 0.004, "step": 44610 }, { "epoch": 9.314107249651139, "grad_norm": 0.0956897959113121, "learning_rate": 5.321470786373262e-07, "loss": 0.0064, "step": 44620 }, { "epoch": 9.316233636786498, "grad_norm": 0.13413359224796295, "learning_rate": 5.290624367145558e-07, "loss": 0.0069, "step": 44630 }, { "epoch": 9.318360023921855, "grad_norm": 0.12828822433948517, "learning_rate": 5.259866411731662e-07, "loss": 0.0084, "step": 44640 }, { "epoch": 9.320486411057214, "grad_norm": 0.13660895824432373, "learning_rate": 5.22919693410604e-07, "loss": 0.0076, "step": 44650 }, { "epoch": 9.322612798192571, "grad_norm": 0.08414515107870102, "learning_rate": 5.198615948203035e-07, "loss": 0.0086, "step": 44660 }, { "epoch": 9.324739185327928, "grad_norm": 0.08705190569162369, "learning_rate": 5.168123467916753e-07, "loss": 0.0067, "step": 44670 }, { "epoch": 9.326865572463287, "grad_norm": 0.0628683790564537, "learning_rate": 5.137719507101024e-07, "loss": 0.0088, "step": 44680 }, { "epoch": 9.328991959598644, "grad_norm": 0.13081005215644836, "learning_rate": 5.107404079569578e-07, "loss": 0.0101, "step": 44690 }, { "epoch": 9.331118346734002, "grad_norm": 0.0814177468419075, "learning_rate": 5.077177199095817e-07, "loss": 0.0058, "step": 44700 }, { "epoch": 9.33324473386936, "grad_norm": 0.10142470896244049, "learning_rate": 5.047038879412958e-07, "loss": 0.0064, "step": 44710 }, { "epoch": 9.335371121004718, "grad_norm": 0.11091962456703186, "learning_rate": 5.016989134214001e-07, "loss": 0.0061, "step": 44720 }, { "epoch": 9.337497508140075, "grad_norm": 0.09737668931484222, "learning_rate": 4.987027977151626e-07, "loss": 0.0122, "step": 44730 }, { "epoch": 9.339623895275434, "grad_norm": 0.07197817414999008, "learning_rate": 4.957155421838367e-07, "loss": 0.0049, "step": 44740 }, { "epoch": 9.341750282410791, "grad_norm": 0.10077591240406036, "learning_rate": 4.927371481846433e-07, "loss": 0.0094, "step": 44750 }, { "epoch": 9.34387666954615, "grad_norm": 0.18047530949115753, "learning_rate": 4.897676170707799e-07, "loss": 0.0066, "step": 44760 }, { "epoch": 9.346003056681507, "grad_norm": 0.07136352360248566, "learning_rate": 4.868069501914164e-07, "loss": 0.0041, "step": 44770 }, { "epoch": 9.348129443816864, "grad_norm": 0.10560116916894913, "learning_rate": 4.838551488916943e-07, "loss": 0.0072, "step": 44780 }, { "epoch": 9.350255830952223, "grad_norm": 0.14926758408546448, "learning_rate": 4.809122145127276e-07, "loss": 0.0053, "step": 44790 }, { "epoch": 9.35238221808758, "grad_norm": 0.1330461949110031, "learning_rate": 4.779781483916046e-07, "loss": 0.0081, "step": 44800 }, { "epoch": 9.354508605222938, "grad_norm": 0.07520770281553268, "learning_rate": 4.750529518613811e-07, "loss": 0.0064, "step": 44810 }, { "epoch": 9.356634992358297, "grad_norm": 0.12127548456192017, "learning_rate": 4.7213662625108516e-07, "loss": 0.0049, "step": 44820 }, { "epoch": 9.358761379493654, "grad_norm": 0.12370481342077255, "learning_rate": 4.6922917288571455e-07, "loss": 0.0075, "step": 44830 }, { "epoch": 9.360887766629013, "grad_norm": 0.09930811822414398, "learning_rate": 4.663305930862305e-07, "loss": 0.0061, "step": 44840 }, { "epoch": 9.36301415376437, "grad_norm": 0.08829520642757416, "learning_rate": 4.634408881695729e-07, "loss": 0.0054, "step": 44850 }, { "epoch": 9.365140540899727, "grad_norm": 0.0646788477897644, "learning_rate": 4.6056005944864036e-07, "loss": 0.0097, "step": 44860 }, { "epoch": 9.367266928035086, "grad_norm": 0.11771947890520096, "learning_rate": 4.5768810823229925e-07, "loss": 0.0082, "step": 44870 }, { "epoch": 9.369393315170443, "grad_norm": 0.08347621560096741, "learning_rate": 4.548250358253925e-07, "loss": 0.0076, "step": 44880 }, { "epoch": 9.3715197023058, "grad_norm": 0.05767732486128807, "learning_rate": 4.5197084352871514e-07, "loss": 0.0064, "step": 44890 }, { "epoch": 9.37364608944116, "grad_norm": 0.090168297290802, "learning_rate": 4.491255326390387e-07, "loss": 0.0051, "step": 44900 }, { "epoch": 9.375772476576516, "grad_norm": 0.06003545597195625, "learning_rate": 4.4628910444909357e-07, "loss": 0.0061, "step": 44910 }, { "epoch": 9.377898863711874, "grad_norm": 0.10912105441093445, "learning_rate": 4.4346156024757335e-07, "loss": 0.0059, "step": 44920 }, { "epoch": 9.380025250847233, "grad_norm": 0.09219443798065186, "learning_rate": 4.406429013191438e-07, "loss": 0.0073, "step": 44930 }, { "epoch": 9.38215163798259, "grad_norm": 0.217375248670578, "learning_rate": 4.3783312894442486e-07, "loss": 0.0078, "step": 44940 }, { "epoch": 9.384278025117949, "grad_norm": 0.11517871916294098, "learning_rate": 4.350322443999999e-07, "loss": 0.008, "step": 44950 }, { "epoch": 9.386404412253306, "grad_norm": 0.10032092034816742, "learning_rate": 4.322402489584221e-07, "loss": 0.0043, "step": 44960 }, { "epoch": 9.388530799388663, "grad_norm": 0.11412176489830017, "learning_rate": 4.294571438881967e-07, "loss": 0.0073, "step": 44970 }, { "epoch": 9.390657186524022, "grad_norm": 0.10638032108545303, "learning_rate": 4.266829304537923e-07, "loss": 0.008, "step": 44980 }, { "epoch": 9.39278357365938, "grad_norm": 0.10961548238992691, "learning_rate": 4.2391760991564056e-07, "loss": 0.0112, "step": 44990 }, { "epoch": 9.394909960794736, "grad_norm": 0.06375439465045929, "learning_rate": 4.211611835301299e-07, "loss": 0.0045, "step": 45000 }, { "epoch": 9.397036347930095, "grad_norm": 0.24993596971035004, "learning_rate": 4.184136525496096e-07, "loss": 0.0082, "step": 45010 }, { "epoch": 9.399162735065453, "grad_norm": 0.07071101665496826, "learning_rate": 4.1567501822238565e-07, "loss": 0.0084, "step": 45020 }, { "epoch": 9.401289122200811, "grad_norm": 0.04190049692988396, "learning_rate": 4.129452817927249e-07, "loss": 0.0074, "step": 45030 }, { "epoch": 9.403415509336169, "grad_norm": 0.0851338729262352, "learning_rate": 4.102244445008485e-07, "loss": 0.0043, "step": 45040 }, { "epoch": 9.405541896471526, "grad_norm": 0.0824405699968338, "learning_rate": 4.0751250758293227e-07, "loss": 0.0063, "step": 45050 }, { "epoch": 9.407668283606885, "grad_norm": 0.07266733795404434, "learning_rate": 4.0480947227111934e-07, "loss": 0.0042, "step": 45060 }, { "epoch": 9.409794670742242, "grad_norm": 0.07018905133008957, "learning_rate": 4.021153397934918e-07, "loss": 0.0058, "step": 45070 }, { "epoch": 9.4119210578776, "grad_norm": 0.08526567369699478, "learning_rate": 3.994301113741017e-07, "loss": 0.0048, "step": 45080 }, { "epoch": 9.414047445012958, "grad_norm": 0.06456899642944336, "learning_rate": 3.9675378823294643e-07, "loss": 0.0111, "step": 45090 }, { "epoch": 9.416173832148315, "grad_norm": 0.10460638254880905, "learning_rate": 3.940863715859822e-07, "loss": 0.0043, "step": 45100 }, { "epoch": 9.418300219283672, "grad_norm": 0.07452580332756042, "learning_rate": 3.914278626451173e-07, "loss": 0.0079, "step": 45110 }, { "epoch": 9.420426606419031, "grad_norm": 0.12330672144889832, "learning_rate": 3.887782626182102e-07, "loss": 0.0063, "step": 45120 }, { "epoch": 9.422552993554389, "grad_norm": 0.06029060482978821, "learning_rate": 3.8613757270907327e-07, "loss": 0.0074, "step": 45130 }, { "epoch": 9.424679380689748, "grad_norm": 0.16174784302711487, "learning_rate": 3.8350579411747803e-07, "loss": 0.0058, "step": 45140 }, { "epoch": 9.426805767825105, "grad_norm": 0.05137509107589722, "learning_rate": 3.808829280391346e-07, "loss": 0.0076, "step": 45150 }, { "epoch": 9.428932154960462, "grad_norm": 0.0792972519993782, "learning_rate": 3.782689756657121e-07, "loss": 0.0058, "step": 45160 }, { "epoch": 9.43105854209582, "grad_norm": 0.16059166193008423, "learning_rate": 3.7566393818482704e-07, "loss": 0.0072, "step": 45170 }, { "epoch": 9.433184929231178, "grad_norm": 0.07495918869972229, "learning_rate": 3.730678167800483e-07, "loss": 0.0105, "step": 45180 }, { "epoch": 9.435311316366535, "grad_norm": 0.25093311071395874, "learning_rate": 3.7048061263089017e-07, "loss": 0.0062, "step": 45190 }, { "epoch": 9.437437703501894, "grad_norm": 0.058328777551651, "learning_rate": 3.679023269128168e-07, "loss": 0.006, "step": 45200 }, { "epoch": 9.439564090637251, "grad_norm": 0.10326693207025528, "learning_rate": 3.6533296079724e-07, "loss": 0.0054, "step": 45210 }, { "epoch": 9.44169047777261, "grad_norm": 0.07045549154281616, "learning_rate": 3.627725154515216e-07, "loss": 0.0073, "step": 45220 }, { "epoch": 9.443816864907967, "grad_norm": 0.05838565155863762, "learning_rate": 3.6022099203896656e-07, "loss": 0.007, "step": 45230 }, { "epoch": 9.445943252043325, "grad_norm": 0.17899996042251587, "learning_rate": 3.57678391718832e-07, "loss": 0.0076, "step": 45240 }, { "epoch": 9.448069639178684, "grad_norm": 0.2640451490879059, "learning_rate": 3.551447156463117e-07, "loss": 0.0088, "step": 45250 }, { "epoch": 9.45019602631404, "grad_norm": 0.11699773371219635, "learning_rate": 3.5261996497255144e-07, "loss": 0.0043, "step": 45260 }, { "epoch": 9.452322413449398, "grad_norm": 0.08538402616977692, "learning_rate": 3.5010414084464704e-07, "loss": 0.0086, "step": 45270 }, { "epoch": 9.454448800584757, "grad_norm": 0.0486709326505661, "learning_rate": 3.4759724440562414e-07, "loss": 0.0069, "step": 45280 }, { "epoch": 9.456575187720114, "grad_norm": 0.07803478091955185, "learning_rate": 3.45099276794465e-07, "loss": 0.0083, "step": 45290 }, { "epoch": 9.458701574855471, "grad_norm": 0.08786503970623016, "learning_rate": 3.426102391460884e-07, "loss": 0.0067, "step": 45300 }, { "epoch": 9.46082796199083, "grad_norm": 0.13762973248958588, "learning_rate": 3.4013013259136083e-07, "loss": 0.0062, "step": 45310 }, { "epoch": 9.462954349126187, "grad_norm": 0.05299348756670952, "learning_rate": 3.376589582570833e-07, "loss": 0.0051, "step": 45320 }, { "epoch": 9.465080736261546, "grad_norm": 0.08063215762376785, "learning_rate": 3.3519671726600867e-07, "loss": 0.0069, "step": 45330 }, { "epoch": 9.467207123396904, "grad_norm": 0.05958941578865051, "learning_rate": 3.3274341073682214e-07, "loss": 0.0074, "step": 45340 }, { "epoch": 9.46933351053226, "grad_norm": 0.05405869707465172, "learning_rate": 3.302990397841566e-07, "loss": 0.0047, "step": 45350 }, { "epoch": 9.47145989766762, "grad_norm": 0.07484244555234909, "learning_rate": 3.278636055185791e-07, "loss": 0.0078, "step": 45360 }, { "epoch": 9.473586284802977, "grad_norm": 0.08778705447912216, "learning_rate": 3.254371090466024e-07, "loss": 0.005, "step": 45370 }, { "epoch": 9.475712671938334, "grad_norm": 0.17443907260894775, "learning_rate": 3.2301955147067133e-07, "loss": 0.0083, "step": 45380 }, { "epoch": 9.477839059073693, "grad_norm": 0.10499085485935211, "learning_rate": 3.2061093388917607e-07, "loss": 0.0061, "step": 45390 }, { "epoch": 9.47996544620905, "grad_norm": 0.058708492666482925, "learning_rate": 3.1821125739644355e-07, "loss": 0.0053, "step": 45400 }, { "epoch": 9.482091833344409, "grad_norm": 0.08637896925210953, "learning_rate": 3.158205230827327e-07, "loss": 0.0048, "step": 45410 }, { "epoch": 9.484218220479766, "grad_norm": 0.09696163982152939, "learning_rate": 3.1343873203425024e-07, "loss": 0.0087, "step": 45420 }, { "epoch": 9.486344607615123, "grad_norm": 0.1303236484527588, "learning_rate": 3.1106588533313053e-07, "loss": 0.0103, "step": 45430 }, { "epoch": 9.488470994750482, "grad_norm": 0.11542632430791855, "learning_rate": 3.087019840574468e-07, "loss": 0.009, "step": 45440 }, { "epoch": 9.49059738188584, "grad_norm": 0.06670914590358734, "learning_rate": 3.06347029281211e-07, "loss": 0.0053, "step": 45450 }, { "epoch": 9.492723769021197, "grad_norm": 0.051899321377277374, "learning_rate": 3.04001022074365e-07, "loss": 0.0036, "step": 45460 }, { "epoch": 9.494850156156556, "grad_norm": 0.0773785337805748, "learning_rate": 3.016639635027896e-07, "loss": 0.0087, "step": 45470 }, { "epoch": 9.496976543291913, "grad_norm": 0.10662957280874252, "learning_rate": 2.9933585462829985e-07, "loss": 0.0061, "step": 45480 }, { "epoch": 9.49910293042727, "grad_norm": 0.06124698743224144, "learning_rate": 2.970166965086407e-07, "loss": 0.0075, "step": 45490 }, { "epoch": 9.501229317562629, "grad_norm": 0.08496557921171188, "learning_rate": 2.947064901974961e-07, "loss": 0.0055, "step": 45500 }, { "epoch": 9.503355704697986, "grad_norm": 0.154542937874794, "learning_rate": 2.924052367444774e-07, "loss": 0.0063, "step": 45510 }, { "epoch": 9.505482091833345, "grad_norm": 0.07278340309858322, "learning_rate": 2.901129371951328e-07, "loss": 0.0083, "step": 45520 }, { "epoch": 9.507608478968702, "grad_norm": 0.08176115900278091, "learning_rate": 2.878295925909402e-07, "loss": 0.0088, "step": 45530 }, { "epoch": 9.50973486610406, "grad_norm": 0.15441691875457764, "learning_rate": 2.855552039693077e-07, "loss": 0.0067, "step": 45540 }, { "epoch": 9.511861253239418, "grad_norm": 0.10187068581581116, "learning_rate": 2.832897723635797e-07, "loss": 0.0063, "step": 45550 }, { "epoch": 9.513987640374776, "grad_norm": 0.08757385611534119, "learning_rate": 2.8103329880302177e-07, "loss": 0.0069, "step": 45560 }, { "epoch": 9.516114027510133, "grad_norm": 0.078859843313694, "learning_rate": 2.787857843128383e-07, "loss": 0.0046, "step": 45570 }, { "epoch": 9.518240414645492, "grad_norm": 0.08451405167579651, "learning_rate": 2.765472299141592e-07, "loss": 0.0059, "step": 45580 }, { "epoch": 9.520366801780849, "grad_norm": 0.11170680075883865, "learning_rate": 2.743176366240441e-07, "loss": 0.0043, "step": 45590 }, { "epoch": 9.522493188916208, "grad_norm": 0.07699549943208694, "learning_rate": 2.720970054554806e-07, "loss": 0.0057, "step": 45600 }, { "epoch": 9.524619576051565, "grad_norm": 0.07080018520355225, "learning_rate": 2.69885337417386e-07, "loss": 0.0073, "step": 45610 }, { "epoch": 9.526745963186922, "grad_norm": 0.05545205995440483, "learning_rate": 2.6768263351460544e-07, "loss": 0.0072, "step": 45620 }, { "epoch": 9.528872350322281, "grad_norm": 0.1792362630367279, "learning_rate": 2.654888947479095e-07, "loss": 0.0062, "step": 45630 }, { "epoch": 9.530998737457638, "grad_norm": 0.1253228485584259, "learning_rate": 2.6330412211399645e-07, "loss": 0.0052, "step": 45640 }, { "epoch": 9.533125124592996, "grad_norm": 0.0792263075709343, "learning_rate": 2.611283166054901e-07, "loss": 0.0079, "step": 45650 }, { "epoch": 9.535251511728355, "grad_norm": 0.07444557547569275, "learning_rate": 2.5896147921094186e-07, "loss": 0.0055, "step": 45660 }, { "epoch": 9.537377898863712, "grad_norm": 0.09768921881914139, "learning_rate": 2.5680361091482644e-07, "loss": 0.0089, "step": 45670 }, { "epoch": 9.539504285999069, "grad_norm": 0.06275828182697296, "learning_rate": 2.546547126975485e-07, "loss": 0.005, "step": 45680 }, { "epoch": 9.541630673134428, "grad_norm": 0.05969492718577385, "learning_rate": 2.525147855354293e-07, "loss": 0.0059, "step": 45690 }, { "epoch": 9.543757060269785, "grad_norm": 0.06423743069171906, "learning_rate": 2.5038383040072003e-07, "loss": 0.0057, "step": 45700 }, { "epoch": 9.545883447405144, "grad_norm": 0.09599459916353226, "learning_rate": 2.4826184826159507e-07, "loss": 0.0079, "step": 45710 }, { "epoch": 9.548009834540501, "grad_norm": 0.07287222146987915, "learning_rate": 2.461488400821499e-07, "loss": 0.0079, "step": 45720 }, { "epoch": 9.550136221675858, "grad_norm": 0.12430170923471451, "learning_rate": 2.4404480682240326e-07, "loss": 0.01, "step": 45730 }, { "epoch": 9.552262608811217, "grad_norm": 0.13455338776111603, "learning_rate": 2.419497494382972e-07, "loss": 0.0063, "step": 45740 }, { "epoch": 9.554388995946574, "grad_norm": 0.13392873108386993, "learning_rate": 2.3986366888169687e-07, "loss": 0.0084, "step": 45750 }, { "epoch": 9.556515383081932, "grad_norm": 0.05634648725390434, "learning_rate": 2.377865661003842e-07, "loss": 0.0046, "step": 45760 }, { "epoch": 9.55864177021729, "grad_norm": 0.04923701658844948, "learning_rate": 2.3571844203806871e-07, "loss": 0.0053, "step": 45770 }, { "epoch": 9.560768157352648, "grad_norm": 0.06166281923651695, "learning_rate": 2.3365929763437657e-07, "loss": 0.0051, "step": 45780 }, { "epoch": 9.562894544488007, "grad_norm": 0.0667545348405838, "learning_rate": 2.316091338248505e-07, "loss": 0.0069, "step": 45790 }, { "epoch": 9.565020931623364, "grad_norm": 0.07288578897714615, "learning_rate": 2.2956795154096544e-07, "loss": 0.0061, "step": 45800 }, { "epoch": 9.567147318758721, "grad_norm": 0.11010805517435074, "learning_rate": 2.2753575171009734e-07, "loss": 0.0073, "step": 45810 }, { "epoch": 9.56927370589408, "grad_norm": 0.14129525423049927, "learning_rate": 2.2551253525556093e-07, "loss": 0.006, "step": 45820 }, { "epoch": 9.571400093029437, "grad_norm": 0.06008567288517952, "learning_rate": 2.2349830309657205e-07, "loss": 0.0094, "step": 45830 }, { "epoch": 9.573526480164794, "grad_norm": 0.1106971800327301, "learning_rate": 2.2149305614827865e-07, "loss": 0.0072, "step": 45840 }, { "epoch": 9.575652867300153, "grad_norm": 0.06404081732034683, "learning_rate": 2.194967953217364e-07, "loss": 0.006, "step": 45850 }, { "epoch": 9.57777925443551, "grad_norm": 0.08383739739656448, "learning_rate": 2.1750952152392202e-07, "loss": 0.0082, "step": 45860 }, { "epoch": 9.579905641570868, "grad_norm": 0.21563154458999634, "learning_rate": 2.1553123565772882e-07, "loss": 0.007, "step": 45870 }, { "epoch": 9.582032028706227, "grad_norm": 0.06725674122571945, "learning_rate": 2.1356193862196671e-07, "loss": 0.0068, "step": 45880 }, { "epoch": 9.584158415841584, "grad_norm": 0.07669797539710999, "learning_rate": 2.1160163131136447e-07, "loss": 0.0077, "step": 45890 }, { "epoch": 9.586284802976943, "grad_norm": 0.10960953682661057, "learning_rate": 2.096503146165607e-07, "loss": 0.0053, "step": 45900 }, { "epoch": 9.5884111901123, "grad_norm": 0.2977711260318756, "learning_rate": 2.0770798942411295e-07, "loss": 0.0082, "step": 45910 }, { "epoch": 9.590537577247657, "grad_norm": 0.0623139813542366, "learning_rate": 2.0577465661649088e-07, "loss": 0.0095, "step": 45920 }, { "epoch": 9.592663964383016, "grad_norm": 0.08231128007173538, "learning_rate": 2.0385031707208513e-07, "loss": 0.0081, "step": 45930 }, { "epoch": 9.594790351518373, "grad_norm": 0.08271031826734543, "learning_rate": 2.0193497166519193e-07, "loss": 0.0092, "step": 45940 }, { "epoch": 9.59691673865373, "grad_norm": 0.05504326894879341, "learning_rate": 2.0002862126602629e-07, "loss": 0.0077, "step": 45950 }, { "epoch": 9.59904312578909, "grad_norm": 0.12333492189645767, "learning_rate": 1.9813126674071536e-07, "loss": 0.0061, "step": 45960 }, { "epoch": 9.601169512924447, "grad_norm": 0.07563965022563934, "learning_rate": 1.962429089512985e-07, "loss": 0.0035, "step": 45970 }, { "epoch": 9.603295900059805, "grad_norm": 0.12060631066560745, "learning_rate": 1.9436354875572717e-07, "loss": 0.0065, "step": 45980 }, { "epoch": 9.605422287195163, "grad_norm": 0.07764338701963425, "learning_rate": 1.9249318700786723e-07, "loss": 0.0064, "step": 45990 }, { "epoch": 9.60754867433052, "grad_norm": 0.06486351042985916, "learning_rate": 1.9063182455749007e-07, "loss": 0.0095, "step": 46000 }, { "epoch": 9.609675061465879, "grad_norm": 0.08004548400640488, "learning_rate": 1.887794622502903e-07, "loss": 0.0055, "step": 46010 }, { "epoch": 9.611801448601236, "grad_norm": 0.12533439695835114, "learning_rate": 1.869361009278614e-07, "loss": 0.0078, "step": 46020 }, { "epoch": 9.613927835736593, "grad_norm": 0.12054367363452911, "learning_rate": 1.851017414277112e-07, "loss": 0.0053, "step": 46030 }, { "epoch": 9.616054222871952, "grad_norm": 0.09260682761669159, "learning_rate": 1.8327638458325968e-07, "loss": 0.0075, "step": 46040 }, { "epoch": 9.61818061000731, "grad_norm": 0.09104914218187332, "learning_rate": 1.814600312238368e-07, "loss": 0.0043, "step": 46050 }, { "epoch": 9.620306997142666, "grad_norm": 0.05148407816886902, "learning_rate": 1.79652682174678e-07, "loss": 0.0066, "step": 46060 }, { "epoch": 9.622433384278025, "grad_norm": 0.1555749773979187, "learning_rate": 1.778543382569331e-07, "loss": 0.006, "step": 46070 }, { "epoch": 9.624559771413383, "grad_norm": 0.05174606665968895, "learning_rate": 1.7606500028765295e-07, "loss": 0.0063, "step": 46080 }, { "epoch": 9.626686158548742, "grad_norm": 0.08490180224180222, "learning_rate": 1.74284669079805e-07, "loss": 0.0049, "step": 46090 }, { "epoch": 9.628812545684099, "grad_norm": 0.06410021334886551, "learning_rate": 1.7251334544226228e-07, "loss": 0.0081, "step": 46100 }, { "epoch": 9.630938932819456, "grad_norm": 0.12706321477890015, "learning_rate": 1.7075103017979878e-07, "loss": 0.0065, "step": 46110 }, { "epoch": 9.633065319954815, "grad_norm": 0.050974130630493164, "learning_rate": 1.689977240931029e-07, "loss": 0.0056, "step": 46120 }, { "epoch": 9.635191707090172, "grad_norm": 0.2763828635215759, "learning_rate": 1.6725342797877075e-07, "loss": 0.0081, "step": 46130 }, { "epoch": 9.63731809422553, "grad_norm": 0.057369962334632874, "learning_rate": 1.6551814262929733e-07, "loss": 0.0064, "step": 46140 }, { "epoch": 9.639444481360888, "grad_norm": 0.0891820415854454, "learning_rate": 1.6379186883309196e-07, "loss": 0.0057, "step": 46150 }, { "epoch": 9.641570868496245, "grad_norm": 0.06789347529411316, "learning_rate": 1.620746073744628e-07, "loss": 0.0061, "step": 46160 }, { "epoch": 9.643697255631604, "grad_norm": 0.10436540096998215, "learning_rate": 1.6036635903362795e-07, "loss": 0.0055, "step": 46170 }, { "epoch": 9.645823642766961, "grad_norm": 0.07430719584226608, "learning_rate": 1.5866712458671108e-07, "loss": 0.0042, "step": 46180 }, { "epoch": 9.647950029902319, "grad_norm": 0.13858430087566376, "learning_rate": 1.5697690480573457e-07, "loss": 0.0068, "step": 46190 }, { "epoch": 9.650076417037678, "grad_norm": 0.07190603762865067, "learning_rate": 1.5529570045863307e-07, "loss": 0.0045, "step": 46200 }, { "epoch": 9.652202804173035, "grad_norm": 0.06925050914287567, "learning_rate": 1.5362351230923778e-07, "loss": 0.0068, "step": 46210 }, { "epoch": 9.654329191308392, "grad_norm": 0.21394863724708557, "learning_rate": 1.5196034111728763e-07, "loss": 0.0076, "step": 46220 }, { "epoch": 9.656455578443751, "grad_norm": 0.11130901426076889, "learning_rate": 1.5030618763842487e-07, "loss": 0.0079, "step": 46230 }, { "epoch": 9.658581965579108, "grad_norm": 0.07772164046764374, "learning_rate": 1.4866105262419273e-07, "loss": 0.0065, "step": 46240 }, { "epoch": 9.660708352714465, "grad_norm": 0.16974255442619324, "learning_rate": 1.470249368220422e-07, "loss": 0.0066, "step": 46250 }, { "epoch": 9.662834739849824, "grad_norm": 0.1470179408788681, "learning_rate": 1.4539784097531428e-07, "loss": 0.0097, "step": 46260 }, { "epoch": 9.664961126985181, "grad_norm": 0.08299179375171661, "learning_rate": 1.437797658232687e-07, "loss": 0.0053, "step": 46270 }, { "epoch": 9.66708751412054, "grad_norm": 0.0542394258081913, "learning_rate": 1.4217071210105293e-07, "loss": 0.0053, "step": 46280 }, { "epoch": 9.669213901255898, "grad_norm": 0.08592654764652252, "learning_rate": 1.4057068053972222e-07, "loss": 0.0052, "step": 46290 }, { "epoch": 9.671340288391255, "grad_norm": 0.07151366770267487, "learning_rate": 1.3897967186623062e-07, "loss": 0.0056, "step": 46300 }, { "epoch": 9.673466675526614, "grad_norm": 0.10919623076915741, "learning_rate": 1.3739768680343325e-07, "loss": 0.0076, "step": 46310 }, { "epoch": 9.67559306266197, "grad_norm": 0.052643608301877975, "learning_rate": 1.3582472607008623e-07, "loss": 0.006, "step": 46320 }, { "epoch": 9.677719449797328, "grad_norm": 0.05597268417477608, "learning_rate": 1.3426079038084461e-07, "loss": 0.0071, "step": 46330 }, { "epoch": 9.679845836932687, "grad_norm": 0.07317246496677399, "learning_rate": 1.3270588044626443e-07, "loss": 0.0053, "step": 46340 }, { "epoch": 9.681972224068044, "grad_norm": 0.08442138880491257, "learning_rate": 1.3115999697279836e-07, "loss": 0.006, "step": 46350 }, { "epoch": 9.684098611203403, "grad_norm": 0.06207236647605896, "learning_rate": 1.2962314066280013e-07, "loss": 0.0067, "step": 46360 }, { "epoch": 9.68622499833876, "grad_norm": 0.11066444218158722, "learning_rate": 1.2809531221452232e-07, "loss": 0.0078, "step": 46370 }, { "epoch": 9.688351385474117, "grad_norm": 0.06266403943300247, "learning_rate": 1.2657651232211632e-07, "loss": 0.0056, "step": 46380 }, { "epoch": 9.690477772609476, "grad_norm": 0.1135714203119278, "learning_rate": 1.2506674167562572e-07, "loss": 0.0082, "step": 46390 }, { "epoch": 9.692604159744834, "grad_norm": 0.09715402871370316, "learning_rate": 1.2356600096099735e-07, "loss": 0.0065, "step": 46400 }, { "epoch": 9.69473054688019, "grad_norm": 0.21207119524478912, "learning_rate": 1.2207429086007917e-07, "loss": 0.0062, "step": 46410 }, { "epoch": 9.69685693401555, "grad_norm": 0.07998056709766388, "learning_rate": 1.2059161205060677e-07, "loss": 0.0071, "step": 46420 }, { "epoch": 9.698983321150907, "grad_norm": 0.20146529376506805, "learning_rate": 1.1911796520621688e-07, "loss": 0.0073, "step": 46430 }, { "epoch": 9.701109708286264, "grad_norm": 0.07590856403112411, "learning_rate": 1.1765335099644503e-07, "loss": 0.0049, "step": 46440 }, { "epoch": 9.703236095421623, "grad_norm": 0.10210797935724258, "learning_rate": 1.1619777008672117e-07, "loss": 0.0093, "step": 46450 }, { "epoch": 9.70536248255698, "grad_norm": 0.0810011476278305, "learning_rate": 1.1475122313836962e-07, "loss": 0.0068, "step": 46460 }, { "epoch": 9.70748886969234, "grad_norm": 0.09990231692790985, "learning_rate": 1.133137108086091e-07, "loss": 0.006, "step": 46470 }, { "epoch": 9.709615256827696, "grad_norm": 0.08201201260089874, "learning_rate": 1.1188523375055715e-07, "loss": 0.0085, "step": 46480 }, { "epoch": 9.711741643963054, "grad_norm": 0.10763274878263474, "learning_rate": 1.1046579261322798e-07, "loss": 0.0086, "step": 46490 }, { "epoch": 9.713868031098412, "grad_norm": 0.09125655144453049, "learning_rate": 1.0905538804152348e-07, "loss": 0.0087, "step": 46500 }, { "epoch": 9.71599441823377, "grad_norm": 0.07773427665233612, "learning_rate": 1.076540206762422e-07, "loss": 0.0058, "step": 46510 }, { "epoch": 9.718120805369127, "grad_norm": 0.08241023123264313, "learning_rate": 1.0626169115408369e-07, "loss": 0.0085, "step": 46520 }, { "epoch": 9.720247192504486, "grad_norm": 0.1010398417711258, "learning_rate": 1.0487840010763084e-07, "loss": 0.0083, "step": 46530 }, { "epoch": 9.722373579639843, "grad_norm": 0.08933252096176147, "learning_rate": 1.0350414816536758e-07, "loss": 0.0073, "step": 46540 }, { "epoch": 9.724499966775202, "grad_norm": 0.09788548946380615, "learning_rate": 1.0213893595166779e-07, "loss": 0.0077, "step": 46550 }, { "epoch": 9.726626353910559, "grad_norm": 0.05475420877337456, "learning_rate": 1.0078276408679754e-07, "loss": 0.008, "step": 46560 }, { "epoch": 9.728752741045916, "grad_norm": 0.1372324824333191, "learning_rate": 9.943563318691951e-08, "loss": 0.0065, "step": 46570 }, { "epoch": 9.730879128181275, "grad_norm": 0.11562158167362213, "learning_rate": 9.80975438640841e-08, "loss": 0.0059, "step": 46580 }, { "epoch": 9.733005515316632, "grad_norm": 0.1915254443883896, "learning_rate": 9.676849672623612e-08, "loss": 0.0079, "step": 46590 }, { "epoch": 9.73513190245199, "grad_norm": 0.12128864228725433, "learning_rate": 9.544849237721033e-08, "loss": 0.0082, "step": 46600 }, { "epoch": 9.737258289587349, "grad_norm": 0.08387965708971024, "learning_rate": 9.413753141673809e-08, "loss": 0.0059, "step": 46610 }, { "epoch": 9.739384676722706, "grad_norm": 0.31639406085014343, "learning_rate": 9.283561444043632e-08, "loss": 0.0074, "step": 46620 }, { "epoch": 9.741511063858063, "grad_norm": 0.1280372440814972, "learning_rate": 9.154274203981406e-08, "loss": 0.0051, "step": 46630 }, { "epoch": 9.743637450993422, "grad_norm": 0.09722702950239182, "learning_rate": 9.025891480227478e-08, "loss": 0.0053, "step": 46640 }, { "epoch": 9.745763838128779, "grad_norm": 0.06377612799406052, "learning_rate": 8.898413331110523e-08, "loss": 0.0055, "step": 46650 }, { "epoch": 9.747890225264138, "grad_norm": 0.053494762629270554, "learning_rate": 8.771839814549099e-08, "loss": 0.005, "step": 46660 }, { "epoch": 9.750016612399495, "grad_norm": 0.0834541767835617, "learning_rate": 8.646170988050318e-08, "loss": 0.0093, "step": 46670 }, { "epoch": 9.752142999534852, "grad_norm": 0.061729446053504944, "learning_rate": 8.52140690870984e-08, "loss": 0.0049, "step": 46680 }, { "epoch": 9.754269386670211, "grad_norm": 0.04219207912683487, "learning_rate": 8.39754763321321e-08, "loss": 0.004, "step": 46690 }, { "epoch": 9.756395773805568, "grad_norm": 0.06596383452415466, "learning_rate": 8.274593217834081e-08, "loss": 0.0061, "step": 46700 }, { "epoch": 9.758522160940926, "grad_norm": 0.10087316483259201, "learning_rate": 8.152543718435323e-08, "loss": 0.0049, "step": 46710 }, { "epoch": 9.760648548076285, "grad_norm": 0.050794608891010284, "learning_rate": 8.03139919046858e-08, "loss": 0.0062, "step": 46720 }, { "epoch": 9.762774935211642, "grad_norm": 0.04957945644855499, "learning_rate": 7.91115968897449e-08, "loss": 0.0054, "step": 46730 }, { "epoch": 9.764901322347, "grad_norm": 0.07113183289766312, "learning_rate": 7.791825268582243e-08, "loss": 0.0049, "step": 46740 }, { "epoch": 9.767027709482358, "grad_norm": 0.06518006324768066, "learning_rate": 7.673395983510246e-08, "loss": 0.0064, "step": 46750 }, { "epoch": 9.769154096617715, "grad_norm": 0.08137210458517075, "learning_rate": 7.555871887565015e-08, "loss": 0.0061, "step": 46760 }, { "epoch": 9.771280483753074, "grad_norm": 0.12134739011526108, "learning_rate": 7.439253034142502e-08, "loss": 0.0076, "step": 46770 }, { "epoch": 9.773406870888431, "grad_norm": 0.1435489058494568, "learning_rate": 7.323539476226993e-08, "loss": 0.0068, "step": 46780 }, { "epoch": 9.775533258023788, "grad_norm": 0.10236471146345139, "learning_rate": 7.208731266391322e-08, "loss": 0.0047, "step": 46790 }, { "epoch": 9.777659645159147, "grad_norm": 0.10558388382196426, "learning_rate": 7.09482845679732e-08, "loss": 0.0082, "step": 46800 }, { "epoch": 9.779786032294504, "grad_norm": 0.08266352862119675, "learning_rate": 6.981831099195369e-08, "loss": 0.0064, "step": 46810 }, { "epoch": 9.781912419429862, "grad_norm": 0.08234688639640808, "learning_rate": 6.8697392449244e-08, "loss": 0.0048, "step": 46820 }, { "epoch": 9.78403880656522, "grad_norm": 0.09173407405614853, "learning_rate": 6.7585529449119e-08, "loss": 0.0063, "step": 46830 }, { "epoch": 9.786165193700578, "grad_norm": 0.06011432036757469, "learning_rate": 6.648272249674125e-08, "loss": 0.0068, "step": 46840 }, { "epoch": 9.788291580835937, "grad_norm": 0.08384426683187485, "learning_rate": 6.538897209315664e-08, "loss": 0.0075, "step": 46850 }, { "epoch": 9.790417967971294, "grad_norm": 0.11884940415620804, "learning_rate": 6.430427873529876e-08, "loss": 0.0063, "step": 46860 }, { "epoch": 9.792544355106651, "grad_norm": 0.05913754925131798, "learning_rate": 6.322864291598229e-08, "loss": 0.0064, "step": 46870 }, { "epoch": 9.79467074224201, "grad_norm": 0.05626747012138367, "learning_rate": 6.216206512390965e-08, "loss": 0.0073, "step": 46880 }, { "epoch": 9.796797129377367, "grad_norm": 0.050740789622068405, "learning_rate": 6.110454584366876e-08, "loss": 0.0076, "step": 46890 }, { "epoch": 9.798923516512724, "grad_norm": 0.08202860504388809, "learning_rate": 6.005608555573083e-08, "loss": 0.0068, "step": 46900 }, { "epoch": 9.801049903648083, "grad_norm": 0.0968312993645668, "learning_rate": 5.901668473645039e-08, "loss": 0.009, "step": 46910 }, { "epoch": 9.80317629078344, "grad_norm": 0.09642332047224045, "learning_rate": 5.798634385806301e-08, "loss": 0.0069, "step": 46920 }, { "epoch": 9.8053026779188, "grad_norm": 0.08227372914552689, "learning_rate": 5.696506338869645e-08, "loss": 0.0059, "step": 46930 }, { "epoch": 9.807429065054157, "grad_norm": 0.10789494216442108, "learning_rate": 5.59528437923551e-08, "loss": 0.0055, "step": 46940 }, { "epoch": 9.809555452189514, "grad_norm": 0.0954144150018692, "learning_rate": 5.4949685528926655e-08, "loss": 0.0051, "step": 46950 }, { "epoch": 9.811681839324873, "grad_norm": 0.13878530263900757, "learning_rate": 5.395558905418652e-08, "loss": 0.0076, "step": 46960 }, { "epoch": 9.81380822646023, "grad_norm": 0.12239578366279602, "learning_rate": 5.297055481978897e-08, "loss": 0.0072, "step": 46970 }, { "epoch": 9.815934613595587, "grad_norm": 0.054265934973955154, "learning_rate": 5.199458327327156e-08, "loss": 0.0067, "step": 46980 }, { "epoch": 9.818061000730946, "grad_norm": 0.13010042905807495, "learning_rate": 5.102767485805737e-08, "loss": 0.0086, "step": 46990 }, { "epoch": 9.820187387866303, "grad_norm": 0.07283120602369308, "learning_rate": 5.006983001344612e-08, "loss": 0.0052, "step": 47000 }, { "epoch": 9.82231377500166, "grad_norm": 0.10412652790546417, "learning_rate": 4.912104917462524e-08, "loss": 0.0064, "step": 47010 }, { "epoch": 9.82444016213702, "grad_norm": 0.21864429116249084, "learning_rate": 4.818133277266102e-08, "loss": 0.01, "step": 47020 }, { "epoch": 9.826566549272377, "grad_norm": 0.05363056808710098, "learning_rate": 4.7250681234500825e-08, "loss": 0.0048, "step": 47030 }, { "epoch": 9.828692936407736, "grad_norm": 0.050713326781988144, "learning_rate": 4.6329094982977554e-08, "loss": 0.0054, "step": 47040 }, { "epoch": 9.830819323543093, "grad_norm": 0.16243422031402588, "learning_rate": 4.541657443679848e-08, "loss": 0.0098, "step": 47050 }, { "epoch": 9.83294571067845, "grad_norm": 0.08455205708742142, "learning_rate": 4.451312001056085e-08, "loss": 0.0069, "step": 47060 }, { "epoch": 9.835072097813809, "grad_norm": 0.1746399849653244, "learning_rate": 4.361873211473411e-08, "loss": 0.0072, "step": 47070 }, { "epoch": 9.837198484949166, "grad_norm": 0.101706363260746, "learning_rate": 4.273341115567542e-08, "loss": 0.0055, "step": 47080 }, { "epoch": 9.839324872084523, "grad_norm": 0.07161462306976318, "learning_rate": 4.1857157535616363e-08, "loss": 0.0051, "step": 47090 }, { "epoch": 9.841451259219882, "grad_norm": 0.07007240504026413, "learning_rate": 4.0989971652674043e-08, "loss": 0.0061, "step": 47100 }, { "epoch": 9.84357764635524, "grad_norm": 0.06397923827171326, "learning_rate": 4.0131853900844396e-08, "loss": 0.0056, "step": 47110 }, { "epoch": 9.845704033490598, "grad_norm": 0.04562964662909508, "learning_rate": 3.9282804670002226e-08, "loss": 0.0044, "step": 47120 }, { "epoch": 9.847830420625955, "grad_norm": 0.03975366801023483, "learning_rate": 3.844282434590119e-08, "loss": 0.0058, "step": 47130 }, { "epoch": 9.849956807761313, "grad_norm": 0.10866449773311615, "learning_rate": 3.761191331017822e-08, "loss": 0.0055, "step": 47140 }, { "epoch": 9.852083194896672, "grad_norm": 0.06693773716688156, "learning_rate": 3.679007194034467e-08, "loss": 0.0079, "step": 47150 }, { "epoch": 9.854209582032029, "grad_norm": 0.10040107369422913, "learning_rate": 3.597730060979743e-08, "loss": 0.0063, "step": 47160 }, { "epoch": 9.856335969167386, "grad_norm": 0.07085244357585907, "learning_rate": 3.517359968780554e-08, "loss": 0.0049, "step": 47170 }, { "epoch": 9.858462356302745, "grad_norm": 0.08059089630842209, "learning_rate": 3.437896953952358e-08, "loss": 0.0053, "step": 47180 }, { "epoch": 9.860588743438102, "grad_norm": 0.12452452629804611, "learning_rate": 3.359341052598053e-08, "loss": 0.0069, "step": 47190 }, { "epoch": 9.86271513057346, "grad_norm": 0.05627785995602608, "learning_rate": 3.281692300408645e-08, "loss": 0.0061, "step": 47200 }, { "epoch": 9.864841517708818, "grad_norm": 0.06759165227413177, "learning_rate": 3.204950732662804e-08, "loss": 0.0065, "step": 47210 }, { "epoch": 9.866967904844175, "grad_norm": 0.09904655069112778, "learning_rate": 3.129116384227082e-08, "loss": 0.0051, "step": 47220 }, { "epoch": 9.869094291979533, "grad_norm": 0.051732465624809265, "learning_rate": 3.054189289556142e-08, "loss": 0.0078, "step": 47230 }, { "epoch": 9.871220679114892, "grad_norm": 0.05680263042449951, "learning_rate": 2.9801694826918637e-08, "loss": 0.0045, "step": 47240 }, { "epoch": 9.873347066250249, "grad_norm": 0.13247358798980713, "learning_rate": 2.907056997264457e-08, "loss": 0.0086, "step": 47250 }, { "epoch": 9.875473453385608, "grad_norm": 0.08386099338531494, "learning_rate": 2.8348518664915725e-08, "loss": 0.0061, "step": 47260 }, { "epoch": 9.877599840520965, "grad_norm": 0.13767658174037933, "learning_rate": 2.7635541231787465e-08, "loss": 0.0076, "step": 47270 }, { "epoch": 9.879726227656322, "grad_norm": 0.11548544466495514, "learning_rate": 2.6931637997191784e-08, "loss": 0.0062, "step": 47280 }, { "epoch": 9.881852614791681, "grad_norm": 0.07511698454618454, "learning_rate": 2.6236809280939524e-08, "loss": 0.0093, "step": 47290 }, { "epoch": 9.883979001927038, "grad_norm": 0.06025674194097519, "learning_rate": 2.555105539871594e-08, "loss": 0.0061, "step": 47300 }, { "epoch": 9.886105389062397, "grad_norm": 0.0640256330370903, "learning_rate": 2.4874376662087362e-08, "loss": 0.0051, "step": 47310 }, { "epoch": 9.888231776197754, "grad_norm": 0.0529705211520195, "learning_rate": 2.4206773378494532e-08, "loss": 0.0045, "step": 47320 }, { "epoch": 9.890358163333111, "grad_norm": 0.0554792657494545, "learning_rate": 2.3548245851250373e-08, "loss": 0.0068, "step": 47330 }, { "epoch": 9.89248455046847, "grad_norm": 0.08335666358470917, "learning_rate": 2.2898794379555556e-08, "loss": 0.0072, "step": 47340 }, { "epoch": 9.894610937603828, "grad_norm": 0.11222485452890396, "learning_rate": 2.225841925847405e-08, "loss": 0.0092, "step": 47350 }, { "epoch": 9.896737324739185, "grad_norm": 0.07755165547132492, "learning_rate": 2.1627120778955345e-08, "loss": 0.0071, "step": 47360 }, { "epoch": 9.898863711874544, "grad_norm": 0.15027864277362823, "learning_rate": 2.1004899227823338e-08, "loss": 0.0052, "step": 47370 }, { "epoch": 9.900990099009901, "grad_norm": 0.09477586299180984, "learning_rate": 2.0391754887774116e-08, "loss": 0.0068, "step": 47380 }, { "epoch": 9.903116486145258, "grad_norm": 0.054509956389665604, "learning_rate": 1.9787688037382623e-08, "loss": 0.0049, "step": 47390 }, { "epoch": 9.905242873280617, "grad_norm": 0.08677219599485397, "learning_rate": 1.919269895110043e-08, "loss": 0.0066, "step": 47400 }, { "epoch": 9.907369260415974, "grad_norm": 0.06825282424688339, "learning_rate": 1.860678789925352e-08, "loss": 0.0053, "step": 47410 }, { "epoch": 9.909495647551331, "grad_norm": 0.11646579951047897, "learning_rate": 1.802995514804451e-08, "loss": 0.0062, "step": 47420 }, { "epoch": 9.91162203468669, "grad_norm": 0.18124999105930328, "learning_rate": 1.7462200959548204e-08, "loss": 0.0078, "step": 47430 }, { "epoch": 9.913748421822048, "grad_norm": 0.11142364144325256, "learning_rate": 1.6903525591716042e-08, "loss": 0.0068, "step": 47440 }, { "epoch": 9.915874808957406, "grad_norm": 0.044157881289720535, "learning_rate": 1.6353929298376093e-08, "loss": 0.0047, "step": 47450 }, { "epoch": 9.918001196092764, "grad_norm": 0.08629575371742249, "learning_rate": 1.5813412329233057e-08, "loss": 0.0074, "step": 47460 }, { "epoch": 9.92012758322812, "grad_norm": 0.05882268399000168, "learning_rate": 1.5281974929861608e-08, "loss": 0.009, "step": 47470 }, { "epoch": 9.92225397036348, "grad_norm": 0.08525555580854416, "learning_rate": 1.475961734171527e-08, "loss": 0.0068, "step": 47480 }, { "epoch": 9.924380357498837, "grad_norm": 0.09469709545373917, "learning_rate": 1.4246339802117537e-08, "loss": 0.005, "step": 47490 }, { "epoch": 9.926506744634194, "grad_norm": 0.06491371989250183, "learning_rate": 1.3742142544275194e-08, "loss": 0.0056, "step": 47500 }, { "epoch": 9.928633131769553, "grad_norm": 0.07797796279191971, "learning_rate": 1.3247025797258339e-08, "loss": 0.0085, "step": 47510 }, { "epoch": 9.93075951890491, "grad_norm": 0.07234423607587814, "learning_rate": 1.2760989786022583e-08, "loss": 0.0052, "step": 47520 }, { "epoch": 9.93288590604027, "grad_norm": 0.10033213347196579, "learning_rate": 1.2284034731389061e-08, "loss": 0.0082, "step": 47530 }, { "epoch": 9.935012293175626, "grad_norm": 0.06419480592012405, "learning_rate": 1.1816160850055547e-08, "loss": 0.0049, "step": 47540 }, { "epoch": 9.937138680310984, "grad_norm": 0.06003078818321228, "learning_rate": 1.1357368354598664e-08, "loss": 0.0056, "step": 47550 }, { "epoch": 9.939265067446343, "grad_norm": 0.17142388224601746, "learning_rate": 1.0907657453460563e-08, "loss": 0.009, "step": 47560 }, { "epoch": 9.9413914545817, "grad_norm": 0.09216486662626266, "learning_rate": 1.046702835096669e-08, "loss": 0.0056, "step": 47570 }, { "epoch": 9.943517841717057, "grad_norm": 0.06426838040351868, "learning_rate": 1.003548124730802e-08, "loss": 0.0101, "step": 47580 }, { "epoch": 9.945644228852416, "grad_norm": 0.10110671073198318, "learning_rate": 9.613016338554382e-09, "loss": 0.0055, "step": 47590 }, { "epoch": 9.947770615987773, "grad_norm": 0.10425698012113571, "learning_rate": 9.19963381664557e-09, "loss": 0.0112, "step": 47600 }, { "epoch": 9.94989700312313, "grad_norm": 0.0740184560418129, "learning_rate": 8.795333869398016e-09, "loss": 0.007, "step": 47610 }, { "epoch": 9.95202339025849, "grad_norm": 0.11799513548612595, "learning_rate": 8.400116680500337e-09, "loss": 0.0078, "step": 47620 }, { "epoch": 9.954149777393846, "grad_norm": 0.11577790975570679, "learning_rate": 8.013982429515565e-09, "loss": 0.0064, "step": 47630 }, { "epoch": 9.956276164529205, "grad_norm": 0.04880143702030182, "learning_rate": 7.636931291878925e-09, "loss": 0.0103, "step": 47640 }, { "epoch": 9.958402551664562, "grad_norm": 0.23198460042476654, "learning_rate": 7.2689634388978245e-09, "loss": 0.011, "step": 47650 }, { "epoch": 9.96052893879992, "grad_norm": 0.0632665827870369, "learning_rate": 6.910079037754091e-09, "loss": 0.0064, "step": 47660 }, { "epoch": 9.962655325935279, "grad_norm": 0.0889221727848053, "learning_rate": 6.56027825150396e-09, "loss": 0.0054, "step": 47670 }, { "epoch": 9.964781713070636, "grad_norm": 0.08801542967557907, "learning_rate": 6.219561239071414e-09, "loss": 0.0058, "step": 47680 }, { "epoch": 9.966908100205993, "grad_norm": 0.060094889253377914, "learning_rate": 5.887928155261513e-09, "loss": 0.0045, "step": 47690 }, { "epoch": 9.969034487341352, "grad_norm": 0.21591047942638397, "learning_rate": 5.565379150744843e-09, "loss": 0.0074, "step": 47700 }, { "epoch": 9.971160874476709, "grad_norm": 0.11017168313264847, "learning_rate": 5.2519143720708435e-09, "loss": 0.0072, "step": 47710 }, { "epoch": 9.973287261612068, "grad_norm": 0.04329228028655052, "learning_rate": 4.947533961652262e-09, "loss": 0.0043, "step": 47720 }, { "epoch": 9.975413648747425, "grad_norm": 0.08211742341518402, "learning_rate": 4.652238057787362e-09, "loss": 0.004, "step": 47730 }, { "epoch": 9.977540035882782, "grad_norm": 0.062386803328990936, "learning_rate": 4.366026794635492e-09, "loss": 0.0072, "step": 47740 }, { "epoch": 9.979666423018141, "grad_norm": 0.0585026890039444, "learning_rate": 4.088900302237075e-09, "loss": 0.0056, "step": 47750 }, { "epoch": 9.981792810153499, "grad_norm": 0.056861963123083115, "learning_rate": 3.820858706495845e-09, "loss": 0.0064, "step": 47760 }, { "epoch": 9.983919197288856, "grad_norm": 0.05580397695302963, "learning_rate": 3.5619021291988243e-09, "loss": 0.0053, "step": 47770 }, { "epoch": 9.986045584424215, "grad_norm": 0.16614583134651184, "learning_rate": 3.3120306879963483e-09, "loss": 0.0072, "step": 47780 }, { "epoch": 9.988171971559572, "grad_norm": 0.07879739254713058, "learning_rate": 3.0712444964153822e-09, "loss": 0.008, "step": 47790 }, { "epoch": 9.990298358694929, "grad_norm": 0.10770705342292786, "learning_rate": 2.8395436638550823e-09, "loss": 0.0073, "step": 47800 }, { "epoch": 9.992424745830288, "grad_norm": 0.05445759370923042, "learning_rate": 2.6169282955845755e-09, "loss": 0.0059, "step": 47810 }, { "epoch": 9.994551132965645, "grad_norm": 0.10630927979946136, "learning_rate": 2.403398492745179e-09, "loss": 0.0059, "step": 47820 }, { "epoch": 9.996677520101004, "grad_norm": 0.15896163880825043, "learning_rate": 2.198954352354843e-09, "loss": 0.0071, "step": 47830 }, { "epoch": 9.998803907236361, "grad_norm": 0.08631344884634018, "learning_rate": 2.0035959672992657e-09, "loss": 0.0061, "step": 47840 }, { "epoch": 10.000930294371718, "grad_norm": 0.1044807881116867, "learning_rate": 1.8173234263363371e-09, "loss": 0.0045, "step": 47850 }, { "epoch": 10.003056681507077, "grad_norm": 0.05990280583500862, "learning_rate": 1.6401368140961382e-09, "loss": 0.0075, "step": 47860 }, { "epoch": 10.005183068642435, "grad_norm": 0.24975143373012543, "learning_rate": 1.4720362110809405e-09, "loss": 0.0066, "step": 47870 }, { "epoch": 10.007309455777794, "grad_norm": 0.07268635183572769, "learning_rate": 1.3130216936674268e-09, "loss": 0.0093, "step": 47880 }, { "epoch": 10.00943584291315, "grad_norm": 0.12219683825969696, "learning_rate": 1.1630933341000294e-09, "loss": 0.0081, "step": 47890 }, { "epoch": 10.011562230048508, "grad_norm": 0.05602068454027176, "learning_rate": 1.0222512004975926e-09, "loss": 0.0049, "step": 47900 }, { "epoch": 10.013688617183867, "grad_norm": 0.17097996175289154, "learning_rate": 8.904953568489305e-10, "loss": 0.0066, "step": 47910 }, { "epoch": 10.015815004319224, "grad_norm": 0.10852757841348648, "learning_rate": 7.678258630172686e-10, "loss": 0.008, "step": 47920 }, { "epoch": 10.017941391454581, "grad_norm": 0.06873766332864761, "learning_rate": 6.542427747358027e-10, "loss": 0.0049, "step": 47930 }, { "epoch": 10.02006777858994, "grad_norm": 0.1313423365354538, "learning_rate": 5.497461436099194e-10, "loss": 0.0081, "step": 47940 }, { "epoch": 10.020705694730546, "step": 47943, "total_flos": 4.6307239835234206e+19, "train_loss": 0.0013970831559684137, "train_runtime": 169192.0368, "train_samples_per_second": 36.357, "train_steps_per_second": 0.284 } ], "logging_steps": 10, "max_steps": 48050, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.6307239835234206e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }