diff --git "a/contextlm_gpt2_large/trainer_state.json" "b/contextlm_gpt2_large/trainer_state.json" new file mode 100644--- /dev/null +++ "b/contextlm_gpt2_large/trainer_state.json" @@ -0,0 +1,12240 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999709884243814, + "eval_steps": 1000, + "global_step": 17234, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000580231512373437, + "grad_norm": 4.4758100509643555, + "learning_rate": 6.264501160092807e-06, + "loss": 10.4749, + "step": 10 + }, + { + "epoch": 0.001160463024746874, + "grad_norm": 1.6773627996444702, + "learning_rate": 1.322505800464037e-05, + "loss": 9.159, + "step": 20 + }, + { + "epoch": 0.001740694537120311, + "grad_norm": 1.5999170541763306, + "learning_rate": 2.018561484918793e-05, + "loss": 8.8189, + "step": 30 + }, + { + "epoch": 0.002320926049493748, + "grad_norm": 1.9260104894638062, + "learning_rate": 2.7146171693735496e-05, + "loss": 8.4574, + "step": 40 + }, + { + "epoch": 0.002901157561867185, + "grad_norm": 2.173593282699585, + "learning_rate": 3.410672853828306e-05, + "loss": 8.0835, + "step": 50 + }, + { + "epoch": 0.003481389074240622, + "grad_norm": 1.5830281972885132, + "learning_rate": 4.1067285382830626e-05, + "loss": 7.7376, + "step": 60 + }, + { + "epoch": 0.004061620586614059, + "grad_norm": 2.772728443145752, + "learning_rate": 4.802784222737819e-05, + "loss": 7.4168, + "step": 70 + }, + { + "epoch": 0.004641852098987496, + "grad_norm": 1.511775016784668, + "learning_rate": 5.498839907192575e-05, + "loss": 7.1442, + "step": 80 + }, + { + "epoch": 0.005222083611360933, + "grad_norm": 1.9058183431625366, + "learning_rate": 6.194895591647331e-05, + "loss": 6.9324, + "step": 90 + }, + { + "epoch": 0.00580231512373437, + "grad_norm": 1.6976985931396484, + "learning_rate": 6.890951276102087e-05, + "loss": 6.8005, + "step": 100 + }, + { + "epoch": 0.006382546636107807, + "grad_norm": 1.4346176385879517, + "learning_rate": 7.587006960556844e-05, + "loss": 6.6814, + "step": 110 + }, + { + "epoch": 0.006962778148481244, + "grad_norm": 1.0364270210266113, + "learning_rate": 8.283062645011599e-05, + "loss": 6.5547, + "step": 120 + }, + { + "epoch": 0.007543009660854681, + "grad_norm": 0.6528536677360535, + "learning_rate": 8.979118329466357e-05, + "loss": 6.4482, + "step": 130 + }, + { + "epoch": 0.008123241173228117, + "grad_norm": 1.1468390226364136, + "learning_rate": 9.675174013921112e-05, + "loss": 6.3518, + "step": 140 + }, + { + "epoch": 0.008703472685601555, + "grad_norm": 0.6249582171440125, + "learning_rate": 0.0001037122969837587, + "loss": 6.2749, + "step": 150 + }, + { + "epoch": 0.009283704197974993, + "grad_norm": 0.9577043652534485, + "learning_rate": 0.00011067285382830626, + "loss": 6.2026, + "step": 160 + }, + { + "epoch": 0.009863935710348428, + "grad_norm": 1.156731367111206, + "learning_rate": 0.00011763341067285381, + "loss": 6.1482, + "step": 170 + }, + { + "epoch": 0.010444167222721866, + "grad_norm": 0.7919487357139587, + "learning_rate": 0.0001245939675174014, + "loss": 6.0907, + "step": 180 + }, + { + "epoch": 0.011024398735095304, + "grad_norm": 0.5902596712112427, + "learning_rate": 0.00013155452436194894, + "loss": 6.0469, + "step": 190 + }, + { + "epoch": 0.01160463024746874, + "grad_norm": 0.9712298512458801, + "learning_rate": 0.00013851508120649652, + "loss": 6.0128, + "step": 200 + }, + { + "epoch": 0.012184861759842177, + "grad_norm": 0.6487208008766174, + "learning_rate": 0.00014547563805104407, + "loss": 5.949, + "step": 210 + }, + { + "epoch": 0.012765093272215615, + "grad_norm": 0.6659431457519531, + "learning_rate": 0.00015243619489559162, + "loss": 5.9004, + "step": 220 + }, + { + "epoch": 0.01334532478458905, + "grad_norm": 0.9973188042640686, + "learning_rate": 0.0001593967517401392, + "loss": 5.8727, + "step": 230 + }, + { + "epoch": 0.013925556296962488, + "grad_norm": 0.592413067817688, + "learning_rate": 0.00016635730858468675, + "loss": 5.8594, + "step": 240 + }, + { + "epoch": 0.014505787809335926, + "grad_norm": 0.6143619418144226, + "learning_rate": 0.00017331786542923433, + "loss": 5.8114, + "step": 250 + }, + { + "epoch": 0.015086019321709361, + "grad_norm": 0.5780689120292664, + "learning_rate": 0.00018027842227378188, + "loss": 5.7829, + "step": 260 + }, + { + "epoch": 0.0156662508340828, + "grad_norm": 0.41307076811790466, + "learning_rate": 0.00018723897911832944, + "loss": 5.7197, + "step": 270 + }, + { + "epoch": 0.016246482346456235, + "grad_norm": 0.6880993247032166, + "learning_rate": 0.00019419953596287701, + "loss": 5.7168, + "step": 280 + }, + { + "epoch": 0.016826713858829674, + "grad_norm": 0.4273562431335449, + "learning_rate": 0.0002011600928074246, + "loss": 5.6639, + "step": 290 + }, + { + "epoch": 0.01740694537120311, + "grad_norm": 0.5025382041931152, + "learning_rate": 0.00020812064965197212, + "loss": 5.6305, + "step": 300 + }, + { + "epoch": 0.017987176883576546, + "grad_norm": 0.7127647995948792, + "learning_rate": 0.0002150812064965197, + "loss": 5.5991, + "step": 310 + }, + { + "epoch": 0.018567408395949985, + "grad_norm": 0.6494776010513306, + "learning_rate": 0.00022204176334106727, + "loss": 5.5961, + "step": 320 + }, + { + "epoch": 0.01914763990832342, + "grad_norm": 0.43809765577316284, + "learning_rate": 0.00022900232018561485, + "loss": 5.5242, + "step": 330 + }, + { + "epoch": 0.019727871420696857, + "grad_norm": 0.5514947175979614, + "learning_rate": 0.00023596287703016238, + "loss": 5.4885, + "step": 340 + }, + { + "epoch": 0.020308102933070296, + "grad_norm": 0.7086557745933533, + "learning_rate": 0.00024292343387470995, + "loss": 5.4558, + "step": 350 + }, + { + "epoch": 0.020888334445443732, + "grad_norm": 0.44333210587501526, + "learning_rate": 0.0002498839907192575, + "loss": 5.4249, + "step": 360 + }, + { + "epoch": 0.021468565957817168, + "grad_norm": 0.5971847772598267, + "learning_rate": 0.0002568445475638051, + "loss": 5.3896, + "step": 370 + }, + { + "epoch": 0.022048797470190607, + "grad_norm": 0.5358195900917053, + "learning_rate": 0.0002638051044083526, + "loss": 5.3647, + "step": 380 + }, + { + "epoch": 0.022629028982564043, + "grad_norm": 0.4231407046318054, + "learning_rate": 0.0002707656612529002, + "loss": 5.3325, + "step": 390 + }, + { + "epoch": 0.02320926049493748, + "grad_norm": 0.48789191246032715, + "learning_rate": 0.00027772621809744777, + "loss": 5.2922, + "step": 400 + }, + { + "epoch": 0.023789492007310918, + "grad_norm": 0.46154582500457764, + "learning_rate": 0.0002846867749419953, + "loss": 5.2881, + "step": 410 + }, + { + "epoch": 0.024369723519684354, + "grad_norm": 0.44972172379493713, + "learning_rate": 0.00029164733178654287, + "loss": 5.2397, + "step": 420 + }, + { + "epoch": 0.02494995503205779, + "grad_norm": 0.505415678024292, + "learning_rate": 0.0002986078886310905, + "loss": 5.1841, + "step": 430 + }, + { + "epoch": 0.02553018654443123, + "grad_norm": 0.42717623710632324, + "learning_rate": 0.0003055684454756381, + "loss": 5.1848, + "step": 440 + }, + { + "epoch": 0.026110418056804665, + "grad_norm": 0.4216056168079376, + "learning_rate": 0.0003125290023201856, + "loss": 5.1447, + "step": 450 + }, + { + "epoch": 0.0266906495691781, + "grad_norm": 0.5051509141921997, + "learning_rate": 0.00031948955916473313, + "loss": 5.1084, + "step": 460 + }, + { + "epoch": 0.02727088108155154, + "grad_norm": 0.5205376744270325, + "learning_rate": 0.0003264501160092807, + "loss": 5.0462, + "step": 470 + }, + { + "epoch": 0.027851112593924976, + "grad_norm": 0.5111084580421448, + "learning_rate": 0.0003334106728538283, + "loss": 5.0225, + "step": 480 + }, + { + "epoch": 0.028431344106298412, + "grad_norm": 0.4395337402820587, + "learning_rate": 0.00034037122969837584, + "loss": 4.991, + "step": 490 + }, + { + "epoch": 0.02901157561867185, + "grad_norm": 0.2879785895347595, + "learning_rate": 0.00034733178654292344, + "loss": 4.9628, + "step": 500 + }, + { + "epoch": 0.029591807131045287, + "grad_norm": 0.3356530964374542, + "learning_rate": 0.000354292343387471, + "loss": 4.9165, + "step": 510 + }, + { + "epoch": 0.030172038643418723, + "grad_norm": 0.39410287141799927, + "learning_rate": 0.00036125290023201855, + "loss": 4.8802, + "step": 520 + }, + { + "epoch": 0.030752270155792162, + "grad_norm": 0.4210626184940338, + "learning_rate": 0.00036821345707656604, + "loss": 4.8403, + "step": 530 + }, + { + "epoch": 0.0313325016681656, + "grad_norm": 0.4170067608356476, + "learning_rate": 0.00037517401392111365, + "loss": 4.8156, + "step": 540 + }, + { + "epoch": 0.031912733180539034, + "grad_norm": 0.40876781940460205, + "learning_rate": 0.0003821345707656612, + "loss": 4.7932, + "step": 550 + }, + { + "epoch": 0.03249296469291247, + "grad_norm": 0.3717671036720276, + "learning_rate": 0.0003890951276102088, + "loss": 4.7812, + "step": 560 + }, + { + "epoch": 0.03307319620528591, + "grad_norm": 0.37275081872940063, + "learning_rate": 0.00039605568445475636, + "loss": 4.7324, + "step": 570 + }, + { + "epoch": 0.03365342771765935, + "grad_norm": 0.32523536682128906, + "learning_rate": 0.0004030162412993039, + "loss": 4.6891, + "step": 580 + }, + { + "epoch": 0.034233659230032784, + "grad_norm": 0.2909957468509674, + "learning_rate": 0.0004099767981438515, + "loss": 4.6555, + "step": 590 + }, + { + "epoch": 0.03481389074240622, + "grad_norm": 0.40268951654434204, + "learning_rate": 0.00041693735498839906, + "loss": 4.622, + "step": 600 + }, + { + "epoch": 0.035394122254779656, + "grad_norm": 0.433383584022522, + "learning_rate": 0.00042389791183294656, + "loss": 4.6122, + "step": 610 + }, + { + "epoch": 0.03597435376715309, + "grad_norm": 0.3096088171005249, + "learning_rate": 0.0004308584686774941, + "loss": 4.5976, + "step": 620 + }, + { + "epoch": 0.036554585279526534, + "grad_norm": 0.30540433526039124, + "learning_rate": 0.0004378190255220417, + "loss": 4.5569, + "step": 630 + }, + { + "epoch": 0.03713481679189997, + "grad_norm": 0.3136671781539917, + "learning_rate": 0.00044477958236658927, + "loss": 4.5228, + "step": 640 + }, + { + "epoch": 0.037715048304273406, + "grad_norm": 0.332621693611145, + "learning_rate": 0.0004517401392111369, + "loss": 4.4901, + "step": 650 + }, + { + "epoch": 0.03829527981664684, + "grad_norm": 0.3817736804485321, + "learning_rate": 0.0004587006960556844, + "loss": 4.475, + "step": 660 + }, + { + "epoch": 0.03887551132902028, + "grad_norm": 0.458741158246994, + "learning_rate": 0.000465661252900232, + "loss": 4.4545, + "step": 670 + }, + { + "epoch": 0.039455742841393714, + "grad_norm": 0.27561265230178833, + "learning_rate": 0.0004726218097447796, + "loss": 4.4406, + "step": 680 + }, + { + "epoch": 0.040035974353767156, + "grad_norm": 0.380633145570755, + "learning_rate": 0.0004795823665893271, + "loss": 4.4027, + "step": 690 + }, + { + "epoch": 0.04061620586614059, + "grad_norm": 0.3662358820438385, + "learning_rate": 0.00048654292343387463, + "loss": 4.377, + "step": 700 + }, + { + "epoch": 0.04119643737851403, + "grad_norm": 0.31104594469070435, + "learning_rate": 0.0004935034802784222, + "loss": 4.3399, + "step": 710 + }, + { + "epoch": 0.041776668890887464, + "grad_norm": 0.43897074460983276, + "learning_rate": 0.0005004640371229698, + "loss": 4.3229, + "step": 720 + }, + { + "epoch": 0.0423569004032609, + "grad_norm": 0.2685506343841553, + "learning_rate": 0.0005074245939675173, + "loss": 4.302, + "step": 730 + }, + { + "epoch": 0.042937131915634336, + "grad_norm": 0.2662206292152405, + "learning_rate": 0.0005143851508120649, + "loss": 4.2533, + "step": 740 + }, + { + "epoch": 0.04351736342800778, + "grad_norm": 0.31665244698524475, + "learning_rate": 0.0005213457076566126, + "loss": 4.2463, + "step": 750 + }, + { + "epoch": 0.044097594940381214, + "grad_norm": 0.3573771119117737, + "learning_rate": 0.0005283062645011601, + "loss": 4.2177, + "step": 760 + }, + { + "epoch": 0.04467782645275465, + "grad_norm": 0.3051789402961731, + "learning_rate": 0.0005352668213457077, + "loss": 4.2098, + "step": 770 + }, + { + "epoch": 0.045258057965128086, + "grad_norm": 0.26946839690208435, + "learning_rate": 0.0005422273781902551, + "loss": 4.1739, + "step": 780 + }, + { + "epoch": 0.04583828947750152, + "grad_norm": 0.21327945590019226, + "learning_rate": 0.0005491879350348028, + "loss": 4.151, + "step": 790 + }, + { + "epoch": 0.04641852098987496, + "grad_norm": 0.28413307666778564, + "learning_rate": 0.0005561484918793503, + "loss": 4.1455, + "step": 800 + }, + { + "epoch": 0.0469987525022484, + "grad_norm": 0.2847752869129181, + "learning_rate": 0.0005631090487238979, + "loss": 4.1166, + "step": 810 + }, + { + "epoch": 0.047578984014621836, + "grad_norm": 0.25382527709007263, + "learning_rate": 0.0005700696055684454, + "loss": 4.0986, + "step": 820 + }, + { + "epoch": 0.04815921552699527, + "grad_norm": 0.2375078797340393, + "learning_rate": 0.000577030162412993, + "loss": 4.0765, + "step": 830 + }, + { + "epoch": 0.04873944703936871, + "grad_norm": 0.3032638430595398, + "learning_rate": 0.0005839907192575406, + "loss": 4.085, + "step": 840 + }, + { + "epoch": 0.049319678551742144, + "grad_norm": 0.2454582005739212, + "learning_rate": 0.0005909512761020882, + "loss": 4.0505, + "step": 850 + }, + { + "epoch": 0.04989991006411558, + "grad_norm": 0.23829826712608337, + "learning_rate": 0.0005979118329466356, + "loss": 4.0391, + "step": 860 + }, + { + "epoch": 0.05048014157648902, + "grad_norm": 0.29694074392318726, + "learning_rate": 0.0005999997293652579, + "loss": 4.0195, + "step": 870 + }, + { + "epoch": 0.05106037308886246, + "grad_norm": 0.20268426835536957, + "learning_rate": 0.0005999984038085133, + "loss": 4.0023, + "step": 880 + }, + { + "epoch": 0.051640604601235894, + "grad_norm": 0.2563273310661316, + "learning_rate": 0.000599995973626219, + "loss": 3.98, + "step": 890 + }, + { + "epoch": 0.05222083611360933, + "grad_norm": 0.26515451073646545, + "learning_rate": 0.0005999924388273229, + "loss": 3.9799, + "step": 900 + }, + { + "epoch": 0.052801067625982766, + "grad_norm": 0.23011842370033264, + "learning_rate": 0.0005999877994248407, + "loss": 3.9592, + "step": 910 + }, + { + "epoch": 0.0533812991383562, + "grad_norm": 0.21570523083209991, + "learning_rate": 0.0005999820554358552, + "loss": 3.9366, + "step": 920 + }, + { + "epoch": 0.053961530650729644, + "grad_norm": 0.24623119831085205, + "learning_rate": 0.0005999752068815162, + "loss": 3.923, + "step": 930 + }, + { + "epoch": 0.05454176216310308, + "grad_norm": 0.26557642221450806, + "learning_rate": 0.0005999672537870409, + "loss": 3.9114, + "step": 940 + }, + { + "epoch": 0.055121993675476516, + "grad_norm": 0.23711174726486206, + "learning_rate": 0.0005999581961817135, + "loss": 3.9021, + "step": 950 + }, + { + "epoch": 0.05570222518784995, + "grad_norm": 0.2636472284793854, + "learning_rate": 0.000599948034098885, + "loss": 3.8945, + "step": 960 + }, + { + "epoch": 0.05628245670022339, + "grad_norm": 0.2139461785554886, + "learning_rate": 0.000599936767575973, + "loss": 3.8742, + "step": 970 + }, + { + "epoch": 0.056862688212596824, + "grad_norm": 0.2411975860595703, + "learning_rate": 0.0005999243966544624, + "loss": 3.8627, + "step": 980 + }, + { + "epoch": 0.057442919724970266, + "grad_norm": 0.22522902488708496, + "learning_rate": 0.000599910921379904, + "loss": 3.8439, + "step": 990 + }, + { + "epoch": 0.0580231512373437, + "grad_norm": 0.2505146861076355, + "learning_rate": 0.0005998963418019153, + "loss": 3.8376, + "step": 1000 + }, + { + "epoch": 0.0580231512373437, + "eval_loss": 3.7977514266967773, + "eval_runtime": 3.2666, + "eval_samples_per_second": 1325.524, + "eval_steps_per_second": 2.755, + "step": 1000 + }, + { + "epoch": 0.05860338274971714, + "grad_norm": 0.21931585669517517, + "learning_rate": 0.0005998806579741798, + "loss": 3.8196, + "step": 1010 + }, + { + "epoch": 0.059183614262090574, + "grad_norm": 0.19973556697368622, + "learning_rate": 0.0005998638699544469, + "loss": 3.813, + "step": 1020 + }, + { + "epoch": 0.05976384577446401, + "grad_norm": 0.21615122258663177, + "learning_rate": 0.0005998459778045319, + "loss": 3.7993, + "step": 1030 + }, + { + "epoch": 0.060344077286837446, + "grad_norm": 0.18904747068881989, + "learning_rate": 0.0005998269815903156, + "loss": 3.8122, + "step": 1040 + }, + { + "epoch": 0.06092430879921089, + "grad_norm": 0.20379868149757385, + "learning_rate": 0.000599806881381744, + "loss": 3.7891, + "step": 1050 + }, + { + "epoch": 0.061504540311584324, + "grad_norm": 0.21616701781749725, + "learning_rate": 0.0005997856772528283, + "loss": 3.7768, + "step": 1060 + }, + { + "epoch": 0.06208477182395776, + "grad_norm": 0.1838783323764801, + "learning_rate": 0.0005997633692816442, + "loss": 3.7744, + "step": 1070 + }, + { + "epoch": 0.0626650033363312, + "grad_norm": 0.17894767224788666, + "learning_rate": 0.0005997399575503321, + "loss": 3.7667, + "step": 1080 + }, + { + "epoch": 0.06324523484870463, + "grad_norm": 0.20992882549762726, + "learning_rate": 0.0005997154421450963, + "loss": 3.7449, + "step": 1090 + }, + { + "epoch": 0.06382546636107807, + "grad_norm": 0.19586902856826782, + "learning_rate": 0.0005996898231562051, + "loss": 3.7423, + "step": 1100 + }, + { + "epoch": 0.0644056978734515, + "grad_norm": 0.24105612933635712, + "learning_rate": 0.0005996631006779903, + "loss": 3.7223, + "step": 1110 + }, + { + "epoch": 0.06498592938582494, + "grad_norm": 0.19526907801628113, + "learning_rate": 0.0005996352748088471, + "loss": 3.7189, + "step": 1120 + }, + { + "epoch": 0.06556616089819838, + "grad_norm": 0.16144131124019623, + "learning_rate": 0.000599606345651233, + "loss": 3.7118, + "step": 1130 + }, + { + "epoch": 0.06614639241057182, + "grad_norm": 0.167442187666893, + "learning_rate": 0.0005995763133116683, + "loss": 3.6986, + "step": 1140 + }, + { + "epoch": 0.06672662392294526, + "grad_norm": 0.23503893613815308, + "learning_rate": 0.0005995451779007352, + "loss": 3.7049, + "step": 1150 + }, + { + "epoch": 0.0673068554353187, + "grad_norm": 0.2096278965473175, + "learning_rate": 0.0005995129395330776, + "loss": 3.6865, + "step": 1160 + }, + { + "epoch": 0.06788708694769213, + "grad_norm": 0.19825097918510437, + "learning_rate": 0.0005994795983274004, + "loss": 3.6712, + "step": 1170 + }, + { + "epoch": 0.06846731846006557, + "grad_norm": 0.15405306220054626, + "learning_rate": 0.0005994451544064696, + "loss": 3.6711, + "step": 1180 + }, + { + "epoch": 0.069047549972439, + "grad_norm": 0.563884437084198, + "learning_rate": 0.0005994096078971111, + "loss": 3.677, + "step": 1190 + }, + { + "epoch": 0.06962778148481244, + "grad_norm": 0.1655234694480896, + "learning_rate": 0.0005993729589302111, + "loss": 3.7143, + "step": 1200 + }, + { + "epoch": 0.07020801299718588, + "grad_norm": 0.15598031878471375, + "learning_rate": 0.0005993352076407148, + "loss": 3.6689, + "step": 1210 + }, + { + "epoch": 0.07078824450955931, + "grad_norm": 0.14992448687553406, + "learning_rate": 0.0005992963541676265, + "loss": 3.6581, + "step": 1220 + }, + { + "epoch": 0.07136847602193275, + "grad_norm": 0.1618255376815796, + "learning_rate": 0.0005992563986540086, + "loss": 3.642, + "step": 1230 + }, + { + "epoch": 0.07194870753430618, + "grad_norm": 0.16188852488994598, + "learning_rate": 0.0005992153412469816, + "loss": 3.6399, + "step": 1240 + }, + { + "epoch": 0.07252893904667962, + "grad_norm": 0.17180649936199188, + "learning_rate": 0.0005991731820977231, + "loss": 3.6252, + "step": 1250 + }, + { + "epoch": 0.07310917055905307, + "grad_norm": 0.1691058874130249, + "learning_rate": 0.0005991299213614678, + "loss": 3.6244, + "step": 1260 + }, + { + "epoch": 0.0736894020714265, + "grad_norm": 0.19470703601837158, + "learning_rate": 0.0005990855591975059, + "loss": 3.6199, + "step": 1270 + }, + { + "epoch": 0.07426963358379994, + "grad_norm": 0.15482653677463531, + "learning_rate": 0.0005990400957691835, + "loss": 3.6176, + "step": 1280 + }, + { + "epoch": 0.07484986509617338, + "grad_norm": 0.18342998623847961, + "learning_rate": 0.000598993531243902, + "loss": 3.6082, + "step": 1290 + }, + { + "epoch": 0.07543009660854681, + "grad_norm": 0.17348110675811768, + "learning_rate": 0.0005989458657931167, + "loss": 3.6063, + "step": 1300 + }, + { + "epoch": 0.07601032812092025, + "grad_norm": 0.1687677949666977, + "learning_rate": 0.0005988970995923368, + "loss": 3.6015, + "step": 1310 + }, + { + "epoch": 0.07659055963329368, + "grad_norm": 0.19341568648815155, + "learning_rate": 0.0005988472328211246, + "loss": 3.5912, + "step": 1320 + }, + { + "epoch": 0.07717079114566712, + "grad_norm": 0.15345478057861328, + "learning_rate": 0.0005987962656630947, + "loss": 3.586, + "step": 1330 + }, + { + "epoch": 0.07775102265804056, + "grad_norm": 0.16126085817813873, + "learning_rate": 0.0005987441983059136, + "loss": 3.5797, + "step": 1340 + }, + { + "epoch": 0.07833125417041399, + "grad_norm": 0.1716892272233963, + "learning_rate": 0.0005986910309412986, + "loss": 3.5751, + "step": 1350 + }, + { + "epoch": 0.07891148568278743, + "grad_norm": 0.15669932961463928, + "learning_rate": 0.0005986367637650177, + "loss": 3.5799, + "step": 1360 + }, + { + "epoch": 0.07949171719516086, + "grad_norm": 0.19878168404102325, + "learning_rate": 0.0005985813969768884, + "loss": 3.572, + "step": 1370 + }, + { + "epoch": 0.08007194870753431, + "grad_norm": 0.1505119651556015, + "learning_rate": 0.0005985249307807767, + "loss": 3.567, + "step": 1380 + }, + { + "epoch": 0.08065218021990775, + "grad_norm": 0.1548507809638977, + "learning_rate": 0.0005984673653845972, + "loss": 3.5427, + "step": 1390 + }, + { + "epoch": 0.08123241173228118, + "grad_norm": 0.15786635875701904, + "learning_rate": 0.0005984087010003119, + "loss": 3.5637, + "step": 1400 + }, + { + "epoch": 0.08181264324465462, + "grad_norm": 0.15546779334545135, + "learning_rate": 0.0005983489378439289, + "loss": 3.5475, + "step": 1410 + }, + { + "epoch": 0.08239287475702806, + "grad_norm": 0.17267097532749176, + "learning_rate": 0.0005982880761355026, + "loss": 3.5519, + "step": 1420 + }, + { + "epoch": 0.08297310626940149, + "grad_norm": 0.2120850831270218, + "learning_rate": 0.0005982261160991321, + "loss": 3.545, + "step": 1430 + }, + { + "epoch": 0.08355333778177493, + "grad_norm": 0.1541440784931183, + "learning_rate": 0.0005981630579629609, + "loss": 3.5236, + "step": 1440 + }, + { + "epoch": 0.08413356929414836, + "grad_norm": 0.1610753834247589, + "learning_rate": 0.0005980989019591753, + "loss": 3.5153, + "step": 1450 + }, + { + "epoch": 0.0847138008065218, + "grad_norm": 0.1872093677520752, + "learning_rate": 0.0005980336483240048, + "loss": 3.5208, + "step": 1460 + }, + { + "epoch": 0.08529403231889524, + "grad_norm": 0.15793032944202423, + "learning_rate": 0.0005979672972977201, + "loss": 3.5294, + "step": 1470 + }, + { + "epoch": 0.08587426383126867, + "grad_norm": 0.1738296002149582, + "learning_rate": 0.0005978998491246324, + "loss": 3.5234, + "step": 1480 + }, + { + "epoch": 0.08645449534364211, + "grad_norm": 0.1644987314939499, + "learning_rate": 0.0005978313040530931, + "loss": 3.515, + "step": 1490 + }, + { + "epoch": 0.08703472685601556, + "grad_norm": 0.16707918047904968, + "learning_rate": 0.0005977616623354923, + "loss": 3.5014, + "step": 1500 + }, + { + "epoch": 0.08761495836838899, + "grad_norm": 0.14812146127223969, + "learning_rate": 0.0005976909242282581, + "loss": 3.4923, + "step": 1510 + }, + { + "epoch": 0.08819518988076243, + "grad_norm": 0.15653282403945923, + "learning_rate": 0.0005976190899918555, + "loss": 3.4899, + "step": 1520 + }, + { + "epoch": 0.08877542139313586, + "grad_norm": 0.1531265377998352, + "learning_rate": 0.0005975461598907858, + "loss": 3.4939, + "step": 1530 + }, + { + "epoch": 0.0893556529055093, + "grad_norm": 0.19499650597572327, + "learning_rate": 0.0005974721341935854, + "loss": 3.4776, + "step": 1540 + }, + { + "epoch": 0.08993588441788274, + "grad_norm": 0.16522051393985748, + "learning_rate": 0.0005973970131728245, + "loss": 3.4843, + "step": 1550 + }, + { + "epoch": 0.09051611593025617, + "grad_norm": 0.14911240339279175, + "learning_rate": 0.0005973207971051066, + "loss": 3.4854, + "step": 1560 + }, + { + "epoch": 0.09109634744262961, + "grad_norm": 0.1797751784324646, + "learning_rate": 0.0005972434862710673, + "loss": 3.4814, + "step": 1570 + }, + { + "epoch": 0.09167657895500304, + "grad_norm": 0.14958298206329346, + "learning_rate": 0.0005971650809553729, + "loss": 3.4791, + "step": 1580 + }, + { + "epoch": 0.09225681046737648, + "grad_norm": 0.17834265530109406, + "learning_rate": 0.0005970855814467205, + "loss": 3.4633, + "step": 1590 + }, + { + "epoch": 0.09283704197974992, + "grad_norm": 0.15738125145435333, + "learning_rate": 0.0005970049880378353, + "loss": 3.4676, + "step": 1600 + }, + { + "epoch": 0.09341727349212335, + "grad_norm": 0.14483994245529175, + "learning_rate": 0.0005969233010254707, + "loss": 3.4661, + "step": 1610 + }, + { + "epoch": 0.0939975050044968, + "grad_norm": 0.14126789569854736, + "learning_rate": 0.0005968405207104068, + "loss": 3.4571, + "step": 1620 + }, + { + "epoch": 0.09457773651687024, + "grad_norm": 0.1578633040189743, + "learning_rate": 0.0005967566473974495, + "loss": 3.4558, + "step": 1630 + }, + { + "epoch": 0.09515796802924367, + "grad_norm": 0.1565486639738083, + "learning_rate": 0.000596671681395429, + "loss": 3.4604, + "step": 1640 + }, + { + "epoch": 0.09573819954161711, + "grad_norm": 0.13866451382637024, + "learning_rate": 0.0005965856230171993, + "loss": 3.4552, + "step": 1650 + }, + { + "epoch": 0.09631843105399054, + "grad_norm": 0.2121124267578125, + "learning_rate": 0.0005964984725796359, + "loss": 3.4541, + "step": 1660 + }, + { + "epoch": 0.09689866256636398, + "grad_norm": 0.17082008719444275, + "learning_rate": 0.0005964102304036363, + "loss": 3.4382, + "step": 1670 + }, + { + "epoch": 0.09747889407873742, + "grad_norm": 0.20681622624397278, + "learning_rate": 0.0005963208968141172, + "loss": 3.4372, + "step": 1680 + }, + { + "epoch": 0.09805912559111085, + "grad_norm": 0.1384105086326599, + "learning_rate": 0.0005962304721400142, + "loss": 3.4484, + "step": 1690 + }, + { + "epoch": 0.09863935710348429, + "grad_norm": 0.16820856928825378, + "learning_rate": 0.0005961389567142806, + "loss": 3.4302, + "step": 1700 + }, + { + "epoch": 0.09921958861585772, + "grad_norm": 0.16617996990680695, + "learning_rate": 0.0005960463508738855, + "loss": 3.4328, + "step": 1710 + }, + { + "epoch": 0.09979982012823116, + "grad_norm": 0.16344214975833893, + "learning_rate": 0.0005959526549598137, + "loss": 3.4326, + "step": 1720 + }, + { + "epoch": 0.1003800516406046, + "grad_norm": 0.16235540807247162, + "learning_rate": 0.000595857869317063, + "loss": 3.4271, + "step": 1730 + }, + { + "epoch": 0.10096028315297804, + "grad_norm": 0.1524738371372223, + "learning_rate": 0.0005957619942946442, + "loss": 3.424, + "step": 1740 + }, + { + "epoch": 0.10154051466535148, + "grad_norm": 0.18023791909217834, + "learning_rate": 0.0005956650302455793, + "loss": 3.4266, + "step": 1750 + }, + { + "epoch": 0.10212074617772492, + "grad_norm": 0.17738115787506104, + "learning_rate": 0.0005955669775268999, + "loss": 3.4046, + "step": 1760 + }, + { + "epoch": 0.10270097769009835, + "grad_norm": 0.13939271867275238, + "learning_rate": 0.0005954678364996466, + "loss": 3.4177, + "step": 1770 + }, + { + "epoch": 0.10328120920247179, + "grad_norm": 0.18028447031974792, + "learning_rate": 0.0005953676075288668, + "loss": 3.4113, + "step": 1780 + }, + { + "epoch": 0.10386144071484522, + "grad_norm": 0.15911422669887543, + "learning_rate": 0.0005952662909836142, + "loss": 3.4191, + "step": 1790 + }, + { + "epoch": 0.10444167222721866, + "grad_norm": 0.15596607327461243, + "learning_rate": 0.0005951638872369469, + "loss": 3.3993, + "step": 1800 + }, + { + "epoch": 0.1050219037395921, + "grad_norm": 0.15493981540203094, + "learning_rate": 0.0005950603966659264, + "loss": 3.4043, + "step": 1810 + }, + { + "epoch": 0.10560213525196553, + "grad_norm": 0.1727568507194519, + "learning_rate": 0.0005949558196516154, + "loss": 3.4028, + "step": 1820 + }, + { + "epoch": 0.10618236676433897, + "grad_norm": 0.1614874303340912, + "learning_rate": 0.0005948501565790779, + "loss": 3.3998, + "step": 1830 + }, + { + "epoch": 0.1067625982767124, + "grad_norm": 0.13620299100875854, + "learning_rate": 0.000594743407837376, + "loss": 3.3896, + "step": 1840 + }, + { + "epoch": 0.10734282978908584, + "grad_norm": 0.15391112864017487, + "learning_rate": 0.0005946355738195701, + "loss": 3.3823, + "step": 1850 + }, + { + "epoch": 0.10792306130145929, + "grad_norm": 0.15937426686286926, + "learning_rate": 0.0005945266549227162, + "loss": 3.3893, + "step": 1860 + }, + { + "epoch": 0.10850329281383272, + "grad_norm": 0.16253319382667542, + "learning_rate": 0.0005944166515478649, + "loss": 3.3905, + "step": 1870 + }, + { + "epoch": 0.10908352432620616, + "grad_norm": 0.14502382278442383, + "learning_rate": 0.0005943055641000604, + "loss": 3.3836, + "step": 1880 + }, + { + "epoch": 0.1096637558385796, + "grad_norm": 0.14128324389457703, + "learning_rate": 0.0005941933929883384, + "loss": 3.3854, + "step": 1890 + }, + { + "epoch": 0.11024398735095303, + "grad_norm": 0.19345618784427643, + "learning_rate": 0.0005940801386257244, + "loss": 3.3746, + "step": 1900 + }, + { + "epoch": 0.11082421886332647, + "grad_norm": 0.1499020904302597, + "learning_rate": 0.000593965801429233, + "loss": 3.3729, + "step": 1910 + }, + { + "epoch": 0.1114044503756999, + "grad_norm": 0.14975206553936005, + "learning_rate": 0.0005938503818198656, + "loss": 3.3676, + "step": 1920 + }, + { + "epoch": 0.11198468188807334, + "grad_norm": 0.13726426661014557, + "learning_rate": 0.0005937338802226094, + "loss": 3.373, + "step": 1930 + }, + { + "epoch": 0.11256491340044678, + "grad_norm": 0.1749139279127121, + "learning_rate": 0.0005936162970664355, + "loss": 3.3761, + "step": 1940 + }, + { + "epoch": 0.11314514491282021, + "grad_norm": 0.14197006821632385, + "learning_rate": 0.0005934976327842974, + "loss": 3.3513, + "step": 1950 + }, + { + "epoch": 0.11372537642519365, + "grad_norm": 0.15288510918617249, + "learning_rate": 0.0005933778878131294, + "loss": 3.357, + "step": 1960 + }, + { + "epoch": 0.11430560793756708, + "grad_norm": 0.1787514090538025, + "learning_rate": 0.000593257062593845, + "loss": 3.3642, + "step": 1970 + }, + { + "epoch": 0.11488583944994053, + "grad_norm": 0.13630741834640503, + "learning_rate": 0.0005931351575713353, + "loss": 3.3614, + "step": 1980 + }, + { + "epoch": 0.11546607096231397, + "grad_norm": 0.16102264821529388, + "learning_rate": 0.0005930121731944674, + "loss": 3.3523, + "step": 1990 + }, + { + "epoch": 0.1160463024746874, + "grad_norm": 0.16226573288440704, + "learning_rate": 0.0005928881099160826, + "loss": 3.3595, + "step": 2000 + }, + { + "epoch": 0.1160463024746874, + "eval_loss": 3.3178560733795166, + "eval_runtime": 3.2576, + "eval_samples_per_second": 1329.214, + "eval_steps_per_second": 2.763, + "step": 2000 + }, + { + "epoch": 0.11662653398706084, + "grad_norm": 0.14609858393669128, + "learning_rate": 0.0005927629681929951, + "loss": 3.3585, + "step": 2010 + }, + { + "epoch": 0.11720676549943428, + "grad_norm": 0.14387281239032745, + "learning_rate": 0.0005926367484859896, + "loss": 3.3517, + "step": 2020 + }, + { + "epoch": 0.11778699701180771, + "grad_norm": 0.14605766534805298, + "learning_rate": 0.0005925094512598202, + "loss": 3.3524, + "step": 2030 + }, + { + "epoch": 0.11836722852418115, + "grad_norm": 0.22022885084152222, + "learning_rate": 0.000592381076983209, + "loss": 3.3356, + "step": 2040 + }, + { + "epoch": 0.11894746003655458, + "grad_norm": 0.1847839504480362, + "learning_rate": 0.0005922516261288431, + "loss": 3.3441, + "step": 2050 + }, + { + "epoch": 0.11952769154892802, + "grad_norm": 0.13915176689624786, + "learning_rate": 0.0005921210991733745, + "loss": 3.352, + "step": 2060 + }, + { + "epoch": 0.12010792306130146, + "grad_norm": 0.1398390680551529, + "learning_rate": 0.0005919894965974168, + "loss": 3.3455, + "step": 2070 + }, + { + "epoch": 0.12068815457367489, + "grad_norm": 0.1368722915649414, + "learning_rate": 0.0005918568188855447, + "loss": 3.3403, + "step": 2080 + }, + { + "epoch": 0.12126838608604833, + "grad_norm": 0.16239017248153687, + "learning_rate": 0.0005917230665262914, + "loss": 3.3334, + "step": 2090 + }, + { + "epoch": 0.12184861759842178, + "grad_norm": 0.14380386471748352, + "learning_rate": 0.000591588240012147, + "loss": 3.3294, + "step": 2100 + }, + { + "epoch": 0.12242884911079521, + "grad_norm": 0.16626037657260895, + "learning_rate": 0.0005914523398395569, + "loss": 3.3425, + "step": 2110 + }, + { + "epoch": 0.12300908062316865, + "grad_norm": 0.15981921553611755, + "learning_rate": 0.0005913153665089197, + "loss": 3.3403, + "step": 2120 + }, + { + "epoch": 0.12358931213554208, + "grad_norm": 0.15275150537490845, + "learning_rate": 0.0005911773205245857, + "loss": 3.3261, + "step": 2130 + }, + { + "epoch": 0.12416954364791552, + "grad_norm": 0.1598198413848877, + "learning_rate": 0.0005910382023948546, + "loss": 3.3264, + "step": 2140 + }, + { + "epoch": 0.12474977516028896, + "grad_norm": 0.138661190867424, + "learning_rate": 0.0005908980126319739, + "loss": 3.3216, + "step": 2150 + }, + { + "epoch": 0.1253300066726624, + "grad_norm": 0.15583263337612152, + "learning_rate": 0.000590756751752137, + "loss": 3.3204, + "step": 2160 + }, + { + "epoch": 0.12591023818503583, + "grad_norm": 0.15883944928646088, + "learning_rate": 0.0005906144202754813, + "loss": 3.3274, + "step": 2170 + }, + { + "epoch": 0.12649046969740926, + "grad_norm": 0.15031637251377106, + "learning_rate": 0.0005904710187260862, + "loss": 3.3224, + "step": 2180 + }, + { + "epoch": 0.1270707012097827, + "grad_norm": 0.1994715929031372, + "learning_rate": 0.0005903265476319712, + "loss": 3.3204, + "step": 2190 + }, + { + "epoch": 0.12765093272215614, + "grad_norm": 0.16986873745918274, + "learning_rate": 0.000590181007525094, + "loss": 3.327, + "step": 2200 + }, + { + "epoch": 0.12823116423452957, + "grad_norm": 0.147616907954216, + "learning_rate": 0.0005900343989413485, + "loss": 3.3063, + "step": 2210 + }, + { + "epoch": 0.128811395746903, + "grad_norm": 0.16532088816165924, + "learning_rate": 0.0005898867224205629, + "loss": 3.3198, + "step": 2220 + }, + { + "epoch": 0.12939162725927644, + "grad_norm": 0.16687408089637756, + "learning_rate": 0.0005897379785064977, + "loss": 3.3193, + "step": 2230 + }, + { + "epoch": 0.12997185877164988, + "grad_norm": 0.16683116555213928, + "learning_rate": 0.0005895881677468434, + "loss": 3.3078, + "step": 2240 + }, + { + "epoch": 0.13055209028402331, + "grad_norm": 0.15461483597755432, + "learning_rate": 0.000589437290693219, + "loss": 3.3126, + "step": 2250 + }, + { + "epoch": 0.13113232179639675, + "grad_norm": 0.1432589441537857, + "learning_rate": 0.0005892853479011696, + "loss": 3.3004, + "step": 2260 + }, + { + "epoch": 0.13171255330877019, + "grad_norm": 0.1792496293783188, + "learning_rate": 0.0005891323399301646, + "loss": 3.2946, + "step": 2270 + }, + { + "epoch": 0.13229278482114365, + "grad_norm": 0.15189994871616364, + "learning_rate": 0.0005889782673435952, + "loss": 3.3013, + "step": 2280 + }, + { + "epoch": 0.13287301633351709, + "grad_norm": 0.15026351809501648, + "learning_rate": 0.0005888231307087728, + "loss": 3.295, + "step": 2290 + }, + { + "epoch": 0.13345324784589052, + "grad_norm": 0.16199465095996857, + "learning_rate": 0.0005886669305969269, + "loss": 3.2955, + "step": 2300 + }, + { + "epoch": 0.13403347935826396, + "grad_norm": 0.16704988479614258, + "learning_rate": 0.0005885096675832027, + "loss": 3.3057, + "step": 2310 + }, + { + "epoch": 0.1346137108706374, + "grad_norm": 0.14401213824748993, + "learning_rate": 0.0005883513422466588, + "loss": 3.2876, + "step": 2320 + }, + { + "epoch": 0.13519394238301083, + "grad_norm": 0.15336865186691284, + "learning_rate": 0.000588191955170266, + "loss": 3.2903, + "step": 2330 + }, + { + "epoch": 0.13577417389538426, + "grad_norm": 0.16176366806030273, + "learning_rate": 0.0005880315069409039, + "loss": 3.2873, + "step": 2340 + }, + { + "epoch": 0.1363544054077577, + "grad_norm": 0.14728406071662903, + "learning_rate": 0.00058786999814936, + "loss": 3.2862, + "step": 2350 + }, + { + "epoch": 0.13693463692013114, + "grad_norm": 0.14426636695861816, + "learning_rate": 0.0005877074293903264, + "loss": 3.2786, + "step": 2360 + }, + { + "epoch": 0.13751486843250457, + "grad_norm": 0.15023665130138397, + "learning_rate": 0.0005875438012623984, + "loss": 3.2888, + "step": 2370 + }, + { + "epoch": 0.138095099944878, + "grad_norm": 0.1882687211036682, + "learning_rate": 0.0005873791143680718, + "loss": 3.2806, + "step": 2380 + }, + { + "epoch": 0.13867533145725144, + "grad_norm": 0.14847789704799652, + "learning_rate": 0.000587213369313741, + "loss": 3.2698, + "step": 2390 + }, + { + "epoch": 0.13925556296962488, + "grad_norm": 0.14070352911949158, + "learning_rate": 0.0005870465667096969, + "loss": 3.2782, + "step": 2400 + }, + { + "epoch": 0.13983579448199832, + "grad_norm": 0.19226056337356567, + "learning_rate": 0.0005868787071701238, + "loss": 3.2639, + "step": 2410 + }, + { + "epoch": 0.14041602599437175, + "grad_norm": 0.1776312291622162, + "learning_rate": 0.0005867097913130982, + "loss": 3.2792, + "step": 2420 + }, + { + "epoch": 0.1409962575067452, + "grad_norm": 0.13482613861560822, + "learning_rate": 0.0005865398197605863, + "loss": 3.2834, + "step": 2430 + }, + { + "epoch": 0.14157648901911862, + "grad_norm": 0.16731715202331543, + "learning_rate": 0.0005863687931384408, + "loss": 3.2773, + "step": 2440 + }, + { + "epoch": 0.14215672053149206, + "grad_norm": 0.14542406797409058, + "learning_rate": 0.0005861967120763997, + "loss": 3.2676, + "step": 2450 + }, + { + "epoch": 0.1427369520438655, + "grad_norm": 0.1490476280450821, + "learning_rate": 0.0005860235772080836, + "loss": 3.2783, + "step": 2460 + }, + { + "epoch": 0.14331718355623893, + "grad_norm": 0.1446717530488968, + "learning_rate": 0.0005858493891709932, + "loss": 3.283, + "step": 2470 + }, + { + "epoch": 0.14389741506861237, + "grad_norm": 0.1412891447544098, + "learning_rate": 0.0005856741486065071, + "loss": 3.2652, + "step": 2480 + }, + { + "epoch": 0.1444776465809858, + "grad_norm": 0.14674563705921173, + "learning_rate": 0.0005854978561598794, + "loss": 3.2613, + "step": 2490 + }, + { + "epoch": 0.14505787809335924, + "grad_norm": 0.14808981120586395, + "learning_rate": 0.0005853205124802374, + "loss": 3.2742, + "step": 2500 + }, + { + "epoch": 0.14563810960573267, + "grad_norm": 0.14043253660202026, + "learning_rate": 0.0005851421182205789, + "loss": 3.2685, + "step": 2510 + }, + { + "epoch": 0.14621834111810614, + "grad_norm": 0.1568257212638855, + "learning_rate": 0.0005849626740377705, + "loss": 3.2711, + "step": 2520 + }, + { + "epoch": 0.14679857263047957, + "grad_norm": 0.13545943796634674, + "learning_rate": 0.0005847821805925444, + "loss": 3.2573, + "step": 2530 + }, + { + "epoch": 0.147378804142853, + "grad_norm": 0.18863698840141296, + "learning_rate": 0.0005846006385494964, + "loss": 3.2526, + "step": 2540 + }, + { + "epoch": 0.14795903565522645, + "grad_norm": 0.14628858864307404, + "learning_rate": 0.0005844180485770832, + "loss": 3.2629, + "step": 2550 + }, + { + "epoch": 0.14853926716759988, + "grad_norm": 0.1624503880739212, + "learning_rate": 0.0005842344113476202, + "loss": 3.2529, + "step": 2560 + }, + { + "epoch": 0.14911949867997332, + "grad_norm": 0.16218945384025574, + "learning_rate": 0.0005840497275372792, + "loss": 3.2548, + "step": 2570 + }, + { + "epoch": 0.14969973019234675, + "grad_norm": 0.16516704857349396, + "learning_rate": 0.0005838639978260851, + "loss": 3.2501, + "step": 2580 + }, + { + "epoch": 0.1502799617047202, + "grad_norm": 0.1366761326789856, + "learning_rate": 0.0005836772228979142, + "loss": 3.2467, + "step": 2590 + }, + { + "epoch": 0.15086019321709362, + "grad_norm": 0.15526661276817322, + "learning_rate": 0.0005834894034404913, + "loss": 3.242, + "step": 2600 + }, + { + "epoch": 0.15144042472946706, + "grad_norm": 0.1441916972398758, + "learning_rate": 0.0005833005401453874, + "loss": 3.2399, + "step": 2610 + }, + { + "epoch": 0.1520206562418405, + "grad_norm": 0.1708252727985382, + "learning_rate": 0.0005831106337080169, + "loss": 3.2427, + "step": 2620 + }, + { + "epoch": 0.15260088775421393, + "grad_norm": 0.14945155382156372, + "learning_rate": 0.0005829196848276351, + "loss": 3.2449, + "step": 2630 + }, + { + "epoch": 0.15318111926658737, + "grad_norm": 0.1512700468301773, + "learning_rate": 0.000582727694207336, + "loss": 3.2438, + "step": 2640 + }, + { + "epoch": 0.1537613507789608, + "grad_norm": 0.15101619064807892, + "learning_rate": 0.0005825346625540491, + "loss": 3.2396, + "step": 2650 + }, + { + "epoch": 0.15434158229133424, + "grad_norm": 0.13658584654331207, + "learning_rate": 0.000582340590578537, + "loss": 3.2475, + "step": 2660 + }, + { + "epoch": 0.15492181380370768, + "grad_norm": 0.16723176836967468, + "learning_rate": 0.0005821454789953932, + "loss": 3.2385, + "step": 2670 + }, + { + "epoch": 0.1555020453160811, + "grad_norm": 0.16236084699630737, + "learning_rate": 0.000581949328523039, + "loss": 3.2287, + "step": 2680 + }, + { + "epoch": 0.15608227682845455, + "grad_norm": 0.1473713517189026, + "learning_rate": 0.0005817521398837209, + "loss": 3.2335, + "step": 2690 + }, + { + "epoch": 0.15666250834082798, + "grad_norm": 0.14422966539859772, + "learning_rate": 0.0005815539138035082, + "loss": 3.2217, + "step": 2700 + }, + { + "epoch": 0.15724273985320142, + "grad_norm": 0.1676100343465805, + "learning_rate": 0.00058135465101229, + "loss": 3.2329, + "step": 2710 + }, + { + "epoch": 0.15782297136557485, + "grad_norm": 0.14574168622493744, + "learning_rate": 0.000581154352243773, + "loss": 3.2278, + "step": 2720 + }, + { + "epoch": 0.1584032028779483, + "grad_norm": 0.16981543600559235, + "learning_rate": 0.000580953018235478, + "loss": 3.229, + "step": 2730 + }, + { + "epoch": 0.15898343439032173, + "grad_norm": 0.13945645093917847, + "learning_rate": 0.0005807506497287379, + "loss": 3.2297, + "step": 2740 + }, + { + "epoch": 0.15956366590269516, + "grad_norm": 0.17302276194095612, + "learning_rate": 0.0005805472474686949, + "loss": 3.2227, + "step": 2750 + }, + { + "epoch": 0.16014389741506863, + "grad_norm": 0.15059055387973785, + "learning_rate": 0.0005803428122042974, + "loss": 3.2288, + "step": 2760 + }, + { + "epoch": 0.16072412892744206, + "grad_norm": 0.14908020198345184, + "learning_rate": 0.0005801373446882973, + "loss": 3.2293, + "step": 2770 + }, + { + "epoch": 0.1613043604398155, + "grad_norm": 0.1653462052345276, + "learning_rate": 0.0005799308456772478, + "loss": 3.2189, + "step": 2780 + }, + { + "epoch": 0.16188459195218893, + "grad_norm": 0.14483293890953064, + "learning_rate": 0.0005797233159314997, + "loss": 3.2239, + "step": 2790 + }, + { + "epoch": 0.16246482346456237, + "grad_norm": 0.15277917683124542, + "learning_rate": 0.0005795147562151992, + "loss": 3.2155, + "step": 2800 + }, + { + "epoch": 0.1630450549769358, + "grad_norm": 0.13660204410552979, + "learning_rate": 0.0005793051672962852, + "loss": 3.2183, + "step": 2810 + }, + { + "epoch": 0.16362528648930924, + "grad_norm": 0.15595564246177673, + "learning_rate": 0.0005790945499464861, + "loss": 3.2163, + "step": 2820 + }, + { + "epoch": 0.16420551800168268, + "grad_norm": 0.14608708024024963, + "learning_rate": 0.0005788829049413167, + "loss": 3.2222, + "step": 2830 + }, + { + "epoch": 0.1647857495140561, + "grad_norm": 0.14129003882408142, + "learning_rate": 0.0005786702330600764, + "loss": 3.2115, + "step": 2840 + }, + { + "epoch": 0.16536598102642955, + "grad_norm": 0.13925908505916595, + "learning_rate": 0.0005784565350858453, + "loss": 3.2115, + "step": 2850 + }, + { + "epoch": 0.16594621253880298, + "grad_norm": 0.15094564855098724, + "learning_rate": 0.0005782418118054816, + "loss": 3.216, + "step": 2860 + }, + { + "epoch": 0.16652644405117642, + "grad_norm": 0.1384998857975006, + "learning_rate": 0.0005780260640096189, + "loss": 3.2084, + "step": 2870 + }, + { + "epoch": 0.16710667556354986, + "grad_norm": 0.15442876517772675, + "learning_rate": 0.0005778092924926634, + "loss": 3.2071, + "step": 2880 + }, + { + "epoch": 0.1676869070759233, + "grad_norm": 0.16494965553283691, + "learning_rate": 0.0005775914980527904, + "loss": 3.2101, + "step": 2890 + }, + { + "epoch": 0.16826713858829673, + "grad_norm": 0.16855239868164062, + "learning_rate": 0.0005773726814919419, + "loss": 3.2019, + "step": 2900 + }, + { + "epoch": 0.16884737010067016, + "grad_norm": 0.1579483449459076, + "learning_rate": 0.0005771528436158233, + "loss": 3.209, + "step": 2910 + }, + { + "epoch": 0.1694276016130436, + "grad_norm": 0.1417829543352127, + "learning_rate": 0.0005769319852339008, + "loss": 3.2019, + "step": 2920 + }, + { + "epoch": 0.17000783312541703, + "grad_norm": 0.14454993605613708, + "learning_rate": 0.0005767101071593979, + "loss": 3.2047, + "step": 2930 + }, + { + "epoch": 0.17058806463779047, + "grad_norm": 0.16087666153907776, + "learning_rate": 0.0005764872102092931, + "loss": 3.2062, + "step": 2940 + }, + { + "epoch": 0.1711682961501639, + "grad_norm": 0.139312744140625, + "learning_rate": 0.0005762632952043163, + "loss": 3.1988, + "step": 2950 + }, + { + "epoch": 0.17174852766253734, + "grad_norm": 0.15459179878234863, + "learning_rate": 0.000576038362968946, + "loss": 3.2002, + "step": 2960 + }, + { + "epoch": 0.17232875917491078, + "grad_norm": 0.18820500373840332, + "learning_rate": 0.0005758124143314062, + "loss": 3.2035, + "step": 2970 + }, + { + "epoch": 0.17290899068728421, + "grad_norm": 0.14626365900039673, + "learning_rate": 0.0005755854501236635, + "loss": 3.194, + "step": 2980 + }, + { + "epoch": 0.17348922219965765, + "grad_norm": 0.14270606637001038, + "learning_rate": 0.0005753574711814238, + "loss": 3.1879, + "step": 2990 + }, + { + "epoch": 0.1740694537120311, + "grad_norm": 0.15857936441898346, + "learning_rate": 0.0005751284783441297, + "loss": 3.207, + "step": 3000 + }, + { + "epoch": 0.1740694537120311, + "eval_loss": 3.158046245574951, + "eval_runtime": 3.2654, + "eval_samples_per_second": 1326.029, + "eval_steps_per_second": 2.756, + "step": 3000 + }, + { + "epoch": 0.17464968522440455, + "grad_norm": 0.14403465390205383, + "learning_rate": 0.0005748984724549565, + "loss": 3.1895, + "step": 3010 + }, + { + "epoch": 0.17522991673677799, + "grad_norm": 0.1392756998538971, + "learning_rate": 0.0005746674543608101, + "loss": 3.1942, + "step": 3020 + }, + { + "epoch": 0.17581014824915142, + "grad_norm": 0.13957557082176208, + "learning_rate": 0.0005744354249123234, + "loss": 3.1969, + "step": 3030 + }, + { + "epoch": 0.17639037976152486, + "grad_norm": 0.151198148727417, + "learning_rate": 0.0005742023849638531, + "loss": 3.1903, + "step": 3040 + }, + { + "epoch": 0.1769706112738983, + "grad_norm": 0.14607684314250946, + "learning_rate": 0.0005739683353734766, + "loss": 3.2003, + "step": 3050 + }, + { + "epoch": 0.17755084278627173, + "grad_norm": 0.13925622403621674, + "learning_rate": 0.0005737332770029891, + "loss": 3.1927, + "step": 3060 + }, + { + "epoch": 0.17813107429864516, + "grad_norm": 0.13125456869602203, + "learning_rate": 0.0005734972107179001, + "loss": 3.1849, + "step": 3070 + }, + { + "epoch": 0.1787113058110186, + "grad_norm": 0.16905735433101654, + "learning_rate": 0.0005732601373874306, + "loss": 3.187, + "step": 3080 + }, + { + "epoch": 0.17929153732339204, + "grad_norm": 0.13563838601112366, + "learning_rate": 0.0005730220578845091, + "loss": 3.1853, + "step": 3090 + }, + { + "epoch": 0.17987176883576547, + "grad_norm": 0.15470236539840698, + "learning_rate": 0.0005727829730857695, + "loss": 3.1906, + "step": 3100 + }, + { + "epoch": 0.1804520003481389, + "grad_norm": 0.160013347864151, + "learning_rate": 0.0005725428838715469, + "loss": 3.1705, + "step": 3110 + }, + { + "epoch": 0.18103223186051234, + "grad_norm": 0.14684250950813293, + "learning_rate": 0.0005723017911258752, + "loss": 3.1825, + "step": 3120 + }, + { + "epoch": 0.18161246337288578, + "grad_norm": 0.1529027372598648, + "learning_rate": 0.0005720596957364829, + "loss": 3.1817, + "step": 3130 + }, + { + "epoch": 0.18219269488525922, + "grad_norm": 0.13860736787319183, + "learning_rate": 0.0005718165985947907, + "loss": 3.1844, + "step": 3140 + }, + { + "epoch": 0.18277292639763265, + "grad_norm": 0.14795511960983276, + "learning_rate": 0.0005715725005959077, + "loss": 3.1741, + "step": 3150 + }, + { + "epoch": 0.1833531579100061, + "grad_norm": 0.1455545276403427, + "learning_rate": 0.0005713274026386283, + "loss": 3.1869, + "step": 3160 + }, + { + "epoch": 0.18393338942237952, + "grad_norm": 0.14845995604991913, + "learning_rate": 0.0005710813056254289, + "loss": 3.1735, + "step": 3170 + }, + { + "epoch": 0.18451362093475296, + "grad_norm": 0.14949209988117218, + "learning_rate": 0.0005708342104624645, + "loss": 3.178, + "step": 3180 + }, + { + "epoch": 0.1850938524471264, + "grad_norm": 0.16276435554027557, + "learning_rate": 0.0005705861180595653, + "loss": 3.1712, + "step": 3190 + }, + { + "epoch": 0.18567408395949983, + "grad_norm": 0.14152179658412933, + "learning_rate": 0.0005703370293302335, + "loss": 3.1752, + "step": 3200 + }, + { + "epoch": 0.18625431547187327, + "grad_norm": 0.1554255187511444, + "learning_rate": 0.00057008694519164, + "loss": 3.169, + "step": 3210 + }, + { + "epoch": 0.1868345469842467, + "grad_norm": 0.14890237152576447, + "learning_rate": 0.0005698358665646207, + "loss": 3.1706, + "step": 3220 + }, + { + "epoch": 0.18741477849662014, + "grad_norm": 0.15197904407978058, + "learning_rate": 0.0005695837943736735, + "loss": 3.1691, + "step": 3230 + }, + { + "epoch": 0.1879950100089936, + "grad_norm": 0.15369053184986115, + "learning_rate": 0.0005693307295469547, + "loss": 3.1678, + "step": 3240 + }, + { + "epoch": 0.18857524152136704, + "grad_norm": 0.19938114285469055, + "learning_rate": 0.0005690766730162752, + "loss": 3.1706, + "step": 3250 + }, + { + "epoch": 0.18915547303374047, + "grad_norm": 0.14962078630924225, + "learning_rate": 0.0005688216257170979, + "loss": 3.1665, + "step": 3260 + }, + { + "epoch": 0.1897357045461139, + "grad_norm": 0.14826686680316925, + "learning_rate": 0.0005685655885885337, + "loss": 3.1478, + "step": 3270 + }, + { + "epoch": 0.19031593605848734, + "grad_norm": 0.137392058968544, + "learning_rate": 0.0005683085625733382, + "loss": 3.1645, + "step": 3280 + }, + { + "epoch": 0.19089616757086078, + "grad_norm": 0.15559589862823486, + "learning_rate": 0.000568050548617908, + "loss": 3.1674, + "step": 3290 + }, + { + "epoch": 0.19147639908323422, + "grad_norm": 0.17506170272827148, + "learning_rate": 0.0005677915476722775, + "loss": 3.1606, + "step": 3300 + }, + { + "epoch": 0.19205663059560765, + "grad_norm": 0.1602877825498581, + "learning_rate": 0.0005675315606901155, + "loss": 3.1586, + "step": 3310 + }, + { + "epoch": 0.1926368621079811, + "grad_norm": 0.13343220949172974, + "learning_rate": 0.0005672705886287211, + "loss": 3.1553, + "step": 3320 + }, + { + "epoch": 0.19321709362035452, + "grad_norm": 0.15390737354755402, + "learning_rate": 0.0005670086324490208, + "loss": 3.1687, + "step": 3330 + }, + { + "epoch": 0.19379732513272796, + "grad_norm": 0.13513082265853882, + "learning_rate": 0.0005667456931155647, + "loss": 3.1543, + "step": 3340 + }, + { + "epoch": 0.1943775566451014, + "grad_norm": 0.1489078551530838, + "learning_rate": 0.0005664817715965231, + "loss": 3.1623, + "step": 3350 + }, + { + "epoch": 0.19495778815747483, + "grad_norm": 0.14149461686611176, + "learning_rate": 0.0005662168688636826, + "loss": 3.1487, + "step": 3360 + }, + { + "epoch": 0.19553801966984827, + "grad_norm": 0.150479257106781, + "learning_rate": 0.0005659509858924428, + "loss": 3.1588, + "step": 3370 + }, + { + "epoch": 0.1961182511822217, + "grad_norm": 0.15041102468967438, + "learning_rate": 0.0005656841236618127, + "loss": 3.155, + "step": 3380 + }, + { + "epoch": 0.19669848269459514, + "grad_norm": 0.14053913950920105, + "learning_rate": 0.0005654162831544068, + "loss": 3.1581, + "step": 3390 + }, + { + "epoch": 0.19727871420696858, + "grad_norm": 0.15485486388206482, + "learning_rate": 0.0005651474653564421, + "loss": 3.1465, + "step": 3400 + }, + { + "epoch": 0.197858945719342, + "grad_norm": 0.1425885111093521, + "learning_rate": 0.0005648776712577338, + "loss": 3.1535, + "step": 3410 + }, + { + "epoch": 0.19843917723171545, + "grad_norm": 0.1361316442489624, + "learning_rate": 0.0005646069018516921, + "loss": 3.1466, + "step": 3420 + }, + { + "epoch": 0.19901940874408888, + "grad_norm": 0.15521439909934998, + "learning_rate": 0.0005643351581353184, + "loss": 3.1415, + "step": 3430 + }, + { + "epoch": 0.19959964025646232, + "grad_norm": 0.14644280076026917, + "learning_rate": 0.0005640624411092014, + "loss": 3.1411, + "step": 3440 + }, + { + "epoch": 0.20017987176883575, + "grad_norm": 0.14116531610488892, + "learning_rate": 0.0005637887517775137, + "loss": 3.1542, + "step": 3450 + }, + { + "epoch": 0.2007601032812092, + "grad_norm": 0.1301729828119278, + "learning_rate": 0.0005635140911480082, + "loss": 3.1448, + "step": 3460 + }, + { + "epoch": 0.20134033479358263, + "grad_norm": 0.16307103633880615, + "learning_rate": 0.000563238460232014, + "loss": 3.1397, + "step": 3470 + }, + { + "epoch": 0.2019205663059561, + "grad_norm": 0.13141117990016937, + "learning_rate": 0.0005629618600444332, + "loss": 3.1469, + "step": 3480 + }, + { + "epoch": 0.20250079781832953, + "grad_norm": 0.13741467893123627, + "learning_rate": 0.0005626842916037365, + "loss": 3.1419, + "step": 3490 + }, + { + "epoch": 0.20308102933070296, + "grad_norm": 0.16112880408763885, + "learning_rate": 0.0005624057559319601, + "loss": 3.1449, + "step": 3500 + }, + { + "epoch": 0.2036612608430764, + "grad_norm": 0.153072327375412, + "learning_rate": 0.0005621262540547015, + "loss": 3.1365, + "step": 3510 + }, + { + "epoch": 0.20424149235544983, + "grad_norm": 0.1413891613483429, + "learning_rate": 0.0005618457870011158, + "loss": 3.1307, + "step": 3520 + }, + { + "epoch": 0.20482172386782327, + "grad_norm": 0.15589068830013275, + "learning_rate": 0.0005615643558039121, + "loss": 3.1418, + "step": 3530 + }, + { + "epoch": 0.2054019553801967, + "grad_norm": 0.12889379262924194, + "learning_rate": 0.0005612819614993496, + "loss": 3.1366, + "step": 3540 + }, + { + "epoch": 0.20598218689257014, + "grad_norm": 0.14375300705432892, + "learning_rate": 0.0005609986051272336, + "loss": 3.13, + "step": 3550 + }, + { + "epoch": 0.20656241840494358, + "grad_norm": 0.1587209552526474, + "learning_rate": 0.000560714287730912, + "loss": 3.1338, + "step": 3560 + }, + { + "epoch": 0.207142649917317, + "grad_norm": 0.15273341536521912, + "learning_rate": 0.0005604290103572714, + "loss": 3.1393, + "step": 3570 + }, + { + "epoch": 0.20772288142969045, + "grad_norm": 0.13435807824134827, + "learning_rate": 0.0005601427740567328, + "loss": 3.137, + "step": 3580 + }, + { + "epoch": 0.20830311294206388, + "grad_norm": 0.1391715109348297, + "learning_rate": 0.0005598555798832482, + "loss": 3.1347, + "step": 3590 + }, + { + "epoch": 0.20888334445443732, + "grad_norm": 0.16318084299564362, + "learning_rate": 0.0005595674288942969, + "loss": 3.1279, + "step": 3600 + }, + { + "epoch": 0.20946357596681076, + "grad_norm": 0.1386035829782486, + "learning_rate": 0.0005592783221508807, + "loss": 3.1335, + "step": 3610 + }, + { + "epoch": 0.2100438074791842, + "grad_norm": 0.14639577269554138, + "learning_rate": 0.000558988260717521, + "loss": 3.142, + "step": 3620 + }, + { + "epoch": 0.21062403899155763, + "grad_norm": 0.13666051626205444, + "learning_rate": 0.0005586972456622546, + "loss": 3.1287, + "step": 3630 + }, + { + "epoch": 0.21120427050393106, + "grad_norm": 0.14930284023284912, + "learning_rate": 0.0005584052780566293, + "loss": 3.1283, + "step": 3640 + }, + { + "epoch": 0.2117845020163045, + "grad_norm": 0.13987945020198822, + "learning_rate": 0.0005581123589757002, + "loss": 3.1329, + "step": 3650 + }, + { + "epoch": 0.21236473352867793, + "grad_norm": 0.1452946811914444, + "learning_rate": 0.0005578184894980263, + "loss": 3.1294, + "step": 3660 + }, + { + "epoch": 0.21294496504105137, + "grad_norm": 0.15192043781280518, + "learning_rate": 0.0005575236707056657, + "loss": 3.1206, + "step": 3670 + }, + { + "epoch": 0.2135251965534248, + "grad_norm": 0.16006827354431152, + "learning_rate": 0.0005572279036841721, + "loss": 3.1273, + "step": 3680 + }, + { + "epoch": 0.21410542806579824, + "grad_norm": 0.18141302466392517, + "learning_rate": 0.0005569311895225906, + "loss": 3.1245, + "step": 3690 + }, + { + "epoch": 0.21468565957817168, + "grad_norm": 0.14263153076171875, + "learning_rate": 0.0005566335293134539, + "loss": 3.1211, + "step": 3700 + }, + { + "epoch": 0.21526589109054511, + "grad_norm": 0.1435001790523529, + "learning_rate": 0.0005563349241527781, + "loss": 3.1258, + "step": 3710 + }, + { + "epoch": 0.21584612260291858, + "grad_norm": 0.15155887603759766, + "learning_rate": 0.0005560353751400585, + "loss": 3.1233, + "step": 3720 + }, + { + "epoch": 0.216426354115292, + "grad_norm": 0.1545734703540802, + "learning_rate": 0.0005557348833782663, + "loss": 3.1292, + "step": 3730 + }, + { + "epoch": 0.21700658562766545, + "grad_norm": 0.15549300611019135, + "learning_rate": 0.0005554334499738433, + "loss": 3.1142, + "step": 3740 + }, + { + "epoch": 0.21758681714003889, + "grad_norm": 0.15990693867206573, + "learning_rate": 0.000555131076036699, + "loss": 3.125, + "step": 3750 + }, + { + "epoch": 0.21816704865241232, + "grad_norm": 0.16630201041698456, + "learning_rate": 0.0005548277626802058, + "loss": 3.1216, + "step": 3760 + }, + { + "epoch": 0.21874728016478576, + "grad_norm": 0.1408713161945343, + "learning_rate": 0.0005545235110211954, + "loss": 3.1111, + "step": 3770 + }, + { + "epoch": 0.2193275116771592, + "grad_norm": 0.1488475650548935, + "learning_rate": 0.0005542183221799544, + "loss": 3.1253, + "step": 3780 + }, + { + "epoch": 0.21990774318953263, + "grad_norm": 0.14259935915470123, + "learning_rate": 0.0005539121972802198, + "loss": 3.1179, + "step": 3790 + }, + { + "epoch": 0.22048797470190606, + "grad_norm": 0.14055614173412323, + "learning_rate": 0.0005536051374491757, + "loss": 3.1113, + "step": 3800 + }, + { + "epoch": 0.2210682062142795, + "grad_norm": 0.1665177196264267, + "learning_rate": 0.0005532971438174485, + "loss": 3.1197, + "step": 3810 + }, + { + "epoch": 0.22164843772665294, + "grad_norm": 0.15349626541137695, + "learning_rate": 0.0005529882175191031, + "loss": 3.1086, + "step": 3820 + }, + { + "epoch": 0.22222866923902637, + "grad_norm": 0.14321498572826385, + "learning_rate": 0.0005526783596916385, + "loss": 3.1161, + "step": 3830 + }, + { + "epoch": 0.2228089007513998, + "grad_norm": 0.14768148958683014, + "learning_rate": 0.0005523675714759835, + "loss": 3.1164, + "step": 3840 + }, + { + "epoch": 0.22338913226377324, + "grad_norm": 0.1546637862920761, + "learning_rate": 0.000552055854016493, + "loss": 3.1185, + "step": 3850 + }, + { + "epoch": 0.22396936377614668, + "grad_norm": 0.16114896535873413, + "learning_rate": 0.0005517432084609434, + "loss": 3.1083, + "step": 3860 + }, + { + "epoch": 0.22454959528852012, + "grad_norm": 0.13796792924404144, + "learning_rate": 0.0005514296359605284, + "loss": 3.102, + "step": 3870 + }, + { + "epoch": 0.22512982680089355, + "grad_norm": 0.13948635756969452, + "learning_rate": 0.0005511151376698546, + "loss": 3.1079, + "step": 3880 + }, + { + "epoch": 0.225710058313267, + "grad_norm": 0.13826532661914825, + "learning_rate": 0.0005507997147469378, + "loss": 3.107, + "step": 3890 + }, + { + "epoch": 0.22629028982564042, + "grad_norm": 0.1437525451183319, + "learning_rate": 0.0005504833683531981, + "loss": 3.1076, + "step": 3900 + }, + { + "epoch": 0.22687052133801386, + "grad_norm": 0.14256474375724792, + "learning_rate": 0.0005501660996534563, + "loss": 3.1056, + "step": 3910 + }, + { + "epoch": 0.2274507528503873, + "grad_norm": 0.1531156748533249, + "learning_rate": 0.0005498479098159289, + "loss": 3.101, + "step": 3920 + }, + { + "epoch": 0.22803098436276073, + "grad_norm": 0.16901366412639618, + "learning_rate": 0.0005495288000122242, + "loss": 3.0981, + "step": 3930 + }, + { + "epoch": 0.22861121587513417, + "grad_norm": 0.1440243273973465, + "learning_rate": 0.0005492087714173378, + "loss": 3.1052, + "step": 3940 + }, + { + "epoch": 0.2291914473875076, + "grad_norm": 0.1603139340877533, + "learning_rate": 0.0005488878252096487, + "loss": 3.105, + "step": 3950 + }, + { + "epoch": 0.22977167889988107, + "grad_norm": 0.1588706523180008, + "learning_rate": 0.0005485659625709144, + "loss": 3.1107, + "step": 3960 + }, + { + "epoch": 0.2303519104122545, + "grad_norm": 0.1452343761920929, + "learning_rate": 0.0005482431846862667, + "loss": 3.1074, + "step": 3970 + }, + { + "epoch": 0.23093214192462794, + "grad_norm": 0.15799881517887115, + "learning_rate": 0.0005479194927442078, + "loss": 3.0985, + "step": 3980 + }, + { + "epoch": 0.23151237343700137, + "grad_norm": 0.12657681107521057, + "learning_rate": 0.0005475948879366053, + "loss": 3.0958, + "step": 3990 + }, + { + "epoch": 0.2320926049493748, + "grad_norm": 0.13606688380241394, + "learning_rate": 0.000547269371458688, + "loss": 3.0999, + "step": 4000 + }, + { + "epoch": 0.2320926049493748, + "eval_loss": 3.0630993843078613, + "eval_runtime": 3.264, + "eval_samples_per_second": 1326.576, + "eval_steps_per_second": 2.757, + "step": 4000 + }, + { + "epoch": 0.23267283646174824, + "grad_norm": 0.16136619448661804, + "learning_rate": 0.0005469429445090417, + "loss": 3.1004, + "step": 4010 + }, + { + "epoch": 0.23325306797412168, + "grad_norm": 0.14767828583717346, + "learning_rate": 0.0005466156082896047, + "loss": 3.1075, + "step": 4020 + }, + { + "epoch": 0.23383329948649512, + "grad_norm": 0.1492021530866623, + "learning_rate": 0.0005462873640056632, + "loss": 3.1025, + "step": 4030 + }, + { + "epoch": 0.23441353099886855, + "grad_norm": 0.14654645323753357, + "learning_rate": 0.000545958212865847, + "loss": 3.0966, + "step": 4040 + }, + { + "epoch": 0.234993762511242, + "grad_norm": 0.15648731589317322, + "learning_rate": 0.0005456281560821252, + "loss": 3.0937, + "step": 4050 + }, + { + "epoch": 0.23557399402361542, + "grad_norm": 0.13584694266319275, + "learning_rate": 0.0005452971948698014, + "loss": 3.1052, + "step": 4060 + }, + { + "epoch": 0.23615422553598886, + "grad_norm": 0.13829472661018372, + "learning_rate": 0.0005449653304475094, + "loss": 3.0933, + "step": 4070 + }, + { + "epoch": 0.2367344570483623, + "grad_norm": 0.16889816522598267, + "learning_rate": 0.0005446325640372088, + "loss": 3.0949, + "step": 4080 + }, + { + "epoch": 0.23731468856073573, + "grad_norm": 0.12351599335670471, + "learning_rate": 0.0005442988968641804, + "loss": 3.0914, + "step": 4090 + }, + { + "epoch": 0.23789492007310917, + "grad_norm": 0.14327877759933472, + "learning_rate": 0.0005439643301570216, + "loss": 3.0814, + "step": 4100 + }, + { + "epoch": 0.2384751515854826, + "grad_norm": 0.15155468881130219, + "learning_rate": 0.0005436288651476421, + "loss": 3.0849, + "step": 4110 + }, + { + "epoch": 0.23905538309785604, + "grad_norm": 0.14292922616004944, + "learning_rate": 0.0005432925030712594, + "loss": 3.0887, + "step": 4120 + }, + { + "epoch": 0.23963561461022947, + "grad_norm": 0.14884264767169952, + "learning_rate": 0.0005429552451663936, + "loss": 3.0911, + "step": 4130 + }, + { + "epoch": 0.2402158461226029, + "grad_norm": 0.1403530389070511, + "learning_rate": 0.0005426170926748639, + "loss": 3.0926, + "step": 4140 + }, + { + "epoch": 0.24079607763497635, + "grad_norm": 0.14543718099594116, + "learning_rate": 0.0005422780468417829, + "loss": 3.0897, + "step": 4150 + }, + { + "epoch": 0.24137630914734978, + "grad_norm": 0.12813718616962433, + "learning_rate": 0.0005419381089155532, + "loss": 3.0902, + "step": 4160 + }, + { + "epoch": 0.24195654065972322, + "grad_norm": 0.13375824689865112, + "learning_rate": 0.0005415972801478617, + "loss": 3.0915, + "step": 4170 + }, + { + "epoch": 0.24253677217209665, + "grad_norm": 0.14347635209560394, + "learning_rate": 0.0005412555617936755, + "loss": 3.0892, + "step": 4180 + }, + { + "epoch": 0.2431170036844701, + "grad_norm": 0.14166522026062012, + "learning_rate": 0.0005409129551112377, + "loss": 3.0808, + "step": 4190 + }, + { + "epoch": 0.24369723519684355, + "grad_norm": 0.13924048840999603, + "learning_rate": 0.0005405694613620617, + "loss": 3.0854, + "step": 4200 + }, + { + "epoch": 0.244277466709217, + "grad_norm": 0.13338492810726166, + "learning_rate": 0.0005402250818109276, + "loss": 3.0836, + "step": 4210 + }, + { + "epoch": 0.24485769822159043, + "grad_norm": 0.14531342685222626, + "learning_rate": 0.0005398798177258768, + "loss": 3.0971, + "step": 4220 + }, + { + "epoch": 0.24543792973396386, + "grad_norm": 0.1432162970304489, + "learning_rate": 0.0005395336703782082, + "loss": 3.0838, + "step": 4230 + }, + { + "epoch": 0.2460181612463373, + "grad_norm": 0.15475274622440338, + "learning_rate": 0.0005391866410424722, + "loss": 3.0764, + "step": 4240 + }, + { + "epoch": 0.24659839275871073, + "grad_norm": 0.15521539747714996, + "learning_rate": 0.0005388387309964675, + "loss": 3.0837, + "step": 4250 + }, + { + "epoch": 0.24717862427108417, + "grad_norm": 0.1430870145559311, + "learning_rate": 0.0005384899415212351, + "loss": 3.0889, + "step": 4260 + }, + { + "epoch": 0.2477588557834576, + "grad_norm": 0.14807622134685516, + "learning_rate": 0.0005381402739010545, + "loss": 3.0769, + "step": 4270 + }, + { + "epoch": 0.24833908729583104, + "grad_norm": 0.1509249359369278, + "learning_rate": 0.0005377897294234385, + "loss": 3.0815, + "step": 4280 + }, + { + "epoch": 0.24891931880820448, + "grad_norm": 0.1451188027858734, + "learning_rate": 0.0005374383093791287, + "loss": 3.0766, + "step": 4290 + }, + { + "epoch": 0.2494995503205779, + "grad_norm": 0.130240797996521, + "learning_rate": 0.0005370860150620901, + "loss": 3.0824, + "step": 4300 + }, + { + "epoch": 0.2500797818329513, + "grad_norm": 0.14696471393108368, + "learning_rate": 0.0005367328477695077, + "loss": 3.0678, + "step": 4310 + }, + { + "epoch": 0.2506600133453248, + "grad_norm": 0.13198255002498627, + "learning_rate": 0.0005363788088017803, + "loss": 3.0759, + "step": 4320 + }, + { + "epoch": 0.25124024485769825, + "grad_norm": 0.1413690447807312, + "learning_rate": 0.0005360238994625166, + "loss": 3.0842, + "step": 4330 + }, + { + "epoch": 0.25182047637007166, + "grad_norm": 0.1560727059841156, + "learning_rate": 0.0005356681210585297, + "loss": 3.074, + "step": 4340 + }, + { + "epoch": 0.2524007078824451, + "grad_norm": 0.13727669417858124, + "learning_rate": 0.0005353114748998332, + "loss": 3.082, + "step": 4350 + }, + { + "epoch": 0.2529809393948185, + "grad_norm": 0.1479531228542328, + "learning_rate": 0.0005349539622996356, + "loss": 3.0804, + "step": 4360 + }, + { + "epoch": 0.253561170907192, + "grad_norm": 0.13756506145000458, + "learning_rate": 0.0005345955845743358, + "loss": 3.0829, + "step": 4370 + }, + { + "epoch": 0.2541414024195654, + "grad_norm": 0.14778585731983185, + "learning_rate": 0.0005342363430435177, + "loss": 3.0785, + "step": 4380 + }, + { + "epoch": 0.25472163393193886, + "grad_norm": 0.13227440416812897, + "learning_rate": 0.0005338762390299467, + "loss": 3.0776, + "step": 4390 + }, + { + "epoch": 0.25530186544431227, + "grad_norm": 0.14178766310214996, + "learning_rate": 0.0005335152738595634, + "loss": 3.0799, + "step": 4400 + }, + { + "epoch": 0.25588209695668573, + "grad_norm": 0.14833244681358337, + "learning_rate": 0.0005331534488614794, + "loss": 3.0674, + "step": 4410 + }, + { + "epoch": 0.25646232846905914, + "grad_norm": 0.13829241693019867, + "learning_rate": 0.0005327907653679721, + "loss": 3.0643, + "step": 4420 + }, + { + "epoch": 0.2570425599814326, + "grad_norm": 0.16908784210681915, + "learning_rate": 0.0005324272247144802, + "loss": 3.0649, + "step": 4430 + }, + { + "epoch": 0.257622791493806, + "grad_norm": 0.14392369985580444, + "learning_rate": 0.0005320628282395985, + "loss": 3.0761, + "step": 4440 + }, + { + "epoch": 0.2582030230061795, + "grad_norm": 0.16387993097305298, + "learning_rate": 0.0005316975772850729, + "loss": 3.0666, + "step": 4450 + }, + { + "epoch": 0.2587832545185529, + "grad_norm": 0.13506962358951569, + "learning_rate": 0.0005313314731957957, + "loss": 3.0672, + "step": 4460 + }, + { + "epoch": 0.25936348603092635, + "grad_norm": 0.1522989273071289, + "learning_rate": 0.0005309645173198007, + "loss": 3.0607, + "step": 4470 + }, + { + "epoch": 0.25994371754329976, + "grad_norm": 0.13824021816253662, + "learning_rate": 0.0005305967110082576, + "loss": 3.0627, + "step": 4480 + }, + { + "epoch": 0.2605239490556732, + "grad_norm": 0.13685718178749084, + "learning_rate": 0.000530228055615468, + "loss": 3.0612, + "step": 4490 + }, + { + "epoch": 0.26110418056804663, + "grad_norm": 0.13309134542942047, + "learning_rate": 0.0005298585524988594, + "loss": 3.0548, + "step": 4500 + }, + { + "epoch": 0.2616844120804201, + "grad_norm": 0.17121103405952454, + "learning_rate": 0.0005294882030189812, + "loss": 3.066, + "step": 4510 + }, + { + "epoch": 0.2622646435927935, + "grad_norm": 0.13467055559158325, + "learning_rate": 0.000529117008539499, + "loss": 3.0606, + "step": 4520 + }, + { + "epoch": 0.26284487510516696, + "grad_norm": 0.12970523536205292, + "learning_rate": 0.0005287449704271896, + "loss": 3.0553, + "step": 4530 + }, + { + "epoch": 0.26342510661754037, + "grad_norm": 0.1509917676448822, + "learning_rate": 0.0005283720900519365, + "loss": 3.0571, + "step": 4540 + }, + { + "epoch": 0.26400533812991384, + "grad_norm": 0.1372883915901184, + "learning_rate": 0.0005279983687867243, + "loss": 3.0635, + "step": 4550 + }, + { + "epoch": 0.2645855696422873, + "grad_norm": 0.1482354998588562, + "learning_rate": 0.0005276238080076335, + "loss": 3.0619, + "step": 4560 + }, + { + "epoch": 0.2651658011546607, + "grad_norm": 0.13884900510311127, + "learning_rate": 0.0005272484090938365, + "loss": 3.069, + "step": 4570 + }, + { + "epoch": 0.26574603266703417, + "grad_norm": 0.14500798285007477, + "learning_rate": 0.0005268721734275914, + "loss": 3.0715, + "step": 4580 + }, + { + "epoch": 0.2663262641794076, + "grad_norm": 0.1357218474149704, + "learning_rate": 0.000526495102394237, + "loss": 3.0584, + "step": 4590 + }, + { + "epoch": 0.26690649569178104, + "grad_norm": 0.14025723934173584, + "learning_rate": 0.0005261171973821887, + "loss": 3.0613, + "step": 4600 + }, + { + "epoch": 0.26748672720415445, + "grad_norm": 0.15253092348575592, + "learning_rate": 0.0005257384597829322, + "loss": 3.0584, + "step": 4610 + }, + { + "epoch": 0.2680669587165279, + "grad_norm": 0.14573270082473755, + "learning_rate": 0.0005253588909910191, + "loss": 3.0634, + "step": 4620 + }, + { + "epoch": 0.2686471902289013, + "grad_norm": 0.15005233883857727, + "learning_rate": 0.0005249784924040614, + "loss": 3.0526, + "step": 4630 + }, + { + "epoch": 0.2692274217412748, + "grad_norm": 0.15314225852489471, + "learning_rate": 0.0005245972654227265, + "loss": 3.0635, + "step": 4640 + }, + { + "epoch": 0.2698076532536482, + "grad_norm": 0.14412705600261688, + "learning_rate": 0.0005242152114507321, + "loss": 3.055, + "step": 4650 + }, + { + "epoch": 0.27038788476602166, + "grad_norm": 0.15046367049217224, + "learning_rate": 0.0005238323318948412, + "loss": 3.066, + "step": 4660 + }, + { + "epoch": 0.27096811627839507, + "grad_norm": 0.12618590891361237, + "learning_rate": 0.0005234486281648559, + "loss": 3.0433, + "step": 4670 + }, + { + "epoch": 0.27154834779076853, + "grad_norm": 0.14097653329372406, + "learning_rate": 0.000523064101673614, + "loss": 3.0593, + "step": 4680 + }, + { + "epoch": 0.27212857930314194, + "grad_norm": 0.14015048742294312, + "learning_rate": 0.0005226787538369821, + "loss": 3.057, + "step": 4690 + }, + { + "epoch": 0.2727088108155154, + "grad_norm": 0.1534152328968048, + "learning_rate": 0.0005222925860738513, + "loss": 3.06, + "step": 4700 + }, + { + "epoch": 0.2732890423278888, + "grad_norm": 0.1350966989994049, + "learning_rate": 0.0005219055998061319, + "loss": 3.0518, + "step": 4710 + }, + { + "epoch": 0.2738692738402623, + "grad_norm": 0.15589705109596252, + "learning_rate": 0.0005215177964587478, + "loss": 3.0468, + "step": 4720 + }, + { + "epoch": 0.2744495053526357, + "grad_norm": 0.14144299924373627, + "learning_rate": 0.0005211291774596316, + "loss": 3.0555, + "step": 4730 + }, + { + "epoch": 0.27502973686500914, + "grad_norm": 0.14553704857826233, + "learning_rate": 0.000520739744239719, + "loss": 3.0531, + "step": 4740 + }, + { + "epoch": 0.27560996837738255, + "grad_norm": 0.15157508850097656, + "learning_rate": 0.0005203494982329441, + "loss": 3.0504, + "step": 4750 + }, + { + "epoch": 0.276190199889756, + "grad_norm": 0.14391539990901947, + "learning_rate": 0.0005199584408762335, + "loss": 3.0512, + "step": 4760 + }, + { + "epoch": 0.2767704314021294, + "grad_norm": 0.1297539621591568, + "learning_rate": 0.0005195665736095013, + "loss": 3.036, + "step": 4770 + }, + { + "epoch": 0.2773506629145029, + "grad_norm": 0.13723768293857574, + "learning_rate": 0.0005191738978756439, + "loss": 3.0532, + "step": 4780 + }, + { + "epoch": 0.2779308944268763, + "grad_norm": 0.1422174870967865, + "learning_rate": 0.0005187804151205345, + "loss": 3.0605, + "step": 4790 + }, + { + "epoch": 0.27851112593924976, + "grad_norm": 0.137346088886261, + "learning_rate": 0.0005183861267930177, + "loss": 3.0552, + "step": 4800 + }, + { + "epoch": 0.2790913574516232, + "grad_norm": 0.13471810519695282, + "learning_rate": 0.0005179910343449046, + "loss": 3.0426, + "step": 4810 + }, + { + "epoch": 0.27967158896399663, + "grad_norm": 0.12727439403533936, + "learning_rate": 0.0005175951392309669, + "loss": 3.0448, + "step": 4820 + }, + { + "epoch": 0.2802518204763701, + "grad_norm": 0.13242101669311523, + "learning_rate": 0.0005171984429089318, + "loss": 3.0546, + "step": 4830 + }, + { + "epoch": 0.2808320519887435, + "grad_norm": 0.14276637136936188, + "learning_rate": 0.0005168009468394769, + "loss": 3.0392, + "step": 4840 + }, + { + "epoch": 0.28141228350111697, + "grad_norm": 0.1340208798646927, + "learning_rate": 0.0005164026524862242, + "loss": 3.0491, + "step": 4850 + }, + { + "epoch": 0.2819925150134904, + "grad_norm": 0.14000356197357178, + "learning_rate": 0.0005160035613157354, + "loss": 3.0396, + "step": 4860 + }, + { + "epoch": 0.28257274652586384, + "grad_norm": 0.15974439680576324, + "learning_rate": 0.0005156036747975059, + "loss": 3.0406, + "step": 4870 + }, + { + "epoch": 0.28315297803823725, + "grad_norm": 0.1382746398448944, + "learning_rate": 0.0005152029944039597, + "loss": 3.0449, + "step": 4880 + }, + { + "epoch": 0.2837332095506107, + "grad_norm": 0.14049001038074493, + "learning_rate": 0.000514801521610444, + "loss": 3.0463, + "step": 4890 + }, + { + "epoch": 0.2843134410629841, + "grad_norm": 0.13699445128440857, + "learning_rate": 0.0005143992578952238, + "loss": 3.0393, + "step": 4900 + }, + { + "epoch": 0.2848936725753576, + "grad_norm": 0.1515870988368988, + "learning_rate": 0.0005139962047394761, + "loss": 3.0399, + "step": 4910 + }, + { + "epoch": 0.285473904087731, + "grad_norm": 0.1437605917453766, + "learning_rate": 0.0005135923636272849, + "loss": 3.0378, + "step": 4920 + }, + { + "epoch": 0.28605413560010445, + "grad_norm": 0.13769088685512543, + "learning_rate": 0.0005131877360456355, + "loss": 3.0377, + "step": 4930 + }, + { + "epoch": 0.28663436711247786, + "grad_norm": 0.15194256603717804, + "learning_rate": 0.000512782323484409, + "loss": 3.0399, + "step": 4940 + }, + { + "epoch": 0.2872145986248513, + "grad_norm": 0.14672812819480896, + "learning_rate": 0.0005123761274363769, + "loss": 3.04, + "step": 4950 + }, + { + "epoch": 0.28779483013722473, + "grad_norm": 0.13162557780742645, + "learning_rate": 0.0005119691493971957, + "loss": 3.0317, + "step": 4960 + }, + { + "epoch": 0.2883750616495982, + "grad_norm": 0.13286751508712769, + "learning_rate": 0.0005115613908654011, + "loss": 3.0486, + "step": 4970 + }, + { + "epoch": 0.2889552931619716, + "grad_norm": 0.13034851849079132, + "learning_rate": 0.0005111528533424027, + "loss": 3.0399, + "step": 4980 + }, + { + "epoch": 0.28953552467434507, + "grad_norm": 0.1405908614397049, + "learning_rate": 0.0005107435383324786, + "loss": 3.0372, + "step": 4990 + }, + { + "epoch": 0.2901157561867185, + "grad_norm": 0.16415055096149445, + "learning_rate": 0.0005103334473427695, + "loss": 3.0333, + "step": 5000 + }, + { + "epoch": 0.2901157561867185, + "eval_loss": 2.9981322288513184, + "eval_runtime": 3.2581, + "eval_samples_per_second": 1329.001, + "eval_steps_per_second": 2.762, + "step": 5000 + }, + { + "epoch": 0.29069598769909194, + "grad_norm": 0.12301915884017944, + "learning_rate": 0.0005099225818832731, + "loss": 3.0312, + "step": 5010 + }, + { + "epoch": 0.29127621921146535, + "grad_norm": 0.16767041385173798, + "learning_rate": 0.0005095109434668395, + "loss": 3.0247, + "step": 5020 + }, + { + "epoch": 0.2918564507238388, + "grad_norm": 0.13234609365463257, + "learning_rate": 0.0005090985336091642, + "loss": 3.0348, + "step": 5030 + }, + { + "epoch": 0.2924366822362123, + "grad_norm": 0.14020933210849762, + "learning_rate": 0.0005086853538287835, + "loss": 3.0317, + "step": 5040 + }, + { + "epoch": 0.2930169137485857, + "grad_norm": 0.14580604434013367, + "learning_rate": 0.0005082714056470687, + "loss": 3.0321, + "step": 5050 + }, + { + "epoch": 0.29359714526095915, + "grad_norm": 0.13627541065216064, + "learning_rate": 0.0005078566905882205, + "loss": 3.0318, + "step": 5060 + }, + { + "epoch": 0.29417737677333256, + "grad_norm": 0.12629657983779907, + "learning_rate": 0.0005074412101792631, + "loss": 3.0284, + "step": 5070 + }, + { + "epoch": 0.294757608285706, + "grad_norm": 0.13409367203712463, + "learning_rate": 0.0005070249659500387, + "loss": 3.0381, + "step": 5080 + }, + { + "epoch": 0.2953378397980794, + "grad_norm": 0.1341470181941986, + "learning_rate": 0.0005066079594332023, + "loss": 3.0229, + "step": 5090 + }, + { + "epoch": 0.2959180713104529, + "grad_norm": 0.1630919873714447, + "learning_rate": 0.0005061901921642156, + "loss": 3.0315, + "step": 5100 + }, + { + "epoch": 0.2964983028228263, + "grad_norm": 0.12825888395309448, + "learning_rate": 0.0005057716656813416, + "loss": 3.0249, + "step": 5110 + }, + { + "epoch": 0.29707853433519976, + "grad_norm": 0.1613105833530426, + "learning_rate": 0.0005053523815256384, + "loss": 3.0238, + "step": 5120 + }, + { + "epoch": 0.29765876584757317, + "grad_norm": 0.14038483798503876, + "learning_rate": 0.0005049323412409542, + "loss": 3.0294, + "step": 5130 + }, + { + "epoch": 0.29823899735994663, + "grad_norm": 0.16509568691253662, + "learning_rate": 0.0005045115463739215, + "loss": 3.0356, + "step": 5140 + }, + { + "epoch": 0.29881922887232004, + "grad_norm": 0.14289237558841705, + "learning_rate": 0.0005040899984739509, + "loss": 3.0228, + "step": 5150 + }, + { + "epoch": 0.2993994603846935, + "grad_norm": 0.14584140479564667, + "learning_rate": 0.000503667699093226, + "loss": 3.0294, + "step": 5160 + }, + { + "epoch": 0.2999796918970669, + "grad_norm": 0.12970221042633057, + "learning_rate": 0.0005032446497866973, + "loss": 3.0321, + "step": 5170 + }, + { + "epoch": 0.3005599234094404, + "grad_norm": 0.13744401931762695, + "learning_rate": 0.0005028208521120769, + "loss": 3.0236, + "step": 5180 + }, + { + "epoch": 0.3011401549218138, + "grad_norm": 0.1317235380411148, + "learning_rate": 0.0005023963076298321, + "loss": 3.0254, + "step": 5190 + }, + { + "epoch": 0.30172038643418725, + "grad_norm": 0.14213494956493378, + "learning_rate": 0.0005019710179031801, + "loss": 3.0275, + "step": 5200 + }, + { + "epoch": 0.30230061794656066, + "grad_norm": 0.13712069392204285, + "learning_rate": 0.0005015449844980823, + "loss": 3.0249, + "step": 5210 + }, + { + "epoch": 0.3028808494589341, + "grad_norm": 0.14411009848117828, + "learning_rate": 0.0005011182089832381, + "loss": 3.0215, + "step": 5220 + }, + { + "epoch": 0.30346108097130753, + "grad_norm": 0.12583871185779572, + "learning_rate": 0.0005006906929300799, + "loss": 3.0275, + "step": 5230 + }, + { + "epoch": 0.304041312483681, + "grad_norm": 0.14499635994434357, + "learning_rate": 0.0005002624379127666, + "loss": 3.0258, + "step": 5240 + }, + { + "epoch": 0.3046215439960544, + "grad_norm": 0.14918765425682068, + "learning_rate": 0.0004998334455081779, + "loss": 3.0209, + "step": 5250 + }, + { + "epoch": 0.30520177550842786, + "grad_norm": 0.13245496153831482, + "learning_rate": 0.0004994037172959089, + "loss": 3.0212, + "step": 5260 + }, + { + "epoch": 0.3057820070208013, + "grad_norm": 0.12850724160671234, + "learning_rate": 0.0004989732548582638, + "loss": 3.0258, + "step": 5270 + }, + { + "epoch": 0.30636223853317474, + "grad_norm": 0.1346123367547989, + "learning_rate": 0.0004985420597802503, + "loss": 3.0138, + "step": 5280 + }, + { + "epoch": 0.3069424700455482, + "grad_norm": 0.14746621251106262, + "learning_rate": 0.0004981101336495741, + "loss": 3.0202, + "step": 5290 + }, + { + "epoch": 0.3075227015579216, + "grad_norm": 0.140406534075737, + "learning_rate": 0.0004976774780566324, + "loss": 3.0276, + "step": 5300 + }, + { + "epoch": 0.30810293307029507, + "grad_norm": 0.133416548371315, + "learning_rate": 0.0004972440945945083, + "loss": 3.0228, + "step": 5310 + }, + { + "epoch": 0.3086831645826685, + "grad_norm": 0.140433207154274, + "learning_rate": 0.0004968099848589651, + "loss": 3.0219, + "step": 5320 + }, + { + "epoch": 0.30926339609504194, + "grad_norm": 0.14963370561599731, + "learning_rate": 0.0004963751504484403, + "loss": 3.0119, + "step": 5330 + }, + { + "epoch": 0.30984362760741535, + "grad_norm": 0.12273452430963516, + "learning_rate": 0.0004959395929640401, + "loss": 3.0136, + "step": 5340 + }, + { + "epoch": 0.3104238591197888, + "grad_norm": 0.14232607185840607, + "learning_rate": 0.0004955033140095322, + "loss": 3.0088, + "step": 5350 + }, + { + "epoch": 0.3110040906321622, + "grad_norm": 0.15276071429252625, + "learning_rate": 0.0004950663151913419, + "loss": 3.0189, + "step": 5360 + }, + { + "epoch": 0.3115843221445357, + "grad_norm": 0.14110638201236725, + "learning_rate": 0.0004946285981185446, + "loss": 3.0273, + "step": 5370 + }, + { + "epoch": 0.3121645536569091, + "grad_norm": 0.12971307337284088, + "learning_rate": 0.0004941901644028601, + "loss": 3.0181, + "step": 5380 + }, + { + "epoch": 0.31274478516928256, + "grad_norm": 0.12775759398937225, + "learning_rate": 0.0004937510156586474, + "loss": 3.0108, + "step": 5390 + }, + { + "epoch": 0.31332501668165597, + "grad_norm": 0.15120139718055725, + "learning_rate": 0.0004933111535028983, + "loss": 3.0142, + "step": 5400 + }, + { + "epoch": 0.31390524819402943, + "grad_norm": 0.14965811371803284, + "learning_rate": 0.0004928705795552312, + "loss": 3.0137, + "step": 5410 + }, + { + "epoch": 0.31448547970640284, + "grad_norm": 0.1459018588066101, + "learning_rate": 0.0004924292954378856, + "loss": 3.0146, + "step": 5420 + }, + { + "epoch": 0.3150657112187763, + "grad_norm": 0.1286230981349945, + "learning_rate": 0.0004919873027757159, + "loss": 3.0162, + "step": 5430 + }, + { + "epoch": 0.3156459427311497, + "grad_norm": 0.13560357689857483, + "learning_rate": 0.0004915446031961854, + "loss": 3.0129, + "step": 5440 + }, + { + "epoch": 0.3162261742435232, + "grad_norm": 0.1419978141784668, + "learning_rate": 0.0004911011983293601, + "loss": 3.0115, + "step": 5450 + }, + { + "epoch": 0.3168064057558966, + "grad_norm": 0.12910611927509308, + "learning_rate": 0.0004906570898079032, + "loss": 3.0151, + "step": 5460 + }, + { + "epoch": 0.31738663726827004, + "grad_norm": 0.15491628646850586, + "learning_rate": 0.0004902122792670692, + "loss": 3.0118, + "step": 5470 + }, + { + "epoch": 0.31796686878064345, + "grad_norm": 0.12448934465646744, + "learning_rate": 0.0004897667683446967, + "loss": 3.0119, + "step": 5480 + }, + { + "epoch": 0.3185471002930169, + "grad_norm": 0.1288510411977768, + "learning_rate": 0.0004893205586812036, + "loss": 3.0078, + "step": 5490 + }, + { + "epoch": 0.3191273318053903, + "grad_norm": 0.12903016805648804, + "learning_rate": 0.000488873651919581, + "loss": 3.0085, + "step": 5500 + }, + { + "epoch": 0.3197075633177638, + "grad_norm": 0.14042973518371582, + "learning_rate": 0.0004884260497053859, + "loss": 3.0093, + "step": 5510 + }, + { + "epoch": 0.32028779483013725, + "grad_norm": 0.13995361328125, + "learning_rate": 0.0004879777536867369, + "loss": 3.0009, + "step": 5520 + }, + { + "epoch": 0.32086802634251066, + "grad_norm": 0.13979199528694153, + "learning_rate": 0.00048752876551430677, + "loss": 3.0089, + "step": 5530 + }, + { + "epoch": 0.3214482578548841, + "grad_norm": 0.130417600274086, + "learning_rate": 0.0004870790868413171, + "loss": 3.0087, + "step": 5540 + }, + { + "epoch": 0.32202848936725753, + "grad_norm": 0.13676275312900543, + "learning_rate": 0.00048662871932353164, + "loss": 3.0092, + "step": 5550 + }, + { + "epoch": 0.322608720879631, + "grad_norm": 0.12869158387184143, + "learning_rate": 0.00048617766461925104, + "loss": 3.0074, + "step": 5560 + }, + { + "epoch": 0.3231889523920044, + "grad_norm": 0.13846737146377563, + "learning_rate": 0.0004857259243893058, + "loss": 3.0079, + "step": 5570 + }, + { + "epoch": 0.32376918390437787, + "grad_norm": 0.1349971890449524, + "learning_rate": 0.0004852735002970509, + "loss": 2.9915, + "step": 5580 + }, + { + "epoch": 0.3243494154167513, + "grad_norm": 0.13398951292037964, + "learning_rate": 0.000484820394008359, + "loss": 2.9982, + "step": 5590 + }, + { + "epoch": 0.32492964692912474, + "grad_norm": 0.13627557456493378, + "learning_rate": 0.0004843666071916152, + "loss": 3.0019, + "step": 5600 + }, + { + "epoch": 0.32550987844149815, + "grad_norm": 0.13470283150672913, + "learning_rate": 0.00048391214151771, + "loss": 3.0015, + "step": 5610 + }, + { + "epoch": 0.3260901099538716, + "grad_norm": 0.14207038283348083, + "learning_rate": 0.0004834569986600336, + "loss": 3.0051, + "step": 5620 + }, + { + "epoch": 0.326670341466245, + "grad_norm": 0.13324964046478271, + "learning_rate": 0.00048300118029446967, + "loss": 2.9956, + "step": 5630 + }, + { + "epoch": 0.3272505729786185, + "grad_norm": 0.15288645029067993, + "learning_rate": 0.0004825446880993892, + "loss": 3.0087, + "step": 5640 + }, + { + "epoch": 0.3278308044909919, + "grad_norm": 0.13744772970676422, + "learning_rate": 0.00048208752375564424, + "loss": 3.0049, + "step": 5650 + }, + { + "epoch": 0.32841103600336535, + "grad_norm": 0.13114534318447113, + "learning_rate": 0.00048162968894656193, + "loss": 2.9993, + "step": 5660 + }, + { + "epoch": 0.32899126751573876, + "grad_norm": 0.1254429966211319, + "learning_rate": 0.00048117118535793773, + "loss": 2.9937, + "step": 5670 + }, + { + "epoch": 0.3295714990281122, + "grad_norm": 0.15155521035194397, + "learning_rate": 0.00048071201467803017, + "loss": 3.0017, + "step": 5680 + }, + { + "epoch": 0.33015173054048563, + "grad_norm": 0.1420249044895172, + "learning_rate": 0.00048025217859755365, + "loss": 3.017, + "step": 5690 + }, + { + "epoch": 0.3307319620528591, + "grad_norm": 0.14615775644779205, + "learning_rate": 0.0004797916788096728, + "loss": 3.0052, + "step": 5700 + }, + { + "epoch": 0.3313121935652325, + "grad_norm": 0.12851493060588837, + "learning_rate": 0.00047933051700999605, + "loss": 3.0041, + "step": 5710 + }, + { + "epoch": 0.33189242507760597, + "grad_norm": 0.13371190428733826, + "learning_rate": 0.00047886869489656956, + "loss": 2.9879, + "step": 5720 + }, + { + "epoch": 0.3324726565899794, + "grad_norm": 0.13223771750926971, + "learning_rate": 0.0004784062141698707, + "loss": 2.993, + "step": 5730 + }, + { + "epoch": 0.33305288810235284, + "grad_norm": 0.13460920751094818, + "learning_rate": 0.00047794307653280184, + "loss": 2.9928, + "step": 5740 + }, + { + "epoch": 0.3336331196147263, + "grad_norm": 0.12678171694278717, + "learning_rate": 0.0004774792836906844, + "loss": 3.0053, + "step": 5750 + }, + { + "epoch": 0.3342133511270997, + "grad_norm": 0.14595790207386017, + "learning_rate": 0.0004770148373512522, + "loss": 2.9974, + "step": 5760 + }, + { + "epoch": 0.3347935826394732, + "grad_norm": 0.1505734771490097, + "learning_rate": 0.00047654973922464525, + "loss": 3.0053, + "step": 5770 + }, + { + "epoch": 0.3353738141518466, + "grad_norm": 0.13636811077594757, + "learning_rate": 0.00047608399102340367, + "loss": 2.9984, + "step": 5780 + }, + { + "epoch": 0.33595404566422005, + "grad_norm": 0.14487333595752716, + "learning_rate": 0.000475617594462461, + "loss": 3.0013, + "step": 5790 + }, + { + "epoch": 0.33653427717659345, + "grad_norm": 0.13392585515975952, + "learning_rate": 0.00047515055125913825, + "loss": 2.9897, + "step": 5800 + }, + { + "epoch": 0.3371145086889669, + "grad_norm": 0.1241224929690361, + "learning_rate": 0.0004746828631331376, + "loss": 2.9918, + "step": 5810 + }, + { + "epoch": 0.3376947402013403, + "grad_norm": 0.1381169706583023, + "learning_rate": 0.00047421453180653553, + "loss": 2.9874, + "step": 5820 + }, + { + "epoch": 0.3382749717137138, + "grad_norm": 0.12413561344146729, + "learning_rate": 0.00047374555900377716, + "loss": 2.9928, + "step": 5830 + }, + { + "epoch": 0.3388552032260872, + "grad_norm": 0.13286706805229187, + "learning_rate": 0.0004732759464516694, + "loss": 2.9907, + "step": 5840 + }, + { + "epoch": 0.33943543473846066, + "grad_norm": 0.1558184027671814, + "learning_rate": 0.0004728056958793749, + "loss": 3.0036, + "step": 5850 + }, + { + "epoch": 0.34001566625083407, + "grad_norm": 0.13220670819282532, + "learning_rate": 0.0004723348090184056, + "loss": 2.9945, + "step": 5860 + }, + { + "epoch": 0.34059589776320753, + "grad_norm": 0.13015997409820557, + "learning_rate": 0.00047186328760261603, + "loss": 3.0005, + "step": 5870 + }, + { + "epoch": 0.34117612927558094, + "grad_norm": 0.146441251039505, + "learning_rate": 0.0004713911333681976, + "loss": 2.9984, + "step": 5880 + }, + { + "epoch": 0.3417563607879544, + "grad_norm": 0.12352869659662247, + "learning_rate": 0.0004709183480536718, + "loss": 2.9946, + "step": 5890 + }, + { + "epoch": 0.3423365923003278, + "grad_norm": 0.12516902387142181, + "learning_rate": 0.0004704449333998834, + "loss": 2.9918, + "step": 5900 + }, + { + "epoch": 0.3429168238127013, + "grad_norm": 0.14155182242393494, + "learning_rate": 0.00046997089114999494, + "loss": 2.9937, + "step": 5910 + }, + { + "epoch": 0.3434970553250747, + "grad_norm": 0.12636148929595947, + "learning_rate": 0.0004694962230494796, + "loss": 2.9869, + "step": 5920 + }, + { + "epoch": 0.34407728683744815, + "grad_norm": 0.14390048384666443, + "learning_rate": 0.000469020930846115, + "loss": 2.9759, + "step": 5930 + }, + { + "epoch": 0.34465751834982156, + "grad_norm": 0.14705798029899597, + "learning_rate": 0.0004685450162899768, + "loss": 2.9876, + "step": 5940 + }, + { + "epoch": 0.345237749862195, + "grad_norm": 0.13937653601169586, + "learning_rate": 0.00046806848113343234, + "loss": 2.9872, + "step": 5950 + }, + { + "epoch": 0.34581798137456843, + "grad_norm": 0.13351042568683624, + "learning_rate": 0.00046759132713113403, + "loss": 2.986, + "step": 5960 + }, + { + "epoch": 0.3463982128869419, + "grad_norm": 0.133000910282135, + "learning_rate": 0.0004671135560400127, + "loss": 2.9886, + "step": 5970 + }, + { + "epoch": 0.3469784443993153, + "grad_norm": 0.1261400580406189, + "learning_rate": 0.0004666351696192718, + "loss": 2.9811, + "step": 5980 + }, + { + "epoch": 0.34755867591168876, + "grad_norm": 0.13575439155101776, + "learning_rate": 0.00046615616963038007, + "loss": 2.9796, + "step": 5990 + }, + { + "epoch": 0.3481389074240622, + "grad_norm": 0.13202066719532013, + "learning_rate": 0.0004656765578370657, + "loss": 2.9958, + "step": 6000 + }, + { + "epoch": 0.3481389074240622, + "eval_loss": 2.949599027633667, + "eval_runtime": 3.2655, + "eval_samples_per_second": 1325.986, + "eval_steps_per_second": 2.756, + "step": 6000 + }, + { + "epoch": 0.34871913893643564, + "grad_norm": 0.14002783596515656, + "learning_rate": 0.0004651963360053096, + "loss": 2.9811, + "step": 6010 + }, + { + "epoch": 0.3492993704488091, + "grad_norm": 0.1519598364830017, + "learning_rate": 0.00046471550590333874, + "loss": 2.9884, + "step": 6020 + }, + { + "epoch": 0.3498796019611825, + "grad_norm": 0.1435564160346985, + "learning_rate": 0.00046423406930162, + "loss": 2.9831, + "step": 6030 + }, + { + "epoch": 0.35045983347355597, + "grad_norm": 0.1241581067442894, + "learning_rate": 0.0004637520279728534, + "loss": 2.9801, + "step": 6040 + }, + { + "epoch": 0.3510400649859294, + "grad_norm": 0.124722421169281, + "learning_rate": 0.00046326938369196566, + "loss": 2.9872, + "step": 6050 + }, + { + "epoch": 0.35162029649830284, + "grad_norm": 0.12400694936513901, + "learning_rate": 0.0004627861382361034, + "loss": 2.9863, + "step": 6060 + }, + { + "epoch": 0.35220052801067625, + "grad_norm": 0.14388398826122284, + "learning_rate": 0.0004623022933846272, + "loss": 2.973, + "step": 6070 + }, + { + "epoch": 0.3527807595230497, + "grad_norm": 0.14111004769802094, + "learning_rate": 0.0004618178509191045, + "loss": 2.9902, + "step": 6080 + }, + { + "epoch": 0.3533609910354231, + "grad_norm": 0.1257510930299759, + "learning_rate": 0.000461332812623303, + "loss": 2.9877, + "step": 6090 + }, + { + "epoch": 0.3539412225477966, + "grad_norm": 0.1282566338777542, + "learning_rate": 0.00046084718028318466, + "loss": 2.9832, + "step": 6100 + }, + { + "epoch": 0.35452145406017, + "grad_norm": 0.14325213432312012, + "learning_rate": 0.00046036095568689864, + "loss": 2.9782, + "step": 6110 + }, + { + "epoch": 0.35510168557254346, + "grad_norm": 0.1563083529472351, + "learning_rate": 0.0004598741406247748, + "loss": 2.9793, + "step": 6120 + }, + { + "epoch": 0.35568191708491687, + "grad_norm": 0.1327456384897232, + "learning_rate": 0.0004593867368893172, + "loss": 2.9843, + "step": 6130 + }, + { + "epoch": 0.35626214859729033, + "grad_norm": 0.13930997252464294, + "learning_rate": 0.0004588987462751975, + "loss": 2.976, + "step": 6140 + }, + { + "epoch": 0.35684238010966374, + "grad_norm": 0.1295255720615387, + "learning_rate": 0.00045841017057924807, + "loss": 2.9801, + "step": 6150 + }, + { + "epoch": 0.3574226116220372, + "grad_norm": 0.1404607594013214, + "learning_rate": 0.00045792101160045613, + "loss": 2.9788, + "step": 6160 + }, + { + "epoch": 0.3580028431344106, + "grad_norm": 0.12297389656305313, + "learning_rate": 0.0004574312711399561, + "loss": 2.9853, + "step": 6170 + }, + { + "epoch": 0.3585830746467841, + "grad_norm": 0.15521986782550812, + "learning_rate": 0.0004569409510010236, + "loss": 2.9825, + "step": 6180 + }, + { + "epoch": 0.3591633061591575, + "grad_norm": 0.12915629148483276, + "learning_rate": 0.00045645005298906887, + "loss": 2.984, + "step": 6190 + }, + { + "epoch": 0.35974353767153094, + "grad_norm": 0.12852182984352112, + "learning_rate": 0.00045595857891162964, + "loss": 2.9703, + "step": 6200 + }, + { + "epoch": 0.36032376918390435, + "grad_norm": 0.1300152987241745, + "learning_rate": 0.00045546653057836517, + "loss": 2.971, + "step": 6210 + }, + { + "epoch": 0.3609040006962778, + "grad_norm": 0.13348935544490814, + "learning_rate": 0.00045497390980104885, + "loss": 2.9762, + "step": 6220 + }, + { + "epoch": 0.3614842322086513, + "grad_norm": 0.13476519286632538, + "learning_rate": 0.00045448071839356203, + "loss": 2.9756, + "step": 6230 + }, + { + "epoch": 0.3620644637210247, + "grad_norm": 0.13884297013282776, + "learning_rate": 0.000453986958171887, + "loss": 2.9829, + "step": 6240 + }, + { + "epoch": 0.36264469523339815, + "grad_norm": 0.12928573787212372, + "learning_rate": 0.00045349263095410087, + "loss": 2.9752, + "step": 6250 + }, + { + "epoch": 0.36322492674577156, + "grad_norm": 0.13350141048431396, + "learning_rate": 0.000452997738560368, + "loss": 2.9748, + "step": 6260 + }, + { + "epoch": 0.363805158258145, + "grad_norm": 0.13747799396514893, + "learning_rate": 0.00045250228281293423, + "loss": 2.9705, + "step": 6270 + }, + { + "epoch": 0.36438538977051843, + "grad_norm": 0.1344989687204361, + "learning_rate": 0.00045200626553611943, + "loss": 2.9801, + "step": 6280 + }, + { + "epoch": 0.3649656212828919, + "grad_norm": 0.1321888118982315, + "learning_rate": 0.00045150968855631104, + "loss": 2.9781, + "step": 6290 + }, + { + "epoch": 0.3655458527952653, + "grad_norm": 0.12561041116714478, + "learning_rate": 0.0004510125537019577, + "loss": 2.973, + "step": 6300 + }, + { + "epoch": 0.36612608430763877, + "grad_norm": 0.13948814570903778, + "learning_rate": 0.00045051486280356194, + "loss": 2.9731, + "step": 6310 + }, + { + "epoch": 0.3667063158200122, + "grad_norm": 0.12595129013061523, + "learning_rate": 0.0004500166176936739, + "loss": 2.9659, + "step": 6320 + }, + { + "epoch": 0.36728654733238564, + "grad_norm": 0.12941335141658783, + "learning_rate": 0.00044951782020688415, + "loss": 2.973, + "step": 6330 + }, + { + "epoch": 0.36786677884475905, + "grad_norm": 0.14215658605098724, + "learning_rate": 0.00044901847217981736, + "loss": 2.975, + "step": 6340 + }, + { + "epoch": 0.3684470103571325, + "grad_norm": 0.12309448421001434, + "learning_rate": 0.00044851857545112525, + "loss": 2.9749, + "step": 6350 + }, + { + "epoch": 0.3690272418695059, + "grad_norm": 0.12824192643165588, + "learning_rate": 0.00044801813186147986, + "loss": 2.9672, + "step": 6360 + }, + { + "epoch": 0.3696074733818794, + "grad_norm": 0.12063992768526077, + "learning_rate": 0.00044751714325356697, + "loss": 2.9708, + "step": 6370 + }, + { + "epoch": 0.3701877048942528, + "grad_norm": 0.12898465991020203, + "learning_rate": 0.0004470156114720792, + "loss": 2.9699, + "step": 6380 + }, + { + "epoch": 0.37076793640662625, + "grad_norm": 0.1321457326412201, + "learning_rate": 0.00044651353836370897, + "loss": 2.9661, + "step": 6390 + }, + { + "epoch": 0.37134816791899966, + "grad_norm": 0.13804246485233307, + "learning_rate": 0.0004460109257771422, + "loss": 2.9783, + "step": 6400 + }, + { + "epoch": 0.3719283994313731, + "grad_norm": 0.12447643280029297, + "learning_rate": 0.00044550777556305094, + "loss": 2.9691, + "step": 6410 + }, + { + "epoch": 0.37250863094374653, + "grad_norm": 0.1610770970582962, + "learning_rate": 0.00044500408957408706, + "loss": 2.972, + "step": 6420 + }, + { + "epoch": 0.37308886245612, + "grad_norm": 0.1278504580259323, + "learning_rate": 0.00044449986966487527, + "loss": 2.9694, + "step": 6430 + }, + { + "epoch": 0.3736690939684934, + "grad_norm": 0.13527578115463257, + "learning_rate": 0.0004439951176920059, + "loss": 2.9707, + "step": 6440 + }, + { + "epoch": 0.37424932548086687, + "grad_norm": 0.14050637185573578, + "learning_rate": 0.0004434898355140287, + "loss": 2.9712, + "step": 6450 + }, + { + "epoch": 0.3748295569932403, + "grad_norm": 0.1513315588235855, + "learning_rate": 0.00044298402499144554, + "loss": 2.9705, + "step": 6460 + }, + { + "epoch": 0.37540978850561374, + "grad_norm": 0.1299854964017868, + "learning_rate": 0.00044247768798670367, + "loss": 2.9662, + "step": 6470 + }, + { + "epoch": 0.3759900200179872, + "grad_norm": 0.1321675330400467, + "learning_rate": 0.00044197082636418907, + "loss": 2.9675, + "step": 6480 + }, + { + "epoch": 0.3765702515303606, + "grad_norm": 0.1453583687543869, + "learning_rate": 0.00044146344199021934, + "loss": 2.9639, + "step": 6490 + }, + { + "epoch": 0.3771504830427341, + "grad_norm": 0.13450521230697632, + "learning_rate": 0.00044095553673303685, + "loss": 2.9661, + "step": 6500 + }, + { + "epoch": 0.3777307145551075, + "grad_norm": 0.13579097390174866, + "learning_rate": 0.00044044711246280215, + "loss": 2.9608, + "step": 6510 + }, + { + "epoch": 0.37831094606748095, + "grad_norm": 0.1469910442829132, + "learning_rate": 0.00043993817105158627, + "loss": 2.9686, + "step": 6520 + }, + { + "epoch": 0.37889117757985435, + "grad_norm": 0.1311839371919632, + "learning_rate": 0.00043942871437336527, + "loss": 2.9636, + "step": 6530 + }, + { + "epoch": 0.3794714090922278, + "grad_norm": 0.15060357749462128, + "learning_rate": 0.0004389187443040116, + "loss": 2.9613, + "step": 6540 + }, + { + "epoch": 0.3800516406046012, + "grad_norm": 0.13408997654914856, + "learning_rate": 0.00043840826272128873, + "loss": 2.9626, + "step": 6550 + }, + { + "epoch": 0.3806318721169747, + "grad_norm": 0.1458410769701004, + "learning_rate": 0.0004378972715048434, + "loss": 2.9604, + "step": 6560 + }, + { + "epoch": 0.3812121036293481, + "grad_norm": 0.13342171907424927, + "learning_rate": 0.0004373857725361984, + "loss": 2.9602, + "step": 6570 + }, + { + "epoch": 0.38179233514172156, + "grad_norm": 0.12624911963939667, + "learning_rate": 0.00043687376769874686, + "loss": 2.9703, + "step": 6580 + }, + { + "epoch": 0.38237256665409497, + "grad_norm": 0.13120518624782562, + "learning_rate": 0.0004363612588777442, + "loss": 2.9601, + "step": 6590 + }, + { + "epoch": 0.38295279816646843, + "grad_norm": 0.1357596516609192, + "learning_rate": 0.00043584824796030145, + "loss": 2.9561, + "step": 6600 + }, + { + "epoch": 0.38353302967884184, + "grad_norm": 0.1270647495985031, + "learning_rate": 0.00043533473683537863, + "loss": 2.9522, + "step": 6610 + }, + { + "epoch": 0.3841132611912153, + "grad_norm": 0.1325126439332962, + "learning_rate": 0.0004348207273937776, + "loss": 2.9603, + "step": 6620 + }, + { + "epoch": 0.3846934927035887, + "grad_norm": 0.13015331327915192, + "learning_rate": 0.0004343062215281347, + "loss": 2.955, + "step": 6630 + }, + { + "epoch": 0.3852737242159622, + "grad_norm": 0.12867479026317596, + "learning_rate": 0.00043379122113291465, + "loss": 2.9692, + "step": 6640 + }, + { + "epoch": 0.3858539557283356, + "grad_norm": 0.14423881471157074, + "learning_rate": 0.00043327572810440283, + "loss": 2.9539, + "step": 6650 + }, + { + "epoch": 0.38643418724070905, + "grad_norm": 0.13097575306892395, + "learning_rate": 0.00043275974434069846, + "loss": 2.9576, + "step": 6660 + }, + { + "epoch": 0.38701441875308246, + "grad_norm": 0.129910409450531, + "learning_rate": 0.0004322432717417079, + "loss": 2.9617, + "step": 6670 + }, + { + "epoch": 0.3875946502654559, + "grad_norm": 0.13308489322662354, + "learning_rate": 0.00043172631220913735, + "loss": 2.9514, + "step": 6680 + }, + { + "epoch": 0.38817488177782933, + "grad_norm": 0.12263292074203491, + "learning_rate": 0.00043120886764648605, + "loss": 2.9557, + "step": 6690 + }, + { + "epoch": 0.3887551132902028, + "grad_norm": 0.1288110911846161, + "learning_rate": 0.0004306909399590389, + "loss": 2.9558, + "step": 6700 + }, + { + "epoch": 0.38933534480257626, + "grad_norm": 0.12322728335857391, + "learning_rate": 0.00043017253105386005, + "loss": 2.9551, + "step": 6710 + }, + { + "epoch": 0.38991557631494966, + "grad_norm": 0.1551227867603302, + "learning_rate": 0.0004296536428397853, + "loss": 2.9583, + "step": 6720 + }, + { + "epoch": 0.3904958078273231, + "grad_norm": 0.12883497774600983, + "learning_rate": 0.00042913427722741546, + "loss": 2.9495, + "step": 6730 + }, + { + "epoch": 0.39107603933969654, + "grad_norm": 0.12460558116436005, + "learning_rate": 0.00042861443612910913, + "loss": 2.9597, + "step": 6740 + }, + { + "epoch": 0.39165627085207, + "grad_norm": 0.122388556599617, + "learning_rate": 0.00042809412145897576, + "loss": 2.9557, + "step": 6750 + }, + { + "epoch": 0.3922365023644434, + "grad_norm": 0.12150498479604721, + "learning_rate": 0.00042757333513286834, + "loss": 2.9489, + "step": 6760 + }, + { + "epoch": 0.39281673387681687, + "grad_norm": 0.15273340046405792, + "learning_rate": 0.00042705207906837666, + "loss": 2.9503, + "step": 6770 + }, + { + "epoch": 0.3933969653891903, + "grad_norm": 0.13954737782478333, + "learning_rate": 0.00042653035518482025, + "loss": 2.9481, + "step": 6780 + }, + { + "epoch": 0.39397719690156374, + "grad_norm": 0.15386004745960236, + "learning_rate": 0.0004260081654032411, + "loss": 2.9596, + "step": 6790 + }, + { + "epoch": 0.39455742841393715, + "grad_norm": 0.1319696307182312, + "learning_rate": 0.0004254855116463966, + "loss": 2.9526, + "step": 6800 + }, + { + "epoch": 0.3951376599263106, + "grad_norm": 0.14486876130104065, + "learning_rate": 0.00042496239583875286, + "loss": 2.9501, + "step": 6810 + }, + { + "epoch": 0.395717891438684, + "grad_norm": 0.12461838871240616, + "learning_rate": 0.0004244388199064768, + "loss": 2.9519, + "step": 6820 + }, + { + "epoch": 0.3962981229510575, + "grad_norm": 0.14132647216320038, + "learning_rate": 0.00042391478577743006, + "loss": 2.9533, + "step": 6830 + }, + { + "epoch": 0.3968783544634309, + "grad_norm": 0.12907026708126068, + "learning_rate": 0.00042339029538116104, + "loss": 2.9451, + "step": 6840 + }, + { + "epoch": 0.39745858597580436, + "grad_norm": 0.13801275193691254, + "learning_rate": 0.0004228653506488984, + "loss": 2.9382, + "step": 6850 + }, + { + "epoch": 0.39803881748817777, + "grad_norm": 0.11962810158729553, + "learning_rate": 0.00042233995351354366, + "loss": 2.9501, + "step": 6860 + }, + { + "epoch": 0.39861904900055123, + "grad_norm": 0.12804014980793, + "learning_rate": 0.00042181410590966413, + "loss": 2.9556, + "step": 6870 + }, + { + "epoch": 0.39919928051292464, + "grad_norm": 0.1232592836022377, + "learning_rate": 0.0004212878097734857, + "loss": 2.9493, + "step": 6880 + }, + { + "epoch": 0.3997795120252981, + "grad_norm": 0.12467402964830399, + "learning_rate": 0.0004207610670428859, + "loss": 2.9518, + "step": 6890 + }, + { + "epoch": 0.4003597435376715, + "grad_norm": 0.13029509782791138, + "learning_rate": 0.0004202338796573866, + "loss": 2.9476, + "step": 6900 + }, + { + "epoch": 0.40093997505004497, + "grad_norm": 0.13504283130168915, + "learning_rate": 0.0004197062495581471, + "loss": 2.9457, + "step": 6910 + }, + { + "epoch": 0.4015202065624184, + "grad_norm": 0.12205976992845535, + "learning_rate": 0.00041917817868795666, + "loss": 2.9418, + "step": 6920 + }, + { + "epoch": 0.40210043807479184, + "grad_norm": 0.14173905551433563, + "learning_rate": 0.0004186496689912275, + "loss": 2.9401, + "step": 6930 + }, + { + "epoch": 0.40268066958716525, + "grad_norm": 0.131003275513649, + "learning_rate": 0.00041812072241398764, + "loss": 2.9416, + "step": 6940 + }, + { + "epoch": 0.4032609010995387, + "grad_norm": 0.1430942267179489, + "learning_rate": 0.00041759134090387396, + "loss": 2.9526, + "step": 6950 + }, + { + "epoch": 0.4038411326119122, + "grad_norm": 0.11908053606748581, + "learning_rate": 0.00041706152641012435, + "loss": 2.9457, + "step": 6960 + }, + { + "epoch": 0.4044213641242856, + "grad_norm": 0.12189971655607224, + "learning_rate": 0.0004165312808835716, + "loss": 2.9497, + "step": 6970 + }, + { + "epoch": 0.40500159563665905, + "grad_norm": 0.1238475888967514, + "learning_rate": 0.00041600060627663515, + "loss": 2.9426, + "step": 6980 + }, + { + "epoch": 0.40558182714903246, + "grad_norm": 0.13269031047821045, + "learning_rate": 0.00041546950454331437, + "loss": 2.9441, + "step": 6990 + }, + { + "epoch": 0.4061620586614059, + "grad_norm": 0.14216388761997223, + "learning_rate": 0.0004149379776391817, + "loss": 2.9443, + "step": 7000 + }, + { + "epoch": 0.4061620586614059, + "eval_loss": 2.910210609436035, + "eval_runtime": 3.2597, + "eval_samples_per_second": 1328.339, + "eval_steps_per_second": 2.761, + "step": 7000 + }, + { + "epoch": 0.40674229017377933, + "grad_norm": 0.13298869132995605, + "learning_rate": 0.0004144060275213747, + "loss": 2.946, + "step": 7010 + }, + { + "epoch": 0.4073225216861528, + "grad_norm": 0.14648084342479706, + "learning_rate": 0.00041387365614858955, + "loss": 2.9468, + "step": 7020 + }, + { + "epoch": 0.4079027531985262, + "grad_norm": 0.13918638229370117, + "learning_rate": 0.00041334086548107336, + "loss": 2.9561, + "step": 7030 + }, + { + "epoch": 0.40848298471089967, + "grad_norm": 0.1421622335910797, + "learning_rate": 0.00041280765748061727, + "loss": 2.9437, + "step": 7040 + }, + { + "epoch": 0.4090632162232731, + "grad_norm": 0.1364564597606659, + "learning_rate": 0.0004122740341105488, + "loss": 2.9354, + "step": 7050 + }, + { + "epoch": 0.40964344773564654, + "grad_norm": 0.1310495287179947, + "learning_rate": 0.00041173999733572523, + "loss": 2.9471, + "step": 7060 + }, + { + "epoch": 0.41022367924801995, + "grad_norm": 0.14024296402931213, + "learning_rate": 0.000411205549122526, + "loss": 2.9372, + "step": 7070 + }, + { + "epoch": 0.4108039107603934, + "grad_norm": 0.1430574357509613, + "learning_rate": 0.0004106706914388452, + "loss": 2.9468, + "step": 7080 + }, + { + "epoch": 0.4113841422727668, + "grad_norm": 0.12103896588087082, + "learning_rate": 0.00041013542625408504, + "loss": 2.9463, + "step": 7090 + }, + { + "epoch": 0.4119643737851403, + "grad_norm": 0.12720054388046265, + "learning_rate": 0.00040959975553914787, + "loss": 2.9427, + "step": 7100 + }, + { + "epoch": 0.4125446052975137, + "grad_norm": 0.14135150611400604, + "learning_rate": 0.0004090636812664295, + "loss": 2.9407, + "step": 7110 + }, + { + "epoch": 0.41312483680988715, + "grad_norm": 0.14666588604450226, + "learning_rate": 0.0004085272054098115, + "loss": 2.9435, + "step": 7120 + }, + { + "epoch": 0.41370506832226056, + "grad_norm": 0.13804596662521362, + "learning_rate": 0.0004079903299446541, + "loss": 2.9365, + "step": 7130 + }, + { + "epoch": 0.414285299834634, + "grad_norm": 0.1470736414194107, + "learning_rate": 0.00040745305684778907, + "loss": 2.9278, + "step": 7140 + }, + { + "epoch": 0.41486553134700743, + "grad_norm": 0.12926244735717773, + "learning_rate": 0.00040691538809751234, + "loss": 2.9354, + "step": 7150 + }, + { + "epoch": 0.4154457628593809, + "grad_norm": 0.1294509321451187, + "learning_rate": 0.00040637732567357635, + "loss": 2.9466, + "step": 7160 + }, + { + "epoch": 0.4160259943717543, + "grad_norm": 0.12196213006973267, + "learning_rate": 0.0004058388715571835, + "loss": 2.9322, + "step": 7170 + }, + { + "epoch": 0.41660622588412777, + "grad_norm": 0.15902066230773926, + "learning_rate": 0.00040530002773097825, + "loss": 2.9448, + "step": 7180 + }, + { + "epoch": 0.41718645739650123, + "grad_norm": 0.11859998106956482, + "learning_rate": 0.0004047607961790399, + "loss": 2.9428, + "step": 7190 + }, + { + "epoch": 0.41776668890887464, + "grad_norm": 0.13470393419265747, + "learning_rate": 0.00040422117888687555, + "loss": 2.942, + "step": 7200 + }, + { + "epoch": 0.4183469204212481, + "grad_norm": 0.1288190484046936, + "learning_rate": 0.0004036811778414125, + "loss": 2.9362, + "step": 7210 + }, + { + "epoch": 0.4189271519336215, + "grad_norm": 0.12759481370449066, + "learning_rate": 0.0004031407950309915, + "loss": 2.9447, + "step": 7220 + }, + { + "epoch": 0.419507383445995, + "grad_norm": 0.13468439877033234, + "learning_rate": 0.0004026000324453584, + "loss": 2.9313, + "step": 7230 + }, + { + "epoch": 0.4200876149583684, + "grad_norm": 0.12287794053554535, + "learning_rate": 0.0004020588920756577, + "loss": 2.9369, + "step": 7240 + }, + { + "epoch": 0.42066784647074185, + "grad_norm": 0.12006892263889313, + "learning_rate": 0.00040151737591442497, + "loss": 2.9329, + "step": 7250 + }, + { + "epoch": 0.42124807798311525, + "grad_norm": 0.13062633574008942, + "learning_rate": 0.00040097548595557935, + "loss": 2.9474, + "step": 7260 + }, + { + "epoch": 0.4218283094954887, + "grad_norm": 0.12141095846891403, + "learning_rate": 0.00040043322419441667, + "loss": 2.9386, + "step": 7270 + }, + { + "epoch": 0.4224085410078621, + "grad_norm": 0.13452979922294617, + "learning_rate": 0.0003998905926276014, + "loss": 2.9203, + "step": 7280 + }, + { + "epoch": 0.4229887725202356, + "grad_norm": 0.13672851026058197, + "learning_rate": 0.0003993475932531598, + "loss": 2.9353, + "step": 7290 + }, + { + "epoch": 0.423569004032609, + "grad_norm": 0.1266540139913559, + "learning_rate": 0.0003988042280704724, + "loss": 2.929, + "step": 7300 + }, + { + "epoch": 0.42414923554498246, + "grad_norm": 0.1192171648144722, + "learning_rate": 0.0003982604990802668, + "loss": 2.9314, + "step": 7310 + }, + { + "epoch": 0.42472946705735587, + "grad_norm": 0.11528236418962479, + "learning_rate": 0.0003977164082846101, + "loss": 2.9349, + "step": 7320 + }, + { + "epoch": 0.42530969856972933, + "grad_norm": 0.12837885320186615, + "learning_rate": 0.00039717195768690155, + "loss": 2.9211, + "step": 7330 + }, + { + "epoch": 0.42588993008210274, + "grad_norm": 0.1254536211490631, + "learning_rate": 0.0003966271492918654, + "loss": 2.9311, + "step": 7340 + }, + { + "epoch": 0.4264701615944762, + "grad_norm": 0.12365511804819107, + "learning_rate": 0.0003960819851055432, + "loss": 2.9411, + "step": 7350 + }, + { + "epoch": 0.4270503931068496, + "grad_norm": 0.14178220927715302, + "learning_rate": 0.00039553646713528644, + "loss": 2.9322, + "step": 7360 + }, + { + "epoch": 0.4276306246192231, + "grad_norm": 0.13220851123332977, + "learning_rate": 0.0003949905973897496, + "loss": 2.9397, + "step": 7370 + }, + { + "epoch": 0.4282108561315965, + "grad_norm": 0.12264362722635269, + "learning_rate": 0.00039444437787888224, + "loss": 2.9355, + "step": 7380 + }, + { + "epoch": 0.42879108764396995, + "grad_norm": 0.12907512485980988, + "learning_rate": 0.00039389781061392184, + "loss": 2.9259, + "step": 7390 + }, + { + "epoch": 0.42937131915634336, + "grad_norm": 0.1319524645805359, + "learning_rate": 0.00039335089760738625, + "loss": 2.9284, + "step": 7400 + }, + { + "epoch": 0.4299515506687168, + "grad_norm": 0.1404864490032196, + "learning_rate": 0.0003928036408730664, + "loss": 2.932, + "step": 7410 + }, + { + "epoch": 0.43053178218109023, + "grad_norm": 0.12499509751796722, + "learning_rate": 0.00039225604242601914, + "loss": 2.9313, + "step": 7420 + }, + { + "epoch": 0.4311120136934637, + "grad_norm": 0.13161097466945648, + "learning_rate": 0.0003917081042825591, + "loss": 2.9261, + "step": 7430 + }, + { + "epoch": 0.43169224520583716, + "grad_norm": 0.13262121379375458, + "learning_rate": 0.000391159828460252, + "loss": 2.9302, + "step": 7440 + }, + { + "epoch": 0.43227247671821056, + "grad_norm": 0.13169781863689423, + "learning_rate": 0.0003906112169779069, + "loss": 2.9247, + "step": 7450 + }, + { + "epoch": 0.432852708230584, + "grad_norm": 0.1297696828842163, + "learning_rate": 0.00039006227185556865, + "loss": 2.9422, + "step": 7460 + }, + { + "epoch": 0.43343293974295743, + "grad_norm": 0.1292199194431305, + "learning_rate": 0.00038951299511451077, + "loss": 2.9232, + "step": 7470 + }, + { + "epoch": 0.4340131712553309, + "grad_norm": 0.13055439293384552, + "learning_rate": 0.0003889633887772278, + "loss": 2.9246, + "step": 7480 + }, + { + "epoch": 0.4345934027677043, + "grad_norm": 0.1166820153594017, + "learning_rate": 0.0003884134548674278, + "loss": 2.9361, + "step": 7490 + }, + { + "epoch": 0.43517363428007777, + "grad_norm": 0.12382174283266068, + "learning_rate": 0.00038786319541002487, + "loss": 2.9221, + "step": 7500 + }, + { + "epoch": 0.4357538657924512, + "grad_norm": 0.12510880827903748, + "learning_rate": 0.0003873126124311323, + "loss": 2.9289, + "step": 7510 + }, + { + "epoch": 0.43633409730482464, + "grad_norm": 0.13196755945682526, + "learning_rate": 0.000386761707958054, + "loss": 2.9203, + "step": 7520 + }, + { + "epoch": 0.43691432881719805, + "grad_norm": 0.13719266653060913, + "learning_rate": 0.00038621048401927817, + "loss": 2.9319, + "step": 7530 + }, + { + "epoch": 0.4374945603295715, + "grad_norm": 0.13211804628372192, + "learning_rate": 0.000385658942644469, + "loss": 2.9326, + "step": 7540 + }, + { + "epoch": 0.4380747918419449, + "grad_norm": 0.12999597191810608, + "learning_rate": 0.0003851070858644596, + "loss": 2.9239, + "step": 7550 + }, + { + "epoch": 0.4386550233543184, + "grad_norm": 0.13165125250816345, + "learning_rate": 0.0003845549157112445, + "loss": 2.9312, + "step": 7560 + }, + { + "epoch": 0.4392352548666918, + "grad_norm": 0.13743376731872559, + "learning_rate": 0.00038400243421797206, + "loss": 2.9254, + "step": 7570 + }, + { + "epoch": 0.43981548637906526, + "grad_norm": 0.12621231377124786, + "learning_rate": 0.00038344964341893684, + "loss": 2.9203, + "step": 7580 + }, + { + "epoch": 0.44039571789143866, + "grad_norm": 0.12167075276374817, + "learning_rate": 0.00038289654534957266, + "loss": 2.9281, + "step": 7590 + }, + { + "epoch": 0.44097594940381213, + "grad_norm": 0.13523493707180023, + "learning_rate": 0.0003823431420464444, + "loss": 2.916, + "step": 7600 + }, + { + "epoch": 0.44155618091618554, + "grad_norm": 0.11718156933784485, + "learning_rate": 0.0003817894355472413, + "loss": 2.9145, + "step": 7610 + }, + { + "epoch": 0.442136412428559, + "grad_norm": 0.13470205664634705, + "learning_rate": 0.0003812354278907683, + "loss": 2.9173, + "step": 7620 + }, + { + "epoch": 0.4427166439409324, + "grad_norm": 0.1286102533340454, + "learning_rate": 0.00038068112111693984, + "loss": 2.9249, + "step": 7630 + }, + { + "epoch": 0.44329687545330587, + "grad_norm": 0.13669750094413757, + "learning_rate": 0.00038012651726677146, + "loss": 2.9239, + "step": 7640 + }, + { + "epoch": 0.4438771069656793, + "grad_norm": 0.14638318121433258, + "learning_rate": 0.0003795716183823728, + "loss": 2.9306, + "step": 7650 + }, + { + "epoch": 0.44445733847805274, + "grad_norm": 0.13569045066833496, + "learning_rate": 0.00037901642650693944, + "loss": 2.9168, + "step": 7660 + }, + { + "epoch": 0.4450375699904262, + "grad_norm": 0.1257532387971878, + "learning_rate": 0.00037846094368474613, + "loss": 2.9242, + "step": 7670 + }, + { + "epoch": 0.4456178015027996, + "grad_norm": 0.11852803826332092, + "learning_rate": 0.0003779051719611389, + "loss": 2.9209, + "step": 7680 + }, + { + "epoch": 0.4461980330151731, + "grad_norm": 0.12594154477119446, + "learning_rate": 0.0003773491133825273, + "loss": 2.929, + "step": 7690 + }, + { + "epoch": 0.4467782645275465, + "grad_norm": 0.12566526234149933, + "learning_rate": 0.00037679276999637746, + "loss": 2.9119, + "step": 7700 + }, + { + "epoch": 0.44735849603991995, + "grad_norm": 0.13207079470157623, + "learning_rate": 0.0003762361438512038, + "loss": 2.917, + "step": 7710 + }, + { + "epoch": 0.44793872755229336, + "grad_norm": 0.13788865506649017, + "learning_rate": 0.00037567923699656226, + "loss": 2.92, + "step": 7720 + }, + { + "epoch": 0.4485189590646668, + "grad_norm": 0.13110986351966858, + "learning_rate": 0.00037512205148304204, + "loss": 2.9249, + "step": 7730 + }, + { + "epoch": 0.44909919057704023, + "grad_norm": 0.1643168181180954, + "learning_rate": 0.00037456458936225873, + "loss": 2.9232, + "step": 7740 + }, + { + "epoch": 0.4496794220894137, + "grad_norm": 0.14076946675777435, + "learning_rate": 0.00037400685268684623, + "loss": 2.9252, + "step": 7750 + }, + { + "epoch": 0.4502596536017871, + "grad_norm": 0.1238834485411644, + "learning_rate": 0.0003734488435104494, + "loss": 2.9093, + "step": 7760 + }, + { + "epoch": 0.45083988511416057, + "grad_norm": 0.11924099922180176, + "learning_rate": 0.00037289056388771643, + "loss": 2.9324, + "step": 7770 + }, + { + "epoch": 0.451420116626534, + "grad_norm": 0.13720078766345978, + "learning_rate": 0.0003723320158742914, + "loss": 2.9154, + "step": 7780 + }, + { + "epoch": 0.45200034813890744, + "grad_norm": 0.12532520294189453, + "learning_rate": 0.00037177320152680663, + "loss": 2.9228, + "step": 7790 + }, + { + "epoch": 0.45258057965128085, + "grad_norm": 0.129350483417511, + "learning_rate": 0.0003712141229028751, + "loss": 2.9071, + "step": 7800 + }, + { + "epoch": 0.4531608111636543, + "grad_norm": 0.12484076619148254, + "learning_rate": 0.0003706547820610828, + "loss": 2.9107, + "step": 7810 + }, + { + "epoch": 0.4537410426760277, + "grad_norm": 0.12527912855148315, + "learning_rate": 0.0003700951810609815, + "loss": 2.9166, + "step": 7820 + }, + { + "epoch": 0.4543212741884012, + "grad_norm": 0.1453130692243576, + "learning_rate": 0.0003695353219630803, + "loss": 2.9195, + "step": 7830 + }, + { + "epoch": 0.4549015057007746, + "grad_norm": 0.1291913241147995, + "learning_rate": 0.0003689752068288395, + "loss": 2.9124, + "step": 7840 + }, + { + "epoch": 0.45548173721314805, + "grad_norm": 0.12470022588968277, + "learning_rate": 0.0003684148377206615, + "loss": 2.9241, + "step": 7850 + }, + { + "epoch": 0.45606196872552146, + "grad_norm": 0.1276790350675583, + "learning_rate": 0.00036785421670188395, + "loss": 2.9178, + "step": 7860 + }, + { + "epoch": 0.4566422002378949, + "grad_norm": 0.15164950489997864, + "learning_rate": 0.0003672933458367724, + "loss": 2.9072, + "step": 7870 + }, + { + "epoch": 0.45722243175026833, + "grad_norm": 0.14891022443771362, + "learning_rate": 0.00036673222719051194, + "loss": 2.9235, + "step": 7880 + }, + { + "epoch": 0.4578026632626418, + "grad_norm": 0.1266569346189499, + "learning_rate": 0.0003661708628292003, + "loss": 2.9159, + "step": 7890 + }, + { + "epoch": 0.4583828947750152, + "grad_norm": 0.12030439078807831, + "learning_rate": 0.0003656092548198399, + "loss": 2.912, + "step": 7900 + }, + { + "epoch": 0.45896312628738867, + "grad_norm": 0.12590278685092926, + "learning_rate": 0.00036504740523033016, + "loss": 2.91, + "step": 7910 + }, + { + "epoch": 0.45954335779976213, + "grad_norm": 0.1255042403936386, + "learning_rate": 0.0003644853161294601, + "loss": 2.9127, + "step": 7920 + }, + { + "epoch": 0.46012358931213554, + "grad_norm": 0.1253713071346283, + "learning_rate": 0.0003639229895869009, + "loss": 2.9242, + "step": 7930 + }, + { + "epoch": 0.460703820824509, + "grad_norm": 0.1254982203245163, + "learning_rate": 0.0003633604276731975, + "loss": 2.9115, + "step": 7940 + }, + { + "epoch": 0.4612840523368824, + "grad_norm": 0.12157725542783737, + "learning_rate": 0.00036279763245976207, + "loss": 2.9114, + "step": 7950 + }, + { + "epoch": 0.4618642838492559, + "grad_norm": 0.12421195954084396, + "learning_rate": 0.00036223460601886537, + "loss": 2.9083, + "step": 7960 + }, + { + "epoch": 0.4624445153616293, + "grad_norm": 0.11870937049388885, + "learning_rate": 0.00036167135042362977, + "loss": 2.907, + "step": 7970 + }, + { + "epoch": 0.46302474687400275, + "grad_norm": 0.12460967898368835, + "learning_rate": 0.00036110786774802133, + "loss": 2.9088, + "step": 7980 + }, + { + "epoch": 0.46360497838637615, + "grad_norm": 0.1310334950685501, + "learning_rate": 0.00036054416006684245, + "loss": 2.9102, + "step": 7990 + }, + { + "epoch": 0.4641852098987496, + "grad_norm": 0.12560488283634186, + "learning_rate": 0.00035998022945572366, + "loss": 2.9097, + "step": 8000 + }, + { + "epoch": 0.4641852098987496, + "eval_loss": 2.875955820083618, + "eval_runtime": 3.2545, + "eval_samples_per_second": 1330.484, + "eval_steps_per_second": 2.765, + "step": 8000 + }, + { + "epoch": 0.464765441411123, + "grad_norm": 0.12761953473091125, + "learning_rate": 0.00035941607799111675, + "loss": 2.91, + "step": 8010 + }, + { + "epoch": 0.4653456729234965, + "grad_norm": 0.1247384324669838, + "learning_rate": 0.0003588517077502864, + "loss": 2.9149, + "step": 8020 + }, + { + "epoch": 0.4659259044358699, + "grad_norm": 0.14209751784801483, + "learning_rate": 0.00035828712081130296, + "loss": 2.9083, + "step": 8030 + }, + { + "epoch": 0.46650613594824336, + "grad_norm": 0.12985317409038544, + "learning_rate": 0.00035772231925303464, + "loss": 2.9046, + "step": 8040 + }, + { + "epoch": 0.46708636746061677, + "grad_norm": 0.14672869443893433, + "learning_rate": 0.00035715730515514, + "loss": 2.9113, + "step": 8050 + }, + { + "epoch": 0.46766659897299023, + "grad_norm": 0.13361111283302307, + "learning_rate": 0.0003565920805980602, + "loss": 2.913, + "step": 8060 + }, + { + "epoch": 0.46824683048536364, + "grad_norm": 0.12082985788583755, + "learning_rate": 0.0003560266476630112, + "loss": 2.9138, + "step": 8070 + }, + { + "epoch": 0.4688270619977371, + "grad_norm": 0.1150035560131073, + "learning_rate": 0.0003554610084319763, + "loss": 2.9048, + "step": 8080 + }, + { + "epoch": 0.4694072935101105, + "grad_norm": 0.1214471235871315, + "learning_rate": 0.0003548951649876984, + "loss": 2.9123, + "step": 8090 + }, + { + "epoch": 0.469987525022484, + "grad_norm": 0.12934035062789917, + "learning_rate": 0.0003543291194136723, + "loss": 2.9028, + "step": 8100 + }, + { + "epoch": 0.4705677565348574, + "grad_norm": 0.15276013314723969, + "learning_rate": 0.00035376287379413723, + "loss": 2.9031, + "step": 8110 + }, + { + "epoch": 0.47114798804723085, + "grad_norm": 0.1335725337266922, + "learning_rate": 0.00035319643021406886, + "loss": 2.9124, + "step": 8120 + }, + { + "epoch": 0.47172821955960426, + "grad_norm": 0.12289181351661682, + "learning_rate": 0.00035262979075917166, + "loss": 2.9053, + "step": 8130 + }, + { + "epoch": 0.4723084510719777, + "grad_norm": 0.11827896535396576, + "learning_rate": 0.0003520629575158715, + "loss": 2.9138, + "step": 8140 + }, + { + "epoch": 0.4728886825843512, + "grad_norm": 0.12505313754081726, + "learning_rate": 0.0003514959325713078, + "loss": 2.909, + "step": 8150 + }, + { + "epoch": 0.4734689140967246, + "grad_norm": 0.1321611851453781, + "learning_rate": 0.00035092871801332574, + "loss": 2.9075, + "step": 8160 + }, + { + "epoch": 0.47404914560909805, + "grad_norm": 0.12144722044467926, + "learning_rate": 0.00035036131593046895, + "loss": 2.9046, + "step": 8170 + }, + { + "epoch": 0.47462937712147146, + "grad_norm": 0.11893021315336227, + "learning_rate": 0.0003497937284119711, + "loss": 2.9021, + "step": 8180 + }, + { + "epoch": 0.4752096086338449, + "grad_norm": 0.13043691217899323, + "learning_rate": 0.0003492259575477491, + "loss": 2.9052, + "step": 8190 + }, + { + "epoch": 0.47578984014621833, + "grad_norm": 0.12443230301141739, + "learning_rate": 0.00034865800542839445, + "loss": 2.9003, + "step": 8200 + }, + { + "epoch": 0.4763700716585918, + "grad_norm": 0.1350659728050232, + "learning_rate": 0.0003480898741451667, + "loss": 2.9077, + "step": 8210 + }, + { + "epoch": 0.4769503031709652, + "grad_norm": 0.13212652504444122, + "learning_rate": 0.0003475215657899844, + "loss": 2.8955, + "step": 8220 + }, + { + "epoch": 0.47753053468333867, + "grad_norm": 0.13865076005458832, + "learning_rate": 0.0003469530824554188, + "loss": 2.9015, + "step": 8230 + }, + { + "epoch": 0.4781107661957121, + "grad_norm": 0.1313691884279251, + "learning_rate": 0.00034638442623468484, + "loss": 2.9014, + "step": 8240 + }, + { + "epoch": 0.47869099770808554, + "grad_norm": 0.13368923962116241, + "learning_rate": 0.00034581559922163447, + "loss": 2.8962, + "step": 8250 + }, + { + "epoch": 0.47927122922045895, + "grad_norm": 0.12228936702013016, + "learning_rate": 0.0003452466035107481, + "loss": 2.8997, + "step": 8260 + }, + { + "epoch": 0.4798514607328324, + "grad_norm": 0.12648892402648926, + "learning_rate": 0.00034467744119712787, + "loss": 2.9052, + "step": 8270 + }, + { + "epoch": 0.4804316922452058, + "grad_norm": 0.12937045097351074, + "learning_rate": 0.00034410811437648873, + "loss": 2.9037, + "step": 8280 + }, + { + "epoch": 0.4810119237575793, + "grad_norm": 0.12095940858125687, + "learning_rate": 0.00034353862514515185, + "loss": 2.9002, + "step": 8290 + }, + { + "epoch": 0.4815921552699527, + "grad_norm": 0.11992644518613815, + "learning_rate": 0.0003429689756000362, + "loss": 2.9051, + "step": 8300 + }, + { + "epoch": 0.48217238678232616, + "grad_norm": 0.1110587939620018, + "learning_rate": 0.0003423991678386511, + "loss": 2.9046, + "step": 8310 + }, + { + "epoch": 0.48275261829469956, + "grad_norm": 0.11831989139318466, + "learning_rate": 0.00034182920395908837, + "loss": 2.9001, + "step": 8320 + }, + { + "epoch": 0.48333284980707303, + "grad_norm": 0.11492130905389786, + "learning_rate": 0.0003412590860600148, + "loss": 2.8944, + "step": 8330 + }, + { + "epoch": 0.48391308131944644, + "grad_norm": 0.12855441868305206, + "learning_rate": 0.00034068881624066405, + "loss": 2.8941, + "step": 8340 + }, + { + "epoch": 0.4844933128318199, + "grad_norm": 0.12829254567623138, + "learning_rate": 0.0003401183966008296, + "loss": 2.8989, + "step": 8350 + }, + { + "epoch": 0.4850735443441933, + "grad_norm": 0.1167573556303978, + "learning_rate": 0.00033954782924085604, + "loss": 2.9027, + "step": 8360 + }, + { + "epoch": 0.48565377585656677, + "grad_norm": 0.12906575202941895, + "learning_rate": 0.0003389771162616324, + "loss": 2.893, + "step": 8370 + }, + { + "epoch": 0.4862340073689402, + "grad_norm": 0.12219451367855072, + "learning_rate": 0.00033840625976458357, + "loss": 2.8971, + "step": 8380 + }, + { + "epoch": 0.48681423888131364, + "grad_norm": 0.1430503875017166, + "learning_rate": 0.00033783526185166295, + "loss": 2.8945, + "step": 8390 + }, + { + "epoch": 0.4873944703936871, + "grad_norm": 0.1279267519712448, + "learning_rate": 0.00033726412462534454, + "loss": 2.8969, + "step": 8400 + }, + { + "epoch": 0.4879747019060605, + "grad_norm": 0.1239406168460846, + "learning_rate": 0.00033669285018861567, + "loss": 2.8994, + "step": 8410 + }, + { + "epoch": 0.488554933418434, + "grad_norm": 0.1379164159297943, + "learning_rate": 0.00033612144064496853, + "loss": 2.8949, + "step": 8420 + }, + { + "epoch": 0.4891351649308074, + "grad_norm": 0.12819483876228333, + "learning_rate": 0.00033554989809839294, + "loss": 2.897, + "step": 8430 + }, + { + "epoch": 0.48971539644318085, + "grad_norm": 0.12451434880495071, + "learning_rate": 0.00033497822465336854, + "loss": 2.903, + "step": 8440 + }, + { + "epoch": 0.49029562795555426, + "grad_norm": 0.1466275155544281, + "learning_rate": 0.0003344064224148567, + "loss": 2.8912, + "step": 8450 + }, + { + "epoch": 0.4908758594679277, + "grad_norm": 0.12186205387115479, + "learning_rate": 0.0003338344934882932, + "loss": 2.8998, + "step": 8460 + }, + { + "epoch": 0.49145609098030113, + "grad_norm": 0.12687867879867554, + "learning_rate": 0.00033326243997958014, + "loss": 2.8983, + "step": 8470 + }, + { + "epoch": 0.4920363224926746, + "grad_norm": 0.12620693445205688, + "learning_rate": 0.00033269026399507874, + "loss": 2.895, + "step": 8480 + }, + { + "epoch": 0.492616554005048, + "grad_norm": 0.1362224668264389, + "learning_rate": 0.00033211796764160074, + "loss": 2.9007, + "step": 8490 + }, + { + "epoch": 0.49319678551742147, + "grad_norm": 0.1300470530986786, + "learning_rate": 0.00033154555302640135, + "loss": 2.8914, + "step": 8500 + }, + { + "epoch": 0.4937770170297949, + "grad_norm": 0.12057654559612274, + "learning_rate": 0.00033097302225717096, + "loss": 2.8971, + "step": 8510 + }, + { + "epoch": 0.49435724854216834, + "grad_norm": 0.13263335824012756, + "learning_rate": 0.00033040037744202805, + "loss": 2.8971, + "step": 8520 + }, + { + "epoch": 0.49493748005454175, + "grad_norm": 0.12660051882266998, + "learning_rate": 0.00032982762068951073, + "loss": 2.8914, + "step": 8530 + }, + { + "epoch": 0.4955177115669152, + "grad_norm": 0.12398383021354675, + "learning_rate": 0.0003292547541085694, + "loss": 2.8936, + "step": 8540 + }, + { + "epoch": 0.4960979430792886, + "grad_norm": 0.1229000836610794, + "learning_rate": 0.00032868177980855876, + "loss": 2.888, + "step": 8550 + }, + { + "epoch": 0.4966781745916621, + "grad_norm": 0.11801040917634964, + "learning_rate": 0.0003281086998992303, + "loss": 2.8909, + "step": 8560 + }, + { + "epoch": 0.4972584061040355, + "grad_norm": 0.12945981323719025, + "learning_rate": 0.0003275355164907241, + "loss": 2.8878, + "step": 8570 + }, + { + "epoch": 0.49783863761640895, + "grad_norm": 0.12002068758010864, + "learning_rate": 0.0003269622316935618, + "loss": 2.892, + "step": 8580 + }, + { + "epoch": 0.49841886912878236, + "grad_norm": 0.12449994683265686, + "learning_rate": 0.0003263888476186377, + "loss": 2.8912, + "step": 8590 + }, + { + "epoch": 0.4989991006411558, + "grad_norm": 0.13638156652450562, + "learning_rate": 0.0003258153663772124, + "loss": 2.8877, + "step": 8600 + }, + { + "epoch": 0.49957933215352923, + "grad_norm": 0.12280316650867462, + "learning_rate": 0.0003252417900809038, + "loss": 2.8879, + "step": 8610 + }, + { + "epoch": 0.5001595636659026, + "grad_norm": 0.12275322526693344, + "learning_rate": 0.0003246681208416797, + "loss": 2.8906, + "step": 8620 + }, + { + "epoch": 0.5007397951782762, + "grad_norm": 0.1220172718167305, + "learning_rate": 0.0003240943607718506, + "loss": 2.8952, + "step": 8630 + }, + { + "epoch": 0.5013200266906496, + "grad_norm": 0.11458177119493484, + "learning_rate": 0.00032352051198406104, + "loss": 2.902, + "step": 8640 + }, + { + "epoch": 0.501900258203023, + "grad_norm": 0.12652765214443207, + "learning_rate": 0.0003229465765912824, + "loss": 2.9038, + "step": 8650 + }, + { + "epoch": 0.5024804897153965, + "grad_norm": 0.12456042319536209, + "learning_rate": 0.000322372556706805, + "loss": 2.8844, + "step": 8660 + }, + { + "epoch": 0.5030607212277699, + "grad_norm": 0.13799023628234863, + "learning_rate": 0.0003217984544442301, + "loss": 2.8987, + "step": 8670 + }, + { + "epoch": 0.5036409527401433, + "grad_norm": 0.12474406510591507, + "learning_rate": 0.00032122427191746234, + "loss": 2.8976, + "step": 8680 + }, + { + "epoch": 0.5042211842525167, + "grad_norm": 0.12724703550338745, + "learning_rate": 0.00032065001124070207, + "loss": 2.8862, + "step": 8690 + }, + { + "epoch": 0.5048014157648902, + "grad_norm": 0.11946358531713486, + "learning_rate": 0.0003200756745284371, + "loss": 2.8926, + "step": 8700 + }, + { + "epoch": 0.5053816472772636, + "grad_norm": 0.1258503645658493, + "learning_rate": 0.0003195012638954354, + "loss": 2.8932, + "step": 8710 + }, + { + "epoch": 0.505961878789637, + "grad_norm": 0.12079302221536636, + "learning_rate": 0.00031892678145673724, + "loss": 2.8914, + "step": 8720 + }, + { + "epoch": 0.5065421103020105, + "grad_norm": 0.12168605625629425, + "learning_rate": 0.000318352229327647, + "loss": 2.8867, + "step": 8730 + }, + { + "epoch": 0.507122341814384, + "grad_norm": 0.13427579402923584, + "learning_rate": 0.00031777760962372584, + "loss": 2.8893, + "step": 8740 + }, + { + "epoch": 0.5077025733267574, + "grad_norm": 0.1176985576748848, + "learning_rate": 0.00031720292446078374, + "loss": 2.8887, + "step": 8750 + }, + { + "epoch": 0.5082828048391308, + "grad_norm": 0.12351604551076889, + "learning_rate": 0.00031662817595487166, + "loss": 2.8915, + "step": 8760 + }, + { + "epoch": 0.5088630363515042, + "grad_norm": 0.1390778124332428, + "learning_rate": 0.00031605336622227365, + "loss": 2.8737, + "step": 8770 + }, + { + "epoch": 0.5094432678638777, + "grad_norm": 0.11954103410243988, + "learning_rate": 0.00031547849737949957, + "loss": 2.8888, + "step": 8780 + }, + { + "epoch": 0.5100234993762511, + "grad_norm": 0.12293373793363571, + "learning_rate": 0.00031490357154327674, + "loss": 2.8814, + "step": 8790 + }, + { + "epoch": 0.5106037308886245, + "grad_norm": 0.12284509837627411, + "learning_rate": 0.0003143285908305422, + "loss": 2.8874, + "step": 8800 + }, + { + "epoch": 0.511183962400998, + "grad_norm": 0.11924895644187927, + "learning_rate": 0.00031375355735843523, + "loss": 2.8813, + "step": 8810 + }, + { + "epoch": 0.5117641939133715, + "grad_norm": 0.12003005295991898, + "learning_rate": 0.00031317847324428924, + "loss": 2.8836, + "step": 8820 + }, + { + "epoch": 0.5123444254257449, + "grad_norm": 0.13070861995220184, + "learning_rate": 0.00031260334060562416, + "loss": 2.8851, + "step": 8830 + }, + { + "epoch": 0.5129246569381183, + "grad_norm": 0.11900255084037781, + "learning_rate": 0.0003120281615601387, + "loss": 2.8827, + "step": 8840 + }, + { + "epoch": 0.5135048884504917, + "grad_norm": 0.12470702081918716, + "learning_rate": 0.0003114529382257024, + "loss": 2.8916, + "step": 8850 + }, + { + "epoch": 0.5140851199628652, + "grad_norm": 0.1312616765499115, + "learning_rate": 0.0003108776727203478, + "loss": 2.897, + "step": 8860 + }, + { + "epoch": 0.5146653514752386, + "grad_norm": 0.13872870802879333, + "learning_rate": 0.00031030236716226265, + "loss": 2.8836, + "step": 8870 + }, + { + "epoch": 0.515245582987612, + "grad_norm": 0.11608674377202988, + "learning_rate": 0.00030972702366978237, + "loss": 2.8875, + "step": 8880 + }, + { + "epoch": 0.5158258144999855, + "grad_norm": 0.12205769121646881, + "learning_rate": 0.000309151644361382, + "loss": 2.8862, + "step": 8890 + }, + { + "epoch": 0.516406046012359, + "grad_norm": 0.12009671330451965, + "learning_rate": 0.0003085762313556683, + "loss": 2.8797, + "step": 8900 + }, + { + "epoch": 0.5169862775247324, + "grad_norm": 0.12120591104030609, + "learning_rate": 0.0003080007867713724, + "loss": 2.8905, + "step": 8910 + }, + { + "epoch": 0.5175665090371058, + "grad_norm": 0.12842518091201782, + "learning_rate": 0.00030742531272734153, + "loss": 2.8747, + "step": 8920 + }, + { + "epoch": 0.5181467405494793, + "grad_norm": 0.12532438337802887, + "learning_rate": 0.00030684981134253123, + "loss": 2.8892, + "step": 8930 + }, + { + "epoch": 0.5187269720618527, + "grad_norm": 0.1295221596956253, + "learning_rate": 0.0003062742847359981, + "loss": 2.8842, + "step": 8940 + }, + { + "epoch": 0.5193072035742261, + "grad_norm": 0.1296953707933426, + "learning_rate": 0.00030569873502689116, + "loss": 2.878, + "step": 8950 + }, + { + "epoch": 0.5198874350865995, + "grad_norm": 0.14120282232761383, + "learning_rate": 0.00030512316433444495, + "loss": 2.8809, + "step": 8960 + }, + { + "epoch": 0.520467666598973, + "grad_norm": 0.12610268592834473, + "learning_rate": 0.000304547574777971, + "loss": 2.8794, + "step": 8970 + }, + { + "epoch": 0.5210478981113464, + "grad_norm": 0.11908390372991562, + "learning_rate": 0.0003039719684768503, + "loss": 2.8839, + "step": 8980 + }, + { + "epoch": 0.5216281296237198, + "grad_norm": 0.13508306443691254, + "learning_rate": 0.0003033963475505256, + "loss": 2.8782, + "step": 8990 + }, + { + "epoch": 0.5222083611360933, + "grad_norm": 0.12108524888753891, + "learning_rate": 0.00030282071411849343, + "loss": 2.879, + "step": 9000 + }, + { + "epoch": 0.5222083611360933, + "eval_loss": 2.845144271850586, + "eval_runtime": 3.2553, + "eval_samples_per_second": 1330.14, + "eval_steps_per_second": 2.765, + "step": 9000 + }, + { + "epoch": 0.5227885926484668, + "grad_norm": 0.13046176731586456, + "learning_rate": 0.00030224507030029627, + "loss": 2.8809, + "step": 9010 + }, + { + "epoch": 0.5233688241608402, + "grad_norm": 0.12113803625106812, + "learning_rate": 0.0003016694182155152, + "loss": 2.8839, + "step": 9020 + }, + { + "epoch": 0.5239490556732136, + "grad_norm": 0.12337899953126907, + "learning_rate": 0.0003010937599837613, + "loss": 2.8821, + "step": 9030 + }, + { + "epoch": 0.524529287185587, + "grad_norm": 0.11981160938739777, + "learning_rate": 0.0003005180977246686, + "loss": 2.888, + "step": 9040 + }, + { + "epoch": 0.5251095186979605, + "grad_norm": 0.12357629835605621, + "learning_rate": 0.0002999424335578858, + "loss": 2.8804, + "step": 9050 + }, + { + "epoch": 0.5256897502103339, + "grad_norm": 0.11688230186700821, + "learning_rate": 0.00029936676960306863, + "loss": 2.8891, + "step": 9060 + }, + { + "epoch": 0.5262699817227073, + "grad_norm": 0.11743608117103577, + "learning_rate": 0.0002987911079798723, + "loss": 2.8685, + "step": 9070 + }, + { + "epoch": 0.5268502132350807, + "grad_norm": 0.1338096410036087, + "learning_rate": 0.0002982154508079428, + "loss": 2.8758, + "step": 9080 + }, + { + "epoch": 0.5274304447474543, + "grad_norm": 0.13182982802391052, + "learning_rate": 0.0002976398002069105, + "loss": 2.882, + "step": 9090 + }, + { + "epoch": 0.5280106762598277, + "grad_norm": 0.12470164895057678, + "learning_rate": 0.000297064158296381, + "loss": 2.8817, + "step": 9100 + }, + { + "epoch": 0.5285909077722011, + "grad_norm": 0.11741513013839722, + "learning_rate": 0.0002964885271959282, + "loss": 2.8768, + "step": 9110 + }, + { + "epoch": 0.5291711392845746, + "grad_norm": 0.1364392340183258, + "learning_rate": 0.0002959129090250863, + "loss": 2.8822, + "step": 9120 + }, + { + "epoch": 0.529751370796948, + "grad_norm": 0.12005024403333664, + "learning_rate": 0.0002953373059033413, + "loss": 2.8789, + "step": 9130 + }, + { + "epoch": 0.5303316023093214, + "grad_norm": 0.1239180713891983, + "learning_rate": 0.0002947617199501245, + "loss": 2.8754, + "step": 9140 + }, + { + "epoch": 0.5309118338216948, + "grad_norm": 0.12774530053138733, + "learning_rate": 0.00029418615328480357, + "loss": 2.8773, + "step": 9150 + }, + { + "epoch": 0.5314920653340683, + "grad_norm": 0.11815381795167923, + "learning_rate": 0.00029361060802667526, + "loss": 2.8711, + "step": 9160 + }, + { + "epoch": 0.5320722968464418, + "grad_norm": 0.12450312077999115, + "learning_rate": 0.0002930350862949577, + "loss": 2.8743, + "step": 9170 + }, + { + "epoch": 0.5326525283588152, + "grad_norm": 0.12741632759571075, + "learning_rate": 0.00029245959020878187, + "loss": 2.8846, + "step": 9180 + }, + { + "epoch": 0.5332327598711886, + "grad_norm": 0.12712997198104858, + "learning_rate": 0.0002918841218871848, + "loss": 2.8774, + "step": 9190 + }, + { + "epoch": 0.5338129913835621, + "grad_norm": 0.11238303780555725, + "learning_rate": 0.0002913086834491012, + "loss": 2.8782, + "step": 9200 + }, + { + "epoch": 0.5343932228959355, + "grad_norm": 0.1266774982213974, + "learning_rate": 0.00029073327701335566, + "loss": 2.883, + "step": 9210 + }, + { + "epoch": 0.5349734544083089, + "grad_norm": 0.12266207486391068, + "learning_rate": 0.00029015790469865484, + "loss": 2.8735, + "step": 9220 + }, + { + "epoch": 0.5355536859206823, + "grad_norm": 0.10979332774877548, + "learning_rate": 0.0002895825686235799, + "loss": 2.8791, + "step": 9230 + }, + { + "epoch": 0.5361339174330558, + "grad_norm": 0.11939531564712524, + "learning_rate": 0.0002890072709065787, + "loss": 2.8745, + "step": 9240 + }, + { + "epoch": 0.5367141489454292, + "grad_norm": 0.12080537527799606, + "learning_rate": 0.0002884320136659575, + "loss": 2.8775, + "step": 9250 + }, + { + "epoch": 0.5372943804578026, + "grad_norm": 0.12394317239522934, + "learning_rate": 0.00028785679901987394, + "loss": 2.8734, + "step": 9260 + }, + { + "epoch": 0.537874611970176, + "grad_norm": 0.12320924550294876, + "learning_rate": 0.0002872816290863283, + "loss": 2.8703, + "step": 9270 + }, + { + "epoch": 0.5384548434825496, + "grad_norm": 0.12183520197868347, + "learning_rate": 0.0002867065059831568, + "loss": 2.8731, + "step": 9280 + }, + { + "epoch": 0.539035074994923, + "grad_norm": 0.13638751208782196, + "learning_rate": 0.0002861314318280229, + "loss": 2.8725, + "step": 9290 + }, + { + "epoch": 0.5396153065072964, + "grad_norm": 0.12684093415737152, + "learning_rate": 0.0002855564087384098, + "loss": 2.8714, + "step": 9300 + }, + { + "epoch": 0.5401955380196698, + "grad_norm": 0.11322664469480515, + "learning_rate": 0.00028498143883161277, + "loss": 2.8693, + "step": 9310 + }, + { + "epoch": 0.5407757695320433, + "grad_norm": 0.11759771406650543, + "learning_rate": 0.00028440652422473124, + "loss": 2.8679, + "step": 9320 + }, + { + "epoch": 0.5413560010444167, + "grad_norm": 0.12511123716831207, + "learning_rate": 0.0002838316670346612, + "loss": 2.8744, + "step": 9330 + }, + { + "epoch": 0.5419362325567901, + "grad_norm": 0.1160508468747139, + "learning_rate": 0.00028325686937808673, + "loss": 2.874, + "step": 9340 + }, + { + "epoch": 0.5425164640691637, + "grad_norm": 0.11813979595899582, + "learning_rate": 0.0002826821333714732, + "loss": 2.8691, + "step": 9350 + }, + { + "epoch": 0.5430966955815371, + "grad_norm": 0.11728700250387192, + "learning_rate": 0.0002821074611310588, + "loss": 2.8717, + "step": 9360 + }, + { + "epoch": 0.5436769270939105, + "grad_norm": 0.12824493646621704, + "learning_rate": 0.0002815328547728469, + "loss": 2.875, + "step": 9370 + }, + { + "epoch": 0.5442571586062839, + "grad_norm": 0.12653270363807678, + "learning_rate": 0.0002809583164125983, + "loss": 2.8682, + "step": 9380 + }, + { + "epoch": 0.5448373901186574, + "grad_norm": 0.13113363087177277, + "learning_rate": 0.00028038384816582337, + "loss": 2.8583, + "step": 9390 + }, + { + "epoch": 0.5454176216310308, + "grad_norm": 0.11145169287919998, + "learning_rate": 0.0002798094521477744, + "loss": 2.8714, + "step": 9400 + }, + { + "epoch": 0.5459978531434042, + "grad_norm": 0.12025914341211319, + "learning_rate": 0.0002792351304734378, + "loss": 2.8689, + "step": 9410 + }, + { + "epoch": 0.5465780846557776, + "grad_norm": 0.1347450315952301, + "learning_rate": 0.000278660885257526, + "loss": 2.8803, + "step": 9420 + }, + { + "epoch": 0.5471583161681511, + "grad_norm": 0.11728854477405548, + "learning_rate": 0.0002780867186144703, + "loss": 2.8614, + "step": 9430 + }, + { + "epoch": 0.5477385476805245, + "grad_norm": 0.1399793028831482, + "learning_rate": 0.00027751263265841204, + "loss": 2.8777, + "step": 9440 + }, + { + "epoch": 0.548318779192898, + "grad_norm": 0.13229645788669586, + "learning_rate": 0.0002769386295031961, + "loss": 2.8723, + "step": 9450 + }, + { + "epoch": 0.5488990107052714, + "grad_norm": 0.12199070304632187, + "learning_rate": 0.00027636471126236213, + "loss": 2.8577, + "step": 9460 + }, + { + "epoch": 0.5494792422176449, + "grad_norm": 0.14131730794906616, + "learning_rate": 0.0002757908800491373, + "loss": 2.857, + "step": 9470 + }, + { + "epoch": 0.5500594737300183, + "grad_norm": 0.1343252956867218, + "learning_rate": 0.0002752171379764283, + "loss": 2.8689, + "step": 9480 + }, + { + "epoch": 0.5506397052423917, + "grad_norm": 0.1338685154914856, + "learning_rate": 0.0002746434871568133, + "loss": 2.8775, + "step": 9490 + }, + { + "epoch": 0.5512199367547651, + "grad_norm": 0.12388128787279129, + "learning_rate": 0.00027406992970253506, + "loss": 2.8761, + "step": 9500 + }, + { + "epoch": 0.5518001682671386, + "grad_norm": 0.12272147834300995, + "learning_rate": 0.0002734964677254918, + "loss": 2.8722, + "step": 9510 + }, + { + "epoch": 0.552380399779512, + "grad_norm": 0.12000911682844162, + "learning_rate": 0.00027292310333723086, + "loss": 2.8743, + "step": 9520 + }, + { + "epoch": 0.5529606312918854, + "grad_norm": 0.13635672628879547, + "learning_rate": 0.00027234983864894, + "loss": 2.8657, + "step": 9530 + }, + { + "epoch": 0.5535408628042588, + "grad_norm": 0.12129581719636917, + "learning_rate": 0.0002717766757714398, + "loss": 2.8661, + "step": 9540 + }, + { + "epoch": 0.5541210943166324, + "grad_norm": 0.11717355996370316, + "learning_rate": 0.00027120361681517606, + "loss": 2.8707, + "step": 9550 + }, + { + "epoch": 0.5547013258290058, + "grad_norm": 0.12199341505765915, + "learning_rate": 0.0002706306638902117, + "loss": 2.8555, + "step": 9560 + }, + { + "epoch": 0.5552815573413792, + "grad_norm": 0.1175154522061348, + "learning_rate": 0.0002700578191062196, + "loss": 2.8721, + "step": 9570 + }, + { + "epoch": 0.5558617888537526, + "grad_norm": 0.12546683847904205, + "learning_rate": 0.00026948508457247416, + "loss": 2.8689, + "step": 9580 + }, + { + "epoch": 0.5564420203661261, + "grad_norm": 0.11439734697341919, + "learning_rate": 0.000268912462397844, + "loss": 2.8552, + "step": 9590 + }, + { + "epoch": 0.5570222518784995, + "grad_norm": 0.13139833509922028, + "learning_rate": 0.00026833995469078404, + "loss": 2.8728, + "step": 9600 + }, + { + "epoch": 0.5576024833908729, + "grad_norm": 0.14722158014774323, + "learning_rate": 0.00026776756355932743, + "loss": 2.8594, + "step": 9610 + }, + { + "epoch": 0.5581827149032464, + "grad_norm": 0.12206868082284927, + "learning_rate": 0.00026719529111107846, + "loss": 2.8713, + "step": 9620 + }, + { + "epoch": 0.5587629464156199, + "grad_norm": 0.11777371913194656, + "learning_rate": 0.00026662313945320404, + "loss": 2.8656, + "step": 9630 + }, + { + "epoch": 0.5593431779279933, + "grad_norm": 0.12058188021183014, + "learning_rate": 0.00026605111069242664, + "loss": 2.8712, + "step": 9640 + }, + { + "epoch": 0.5599234094403667, + "grad_norm": 0.1278459131717682, + "learning_rate": 0.00026547920693501616, + "loss": 2.8686, + "step": 9650 + }, + { + "epoch": 0.5605036409527402, + "grad_norm": 0.12272592633962631, + "learning_rate": 0.00026490743028678194, + "loss": 2.8636, + "step": 9660 + }, + { + "epoch": 0.5610838724651136, + "grad_norm": 0.11543965339660645, + "learning_rate": 0.00026433578285306567, + "loss": 2.8592, + "step": 9670 + }, + { + "epoch": 0.561664103977487, + "grad_norm": 0.11765621602535248, + "learning_rate": 0.0002637642667387329, + "loss": 2.867, + "step": 9680 + }, + { + "epoch": 0.5622443354898604, + "grad_norm": 0.12996822595596313, + "learning_rate": 0.0002631928840481662, + "loss": 2.8669, + "step": 9690 + }, + { + "epoch": 0.5628245670022339, + "grad_norm": 0.11992313712835312, + "learning_rate": 0.00026262163688525606, + "loss": 2.8576, + "step": 9700 + }, + { + "epoch": 0.5634047985146073, + "grad_norm": 0.1216612309217453, + "learning_rate": 0.00026205052735339457, + "loss": 2.8656, + "step": 9710 + }, + { + "epoch": 0.5639850300269807, + "grad_norm": 0.11923664063215256, + "learning_rate": 0.00026147955755546686, + "loss": 2.8625, + "step": 9720 + }, + { + "epoch": 0.5645652615393542, + "grad_norm": 0.1174679845571518, + "learning_rate": 0.00026090872959384353, + "loss": 2.8589, + "step": 9730 + }, + { + "epoch": 0.5651454930517277, + "grad_norm": 0.12439408898353577, + "learning_rate": 0.00026033804557037304, + "loss": 2.8573, + "step": 9740 + }, + { + "epoch": 0.5657257245641011, + "grad_norm": 0.12268688529729843, + "learning_rate": 0.0002597675075863735, + "loss": 2.8612, + "step": 9750 + }, + { + "epoch": 0.5663059560764745, + "grad_norm": 0.11994469910860062, + "learning_rate": 0.0002591971177426256, + "loss": 2.8667, + "step": 9760 + }, + { + "epoch": 0.5668861875888479, + "grad_norm": 0.12739793956279755, + "learning_rate": 0.0002586268781393648, + "loss": 2.8657, + "step": 9770 + }, + { + "epoch": 0.5674664191012214, + "grad_norm": 0.12942016124725342, + "learning_rate": 0.00025805679087627267, + "loss": 2.863, + "step": 9780 + }, + { + "epoch": 0.5680466506135948, + "grad_norm": 0.12867708504199982, + "learning_rate": 0.00025748685805247046, + "loss": 2.8596, + "step": 9790 + }, + { + "epoch": 0.5686268821259682, + "grad_norm": 0.1384700983762741, + "learning_rate": 0.00025691708176651034, + "loss": 2.8612, + "step": 9800 + }, + { + "epoch": 0.5692071136383416, + "grad_norm": 0.11695626378059387, + "learning_rate": 0.0002563474641163686, + "loss": 2.8613, + "step": 9810 + }, + { + "epoch": 0.5697873451507152, + "grad_norm": 0.12379258126020432, + "learning_rate": 0.0002557780071994367, + "loss": 2.8637, + "step": 9820 + }, + { + "epoch": 0.5703675766630886, + "grad_norm": 0.13220758736133575, + "learning_rate": 0.00025520871311251493, + "loss": 2.8572, + "step": 9830 + }, + { + "epoch": 0.570947808175462, + "grad_norm": 0.12004509568214417, + "learning_rate": 0.00025463958395180377, + "loss": 2.8614, + "step": 9840 + }, + { + "epoch": 0.5715280396878355, + "grad_norm": 0.12457242608070374, + "learning_rate": 0.0002540706218128962, + "loss": 2.8606, + "step": 9850 + }, + { + "epoch": 0.5721082712002089, + "grad_norm": 0.125260129570961, + "learning_rate": 0.0002535018287907707, + "loss": 2.8606, + "step": 9860 + }, + { + "epoch": 0.5726885027125823, + "grad_norm": 0.11718660593032837, + "learning_rate": 0.00025293320697978254, + "loss": 2.86, + "step": 9870 + }, + { + "epoch": 0.5732687342249557, + "grad_norm": 0.1096329316496849, + "learning_rate": 0.0002523647584736568, + "loss": 2.8743, + "step": 9880 + }, + { + "epoch": 0.5738489657373292, + "grad_norm": 0.11327598243951797, + "learning_rate": 0.0002517964853654806, + "loss": 2.8492, + "step": 9890 + }, + { + "epoch": 0.5744291972497026, + "grad_norm": 0.1237105280160904, + "learning_rate": 0.0002512283897476949, + "loss": 2.852, + "step": 9900 + }, + { + "epoch": 0.5750094287620761, + "grad_norm": 0.11739984154701233, + "learning_rate": 0.0002506604737120874, + "loss": 2.8535, + "step": 9910 + }, + { + "epoch": 0.5755896602744495, + "grad_norm": 0.12682320177555084, + "learning_rate": 0.00025009273934978424, + "loss": 2.8575, + "step": 9920 + }, + { + "epoch": 0.576169891786823, + "grad_norm": 0.12347414344549179, + "learning_rate": 0.00024952518875124305, + "loss": 2.8596, + "step": 9930 + }, + { + "epoch": 0.5767501232991964, + "grad_norm": 0.11207421123981476, + "learning_rate": 0.0002489578240062444, + "loss": 2.8563, + "step": 9940 + }, + { + "epoch": 0.5773303548115698, + "grad_norm": 0.12151192873716354, + "learning_rate": 0.0002483906472038848, + "loss": 2.8513, + "step": 9950 + }, + { + "epoch": 0.5779105863239432, + "grad_norm": 0.11661417037248611, + "learning_rate": 0.00024782366043256876, + "loss": 2.8538, + "step": 9960 + }, + { + "epoch": 0.5784908178363167, + "grad_norm": 0.11908597499132156, + "learning_rate": 0.0002472568657800007, + "loss": 2.8549, + "step": 9970 + }, + { + "epoch": 0.5790710493486901, + "grad_norm": 0.12369140982627869, + "learning_rate": 0.00024669026533317816, + "loss": 2.859, + "step": 9980 + }, + { + "epoch": 0.5796512808610635, + "grad_norm": 0.12169597297906876, + "learning_rate": 0.0002461238611783832, + "loss": 2.8516, + "step": 9990 + }, + { + "epoch": 0.580231512373437, + "grad_norm": 0.1137092188000679, + "learning_rate": 0.0002455576554011753, + "loss": 2.8506, + "step": 10000 + }, + { + "epoch": 0.580231512373437, + "eval_loss": 2.8198139667510986, + "eval_runtime": 3.2544, + "eval_samples_per_second": 1330.504, + "eval_steps_per_second": 2.765, + "step": 10000 + }, + { + "epoch": 0.5808117438858105, + "grad_norm": 0.11945224553346634, + "learning_rate": 0.00024499165008638355, + "loss": 2.8527, + "step": 10010 + }, + { + "epoch": 0.5813919753981839, + "grad_norm": 0.12194681167602539, + "learning_rate": 0.0002444258473180986, + "loss": 2.8676, + "step": 10020 + }, + { + "epoch": 0.5819722069105573, + "grad_norm": 0.12587039172649384, + "learning_rate": 0.00024386024917966563, + "loss": 2.8468, + "step": 10030 + }, + { + "epoch": 0.5825524384229307, + "grad_norm": 0.12192162871360779, + "learning_rate": 0.0002432948577536762, + "loss": 2.8484, + "step": 10040 + }, + { + "epoch": 0.5831326699353042, + "grad_norm": 0.11401449888944626, + "learning_rate": 0.00024272967512196093, + "loss": 2.8636, + "step": 10050 + }, + { + "epoch": 0.5837129014476776, + "grad_norm": 0.12227935343980789, + "learning_rate": 0.0002421647033655812, + "loss": 2.8497, + "step": 10060 + }, + { + "epoch": 0.584293132960051, + "grad_norm": 0.11773716658353806, + "learning_rate": 0.00024159994456482233, + "loss": 2.857, + "step": 10070 + }, + { + "epoch": 0.5848733644724246, + "grad_norm": 0.124253049492836, + "learning_rate": 0.00024103540079918555, + "loss": 2.8499, + "step": 10080 + }, + { + "epoch": 0.585453595984798, + "grad_norm": 0.11704014986753464, + "learning_rate": 0.00024047107414737985, + "loss": 2.8522, + "step": 10090 + }, + { + "epoch": 0.5860338274971714, + "grad_norm": 0.11885286867618561, + "learning_rate": 0.0002399069666873153, + "loss": 2.855, + "step": 10100 + }, + { + "epoch": 0.5866140590095448, + "grad_norm": 0.12006965279579163, + "learning_rate": 0.00023934308049609453, + "loss": 2.8488, + "step": 10110 + }, + { + "epoch": 0.5871942905219183, + "grad_norm": 0.12023113667964935, + "learning_rate": 0.00023877941765000564, + "loss": 2.8542, + "step": 10120 + }, + { + "epoch": 0.5877745220342917, + "grad_norm": 0.12737338244915009, + "learning_rate": 0.00023821598022451436, + "loss": 2.8588, + "step": 10130 + }, + { + "epoch": 0.5883547535466651, + "grad_norm": 0.11698620766401291, + "learning_rate": 0.00023765277029425607, + "loss": 2.8544, + "step": 10140 + }, + { + "epoch": 0.5889349850590385, + "grad_norm": 0.12589864432811737, + "learning_rate": 0.000237089789933029, + "loss": 2.8448, + "step": 10150 + }, + { + "epoch": 0.589515216571412, + "grad_norm": 0.11532309651374817, + "learning_rate": 0.0002365270412137856, + "loss": 2.8618, + "step": 10160 + }, + { + "epoch": 0.5900954480837854, + "grad_norm": 0.10937913507223129, + "learning_rate": 0.00023596452620862585, + "loss": 2.8527, + "step": 10170 + }, + { + "epoch": 0.5906756795961589, + "grad_norm": 0.11980416625738144, + "learning_rate": 0.00023540224698878861, + "loss": 2.8553, + "step": 10180 + }, + { + "epoch": 0.5912559111085323, + "grad_norm": 0.11810686439275742, + "learning_rate": 0.00023484020562464507, + "loss": 2.8545, + "step": 10190 + }, + { + "epoch": 0.5918361426209058, + "grad_norm": 0.11651547253131866, + "learning_rate": 0.00023427840418569043, + "loss": 2.8522, + "step": 10200 + }, + { + "epoch": 0.5924163741332792, + "grad_norm": 0.11145967990159988, + "learning_rate": 0.00023371684474053633, + "loss": 2.8564, + "step": 10210 + }, + { + "epoch": 0.5929966056456526, + "grad_norm": 0.11742381006479263, + "learning_rate": 0.0002331555293569037, + "loss": 2.8529, + "step": 10220 + }, + { + "epoch": 0.593576837158026, + "grad_norm": 0.1287650465965271, + "learning_rate": 0.00023259446010161425, + "loss": 2.847, + "step": 10230 + }, + { + "epoch": 0.5941570686703995, + "grad_norm": 0.12560808658599854, + "learning_rate": 0.00023203363904058394, + "loss": 2.8424, + "step": 10240 + }, + { + "epoch": 0.5947373001827729, + "grad_norm": 0.13144509494304657, + "learning_rate": 0.0002314730682388147, + "loss": 2.8497, + "step": 10250 + }, + { + "epoch": 0.5953175316951463, + "grad_norm": 0.11483640223741531, + "learning_rate": 0.00023091274976038686, + "loss": 2.8525, + "step": 10260 + }, + { + "epoch": 0.5958977632075197, + "grad_norm": 0.12085619568824768, + "learning_rate": 0.0002303526856684519, + "loss": 2.846, + "step": 10270 + }, + { + "epoch": 0.5964779947198933, + "grad_norm": 0.13581375777721405, + "learning_rate": 0.00022979287802522423, + "loss": 2.8471, + "step": 10280 + }, + { + "epoch": 0.5970582262322667, + "grad_norm": 0.11522037535905838, + "learning_rate": 0.00022923332889197447, + "loss": 2.841, + "step": 10290 + }, + { + "epoch": 0.5976384577446401, + "grad_norm": 0.1114853248000145, + "learning_rate": 0.00022867404032902097, + "loss": 2.8507, + "step": 10300 + }, + { + "epoch": 0.5982186892570136, + "grad_norm": 0.1106984093785286, + "learning_rate": 0.00022811501439572288, + "loss": 2.8501, + "step": 10310 + }, + { + "epoch": 0.598798920769387, + "grad_norm": 0.12095363438129425, + "learning_rate": 0.0002275562531504724, + "loss": 2.8392, + "step": 10320 + }, + { + "epoch": 0.5993791522817604, + "grad_norm": 0.11527710407972336, + "learning_rate": 0.00022699775865068667, + "loss": 2.8498, + "step": 10330 + }, + { + "epoch": 0.5999593837941338, + "grad_norm": 0.11631615459918976, + "learning_rate": 0.00022643953295280127, + "loss": 2.8526, + "step": 10340 + }, + { + "epoch": 0.6005396153065073, + "grad_norm": 0.1107979491353035, + "learning_rate": 0.0002258815781122614, + "loss": 2.8488, + "step": 10350 + }, + { + "epoch": 0.6011198468188808, + "grad_norm": 0.1126491129398346, + "learning_rate": 0.00022532389618351532, + "loss": 2.8404, + "step": 10360 + }, + { + "epoch": 0.6017000783312542, + "grad_norm": 0.11740950495004654, + "learning_rate": 0.00022476648922000646, + "loss": 2.8499, + "step": 10370 + }, + { + "epoch": 0.6022803098436276, + "grad_norm": 0.11938904970884323, + "learning_rate": 0.00022420935927416547, + "loss": 2.8547, + "step": 10380 + }, + { + "epoch": 0.6028605413560011, + "grad_norm": 0.11484769731760025, + "learning_rate": 0.00022365250839740338, + "loss": 2.8392, + "step": 10390 + }, + { + "epoch": 0.6034407728683745, + "grad_norm": 0.12051428109407425, + "learning_rate": 0.0002230959386401032, + "loss": 2.8416, + "step": 10400 + }, + { + "epoch": 0.6040210043807479, + "grad_norm": 0.12364054471254349, + "learning_rate": 0.00022253965205161326, + "loss": 2.8343, + "step": 10410 + }, + { + "epoch": 0.6046012358931213, + "grad_norm": 0.1125280112028122, + "learning_rate": 0.00022198365068023892, + "loss": 2.8441, + "step": 10420 + }, + { + "epoch": 0.6051814674054948, + "grad_norm": 0.11715447157621384, + "learning_rate": 0.00022142793657323558, + "loss": 2.8391, + "step": 10430 + }, + { + "epoch": 0.6057616989178682, + "grad_norm": 0.11433437466621399, + "learning_rate": 0.00022087251177680086, + "loss": 2.8549, + "step": 10440 + }, + { + "epoch": 0.6063419304302416, + "grad_norm": 0.1222948208451271, + "learning_rate": 0.00022031737833606686, + "loss": 2.8406, + "step": 10450 + }, + { + "epoch": 0.6069221619426151, + "grad_norm": 0.11805406212806702, + "learning_rate": 0.0002197625382950932, + "loss": 2.8415, + "step": 10460 + }, + { + "epoch": 0.6075023934549886, + "grad_norm": 0.13002602756023407, + "learning_rate": 0.00021920799369685892, + "loss": 2.851, + "step": 10470 + }, + { + "epoch": 0.608082624967362, + "grad_norm": 0.11929357796907425, + "learning_rate": 0.00021865374658325544, + "loss": 2.8437, + "step": 10480 + }, + { + "epoch": 0.6086628564797354, + "grad_norm": 0.11752030998468399, + "learning_rate": 0.00021809979899507876, + "loss": 2.8532, + "step": 10490 + }, + { + "epoch": 0.6092430879921088, + "grad_norm": 0.12201694399118423, + "learning_rate": 0.00021754615297202168, + "loss": 2.8474, + "step": 10500 + }, + { + "epoch": 0.6098233195044823, + "grad_norm": 0.12019883096218109, + "learning_rate": 0.00021699281055266706, + "loss": 2.8422, + "step": 10510 + }, + { + "epoch": 0.6104035510168557, + "grad_norm": 0.12413442134857178, + "learning_rate": 0.00021643977377447954, + "loss": 2.8316, + "step": 10520 + }, + { + "epoch": 0.6109837825292291, + "grad_norm": 0.11983013898134232, + "learning_rate": 0.00021588704467379862, + "loss": 2.8448, + "step": 10530 + }, + { + "epoch": 0.6115640140416027, + "grad_norm": 0.13365738093852997, + "learning_rate": 0.0002153346252858306, + "loss": 2.837, + "step": 10540 + }, + { + "epoch": 0.6121442455539761, + "grad_norm": 0.13185539841651917, + "learning_rate": 0.00021478251764464148, + "loss": 2.8468, + "step": 10550 + }, + { + "epoch": 0.6127244770663495, + "grad_norm": 0.1213960349559784, + "learning_rate": 0.00021423072378314964, + "loss": 2.8444, + "step": 10560 + }, + { + "epoch": 0.6133047085787229, + "grad_norm": 0.12037312239408493, + "learning_rate": 0.00021367924573311773, + "loss": 2.8438, + "step": 10570 + }, + { + "epoch": 0.6138849400910964, + "grad_norm": 0.12542636692523956, + "learning_rate": 0.00021312808552514592, + "loss": 2.8424, + "step": 10580 + }, + { + "epoch": 0.6144651716034698, + "grad_norm": 0.14415085315704346, + "learning_rate": 0.00021257724518866352, + "loss": 2.8417, + "step": 10590 + }, + { + "epoch": 0.6150454031158432, + "grad_norm": 0.1150176003575325, + "learning_rate": 0.00021202672675192248, + "loss": 2.8435, + "step": 10600 + }, + { + "epoch": 0.6156256346282166, + "grad_norm": 0.11662835627794266, + "learning_rate": 0.00021147653224198951, + "loss": 2.8441, + "step": 10610 + }, + { + "epoch": 0.6162058661405901, + "grad_norm": 0.11693531274795532, + "learning_rate": 0.00021092666368473817, + "loss": 2.8391, + "step": 10620 + }, + { + "epoch": 0.6167860976529635, + "grad_norm": 0.11077579110860825, + "learning_rate": 0.0002103771231048423, + "loss": 2.8345, + "step": 10630 + }, + { + "epoch": 0.617366329165337, + "grad_norm": 0.11653861403465271, + "learning_rate": 0.00020982791252576773, + "loss": 2.8448, + "step": 10640 + }, + { + "epoch": 0.6179465606777104, + "grad_norm": 0.11749275773763657, + "learning_rate": 0.00020927903396976552, + "loss": 2.8558, + "step": 10650 + }, + { + "epoch": 0.6185267921900839, + "grad_norm": 0.11677636206150055, + "learning_rate": 0.00020873048945786382, + "loss": 2.8353, + "step": 10660 + }, + { + "epoch": 0.6191070237024573, + "grad_norm": 0.11745753139257431, + "learning_rate": 0.00020818228100986106, + "loss": 2.8494, + "step": 10670 + }, + { + "epoch": 0.6196872552148307, + "grad_norm": 0.11747489869594574, + "learning_rate": 0.00020763441064431827, + "loss": 2.8397, + "step": 10680 + }, + { + "epoch": 0.6202674867272041, + "grad_norm": 0.11356910318136215, + "learning_rate": 0.00020708688037855138, + "loss": 2.8472, + "step": 10690 + }, + { + "epoch": 0.6208477182395776, + "grad_norm": 0.11063719540834427, + "learning_rate": 0.00020653969222862435, + "loss": 2.8508, + "step": 10700 + }, + { + "epoch": 0.621427949751951, + "grad_norm": 0.10978058725595474, + "learning_rate": 0.00020599284820934112, + "loss": 2.8308, + "step": 10710 + }, + { + "epoch": 0.6220081812643244, + "grad_norm": 0.11860186606645584, + "learning_rate": 0.00020544635033423867, + "loss": 2.8263, + "step": 10720 + }, + { + "epoch": 0.6225884127766979, + "grad_norm": 0.1312050074338913, + "learning_rate": 0.00020490020061557953, + "loss": 2.8455, + "step": 10730 + }, + { + "epoch": 0.6231686442890714, + "grad_norm": 0.13181331753730774, + "learning_rate": 0.00020435440106434408, + "loss": 2.8489, + "step": 10740 + }, + { + "epoch": 0.6237488758014448, + "grad_norm": 0.1471181958913803, + "learning_rate": 0.00020380895369022357, + "loss": 2.8285, + "step": 10750 + }, + { + "epoch": 0.6243291073138182, + "grad_norm": 0.12075991183519363, + "learning_rate": 0.00020326386050161215, + "loss": 2.8402, + "step": 10760 + }, + { + "epoch": 0.6249093388261916, + "grad_norm": 0.1117480993270874, + "learning_rate": 0.0002027191235056003, + "loss": 2.8426, + "step": 10770 + }, + { + "epoch": 0.6254895703385651, + "grad_norm": 0.11622477322816849, + "learning_rate": 0.0002021747447079665, + "loss": 2.8423, + "step": 10780 + }, + { + "epoch": 0.6260698018509385, + "grad_norm": 0.11475232988595963, + "learning_rate": 0.00020163072611317055, + "loss": 2.835, + "step": 10790 + }, + { + "epoch": 0.6266500333633119, + "grad_norm": 0.12252891808748245, + "learning_rate": 0.00020108706972434606, + "loss": 2.8381, + "step": 10800 + }, + { + "epoch": 0.6272302648756855, + "grad_norm": 0.11319098621606827, + "learning_rate": 0.00020054377754329258, + "loss": 2.8326, + "step": 10810 + }, + { + "epoch": 0.6278104963880589, + "grad_norm": 0.11103735119104385, + "learning_rate": 0.00020000085157046902, + "loss": 2.8292, + "step": 10820 + }, + { + "epoch": 0.6283907279004323, + "grad_norm": 0.12254971265792847, + "learning_rate": 0.00019945829380498556, + "loss": 2.8379, + "step": 10830 + }, + { + "epoch": 0.6289709594128057, + "grad_norm": 0.1253294050693512, + "learning_rate": 0.00019891610624459674, + "loss": 2.8404, + "step": 10840 + }, + { + "epoch": 0.6295511909251792, + "grad_norm": 0.12701797485351562, + "learning_rate": 0.0001983742908856942, + "loss": 2.8331, + "step": 10850 + }, + { + "epoch": 0.6301314224375526, + "grad_norm": 0.1351822167634964, + "learning_rate": 0.00019783284972329845, + "loss": 2.831, + "step": 10860 + }, + { + "epoch": 0.630711653949926, + "grad_norm": 0.11504077911376953, + "learning_rate": 0.00019729178475105292, + "loss": 2.8397, + "step": 10870 + }, + { + "epoch": 0.6312918854622994, + "grad_norm": 0.11900710314512253, + "learning_rate": 0.00019675109796121523, + "loss": 2.8328, + "step": 10880 + }, + { + "epoch": 0.6318721169746729, + "grad_norm": 0.11879398673772812, + "learning_rate": 0.00019621079134465096, + "loss": 2.8275, + "step": 10890 + }, + { + "epoch": 0.6324523484870463, + "grad_norm": 0.11795203387737274, + "learning_rate": 0.00019567086689082562, + "loss": 2.828, + "step": 10900 + }, + { + "epoch": 0.6330325799994198, + "grad_norm": 0.1163572296500206, + "learning_rate": 0.00019513132658779758, + "loss": 2.8387, + "step": 10910 + }, + { + "epoch": 0.6336128115117932, + "grad_norm": 0.11812139302492142, + "learning_rate": 0.00019459217242221092, + "loss": 2.8336, + "step": 10920 + }, + { + "epoch": 0.6341930430241667, + "grad_norm": 0.11195320636034012, + "learning_rate": 0.00019405340637928755, + "loss": 2.8427, + "step": 10930 + }, + { + "epoch": 0.6347732745365401, + "grad_norm": 0.11674754321575165, + "learning_rate": 0.0001935150304428206, + "loss": 2.8279, + "step": 10940 + }, + { + "epoch": 0.6353535060489135, + "grad_norm": 0.11432943493127823, + "learning_rate": 0.00019297704659516655, + "loss": 2.8267, + "step": 10950 + }, + { + "epoch": 0.6359337375612869, + "grad_norm": 0.12507887184619904, + "learning_rate": 0.0001924394568172384, + "loss": 2.8309, + "step": 10960 + }, + { + "epoch": 0.6365139690736604, + "grad_norm": 0.12057894468307495, + "learning_rate": 0.0001919022630884981, + "loss": 2.8422, + "step": 10970 + }, + { + "epoch": 0.6370942005860338, + "grad_norm": 0.11377721279859543, + "learning_rate": 0.000191365467386949, + "loss": 2.8381, + "step": 10980 + }, + { + "epoch": 0.6376744320984072, + "grad_norm": 0.11800755560398102, + "learning_rate": 0.00019082907168912932, + "loss": 2.8331, + "step": 10990 + }, + { + "epoch": 0.6382546636107806, + "grad_norm": 0.12301038950681686, + "learning_rate": 0.00019029307797010402, + "loss": 2.831, + "step": 11000 + }, + { + "epoch": 0.6382546636107806, + "eval_loss": 2.796895742416382, + "eval_runtime": 3.2627, + "eval_samples_per_second": 1327.123, + "eval_steps_per_second": 2.758, + "step": 11000 + }, + { + "epoch": 0.6388348951231542, + "grad_norm": 0.1179603561758995, + "learning_rate": 0.00018975748820345838, + "loss": 2.8436, + "step": 11010 + }, + { + "epoch": 0.6394151266355276, + "grad_norm": 0.13155020773410797, + "learning_rate": 0.0001892223043612898, + "loss": 2.8317, + "step": 11020 + }, + { + "epoch": 0.639995358147901, + "grad_norm": 0.11468763649463654, + "learning_rate": 0.00018868752841420122, + "loss": 2.8284, + "step": 11030 + }, + { + "epoch": 0.6405755896602745, + "grad_norm": 0.10960279405117035, + "learning_rate": 0.00018815316233129393, + "loss": 2.8286, + "step": 11040 + }, + { + "epoch": 0.6411558211726479, + "grad_norm": 0.1298363208770752, + "learning_rate": 0.00018761920808015966, + "loss": 2.8326, + "step": 11050 + }, + { + "epoch": 0.6417360526850213, + "grad_norm": 0.11535240709781647, + "learning_rate": 0.00018708566762687403, + "loss": 2.8281, + "step": 11060 + }, + { + "epoch": 0.6423162841973947, + "grad_norm": 0.12528617680072784, + "learning_rate": 0.00018655254293598866, + "loss": 2.8179, + "step": 11070 + }, + { + "epoch": 0.6428965157097682, + "grad_norm": 0.11952237784862518, + "learning_rate": 0.00018601983597052468, + "loss": 2.8294, + "step": 11080 + }, + { + "epoch": 0.6434767472221417, + "grad_norm": 0.12121649086475372, + "learning_rate": 0.00018548754869196496, + "loss": 2.8336, + "step": 11090 + }, + { + "epoch": 0.6440569787345151, + "grad_norm": 0.12465447187423706, + "learning_rate": 0.00018495568306024687, + "loss": 2.8314, + "step": 11100 + }, + { + "epoch": 0.6446372102468885, + "grad_norm": 0.10858411341905594, + "learning_rate": 0.00018442424103375563, + "loss": 2.8191, + "step": 11110 + }, + { + "epoch": 0.645217441759262, + "grad_norm": 0.1240803673863411, + "learning_rate": 0.00018389322456931616, + "loss": 2.8334, + "step": 11120 + }, + { + "epoch": 0.6457976732716354, + "grad_norm": 0.11604313552379608, + "learning_rate": 0.00018336263562218695, + "loss": 2.8241, + "step": 11130 + }, + { + "epoch": 0.6463779047840088, + "grad_norm": 0.10764401406049728, + "learning_rate": 0.00018283247614605185, + "loss": 2.8343, + "step": 11140 + }, + { + "epoch": 0.6469581362963822, + "grad_norm": 0.11341771483421326, + "learning_rate": 0.00018230274809301377, + "loss": 2.8323, + "step": 11150 + }, + { + "epoch": 0.6475383678087557, + "grad_norm": 0.11618595570325851, + "learning_rate": 0.00018177345341358699, + "loss": 2.8295, + "step": 11160 + }, + { + "epoch": 0.6481185993211291, + "grad_norm": 0.11492364853620529, + "learning_rate": 0.00018124459405668967, + "loss": 2.8253, + "step": 11170 + }, + { + "epoch": 0.6486988308335025, + "grad_norm": 0.12541726231575012, + "learning_rate": 0.0001807161719696377, + "loss": 2.8305, + "step": 11180 + }, + { + "epoch": 0.649279062345876, + "grad_norm": 0.1240224838256836, + "learning_rate": 0.0001801881890981362, + "loss": 2.832, + "step": 11190 + }, + { + "epoch": 0.6498592938582495, + "grad_norm": 0.12260005623102188, + "learning_rate": 0.00017966064738627363, + "loss": 2.8274, + "step": 11200 + }, + { + "epoch": 0.6504395253706229, + "grad_norm": 0.11284399777650833, + "learning_rate": 0.00017913354877651386, + "loss": 2.8291, + "step": 11210 + }, + { + "epoch": 0.6510197568829963, + "grad_norm": 0.11993937194347382, + "learning_rate": 0.00017860689520968906, + "loss": 2.8357, + "step": 11220 + }, + { + "epoch": 0.6515999883953697, + "grad_norm": 0.11259515583515167, + "learning_rate": 0.00017808068862499302, + "loss": 2.8134, + "step": 11230 + }, + { + "epoch": 0.6521802199077432, + "grad_norm": 0.1146656796336174, + "learning_rate": 0.0001775549309599733, + "loss": 2.8275, + "step": 11240 + }, + { + "epoch": 0.6527604514201166, + "grad_norm": 0.11118417978286743, + "learning_rate": 0.0001770296241505248, + "loss": 2.8276, + "step": 11250 + }, + { + "epoch": 0.65334068293249, + "grad_norm": 0.1155654564499855, + "learning_rate": 0.00017650477013088218, + "loss": 2.8333, + "step": 11260 + }, + { + "epoch": 0.6539209144448636, + "grad_norm": 0.12370238453149796, + "learning_rate": 0.000175980370833613, + "loss": 2.8209, + "step": 11270 + }, + { + "epoch": 0.654501145957237, + "grad_norm": 0.11332956701517105, + "learning_rate": 0.00017545642818961045, + "loss": 2.824, + "step": 11280 + }, + { + "epoch": 0.6550813774696104, + "grad_norm": 0.11696597188711166, + "learning_rate": 0.00017493294412808603, + "loss": 2.8285, + "step": 11290 + }, + { + "epoch": 0.6556616089819838, + "grad_norm": 0.11556991934776306, + "learning_rate": 0.00017440992057656302, + "loss": 2.833, + "step": 11300 + }, + { + "epoch": 0.6562418404943573, + "grad_norm": 0.11072834581136703, + "learning_rate": 0.000173887359460869, + "loss": 2.8202, + "step": 11310 + }, + { + "epoch": 0.6568220720067307, + "grad_norm": 0.12139474600553513, + "learning_rate": 0.0001733652627051285, + "loss": 2.8323, + "step": 11320 + }, + { + "epoch": 0.6574023035191041, + "grad_norm": 0.11882605403661728, + "learning_rate": 0.0001728436322317567, + "loss": 2.8325, + "step": 11330 + }, + { + "epoch": 0.6579825350314775, + "grad_norm": 0.10851707309484482, + "learning_rate": 0.00017232246996145163, + "loss": 2.8304, + "step": 11340 + }, + { + "epoch": 0.658562766543851, + "grad_norm": 0.11566723883152008, + "learning_rate": 0.0001718017778131873, + "loss": 2.8359, + "step": 11350 + }, + { + "epoch": 0.6591429980562244, + "grad_norm": 0.1224483922123909, + "learning_rate": 0.00017128155770420673, + "loss": 2.8246, + "step": 11360 + }, + { + "epoch": 0.6597232295685979, + "grad_norm": 0.11472085118293762, + "learning_rate": 0.00017076181155001492, + "loss": 2.8274, + "step": 11370 + }, + { + "epoch": 0.6603034610809713, + "grad_norm": 0.11463634669780731, + "learning_rate": 0.00017024254126437149, + "loss": 2.8208, + "step": 11380 + }, + { + "epoch": 0.6608836925933448, + "grad_norm": 0.11640073359012604, + "learning_rate": 0.00016972374875928427, + "loss": 2.8351, + "step": 11390 + }, + { + "epoch": 0.6614639241057182, + "grad_norm": 0.12146312743425369, + "learning_rate": 0.00016920543594500147, + "loss": 2.8249, + "step": 11400 + }, + { + "epoch": 0.6620441556180916, + "grad_norm": 0.11683548241853714, + "learning_rate": 0.00016868760473000524, + "loss": 2.8281, + "step": 11410 + }, + { + "epoch": 0.662624387130465, + "grad_norm": 0.11443763226270676, + "learning_rate": 0.0001681702570210043, + "loss": 2.8239, + "step": 11420 + }, + { + "epoch": 0.6632046186428385, + "grad_norm": 0.1136617586016655, + "learning_rate": 0.00016765339472292714, + "loss": 2.827, + "step": 11430 + }, + { + "epoch": 0.6637848501552119, + "grad_norm": 0.11093004792928696, + "learning_rate": 0.00016713701973891472, + "loss": 2.8359, + "step": 11440 + }, + { + "epoch": 0.6643650816675853, + "grad_norm": 0.12110643088817596, + "learning_rate": 0.00016662113397031413, + "loss": 2.8164, + "step": 11450 + }, + { + "epoch": 0.6649453131799588, + "grad_norm": 0.12236957252025604, + "learning_rate": 0.00016610573931667065, + "loss": 2.8295, + "step": 11460 + }, + { + "epoch": 0.6655255446923323, + "grad_norm": 0.11643628776073456, + "learning_rate": 0.0001655908376757214, + "loss": 2.8199, + "step": 11470 + }, + { + "epoch": 0.6661057762047057, + "grad_norm": 0.12198419123888016, + "learning_rate": 0.00016507643094338818, + "loss": 2.8234, + "step": 11480 + }, + { + "epoch": 0.6666860077170791, + "grad_norm": 0.11697736382484436, + "learning_rate": 0.00016456252101377042, + "loss": 2.8309, + "step": 11490 + }, + { + "epoch": 0.6672662392294526, + "grad_norm": 0.11377154290676117, + "learning_rate": 0.00016404910977913824, + "loss": 2.8174, + "step": 11500 + }, + { + "epoch": 0.667846470741826, + "grad_norm": 0.1169874370098114, + "learning_rate": 0.0001635361991299258, + "loss": 2.8174, + "step": 11510 + }, + { + "epoch": 0.6684267022541994, + "grad_norm": 0.11022408306598663, + "learning_rate": 0.00016302379095472374, + "loss": 2.8251, + "step": 11520 + }, + { + "epoch": 0.6690069337665728, + "grad_norm": 0.11143022775650024, + "learning_rate": 0.00016251188714027265, + "loss": 2.832, + "step": 11530 + }, + { + "epoch": 0.6695871652789464, + "grad_norm": 0.11829391121864319, + "learning_rate": 0.00016200048957145597, + "loss": 2.8181, + "step": 11540 + }, + { + "epoch": 0.6701673967913198, + "grad_norm": 0.11668332666158676, + "learning_rate": 0.00016148960013129303, + "loss": 2.8163, + "step": 11550 + }, + { + "epoch": 0.6707476283036932, + "grad_norm": 0.11444656550884247, + "learning_rate": 0.0001609792207009325, + "loss": 2.8171, + "step": 11560 + }, + { + "epoch": 0.6713278598160666, + "grad_norm": 0.11538255959749222, + "learning_rate": 0.00016046935315964476, + "loss": 2.8192, + "step": 11570 + }, + { + "epoch": 0.6719080913284401, + "grad_norm": 0.13890443742275238, + "learning_rate": 0.0001599599993848155, + "loss": 2.814, + "step": 11580 + }, + { + "epoch": 0.6724883228408135, + "grad_norm": 0.10878733545541763, + "learning_rate": 0.00015945116125193876, + "loss": 2.8161, + "step": 11590 + }, + { + "epoch": 0.6730685543531869, + "grad_norm": 0.11337769776582718, + "learning_rate": 0.00015894284063460966, + "loss": 2.8161, + "step": 11600 + }, + { + "epoch": 0.6736487858655603, + "grad_norm": 0.1095629557967186, + "learning_rate": 0.00015843503940451834, + "loss": 2.8087, + "step": 11610 + }, + { + "epoch": 0.6742290173779338, + "grad_norm": 0.1378069370985031, + "learning_rate": 0.00015792775943144165, + "loss": 2.8151, + "step": 11620 + }, + { + "epoch": 0.6748092488903072, + "grad_norm": 0.1202809140086174, + "learning_rate": 0.00015742100258323794, + "loss": 2.831, + "step": 11630 + }, + { + "epoch": 0.6753894804026807, + "grad_norm": 0.12298610061407089, + "learning_rate": 0.00015691477072583894, + "loss": 2.8247, + "step": 11640 + }, + { + "epoch": 0.6759697119150541, + "grad_norm": 0.11947082728147507, + "learning_rate": 0.00015640906572324319, + "loss": 2.8238, + "step": 11650 + }, + { + "epoch": 0.6765499434274276, + "grad_norm": 0.11039472371339798, + "learning_rate": 0.00015590388943750988, + "loss": 2.8267, + "step": 11660 + }, + { + "epoch": 0.677130174939801, + "grad_norm": 0.11807908117771149, + "learning_rate": 0.0001553992437287505, + "loss": 2.8222, + "step": 11670 + }, + { + "epoch": 0.6777104064521744, + "grad_norm": 0.11934113502502441, + "learning_rate": 0.00015489513045512386, + "loss": 2.8193, + "step": 11680 + }, + { + "epoch": 0.6782906379645478, + "grad_norm": 0.11163033545017242, + "learning_rate": 0.00015439155147282764, + "loss": 2.8137, + "step": 11690 + }, + { + "epoch": 0.6788708694769213, + "grad_norm": 0.11381068080663681, + "learning_rate": 0.0001538885086360923, + "loss": 2.8202, + "step": 11700 + }, + { + "epoch": 0.6794511009892947, + "grad_norm": 0.11011006683111191, + "learning_rate": 0.0001533860037971747, + "loss": 2.8213, + "step": 11710 + }, + { + "epoch": 0.6800313325016681, + "grad_norm": 0.11611464619636536, + "learning_rate": 0.0001528840388063497, + "loss": 2.8216, + "step": 11720 + }, + { + "epoch": 0.6806115640140415, + "grad_norm": 0.10734301805496216, + "learning_rate": 0.0001523826155119055, + "loss": 2.8188, + "step": 11730 + }, + { + "epoch": 0.6811917955264151, + "grad_norm": 0.12189003825187683, + "learning_rate": 0.00015188173576013482, + "loss": 2.8206, + "step": 11740 + }, + { + "epoch": 0.6817720270387885, + "grad_norm": 0.11146776378154755, + "learning_rate": 0.0001513814013953296, + "loss": 2.8176, + "step": 11750 + }, + { + "epoch": 0.6823522585511619, + "grad_norm": 0.11531021445989609, + "learning_rate": 0.0001508816142597733, + "loss": 2.8192, + "step": 11760 + }, + { + "epoch": 0.6829324900635354, + "grad_norm": 0.11541693657636642, + "learning_rate": 0.00015038237619373443, + "loss": 2.8219, + "step": 11770 + }, + { + "epoch": 0.6835127215759088, + "grad_norm": 0.11345332115888596, + "learning_rate": 0.0001498836890354602, + "loss": 2.8024, + "step": 11780 + }, + { + "epoch": 0.6840929530882822, + "grad_norm": 0.10796009749174118, + "learning_rate": 0.00014938555462116842, + "loss": 2.8119, + "step": 11790 + }, + { + "epoch": 0.6846731846006556, + "grad_norm": 0.11463455855846405, + "learning_rate": 0.00014888797478504261, + "loss": 2.8119, + "step": 11800 + }, + { + "epoch": 0.6852534161130291, + "grad_norm": 0.11192594468593597, + "learning_rate": 0.00014839095135922372, + "loss": 2.8252, + "step": 11810 + }, + { + "epoch": 0.6858336476254026, + "grad_norm": 0.11805829405784607, + "learning_rate": 0.000147894486173804, + "loss": 2.8095, + "step": 11820 + }, + { + "epoch": 0.686413879137776, + "grad_norm": 0.11721805483102798, + "learning_rate": 0.00014739858105682053, + "loss": 2.8123, + "step": 11830 + }, + { + "epoch": 0.6869941106501494, + "grad_norm": 0.11619780957698822, + "learning_rate": 0.0001469032378342475, + "loss": 2.8177, + "step": 11840 + }, + { + "epoch": 0.6875743421625229, + "grad_norm": 0.10933215916156769, + "learning_rate": 0.00014640845832999087, + "loss": 2.8078, + "step": 11850 + }, + { + "epoch": 0.6881545736748963, + "grad_norm": 0.11362309753894806, + "learning_rate": 0.0001459142443658805, + "loss": 2.8103, + "step": 11860 + }, + { + "epoch": 0.6887348051872697, + "grad_norm": 0.10805781930685043, + "learning_rate": 0.00014542059776166382, + "loss": 2.8073, + "step": 11870 + }, + { + "epoch": 0.6893150366996431, + "grad_norm": 0.124758280813694, + "learning_rate": 0.00014492752033499977, + "loss": 2.8133, + "step": 11880 + }, + { + "epoch": 0.6898952682120166, + "grad_norm": 0.11096182465553284, + "learning_rate": 0.00014443501390145057, + "loss": 2.8061, + "step": 11890 + }, + { + "epoch": 0.69047549972439, + "grad_norm": 0.1132817193865776, + "learning_rate": 0.00014394308027447685, + "loss": 2.8209, + "step": 11900 + }, + { + "epoch": 0.6910557312367634, + "grad_norm": 0.10996360331773758, + "learning_rate": 0.00014345172126542966, + "loss": 2.8161, + "step": 11910 + }, + { + "epoch": 0.6916359627491369, + "grad_norm": 0.11297384649515152, + "learning_rate": 0.0001429609386835442, + "loss": 2.8116, + "step": 11920 + }, + { + "epoch": 0.6922161942615104, + "grad_norm": 0.12191120535135269, + "learning_rate": 0.00014247073433593373, + "loss": 2.8156, + "step": 11930 + }, + { + "epoch": 0.6927964257738838, + "grad_norm": 0.11631318181753159, + "learning_rate": 0.00014198111002758154, + "loss": 2.8225, + "step": 11940 + }, + { + "epoch": 0.6933766572862572, + "grad_norm": 0.14487071335315704, + "learning_rate": 0.00014149206756133595, + "loss": 2.8153, + "step": 11950 + }, + { + "epoch": 0.6939568887986306, + "grad_norm": 0.11780226230621338, + "learning_rate": 0.00014100360873790248, + "loss": 2.8163, + "step": 11960 + }, + { + "epoch": 0.6945371203110041, + "grad_norm": 0.11396613717079163, + "learning_rate": 0.00014051573535583766, + "loss": 2.8101, + "step": 11970 + }, + { + "epoch": 0.6951173518233775, + "grad_norm": 0.11514125019311905, + "learning_rate": 0.00014002844921154233, + "loss": 2.819, + "step": 11980 + }, + { + "epoch": 0.6956975833357509, + "grad_norm": 0.11687569320201874, + "learning_rate": 0.00013954175209925513, + "loss": 2.8106, + "step": 11990 + }, + { + "epoch": 0.6962778148481245, + "grad_norm": 0.11218845099210739, + "learning_rate": 0.00013905564581104607, + "loss": 2.8156, + "step": 12000 + }, + { + "epoch": 0.6962778148481245, + "eval_loss": 2.778130531311035, + "eval_runtime": 3.2555, + "eval_samples_per_second": 1330.053, + "eval_steps_per_second": 2.765, + "step": 12000 + }, + { + "epoch": 0.6968580463604979, + "grad_norm": 0.11513704061508179, + "learning_rate": 0.000138570132136809, + "loss": 2.8185, + "step": 12010 + }, + { + "epoch": 0.6974382778728713, + "grad_norm": 0.12384956330060959, + "learning_rate": 0.00013808521286425644, + "loss": 2.8159, + "step": 12020 + }, + { + "epoch": 0.6980185093852447, + "grad_norm": 0.11136494576931, + "learning_rate": 0.0001376008897789119, + "loss": 2.8196, + "step": 12030 + }, + { + "epoch": 0.6985987408976182, + "grad_norm": 0.11704517900943756, + "learning_rate": 0.00013711716466410353, + "loss": 2.8118, + "step": 12040 + }, + { + "epoch": 0.6991789724099916, + "grad_norm": 0.11521551758050919, + "learning_rate": 0.00013663403930095827, + "loss": 2.8131, + "step": 12050 + }, + { + "epoch": 0.699759203922365, + "grad_norm": 0.10568945109844208, + "learning_rate": 0.00013615151546839382, + "loss": 2.8098, + "step": 12060 + }, + { + "epoch": 0.7003394354347384, + "grad_norm": 0.1213884949684143, + "learning_rate": 0.00013566959494311386, + "loss": 2.8091, + "step": 12070 + }, + { + "epoch": 0.7009196669471119, + "grad_norm": 0.11004059761762619, + "learning_rate": 0.00013518827949960015, + "loss": 2.8238, + "step": 12080 + }, + { + "epoch": 0.7014998984594853, + "grad_norm": 0.11095508933067322, + "learning_rate": 0.00013470757091010649, + "loss": 2.8116, + "step": 12090 + }, + { + "epoch": 0.7020801299718588, + "grad_norm": 0.11275944113731384, + "learning_rate": 0.00013422747094465234, + "loss": 2.8109, + "step": 12100 + }, + { + "epoch": 0.7026603614842322, + "grad_norm": 0.11312493681907654, + "learning_rate": 0.00013374798137101595, + "loss": 2.814, + "step": 12110 + }, + { + "epoch": 0.7032405929966057, + "grad_norm": 0.10738647729158401, + "learning_rate": 0.00013326910395472833, + "loss": 2.8111, + "step": 12120 + }, + { + "epoch": 0.7038208245089791, + "grad_norm": 0.11198966205120087, + "learning_rate": 0.00013279084045906623, + "loss": 2.806, + "step": 12130 + }, + { + "epoch": 0.7044010560213525, + "grad_norm": 0.11718153953552246, + "learning_rate": 0.00013231319264504594, + "loss": 2.8186, + "step": 12140 + }, + { + "epoch": 0.7049812875337259, + "grad_norm": 0.11054380983114243, + "learning_rate": 0.00013183616227141674, + "loss": 2.8144, + "step": 12150 + }, + { + "epoch": 0.7055615190460994, + "grad_norm": 0.11579257249832153, + "learning_rate": 0.0001313597510946543, + "loss": 2.8101, + "step": 12160 + }, + { + "epoch": 0.7061417505584728, + "grad_norm": 0.10710903257131577, + "learning_rate": 0.00013088396086895476, + "loss": 2.8104, + "step": 12170 + }, + { + "epoch": 0.7067219820708462, + "grad_norm": 0.11220473051071167, + "learning_rate": 0.00013040879334622738, + "loss": 2.8049, + "step": 12180 + }, + { + "epoch": 0.7073022135832197, + "grad_norm": 0.10872667282819748, + "learning_rate": 0.00012993425027608884, + "loss": 2.8175, + "step": 12190 + }, + { + "epoch": 0.7078824450955932, + "grad_norm": 0.10861840099096298, + "learning_rate": 0.00012946033340585641, + "loss": 2.8072, + "step": 12200 + }, + { + "epoch": 0.7084626766079666, + "grad_norm": 0.11558268964290619, + "learning_rate": 0.00012898704448054162, + "loss": 2.8034, + "step": 12210 + }, + { + "epoch": 0.70904290812034, + "grad_norm": 0.11709378659725189, + "learning_rate": 0.00012851438524284382, + "loss": 2.8047, + "step": 12220 + }, + { + "epoch": 0.7096231396327135, + "grad_norm": 0.12139759957790375, + "learning_rate": 0.00012804235743314401, + "loss": 2.8056, + "step": 12230 + }, + { + "epoch": 0.7102033711450869, + "grad_norm": 0.11130308359861374, + "learning_rate": 0.00012757096278949792, + "loss": 2.8138, + "step": 12240 + }, + { + "epoch": 0.7107836026574603, + "grad_norm": 0.1112653836607933, + "learning_rate": 0.00012710020304763003, + "loss": 2.8004, + "step": 12250 + }, + { + "epoch": 0.7113638341698337, + "grad_norm": 0.11182957142591476, + "learning_rate": 0.00012663007994092703, + "loss": 2.8064, + "step": 12260 + }, + { + "epoch": 0.7119440656822072, + "grad_norm": 0.13386094570159912, + "learning_rate": 0.00012616059520043145, + "loss": 2.8148, + "step": 12270 + }, + { + "epoch": 0.7125242971945807, + "grad_norm": 0.11641652137041092, + "learning_rate": 0.0001256917505548352, + "loss": 2.8102, + "step": 12280 + }, + { + "epoch": 0.7131045287069541, + "grad_norm": 0.10916447639465332, + "learning_rate": 0.00012522354773047352, + "loss": 2.8148, + "step": 12290 + }, + { + "epoch": 0.7136847602193275, + "grad_norm": 0.10887318104505539, + "learning_rate": 0.0001247559884513182, + "loss": 2.8047, + "step": 12300 + }, + { + "epoch": 0.714264991731701, + "grad_norm": 0.11701834946870804, + "learning_rate": 0.0001242890744389715, + "loss": 2.8144, + "step": 12310 + }, + { + "epoch": 0.7148452232440744, + "grad_norm": 0.10473381727933884, + "learning_rate": 0.00012382280741265968, + "loss": 2.8057, + "step": 12320 + }, + { + "epoch": 0.7154254547564478, + "grad_norm": 0.10586260259151459, + "learning_rate": 0.00012335718908922685, + "loss": 2.8032, + "step": 12330 + }, + { + "epoch": 0.7160056862688212, + "grad_norm": 0.10688824206590652, + "learning_rate": 0.00012289222118312822, + "loss": 2.8054, + "step": 12340 + }, + { + "epoch": 0.7165859177811947, + "grad_norm": 0.11233460903167725, + "learning_rate": 0.0001224279054064247, + "loss": 2.801, + "step": 12350 + }, + { + "epoch": 0.7171661492935681, + "grad_norm": 0.10600557923316956, + "learning_rate": 0.00012196424346877541, + "loss": 2.8035, + "step": 12360 + }, + { + "epoch": 0.7177463808059416, + "grad_norm": 0.11300963163375854, + "learning_rate": 0.00012150123707743219, + "loss": 2.8098, + "step": 12370 + }, + { + "epoch": 0.718326612318315, + "grad_norm": 0.11773265898227692, + "learning_rate": 0.00012103888793723312, + "loss": 2.8103, + "step": 12380 + }, + { + "epoch": 0.7189068438306885, + "grad_norm": 0.11092250049114227, + "learning_rate": 0.00012057719775059602, + "loss": 2.8028, + "step": 12390 + }, + { + "epoch": 0.7194870753430619, + "grad_norm": 0.10554751008749008, + "learning_rate": 0.00012011616821751271, + "loss": 2.8044, + "step": 12400 + }, + { + "epoch": 0.7200673068554353, + "grad_norm": 0.1148175522685051, + "learning_rate": 0.0001196558010355422, + "loss": 2.8099, + "step": 12410 + }, + { + "epoch": 0.7206475383678087, + "grad_norm": 0.10981535166501999, + "learning_rate": 0.00011919609789980458, + "loss": 2.7991, + "step": 12420 + }, + { + "epoch": 0.7212277698801822, + "grad_norm": 0.11188452690839767, + "learning_rate": 0.00011873706050297508, + "loss": 2.8067, + "step": 12430 + }, + { + "epoch": 0.7218080013925556, + "grad_norm": 0.11328940838575363, + "learning_rate": 0.00011827869053527727, + "loss": 2.8049, + "step": 12440 + }, + { + "epoch": 0.722388232904929, + "grad_norm": 0.11542364954948425, + "learning_rate": 0.00011782098968447774, + "loss": 2.7988, + "step": 12450 + }, + { + "epoch": 0.7229684644173026, + "grad_norm": 0.11087549477815628, + "learning_rate": 0.00011736395963587857, + "loss": 2.8102, + "step": 12460 + }, + { + "epoch": 0.723548695929676, + "grad_norm": 0.11298040300607681, + "learning_rate": 0.00011690760207231256, + "loss": 2.8063, + "step": 12470 + }, + { + "epoch": 0.7241289274420494, + "grad_norm": 0.10775293409824371, + "learning_rate": 0.00011645191867413596, + "loss": 2.8065, + "step": 12480 + }, + { + "epoch": 0.7247091589544228, + "grad_norm": 0.11240221560001373, + "learning_rate": 0.00011599691111922272, + "loss": 2.8062, + "step": 12490 + }, + { + "epoch": 0.7252893904667963, + "grad_norm": 0.1069854348897934, + "learning_rate": 0.00011554258108295859, + "loss": 2.79, + "step": 12500 + }, + { + "epoch": 0.7258696219791697, + "grad_norm": 0.11566832661628723, + "learning_rate": 0.00011508893023823393, + "loss": 2.7977, + "step": 12510 + }, + { + "epoch": 0.7264498534915431, + "grad_norm": 0.11771980673074722, + "learning_rate": 0.00011463596025543905, + "loss": 2.803, + "step": 12520 + }, + { + "epoch": 0.7270300850039165, + "grad_norm": 0.11435101926326752, + "learning_rate": 0.0001141836728024567, + "loss": 2.7985, + "step": 12530 + }, + { + "epoch": 0.72761031651629, + "grad_norm": 0.10902056097984314, + "learning_rate": 0.0001137320695446566, + "loss": 2.8096, + "step": 12540 + }, + { + "epoch": 0.7281905480286635, + "grad_norm": 0.10939980298280716, + "learning_rate": 0.0001132811521448896, + "loss": 2.8121, + "step": 12550 + }, + { + "epoch": 0.7287707795410369, + "grad_norm": 0.10922636091709137, + "learning_rate": 0.00011283092226348031, + "loss": 2.8093, + "step": 12560 + }, + { + "epoch": 0.7293510110534103, + "grad_norm": 0.10520195960998535, + "learning_rate": 0.00011238138155822275, + "loss": 2.8031, + "step": 12570 + }, + { + "epoch": 0.7299312425657838, + "grad_norm": 0.10655706375837326, + "learning_rate": 0.00011193253168437253, + "loss": 2.8083, + "step": 12580 + }, + { + "epoch": 0.7305114740781572, + "grad_norm": 0.11627507954835892, + "learning_rate": 0.00011148437429464215, + "loss": 2.7994, + "step": 12590 + }, + { + "epoch": 0.7310917055905306, + "grad_norm": 0.1093965470790863, + "learning_rate": 0.00011103691103919401, + "loss": 2.8054, + "step": 12600 + }, + { + "epoch": 0.731671937102904, + "grad_norm": 0.113887257874012, + "learning_rate": 0.00011059014356563458, + "loss": 2.7963, + "step": 12610 + }, + { + "epoch": 0.7322521686152775, + "grad_norm": 0.10929399728775024, + "learning_rate": 0.00011014407351900879, + "loss": 2.8033, + "step": 12620 + }, + { + "epoch": 0.7328324001276509, + "grad_norm": 0.11176785826683044, + "learning_rate": 0.00010969870254179285, + "loss": 2.8061, + "step": 12630 + }, + { + "epoch": 0.7334126316400243, + "grad_norm": 0.10631275177001953, + "learning_rate": 0.00010925403227388973, + "loss": 2.8107, + "step": 12640 + }, + { + "epoch": 0.7339928631523978, + "grad_norm": 0.11108485609292984, + "learning_rate": 0.00010881006435262179, + "loss": 2.8059, + "step": 12650 + }, + { + "epoch": 0.7345730946647713, + "grad_norm": 0.10749488323926926, + "learning_rate": 0.00010836680041272536, + "loss": 2.8004, + "step": 12660 + }, + { + "epoch": 0.7351533261771447, + "grad_norm": 0.10994744300842285, + "learning_rate": 0.00010792424208634495, + "loss": 2.8093, + "step": 12670 + }, + { + "epoch": 0.7357335576895181, + "grad_norm": 0.10910103470087051, + "learning_rate": 0.00010748239100302627, + "loss": 2.7928, + "step": 12680 + }, + { + "epoch": 0.7363137892018915, + "grad_norm": 0.10835743695497513, + "learning_rate": 0.0001070412487897117, + "loss": 2.8077, + "step": 12690 + }, + { + "epoch": 0.736894020714265, + "grad_norm": 0.10580655187368393, + "learning_rate": 0.00010660081707073288, + "loss": 2.7991, + "step": 12700 + }, + { + "epoch": 0.7374742522266384, + "grad_norm": 0.10928157716989517, + "learning_rate": 0.00010616109746780546, + "loss": 2.7905, + "step": 12710 + }, + { + "epoch": 0.7380544837390118, + "grad_norm": 0.10654684156179428, + "learning_rate": 0.00010572209160002339, + "loss": 2.8021, + "step": 12720 + }, + { + "epoch": 0.7386347152513854, + "grad_norm": 0.10834140330553055, + "learning_rate": 0.00010528380108385186, + "loss": 2.805, + "step": 12730 + }, + { + "epoch": 0.7392149467637588, + "grad_norm": 0.1152142882347107, + "learning_rate": 0.00010484622753312279, + "loss": 2.7916, + "step": 12740 + }, + { + "epoch": 0.7397951782761322, + "grad_norm": 0.10981319844722748, + "learning_rate": 0.0001044093725590277, + "loss": 2.8029, + "step": 12750 + }, + { + "epoch": 0.7403754097885056, + "grad_norm": 0.1065368577837944, + "learning_rate": 0.00010397323777011229, + "loss": 2.8048, + "step": 12760 + }, + { + "epoch": 0.7409556413008791, + "grad_norm": 0.10563939809799194, + "learning_rate": 0.00010353782477227083, + "loss": 2.8058, + "step": 12770 + }, + { + "epoch": 0.7415358728132525, + "grad_norm": 0.11117275804281235, + "learning_rate": 0.00010310313516873922, + "loss": 2.7985, + "step": 12780 + }, + { + "epoch": 0.7421161043256259, + "grad_norm": 0.11544723808765411, + "learning_rate": 0.00010266917056009036, + "loss": 2.8001, + "step": 12790 + }, + { + "epoch": 0.7426963358379993, + "grad_norm": 0.11005005240440369, + "learning_rate": 0.00010223593254422733, + "loss": 2.7954, + "step": 12800 + }, + { + "epoch": 0.7432765673503728, + "grad_norm": 0.11374104768037796, + "learning_rate": 0.0001018034227163779, + "loss": 2.8053, + "step": 12810 + }, + { + "epoch": 0.7438567988627462, + "grad_norm": 0.11264318227767944, + "learning_rate": 0.00010137164266908854, + "loss": 2.8029, + "step": 12820 + }, + { + "epoch": 0.7444370303751197, + "grad_norm": 0.10718287527561188, + "learning_rate": 0.00010094059399221855, + "loss": 2.7964, + "step": 12830 + }, + { + "epoch": 0.7450172618874931, + "grad_norm": 0.11395127326250076, + "learning_rate": 0.00010051027827293457, + "loss": 2.8057, + "step": 12840 + }, + { + "epoch": 0.7455974933998666, + "grad_norm": 0.11251317709684372, + "learning_rate": 0.00010008069709570378, + "loss": 2.8036, + "step": 12850 + }, + { + "epoch": 0.74617772491224, + "grad_norm": 0.1180030032992363, + "learning_rate": 9.965185204228941e-05, + "loss": 2.8016, + "step": 12860 + }, + { + "epoch": 0.7467579564246134, + "grad_norm": 0.12361141294240952, + "learning_rate": 9.922374469174372e-05, + "loss": 2.7891, + "step": 12870 + }, + { + "epoch": 0.7473381879369868, + "grad_norm": 0.11456003040075302, + "learning_rate": 9.879637662040275e-05, + "loss": 2.8028, + "step": 12880 + }, + { + "epoch": 0.7479184194493603, + "grad_norm": 0.11008987575769424, + "learning_rate": 9.83697494018808e-05, + "loss": 2.8093, + "step": 12890 + }, + { + "epoch": 0.7484986509617337, + "grad_norm": 0.11017616838216782, + "learning_rate": 9.794386460706356e-05, + "loss": 2.8005, + "step": 12900 + }, + { + "epoch": 0.7490788824741071, + "grad_norm": 0.11627316474914551, + "learning_rate": 9.751872380410378e-05, + "loss": 2.799, + "step": 12910 + }, + { + "epoch": 0.7496591139864806, + "grad_norm": 0.11369270831346512, + "learning_rate": 9.709432855841436e-05, + "loss": 2.7941, + "step": 12920 + }, + { + "epoch": 0.7502393454988541, + "grad_norm": 0.10983362793922424, + "learning_rate": 9.667068043266302e-05, + "loss": 2.7996, + "step": 12930 + }, + { + "epoch": 0.7508195770112275, + "grad_norm": 0.10419350117444992, + "learning_rate": 9.624778098676652e-05, + "loss": 2.8052, + "step": 12940 + }, + { + "epoch": 0.7513998085236009, + "grad_norm": 0.10500075668096542, + "learning_rate": 9.582563177788487e-05, + "loss": 2.7993, + "step": 12950 + }, + { + "epoch": 0.7519800400359744, + "grad_norm": 0.10765775293111801, + "learning_rate": 9.540423436041585e-05, + "loss": 2.7964, + "step": 12960 + }, + { + "epoch": 0.7525602715483478, + "grad_norm": 0.10872151702642441, + "learning_rate": 9.49835902859888e-05, + "loss": 2.7876, + "step": 12970 + }, + { + "epoch": 0.7531405030607212, + "grad_norm": 0.10935165733098984, + "learning_rate": 9.456370110345927e-05, + "loss": 2.8003, + "step": 12980 + }, + { + "epoch": 0.7537207345730946, + "grad_norm": 0.1083398386836052, + "learning_rate": 9.414456835890322e-05, + "loss": 2.7945, + "step": 12990 + }, + { + "epoch": 0.7543009660854681, + "grad_norm": 0.10846253484487534, + "learning_rate": 9.372619359561121e-05, + "loss": 2.799, + "step": 13000 + }, + { + "epoch": 0.7543009660854681, + "eval_loss": 2.7616169452667236, + "eval_runtime": 3.2768, + "eval_samples_per_second": 1321.408, + "eval_steps_per_second": 2.747, + "step": 13000 + }, + { + "epoch": 0.7548811975978416, + "grad_norm": 0.10937865823507309, + "learning_rate": 9.330857835408318e-05, + "loss": 2.7962, + "step": 13010 + }, + { + "epoch": 0.755461429110215, + "grad_norm": 0.10633205622434616, + "learning_rate": 9.289172417202205e-05, + "loss": 2.7989, + "step": 13020 + }, + { + "epoch": 0.7560416606225884, + "grad_norm": 0.11001235246658325, + "learning_rate": 9.247563258432861e-05, + "loss": 2.7955, + "step": 13030 + }, + { + "epoch": 0.7566218921349619, + "grad_norm": 0.10847952216863632, + "learning_rate": 9.206030512309566e-05, + "loss": 2.7959, + "step": 13040 + }, + { + "epoch": 0.7572021236473353, + "grad_norm": 0.10858704149723053, + "learning_rate": 9.164574331760246e-05, + "loss": 2.7965, + "step": 13050 + }, + { + "epoch": 0.7577823551597087, + "grad_norm": 0.10710106790065765, + "learning_rate": 9.123194869430888e-05, + "loss": 2.7921, + "step": 13060 + }, + { + "epoch": 0.7583625866720821, + "grad_norm": 0.10932508111000061, + "learning_rate": 9.081892277685026e-05, + "loss": 2.7921, + "step": 13070 + }, + { + "epoch": 0.7589428181844556, + "grad_norm": 0.11362321674823761, + "learning_rate": 9.040666708603125e-05, + "loss": 2.7981, + "step": 13080 + }, + { + "epoch": 0.759523049696829, + "grad_norm": 0.10791613906621933, + "learning_rate": 8.999518313982039e-05, + "loss": 2.7993, + "step": 13090 + }, + { + "epoch": 0.7601032812092025, + "grad_norm": 0.11038652807474136, + "learning_rate": 8.958447245334476e-05, + "loss": 2.7922, + "step": 13100 + }, + { + "epoch": 0.7606835127215759, + "grad_norm": 0.11153964698314667, + "learning_rate": 8.91745365388841e-05, + "loss": 2.8016, + "step": 13110 + }, + { + "epoch": 0.7612637442339494, + "grad_norm": 0.10748942941427231, + "learning_rate": 8.876537690586529e-05, + "loss": 2.791, + "step": 13120 + }, + { + "epoch": 0.7618439757463228, + "grad_norm": 0.1106482520699501, + "learning_rate": 8.83569950608572e-05, + "loss": 2.8008, + "step": 13130 + }, + { + "epoch": 0.7624242072586962, + "grad_norm": 0.10443028807640076, + "learning_rate": 8.794939250756441e-05, + "loss": 2.7936, + "step": 13140 + }, + { + "epoch": 0.7630044387710696, + "grad_norm": 0.11383570730686188, + "learning_rate": 8.754257074682222e-05, + "loss": 2.7912, + "step": 13150 + }, + { + "epoch": 0.7635846702834431, + "grad_norm": 0.10836578160524368, + "learning_rate": 8.713653127659105e-05, + "loss": 2.7939, + "step": 13160 + }, + { + "epoch": 0.7641649017958165, + "grad_norm": 0.10870825499296188, + "learning_rate": 8.673127559195066e-05, + "loss": 2.7991, + "step": 13170 + }, + { + "epoch": 0.7647451333081899, + "grad_norm": 0.10718671977519989, + "learning_rate": 8.632680518509492e-05, + "loss": 2.7879, + "step": 13180 + }, + { + "epoch": 0.7653253648205635, + "grad_norm": 0.11277935653924942, + "learning_rate": 8.592312154532637e-05, + "loss": 2.7947, + "step": 13190 + }, + { + "epoch": 0.7659055963329369, + "grad_norm": 0.11088382452726364, + "learning_rate": 8.552022615905038e-05, + "loss": 2.7996, + "step": 13200 + }, + { + "epoch": 0.7664858278453103, + "grad_norm": 0.10912182927131653, + "learning_rate": 8.511812050977003e-05, + "loss": 2.7943, + "step": 13210 + }, + { + "epoch": 0.7670660593576837, + "grad_norm": 0.10919041931629181, + "learning_rate": 8.471680607808035e-05, + "loss": 2.7992, + "step": 13220 + }, + { + "epoch": 0.7676462908700572, + "grad_norm": 0.10616286844015121, + "learning_rate": 8.431628434166309e-05, + "loss": 2.7977, + "step": 13230 + }, + { + "epoch": 0.7682265223824306, + "grad_norm": 0.10572168231010437, + "learning_rate": 8.391655677528143e-05, + "loss": 2.7959, + "step": 13240 + }, + { + "epoch": 0.768806753894804, + "grad_norm": 0.10937794297933578, + "learning_rate": 8.3517624850774e-05, + "loss": 2.793, + "step": 13250 + }, + { + "epoch": 0.7693869854071774, + "grad_norm": 0.10820769518613815, + "learning_rate": 8.311949003704996e-05, + "loss": 2.7991, + "step": 13260 + }, + { + "epoch": 0.769967216919551, + "grad_norm": 0.10802992433309555, + "learning_rate": 8.272215380008343e-05, + "loss": 2.7965, + "step": 13270 + }, + { + "epoch": 0.7705474484319244, + "grad_norm": 0.10747858881950378, + "learning_rate": 8.232561760290794e-05, + "loss": 2.7957, + "step": 13280 + }, + { + "epoch": 0.7711276799442978, + "grad_norm": 0.11238089948892593, + "learning_rate": 8.192988290561157e-05, + "loss": 2.7922, + "step": 13290 + }, + { + "epoch": 0.7717079114566712, + "grad_norm": 0.1034981980919838, + "learning_rate": 8.153495116533056e-05, + "loss": 2.789, + "step": 13300 + }, + { + "epoch": 0.7722881429690447, + "grad_norm": 0.10910629481077194, + "learning_rate": 8.11408238362453e-05, + "loss": 2.7899, + "step": 13310 + }, + { + "epoch": 0.7728683744814181, + "grad_norm": 0.11309719830751419, + "learning_rate": 8.07475023695737e-05, + "loss": 2.7978, + "step": 13320 + }, + { + "epoch": 0.7734486059937915, + "grad_norm": 0.10908596217632294, + "learning_rate": 8.035498821356664e-05, + "loss": 2.7938, + "step": 13330 + }, + { + "epoch": 0.7740288375061649, + "grad_norm": 0.11714279651641846, + "learning_rate": 7.996328281350252e-05, + "loss": 2.7967, + "step": 13340 + }, + { + "epoch": 0.7746090690185384, + "grad_norm": 0.10943669080734253, + "learning_rate": 7.957238761168135e-05, + "loss": 2.7803, + "step": 13350 + }, + { + "epoch": 0.7751893005309118, + "grad_norm": 0.11171719431877136, + "learning_rate": 7.918230404742045e-05, + "loss": 2.7941, + "step": 13360 + }, + { + "epoch": 0.7757695320432852, + "grad_norm": 0.10363152623176575, + "learning_rate": 7.879303355704834e-05, + "loss": 2.8043, + "step": 13370 + }, + { + "epoch": 0.7763497635556587, + "grad_norm": 0.1147744432091713, + "learning_rate": 7.840457757389968e-05, + "loss": 2.8022, + "step": 13380 + }, + { + "epoch": 0.7769299950680322, + "grad_norm": 0.10682083666324615, + "learning_rate": 7.801693752831012e-05, + "loss": 2.7914, + "step": 13390 + }, + { + "epoch": 0.7775102265804056, + "grad_norm": 0.11352023482322693, + "learning_rate": 7.763011484761082e-05, + "loss": 2.7958, + "step": 13400 + }, + { + "epoch": 0.778090458092779, + "grad_norm": 0.10785870254039764, + "learning_rate": 7.724411095612366e-05, + "loss": 2.7971, + "step": 13410 + }, + { + "epoch": 0.7786706896051525, + "grad_norm": 0.10762759298086166, + "learning_rate": 7.68589272751551e-05, + "loss": 2.7916, + "step": 13420 + }, + { + "epoch": 0.7792509211175259, + "grad_norm": 0.10556434839963913, + "learning_rate": 7.647456522299207e-05, + "loss": 2.784, + "step": 13430 + }, + { + "epoch": 0.7798311526298993, + "grad_norm": 0.1077750101685524, + "learning_rate": 7.609102621489577e-05, + "loss": 2.7906, + "step": 13440 + }, + { + "epoch": 0.7804113841422727, + "grad_norm": 0.10472170263528824, + "learning_rate": 7.570831166309693e-05, + "loss": 2.7833, + "step": 13450 + }, + { + "epoch": 0.7809916156546463, + "grad_norm": 0.1061674952507019, + "learning_rate": 7.532642297679093e-05, + "loss": 2.796, + "step": 13460 + }, + { + "epoch": 0.7815718471670197, + "grad_norm": 0.10716653615236282, + "learning_rate": 7.494536156213151e-05, + "loss": 2.791, + "step": 13470 + }, + { + "epoch": 0.7821520786793931, + "grad_norm": 0.11008104681968689, + "learning_rate": 7.456512882222703e-05, + "loss": 2.7874, + "step": 13480 + }, + { + "epoch": 0.7827323101917665, + "grad_norm": 0.11095033586025238, + "learning_rate": 7.418572615713413e-05, + "loss": 2.7874, + "step": 13490 + }, + { + "epoch": 0.78331254170414, + "grad_norm": 0.10690274834632874, + "learning_rate": 7.380715496385316e-05, + "loss": 2.7897, + "step": 13500 + }, + { + "epoch": 0.7838927732165134, + "grad_norm": 0.10463336110115051, + "learning_rate": 7.34294166363231e-05, + "loss": 2.7965, + "step": 13510 + }, + { + "epoch": 0.7844730047288868, + "grad_norm": 0.10628803819417953, + "learning_rate": 7.30525125654157e-05, + "loss": 2.7878, + "step": 13520 + }, + { + "epoch": 0.7850532362412602, + "grad_norm": 0.10758186876773834, + "learning_rate": 7.267644413893152e-05, + "loss": 2.7893, + "step": 13530 + }, + { + "epoch": 0.7856334677536337, + "grad_norm": 0.10785481333732605, + "learning_rate": 7.230121274159384e-05, + "loss": 2.7896, + "step": 13540 + }, + { + "epoch": 0.7862136992660071, + "grad_norm": 0.10700030624866486, + "learning_rate": 7.192681975504382e-05, + "loss": 2.786, + "step": 13550 + }, + { + "epoch": 0.7867939307783806, + "grad_norm": 0.10182949900627136, + "learning_rate": 7.155326655783597e-05, + "loss": 2.7889, + "step": 13560 + }, + { + "epoch": 0.787374162290754, + "grad_norm": 0.10802864283323288, + "learning_rate": 7.118055452543193e-05, + "loss": 2.7946, + "step": 13570 + }, + { + "epoch": 0.7879543938031275, + "grad_norm": 0.10849913954734802, + "learning_rate": 7.080868503019672e-05, + "loss": 2.786, + "step": 13580 + }, + { + "epoch": 0.7885346253155009, + "grad_norm": 0.10770730674266815, + "learning_rate": 7.043765944139264e-05, + "loss": 2.7804, + "step": 13590 + }, + { + "epoch": 0.7891148568278743, + "grad_norm": 0.11441770195960999, + "learning_rate": 7.006747912517475e-05, + "loss": 2.79, + "step": 13600 + }, + { + "epoch": 0.7896950883402477, + "grad_norm": 0.10908571630716324, + "learning_rate": 6.9698145444586e-05, + "loss": 2.7897, + "step": 13610 + }, + { + "epoch": 0.7902753198526212, + "grad_norm": 0.10705877095460892, + "learning_rate": 6.932965975955134e-05, + "loss": 2.7857, + "step": 13620 + }, + { + "epoch": 0.7908555513649946, + "grad_norm": 0.11635982990264893, + "learning_rate": 6.896202342687397e-05, + "loss": 2.7888, + "step": 13630 + }, + { + "epoch": 0.791435782877368, + "grad_norm": 0.1107436865568161, + "learning_rate": 6.859523780022911e-05, + "loss": 2.7902, + "step": 13640 + }, + { + "epoch": 0.7920160143897415, + "grad_norm": 0.11131720244884491, + "learning_rate": 6.822930423016003e-05, + "loss": 2.7982, + "step": 13650 + }, + { + "epoch": 0.792596245902115, + "grad_norm": 0.10535065829753876, + "learning_rate": 6.786422406407247e-05, + "loss": 2.7838, + "step": 13660 + }, + { + "epoch": 0.7931764774144884, + "grad_norm": 0.10784085094928741, + "learning_rate": 6.749999864622973e-05, + "loss": 2.7778, + "step": 13670 + }, + { + "epoch": 0.7937567089268618, + "grad_norm": 0.10266363620758057, + "learning_rate": 6.713662931774818e-05, + "loss": 2.7929, + "step": 13680 + }, + { + "epoch": 0.7943369404392353, + "grad_norm": 0.11121921241283417, + "learning_rate": 6.677411741659145e-05, + "loss": 2.787, + "step": 13690 + }, + { + "epoch": 0.7949171719516087, + "grad_norm": 0.10687406361103058, + "learning_rate": 6.641246427756657e-05, + "loss": 2.7915, + "step": 13700 + }, + { + "epoch": 0.7954974034639821, + "grad_norm": 0.10604474693536758, + "learning_rate": 6.605167123231822e-05, + "loss": 2.7816, + "step": 13710 + }, + { + "epoch": 0.7960776349763555, + "grad_norm": 0.10484491288661957, + "learning_rate": 6.569173960932404e-05, + "loss": 2.7844, + "step": 13720 + }, + { + "epoch": 0.796657866488729, + "grad_norm": 0.10788851231336594, + "learning_rate": 6.533267073389034e-05, + "loss": 2.7815, + "step": 13730 + }, + { + "epoch": 0.7972380980011025, + "grad_norm": 0.10421809554100037, + "learning_rate": 6.49744659281459e-05, + "loss": 2.7953, + "step": 13740 + }, + { + "epoch": 0.7978183295134759, + "grad_norm": 0.10567434132099152, + "learning_rate": 6.461712651103859e-05, + "loss": 2.7898, + "step": 13750 + }, + { + "epoch": 0.7983985610258493, + "grad_norm": 0.10381162911653519, + "learning_rate": 6.426065379832959e-05, + "loss": 2.7902, + "step": 13760 + }, + { + "epoch": 0.7989787925382228, + "grad_norm": 0.10707089304924011, + "learning_rate": 6.390504910258867e-05, + "loss": 2.7923, + "step": 13770 + }, + { + "epoch": 0.7995590240505962, + "grad_norm": 0.10568366944789886, + "learning_rate": 6.355031373318961e-05, + "loss": 2.793, + "step": 13780 + }, + { + "epoch": 0.8001392555629696, + "grad_norm": 0.10662976652383804, + "learning_rate": 6.319644899630514e-05, + "loss": 2.7954, + "step": 13790 + }, + { + "epoch": 0.800719487075343, + "grad_norm": 0.10822783410549164, + "learning_rate": 6.28434561949024e-05, + "loss": 2.7875, + "step": 13800 + }, + { + "epoch": 0.8012997185877165, + "grad_norm": 0.10903995484113693, + "learning_rate": 6.249133662873783e-05, + "loss": 2.7952, + "step": 13810 + }, + { + "epoch": 0.8018799501000899, + "grad_norm": 0.11016574501991272, + "learning_rate": 6.214009159435254e-05, + "loss": 2.7833, + "step": 13820 + }, + { + "epoch": 0.8024601816124634, + "grad_norm": 0.10669629275798798, + "learning_rate": 6.178972238506758e-05, + "loss": 2.7966, + "step": 13830 + }, + { + "epoch": 0.8030404131248368, + "grad_norm": 0.10725666582584381, + "learning_rate": 6.144023029097891e-05, + "loss": 2.781, + "step": 13840 + }, + { + "epoch": 0.8036206446372103, + "grad_norm": 0.10259473323822021, + "learning_rate": 6.10916165989533e-05, + "loss": 2.7858, + "step": 13850 + }, + { + "epoch": 0.8042008761495837, + "grad_norm": 0.10819372534751892, + "learning_rate": 6.0743882592622736e-05, + "loss": 2.782, + "step": 13860 + }, + { + "epoch": 0.8047811076619571, + "grad_norm": 0.09982424229383469, + "learning_rate": 6.039702955238026e-05, + "loss": 2.7767, + "step": 13870 + }, + { + "epoch": 0.8053613391743305, + "grad_norm": 0.11254626512527466, + "learning_rate": 6.005105875537515e-05, + "loss": 2.7773, + "step": 13880 + }, + { + "epoch": 0.805941570686704, + "grad_norm": 0.10880761593580246, + "learning_rate": 5.970597147550808e-05, + "loss": 2.7925, + "step": 13890 + }, + { + "epoch": 0.8065218021990774, + "grad_norm": 0.10454876720905304, + "learning_rate": 5.936176898342649e-05, + "loss": 2.7887, + "step": 13900 + }, + { + "epoch": 0.8071020337114508, + "grad_norm": 0.10871117562055588, + "learning_rate": 5.9018452546520165e-05, + "loss": 2.7914, + "step": 13910 + }, + { + "epoch": 0.8076822652238244, + "grad_norm": 0.10645408183336258, + "learning_rate": 5.8676023428916175e-05, + "loss": 2.7946, + "step": 13920 + }, + { + "epoch": 0.8082624967361978, + "grad_norm": 0.11597729474306107, + "learning_rate": 5.83344828914743e-05, + "loss": 2.7917, + "step": 13930 + }, + { + "epoch": 0.8088427282485712, + "grad_norm": 0.1034785658121109, + "learning_rate": 5.799383219178264e-05, + "loss": 2.7912, + "step": 13940 + }, + { + "epoch": 0.8094229597609446, + "grad_norm": 0.10739534348249435, + "learning_rate": 5.7654072584152787e-05, + "loss": 2.7848, + "step": 13950 + }, + { + "epoch": 0.8100031912733181, + "grad_norm": 0.10825861990451813, + "learning_rate": 5.731520531961505e-05, + "loss": 2.7908, + "step": 13960 + }, + { + "epoch": 0.8105834227856915, + "grad_norm": 0.10880185663700104, + "learning_rate": 5.697723164591441e-05, + "loss": 2.7904, + "step": 13970 + }, + { + "epoch": 0.8111636542980649, + "grad_norm": 0.1085624098777771, + "learning_rate": 5.6640152807505236e-05, + "loss": 2.7839, + "step": 13980 + }, + { + "epoch": 0.8117438858104383, + "grad_norm": 0.10740832984447479, + "learning_rate": 5.630397004554713e-05, + "loss": 2.7858, + "step": 13990 + }, + { + "epoch": 0.8123241173228118, + "grad_norm": 0.10401804000139236, + "learning_rate": 5.596868459790025e-05, + "loss": 2.7802, + "step": 14000 + }, + { + "epoch": 0.8123241173228118, + "eval_loss": 2.749423027038574, + "eval_runtime": 3.2586, + "eval_samples_per_second": 1328.792, + "eval_steps_per_second": 2.762, + "step": 14000 + }, + { + "epoch": 0.8129043488351853, + "grad_norm": 0.10784956812858582, + "learning_rate": 5.563429769912071e-05, + "loss": 2.7852, + "step": 14010 + }, + { + "epoch": 0.8134845803475587, + "grad_norm": 0.10523492097854614, + "learning_rate": 5.530081058045606e-05, + "loss": 2.7856, + "step": 14020 + }, + { + "epoch": 0.8140648118599321, + "grad_norm": 0.10354667156934738, + "learning_rate": 5.4968224469840935e-05, + "loss": 2.7826, + "step": 14030 + }, + { + "epoch": 0.8146450433723056, + "grad_norm": 0.10460636019706726, + "learning_rate": 5.4636540591892164e-05, + "loss": 2.7844, + "step": 14040 + }, + { + "epoch": 0.815225274884679, + "grad_norm": 0.11116158217191696, + "learning_rate": 5.430576016790453e-05, + "loss": 2.7879, + "step": 14050 + }, + { + "epoch": 0.8158055063970524, + "grad_norm": 0.11445162445306778, + "learning_rate": 5.3975884415846206e-05, + "loss": 2.7847, + "step": 14060 + }, + { + "epoch": 0.8163857379094258, + "grad_norm": 0.10757939517498016, + "learning_rate": 5.3646914550354204e-05, + "loss": 2.7884, + "step": 14070 + }, + { + "epoch": 0.8169659694217993, + "grad_norm": 0.10770777612924576, + "learning_rate": 5.331885178273015e-05, + "loss": 2.775, + "step": 14080 + }, + { + "epoch": 0.8175462009341727, + "grad_norm": 0.10863149166107178, + "learning_rate": 5.2991697320935486e-05, + "loss": 2.7883, + "step": 14090 + }, + { + "epoch": 0.8181264324465461, + "grad_norm": 0.10049009323120117, + "learning_rate": 5.266545236958718e-05, + "loss": 2.7878, + "step": 14100 + }, + { + "epoch": 0.8187066639589196, + "grad_norm": 0.104975625872612, + "learning_rate": 5.2340118129953346e-05, + "loss": 2.7806, + "step": 14110 + }, + { + "epoch": 0.8192868954712931, + "grad_norm": 0.10563846677541733, + "learning_rate": 5.201569579994865e-05, + "loss": 2.7807, + "step": 14120 + }, + { + "epoch": 0.8198671269836665, + "grad_norm": 0.10182633996009827, + "learning_rate": 5.1692186574130324e-05, + "loss": 2.7782, + "step": 14130 + }, + { + "epoch": 0.8204473584960399, + "grad_norm": 0.10903611779212952, + "learning_rate": 5.1369591643692896e-05, + "loss": 2.7792, + "step": 14140 + }, + { + "epoch": 0.8210275900084134, + "grad_norm": 0.10453125089406967, + "learning_rate": 5.1047912196464944e-05, + "loss": 2.7814, + "step": 14150 + }, + { + "epoch": 0.8216078215207868, + "grad_norm": 0.11026264727115631, + "learning_rate": 5.072714941690387e-05, + "loss": 2.7847, + "step": 14160 + }, + { + "epoch": 0.8221880530331602, + "grad_norm": 0.10732634365558624, + "learning_rate": 5.040730448609166e-05, + "loss": 2.7716, + "step": 14170 + }, + { + "epoch": 0.8227682845455336, + "grad_norm": 0.10351432114839554, + "learning_rate": 5.008837858173113e-05, + "loss": 2.7883, + "step": 14180 + }, + { + "epoch": 0.8233485160579072, + "grad_norm": 0.10946208238601685, + "learning_rate": 4.9770372878140575e-05, + "loss": 2.786, + "step": 14190 + }, + { + "epoch": 0.8239287475702806, + "grad_norm": 0.1038416251540184, + "learning_rate": 4.9453288546250494e-05, + "loss": 2.7799, + "step": 14200 + }, + { + "epoch": 0.824508979082654, + "grad_norm": 0.10568647086620331, + "learning_rate": 4.913712675359861e-05, + "loss": 2.7874, + "step": 14210 + }, + { + "epoch": 0.8250892105950274, + "grad_norm": 0.10334275662899017, + "learning_rate": 4.882188866432568e-05, + "loss": 2.7835, + "step": 14220 + }, + { + "epoch": 0.8256694421074009, + "grad_norm": 0.10559739917516708, + "learning_rate": 4.850757543917144e-05, + "loss": 2.7791, + "step": 14230 + }, + { + "epoch": 0.8262496736197743, + "grad_norm": 0.1026688888669014, + "learning_rate": 4.819418823546999e-05, + "loss": 2.7777, + "step": 14240 + }, + { + "epoch": 0.8268299051321477, + "grad_norm": 0.10159046947956085, + "learning_rate": 4.788172820714611e-05, + "loss": 2.7876, + "step": 14250 + }, + { + "epoch": 0.8274101366445211, + "grad_norm": 0.114133320748806, + "learning_rate": 4.7570196504710026e-05, + "loss": 2.7777, + "step": 14260 + }, + { + "epoch": 0.8279903681568946, + "grad_norm": 0.10327325016260147, + "learning_rate": 4.725959427525432e-05, + "loss": 2.7976, + "step": 14270 + }, + { + "epoch": 0.828570599669268, + "grad_norm": 0.10618502646684647, + "learning_rate": 4.694992266244889e-05, + "loss": 2.7904, + "step": 14280 + }, + { + "epoch": 0.8291508311816415, + "grad_norm": 0.10732074081897736, + "learning_rate": 4.6641182806537e-05, + "loss": 2.7724, + "step": 14290 + }, + { + "epoch": 0.8297310626940149, + "grad_norm": 0.10467931628227234, + "learning_rate": 4.63333758443313e-05, + "loss": 2.7843, + "step": 14300 + }, + { + "epoch": 0.8303112942063884, + "grad_norm": 0.10281146317720413, + "learning_rate": 4.6026502909209004e-05, + "loss": 2.7842, + "step": 14310 + }, + { + "epoch": 0.8308915257187618, + "grad_norm": 0.1023208498954773, + "learning_rate": 4.572056513110867e-05, + "loss": 2.774, + "step": 14320 + }, + { + "epoch": 0.8314717572311352, + "grad_norm": 0.10323374718427658, + "learning_rate": 4.541556363652511e-05, + "loss": 2.7755, + "step": 14330 + }, + { + "epoch": 0.8320519887435086, + "grad_norm": 0.10136920213699341, + "learning_rate": 4.5111499548505727e-05, + "loss": 2.7814, + "step": 14340 + }, + { + "epoch": 0.8326322202558821, + "grad_norm": 0.10571028292179108, + "learning_rate": 4.4808373986646565e-05, + "loss": 2.7878, + "step": 14350 + }, + { + "epoch": 0.8332124517682555, + "grad_norm": 0.10252848267555237, + "learning_rate": 4.45061880670874e-05, + "loss": 2.7754, + "step": 14360 + }, + { + "epoch": 0.8337926832806289, + "grad_norm": 0.10471548140048981, + "learning_rate": 4.420494290250869e-05, + "loss": 2.7767, + "step": 14370 + }, + { + "epoch": 0.8343729147930025, + "grad_norm": 0.10701679438352585, + "learning_rate": 4.390463960212658e-05, + "loss": 2.7792, + "step": 14380 + }, + { + "epoch": 0.8349531463053759, + "grad_norm": 0.10377515107393265, + "learning_rate": 4.3605279271689264e-05, + "loss": 2.7829, + "step": 14390 + }, + { + "epoch": 0.8355333778177493, + "grad_norm": 0.10350141674280167, + "learning_rate": 4.330686301347298e-05, + "loss": 2.7861, + "step": 14400 + }, + { + "epoch": 0.8361136093301227, + "grad_norm": 0.10299152880907059, + "learning_rate": 4.300939192627742e-05, + "loss": 2.7891, + "step": 14410 + }, + { + "epoch": 0.8366938408424962, + "grad_norm": 0.1038345992565155, + "learning_rate": 4.2712867105422465e-05, + "loss": 2.7812, + "step": 14420 + }, + { + "epoch": 0.8372740723548696, + "grad_norm": 0.10262761265039444, + "learning_rate": 4.241728964274352e-05, + "loss": 2.7784, + "step": 14430 + }, + { + "epoch": 0.837854303867243, + "grad_norm": 0.10034337639808655, + "learning_rate": 4.212266062658777e-05, + "loss": 2.7857, + "step": 14440 + }, + { + "epoch": 0.8384345353796164, + "grad_norm": 0.10054679960012436, + "learning_rate": 4.1828981141810104e-05, + "loss": 2.7783, + "step": 14450 + }, + { + "epoch": 0.83901476689199, + "grad_norm": 0.10352133959531784, + "learning_rate": 4.15362522697691e-05, + "loss": 2.7936, + "step": 14460 + }, + { + "epoch": 0.8395949984043634, + "grad_norm": 0.10465723276138306, + "learning_rate": 4.124447508832332e-05, + "loss": 2.7692, + "step": 14470 + }, + { + "epoch": 0.8401752299167368, + "grad_norm": 0.10384640097618103, + "learning_rate": 4.095365067182665e-05, + "loss": 2.781, + "step": 14480 + }, + { + "epoch": 0.8407554614291102, + "grad_norm": 0.10312188416719437, + "learning_rate": 4.066378009112523e-05, + "loss": 2.7767, + "step": 14490 + }, + { + "epoch": 0.8413356929414837, + "grad_norm": 0.10447024554014206, + "learning_rate": 4.037486441355288e-05, + "loss": 2.7832, + "step": 14500 + }, + { + "epoch": 0.8419159244538571, + "grad_norm": 0.10162138938903809, + "learning_rate": 4.008690470292732e-05, + "loss": 2.7786, + "step": 14510 + }, + { + "epoch": 0.8424961559662305, + "grad_norm": 0.09777431935071945, + "learning_rate": 3.979990201954653e-05, + "loss": 2.7792, + "step": 14520 + }, + { + "epoch": 0.8430763874786039, + "grad_norm": 0.10050346702337265, + "learning_rate": 3.9513857420184216e-05, + "loss": 2.7866, + "step": 14530 + }, + { + "epoch": 0.8436566189909774, + "grad_norm": 0.10209480673074722, + "learning_rate": 3.922877195808678e-05, + "loss": 2.7886, + "step": 14540 + }, + { + "epoch": 0.8442368505033508, + "grad_norm": 0.10496553033590317, + "learning_rate": 3.894464668296864e-05, + "loss": 2.7854, + "step": 14550 + }, + { + "epoch": 0.8448170820157243, + "grad_norm": 0.10205195099115372, + "learning_rate": 3.8661482641008866e-05, + "loss": 2.7869, + "step": 14560 + }, + { + "epoch": 0.8453973135280977, + "grad_norm": 0.10940441489219666, + "learning_rate": 3.837928087484711e-05, + "loss": 2.7799, + "step": 14570 + }, + { + "epoch": 0.8459775450404712, + "grad_norm": 0.10287832468748093, + "learning_rate": 3.8098042423579766e-05, + "loss": 2.7804, + "step": 14580 + }, + { + "epoch": 0.8465577765528446, + "grad_norm": 0.0999421551823616, + "learning_rate": 3.781776832275639e-05, + "loss": 2.7835, + "step": 14590 + }, + { + "epoch": 0.847138008065218, + "grad_norm": 0.10340355336666107, + "learning_rate": 3.753845960437557e-05, + "loss": 2.7831, + "step": 14600 + }, + { + "epoch": 0.8477182395775914, + "grad_norm": 0.10355892032384872, + "learning_rate": 3.72601172968812e-05, + "loss": 2.7749, + "step": 14610 + }, + { + "epoch": 0.8482984710899649, + "grad_norm": 0.10467097908258438, + "learning_rate": 3.6982742425158886e-05, + "loss": 2.7834, + "step": 14620 + }, + { + "epoch": 0.8488787026023383, + "grad_norm": 0.1060672402381897, + "learning_rate": 3.670633601053182e-05, + "loss": 2.7801, + "step": 14630 + }, + { + "epoch": 0.8494589341147117, + "grad_norm": 0.10443491488695145, + "learning_rate": 3.643089907075759e-05, + "loss": 2.7896, + "step": 14640 + }, + { + "epoch": 0.8500391656270853, + "grad_norm": 0.1023486852645874, + "learning_rate": 3.6156432620023726e-05, + "loss": 2.7691, + "step": 14650 + }, + { + "epoch": 0.8506193971394587, + "grad_norm": 0.10417921096086502, + "learning_rate": 3.5882937668944476e-05, + "loss": 2.7703, + "step": 14660 + }, + { + "epoch": 0.8511996286518321, + "grad_norm": 0.10138606280088425, + "learning_rate": 3.561041522455691e-05, + "loss": 2.7885, + "step": 14670 + }, + { + "epoch": 0.8517798601642055, + "grad_norm": 0.10121186077594757, + "learning_rate": 3.5338866290317204e-05, + "loss": 2.7721, + "step": 14680 + }, + { + "epoch": 0.852360091676579, + "grad_norm": 0.10391680151224136, + "learning_rate": 3.506829186609691e-05, + "loss": 2.7818, + "step": 14690 + }, + { + "epoch": 0.8529403231889524, + "grad_norm": 0.10207725316286087, + "learning_rate": 3.479869294817955e-05, + "loss": 2.775, + "step": 14700 + }, + { + "epoch": 0.8535205547013258, + "grad_norm": 0.10676626861095428, + "learning_rate": 3.4530070529256524e-05, + "loss": 2.7759, + "step": 14710 + }, + { + "epoch": 0.8541007862136992, + "grad_norm": 0.10105539858341217, + "learning_rate": 3.42624255984237e-05, + "loss": 2.7855, + "step": 14720 + }, + { + "epoch": 0.8546810177260727, + "grad_norm": 0.10040144622325897, + "learning_rate": 3.399575914117777e-05, + "loss": 2.7736, + "step": 14730 + }, + { + "epoch": 0.8552612492384462, + "grad_norm": 0.10322125256061554, + "learning_rate": 3.3730072139412456e-05, + "loss": 2.7834, + "step": 14740 + }, + { + "epoch": 0.8558414807508196, + "grad_norm": 0.10220754891633987, + "learning_rate": 3.3465365571415315e-05, + "loss": 2.7692, + "step": 14750 + }, + { + "epoch": 0.856421712263193, + "grad_norm": 0.10107099264860153, + "learning_rate": 3.3201640411863584e-05, + "loss": 2.7672, + "step": 14760 + }, + { + "epoch": 0.8570019437755665, + "grad_norm": 0.10284842550754547, + "learning_rate": 3.293889763182089e-05, + "loss": 2.7851, + "step": 14770 + }, + { + "epoch": 0.8575821752879399, + "grad_norm": 0.10386528819799423, + "learning_rate": 3.26771381987337e-05, + "loss": 2.7787, + "step": 14780 + }, + { + "epoch": 0.8581624068003133, + "grad_norm": 0.1039406880736351, + "learning_rate": 3.241636307642769e-05, + "loss": 2.7838, + "step": 14790 + }, + { + "epoch": 0.8587426383126867, + "grad_norm": 0.1034376472234726, + "learning_rate": 3.2156573225104145e-05, + "loss": 2.7794, + "step": 14800 + }, + { + "epoch": 0.8593228698250602, + "grad_norm": 0.10199546813964844, + "learning_rate": 3.189776960133645e-05, + "loss": 2.7806, + "step": 14810 + }, + { + "epoch": 0.8599031013374336, + "grad_norm": 0.10086624324321747, + "learning_rate": 3.163995315806681e-05, + "loss": 2.7666, + "step": 14820 + }, + { + "epoch": 0.860483332849807, + "grad_norm": 0.10021676123142242, + "learning_rate": 3.138312484460228e-05, + "loss": 2.7738, + "step": 14830 + }, + { + "epoch": 0.8610635643621805, + "grad_norm": 0.10465867072343826, + "learning_rate": 3.112728560661164e-05, + "loss": 2.7786, + "step": 14840 + }, + { + "epoch": 0.861643795874554, + "grad_norm": 0.10076703131198883, + "learning_rate": 3.0872436386121776e-05, + "loss": 2.7705, + "step": 14850 + }, + { + "epoch": 0.8622240273869274, + "grad_norm": 0.10121941566467285, + "learning_rate": 3.061857812151414e-05, + "loss": 2.7737, + "step": 14860 + }, + { + "epoch": 0.8628042588993008, + "grad_norm": 0.10309196263551712, + "learning_rate": 3.0365711747521538e-05, + "loss": 2.7783, + "step": 14870 + }, + { + "epoch": 0.8633844904116743, + "grad_norm": 0.10456740111112595, + "learning_rate": 3.011383819522446e-05, + "loss": 2.7809, + "step": 14880 + }, + { + "epoch": 0.8639647219240477, + "grad_norm": 0.1025143563747406, + "learning_rate": 2.986295839204764e-05, + "loss": 2.7813, + "step": 14890 + }, + { + "epoch": 0.8645449534364211, + "grad_norm": 0.10585116595029831, + "learning_rate": 2.961307326175688e-05, + "loss": 2.7738, + "step": 14900 + }, + { + "epoch": 0.8651251849487945, + "grad_norm": 0.10203658789396286, + "learning_rate": 2.936418372445527e-05, + "loss": 2.7777, + "step": 14910 + }, + { + "epoch": 0.865705416461168, + "grad_norm": 0.10538860410451889, + "learning_rate": 2.911629069658037e-05, + "loss": 2.7757, + "step": 14920 + }, + { + "epoch": 0.8662856479735415, + "grad_norm": 0.10184674710035324, + "learning_rate": 2.8869395090900037e-05, + "loss": 2.7797, + "step": 14930 + }, + { + "epoch": 0.8668658794859149, + "grad_norm": 0.10757064819335938, + "learning_rate": 2.862349781650991e-05, + "loss": 2.7837, + "step": 14940 + }, + { + "epoch": 0.8674461109982883, + "grad_norm": 0.09947676211595535, + "learning_rate": 2.8378599778829492e-05, + "loss": 2.7764, + "step": 14950 + }, + { + "epoch": 0.8680263425106618, + "grad_norm": 0.0980169028043747, + "learning_rate": 2.8134701879598965e-05, + "loss": 2.7877, + "step": 14960 + }, + { + "epoch": 0.8686065740230352, + "grad_norm": 0.09837668389081955, + "learning_rate": 2.7891805016876057e-05, + "loss": 2.7806, + "step": 14970 + }, + { + "epoch": 0.8691868055354086, + "grad_norm": 0.09911120682954788, + "learning_rate": 2.7649910085032277e-05, + "loss": 2.7807, + "step": 14980 + }, + { + "epoch": 0.869767037047782, + "grad_norm": 0.09837288409471512, + "learning_rate": 2.7409017974750257e-05, + "loss": 2.7677, + "step": 14990 + }, + { + "epoch": 0.8703472685601555, + "grad_norm": 0.10560393333435059, + "learning_rate": 2.7169129573019943e-05, + "loss": 2.7785, + "step": 15000 + }, + { + "epoch": 0.8703472685601555, + "eval_loss": 2.7414441108703613, + "eval_runtime": 3.2661, + "eval_samples_per_second": 1325.755, + "eval_steps_per_second": 2.756, + "step": 15000 + }, + { + "epoch": 0.870927500072529, + "grad_norm": 0.09839779883623123, + "learning_rate": 2.6930245763135504e-05, + "loss": 2.7759, + "step": 15010 + }, + { + "epoch": 0.8715077315849024, + "grad_norm": 0.09770379960536957, + "learning_rate": 2.6692367424692272e-05, + "loss": 2.787, + "step": 15020 + }, + { + "epoch": 0.8720879630972758, + "grad_norm": 0.09834130108356476, + "learning_rate": 2.645549543358304e-05, + "loss": 2.7731, + "step": 15030 + }, + { + "epoch": 0.8726681946096493, + "grad_norm": 0.1047162264585495, + "learning_rate": 2.6219630661995528e-05, + "loss": 2.7832, + "step": 15040 + }, + { + "epoch": 0.8732484261220227, + "grad_norm": 0.10111907124519348, + "learning_rate": 2.5984773978408257e-05, + "loss": 2.779, + "step": 15050 + }, + { + "epoch": 0.8738286576343961, + "grad_norm": 0.10093654692173004, + "learning_rate": 2.5750926247588322e-05, + "loss": 2.768, + "step": 15060 + }, + { + "epoch": 0.8744088891467695, + "grad_norm": 0.10071719437837601, + "learning_rate": 2.551808833058755e-05, + "loss": 2.7867, + "step": 15070 + }, + { + "epoch": 0.874989120659143, + "grad_norm": 0.10237322747707367, + "learning_rate": 2.5286261084739445e-05, + "loss": 2.7838, + "step": 15080 + }, + { + "epoch": 0.8755693521715164, + "grad_norm": 0.09815766662359238, + "learning_rate": 2.5055445363656358e-05, + "loss": 2.7839, + "step": 15090 + }, + { + "epoch": 0.8761495836838898, + "grad_norm": 0.10203532874584198, + "learning_rate": 2.482564201722581e-05, + "loss": 2.7878, + "step": 15100 + }, + { + "epoch": 0.8767298151962634, + "grad_norm": 0.10766585171222687, + "learning_rate": 2.4596851891607884e-05, + "loss": 2.7823, + "step": 15110 + }, + { + "epoch": 0.8773100467086368, + "grad_norm": 0.09876078367233276, + "learning_rate": 2.4369075829231766e-05, + "loss": 2.7762, + "step": 15120 + }, + { + "epoch": 0.8778902782210102, + "grad_norm": 0.10014016181230545, + "learning_rate": 2.414231466879274e-05, + "loss": 2.7733, + "step": 15130 + }, + { + "epoch": 0.8784705097333836, + "grad_norm": 0.10114018619060516, + "learning_rate": 2.3916569245249306e-05, + "loss": 2.7861, + "step": 15140 + }, + { + "epoch": 0.8790507412457571, + "grad_norm": 0.10012462735176086, + "learning_rate": 2.3691840389819526e-05, + "loss": 2.7635, + "step": 15150 + }, + { + "epoch": 0.8796309727581305, + "grad_norm": 0.10367590934038162, + "learning_rate": 2.3468128929978757e-05, + "loss": 2.7727, + "step": 15160 + }, + { + "epoch": 0.8802112042705039, + "grad_norm": 0.10224179178476334, + "learning_rate": 2.3245435689456015e-05, + "loss": 2.7712, + "step": 15170 + }, + { + "epoch": 0.8807914357828773, + "grad_norm": 0.0989450216293335, + "learning_rate": 2.302376148823102e-05, + "loss": 2.7761, + "step": 15180 + }, + { + "epoch": 0.8813716672952508, + "grad_norm": 0.10036759078502655, + "learning_rate": 2.2803107142531617e-05, + "loss": 2.7815, + "step": 15190 + }, + { + "epoch": 0.8819518988076243, + "grad_norm": 0.10400567203760147, + "learning_rate": 2.2583473464830005e-05, + "loss": 2.7826, + "step": 15200 + }, + { + "epoch": 0.8825321303199977, + "grad_norm": 0.09990741312503815, + "learning_rate": 2.2364861263840507e-05, + "loss": 2.7869, + "step": 15210 + }, + { + "epoch": 0.8831123618323711, + "grad_norm": 0.10067487508058548, + "learning_rate": 2.2147271344516128e-05, + "loss": 2.7771, + "step": 15220 + }, + { + "epoch": 0.8836925933447446, + "grad_norm": 0.10068360716104507, + "learning_rate": 2.1930704508045714e-05, + "loss": 2.781, + "step": 15230 + }, + { + "epoch": 0.884272824857118, + "grad_norm": 0.10076344013214111, + "learning_rate": 2.171516155185117e-05, + "loss": 2.7793, + "step": 15240 + }, + { + "epoch": 0.8848530563694914, + "grad_norm": 0.0988764762878418, + "learning_rate": 2.1500643269584027e-05, + "loss": 2.772, + "step": 15250 + }, + { + "epoch": 0.8854332878818648, + "grad_norm": 0.09937159717082977, + "learning_rate": 2.1287150451123224e-05, + "loss": 2.7786, + "step": 15260 + }, + { + "epoch": 0.8860135193942383, + "grad_norm": 0.10244645178318024, + "learning_rate": 2.1074683882571675e-05, + "loss": 2.7752, + "step": 15270 + }, + { + "epoch": 0.8865937509066117, + "grad_norm": 0.09691537171602249, + "learning_rate": 2.0863244346253517e-05, + "loss": 2.7735, + "step": 15280 + }, + { + "epoch": 0.8871739824189852, + "grad_norm": 0.09877140074968338, + "learning_rate": 2.065283262071128e-05, + "loss": 2.777, + "step": 15290 + }, + { + "epoch": 0.8877542139313586, + "grad_norm": 0.09832227975130081, + "learning_rate": 2.044344948070289e-05, + "loss": 2.7718, + "step": 15300 + }, + { + "epoch": 0.8883344454437321, + "grad_norm": 0.09934905916452408, + "learning_rate": 2.02350956971992e-05, + "loss": 2.7725, + "step": 15310 + }, + { + "epoch": 0.8889146769561055, + "grad_norm": 0.09960002452135086, + "learning_rate": 2.0027772037380463e-05, + "loss": 2.77, + "step": 15320 + }, + { + "epoch": 0.8894949084684789, + "grad_norm": 0.10142461210489273, + "learning_rate": 1.9821479264634234e-05, + "loss": 2.7781, + "step": 15330 + }, + { + "epoch": 0.8900751399808524, + "grad_norm": 0.09648580849170685, + "learning_rate": 1.96162181385521e-05, + "loss": 2.7774, + "step": 15340 + }, + { + "epoch": 0.8906553714932258, + "grad_norm": 0.09822871536016464, + "learning_rate": 1.9411989414926953e-05, + "loss": 2.7718, + "step": 15350 + }, + { + "epoch": 0.8912356030055992, + "grad_norm": 0.1000954881310463, + "learning_rate": 1.9208793845750504e-05, + "loss": 2.7763, + "step": 15360 + }, + { + "epoch": 0.8918158345179726, + "grad_norm": 0.10170748084783554, + "learning_rate": 1.9006632179209925e-05, + "loss": 2.78, + "step": 15370 + }, + { + "epoch": 0.8923960660303462, + "grad_norm": 0.10458207130432129, + "learning_rate": 1.8805505159685807e-05, + "loss": 2.77, + "step": 15380 + }, + { + "epoch": 0.8929762975427196, + "grad_norm": 0.09986699372529984, + "learning_rate": 1.8605413527748823e-05, + "loss": 2.776, + "step": 15390 + }, + { + "epoch": 0.893556529055093, + "grad_norm": 0.09813553094863892, + "learning_rate": 1.8406358020157364e-05, + "loss": 2.7711, + "step": 15400 + }, + { + "epoch": 0.8941367605674664, + "grad_norm": 0.09960541874170303, + "learning_rate": 1.8208339369854663e-05, + "loss": 2.7781, + "step": 15410 + }, + { + "epoch": 0.8947169920798399, + "grad_norm": 0.09737250953912735, + "learning_rate": 1.801135830596605e-05, + "loss": 2.7657, + "step": 15420 + }, + { + "epoch": 0.8952972235922133, + "grad_norm": 0.0949782207608223, + "learning_rate": 1.7815415553796575e-05, + "loss": 2.7705, + "step": 15430 + }, + { + "epoch": 0.8958774551045867, + "grad_norm": 0.09773328900337219, + "learning_rate": 1.762051183482788e-05, + "loss": 2.7684, + "step": 15440 + }, + { + "epoch": 0.8964576866169601, + "grad_norm": 0.09638100862503052, + "learning_rate": 1.7426647866715925e-05, + "loss": 2.7724, + "step": 15450 + }, + { + "epoch": 0.8970379181293336, + "grad_norm": 0.09620904177427292, + "learning_rate": 1.7233824363288118e-05, + "loss": 2.7738, + "step": 15460 + }, + { + "epoch": 0.897618149641707, + "grad_norm": 0.09929810464382172, + "learning_rate": 1.7042042034540783e-05, + "loss": 2.7754, + "step": 15470 + }, + { + "epoch": 0.8981983811540805, + "grad_norm": 0.09778960049152374, + "learning_rate": 1.6851301586636613e-05, + "loss": 2.7766, + "step": 15480 + }, + { + "epoch": 0.8987786126664539, + "grad_norm": 0.09684190899133682, + "learning_rate": 1.6661603721901873e-05, + "loss": 2.7777, + "step": 15490 + }, + { + "epoch": 0.8993588441788274, + "grad_norm": 0.09664195775985718, + "learning_rate": 1.6472949138823967e-05, + "loss": 2.7859, + "step": 15500 + }, + { + "epoch": 0.8999390756912008, + "grad_norm": 0.10036718100309372, + "learning_rate": 1.628533853204883e-05, + "loss": 2.7713, + "step": 15510 + }, + { + "epoch": 0.9005193072035742, + "grad_norm": 0.09811628609895706, + "learning_rate": 1.6098772592378417e-05, + "loss": 2.7733, + "step": 15520 + }, + { + "epoch": 0.9010995387159476, + "grad_norm": 0.09862551838159561, + "learning_rate": 1.591325200676795e-05, + "loss": 2.7701, + "step": 15530 + }, + { + "epoch": 0.9016797702283211, + "grad_norm": 0.09947618097066879, + "learning_rate": 1.5728777458323803e-05, + "loss": 2.7771, + "step": 15540 + }, + { + "epoch": 0.9022600017406945, + "grad_norm": 0.09834101796150208, + "learning_rate": 1.554534962630053e-05, + "loss": 2.7768, + "step": 15550 + }, + { + "epoch": 0.902840233253068, + "grad_norm": 0.10113567858934402, + "learning_rate": 1.5362969186098594e-05, + "loss": 2.7682, + "step": 15560 + }, + { + "epoch": 0.9034204647654415, + "grad_norm": 0.0977102592587471, + "learning_rate": 1.5181636809261921e-05, + "loss": 2.7769, + "step": 15570 + }, + { + "epoch": 0.9040006962778149, + "grad_norm": 0.09831026196479797, + "learning_rate": 1.5001353163475283e-05, + "loss": 2.7681, + "step": 15580 + }, + { + "epoch": 0.9045809277901883, + "grad_norm": 0.09537149965763092, + "learning_rate": 1.4822118912561943e-05, + "loss": 2.7628, + "step": 15590 + }, + { + "epoch": 0.9051611593025617, + "grad_norm": 0.09654498845338821, + "learning_rate": 1.4643934716481253e-05, + "loss": 2.7676, + "step": 15600 + }, + { + "epoch": 0.9057413908149352, + "grad_norm": 0.09738855808973312, + "learning_rate": 1.446680123132603e-05, + "loss": 2.7744, + "step": 15610 + }, + { + "epoch": 0.9063216223273086, + "grad_norm": 0.10082467645406723, + "learning_rate": 1.4290719109320382e-05, + "loss": 2.7706, + "step": 15620 + }, + { + "epoch": 0.906901853839682, + "grad_norm": 0.10283984988927841, + "learning_rate": 1.4115688998817043e-05, + "loss": 2.7742, + "step": 15630 + }, + { + "epoch": 0.9074820853520554, + "grad_norm": 0.09994236379861832, + "learning_rate": 1.3941711544295287e-05, + "loss": 2.7638, + "step": 15640 + }, + { + "epoch": 0.908062316864429, + "grad_norm": 0.09737379103899002, + "learning_rate": 1.3768787386358282e-05, + "loss": 2.7715, + "step": 15650 + }, + { + "epoch": 0.9086425483768024, + "grad_norm": 0.09915235638618469, + "learning_rate": 1.3596917161730902e-05, + "loss": 2.7694, + "step": 15660 + }, + { + "epoch": 0.9092227798891758, + "grad_norm": 0.09791626036167145, + "learning_rate": 1.3426101503257358e-05, + "loss": 2.7628, + "step": 15670 + }, + { + "epoch": 0.9098030114015492, + "grad_norm": 0.09681922197341919, + "learning_rate": 1.3256341039898766e-05, + "loss": 2.7741, + "step": 15680 + }, + { + "epoch": 0.9103832429139227, + "grad_norm": 0.09645412862300873, + "learning_rate": 1.3087636396730949e-05, + "loss": 2.7704, + "step": 15690 + }, + { + "epoch": 0.9109634744262961, + "grad_norm": 0.09795381873846054, + "learning_rate": 1.2919988194942011e-05, + "loss": 2.7666, + "step": 15700 + }, + { + "epoch": 0.9115437059386695, + "grad_norm": 0.09636548161506653, + "learning_rate": 1.2753397051830294e-05, + "loss": 2.7763, + "step": 15710 + }, + { + "epoch": 0.9121239374510429, + "grad_norm": 0.0992702841758728, + "learning_rate": 1.2587863580801794e-05, + "loss": 2.7693, + "step": 15720 + }, + { + "epoch": 0.9127041689634164, + "grad_norm": 0.09708980470895767, + "learning_rate": 1.2423388391368083e-05, + "loss": 2.7696, + "step": 15730 + }, + { + "epoch": 0.9132844004757898, + "grad_norm": 0.09657064080238342, + "learning_rate": 1.2259972089144054e-05, + "loss": 2.7799, + "step": 15740 + }, + { + "epoch": 0.9138646319881633, + "grad_norm": 0.09743205457925797, + "learning_rate": 1.2097615275845617e-05, + "loss": 2.7683, + "step": 15750 + }, + { + "epoch": 0.9144448635005367, + "grad_norm": 0.09803003072738647, + "learning_rate": 1.1936318549287638e-05, + "loss": 2.7731, + "step": 15760 + }, + { + "epoch": 0.9150250950129102, + "grad_norm": 0.0977969542145729, + "learning_rate": 1.1776082503381468e-05, + "loss": 2.778, + "step": 15770 + }, + { + "epoch": 0.9156053265252836, + "grad_norm": 0.0986003428697586, + "learning_rate": 1.1616907728133084e-05, + "loss": 2.7794, + "step": 15780 + }, + { + "epoch": 0.916185558037657, + "grad_norm": 0.09887285530567169, + "learning_rate": 1.1458794809640693e-05, + "loss": 2.7743, + "step": 15790 + }, + { + "epoch": 0.9167657895500304, + "grad_norm": 0.10056151449680328, + "learning_rate": 1.1301744330092522e-05, + "loss": 2.7739, + "step": 15800 + }, + { + "epoch": 0.9173460210624039, + "grad_norm": 0.09636414051055908, + "learning_rate": 1.1145756867765033e-05, + "loss": 2.7772, + "step": 15810 + }, + { + "epoch": 0.9179262525747773, + "grad_norm": 0.09793318808078766, + "learning_rate": 1.0990832997020282e-05, + "loss": 2.7729, + "step": 15820 + }, + { + "epoch": 0.9185064840871507, + "grad_norm": 0.09378232061862946, + "learning_rate": 1.0836973288304229e-05, + "loss": 2.7783, + "step": 15830 + }, + { + "epoch": 0.9190867155995243, + "grad_norm": 0.09904693067073822, + "learning_rate": 1.0684178308144498e-05, + "loss": 2.7697, + "step": 15840 + }, + { + "epoch": 0.9196669471118977, + "grad_norm": 0.0982363149523735, + "learning_rate": 1.0532448619148115e-05, + "loss": 2.7712, + "step": 15850 + }, + { + "epoch": 0.9202471786242711, + "grad_norm": 0.0995451807975769, + "learning_rate": 1.038178477999978e-05, + "loss": 2.7702, + "step": 15860 + }, + { + "epoch": 0.9208274101366445, + "grad_norm": 0.09749618917703629, + "learning_rate": 1.0232187345459431e-05, + "loss": 2.771, + "step": 15870 + }, + { + "epoch": 0.921407641649018, + "grad_norm": 0.09808894246816635, + "learning_rate": 1.0083656866360646e-05, + "loss": 2.7706, + "step": 15880 + }, + { + "epoch": 0.9219878731613914, + "grad_norm": 0.09838584810495377, + "learning_rate": 9.936193889608012e-06, + "loss": 2.7656, + "step": 15890 + }, + { + "epoch": 0.9225681046737648, + "grad_norm": 0.10016359388828278, + "learning_rate": 9.789798958175832e-06, + "loss": 2.7749, + "step": 15900 + }, + { + "epoch": 0.9231483361861382, + "grad_norm": 0.09670013934373856, + "learning_rate": 9.64447261110548e-06, + "loss": 2.7693, + "step": 15910 + }, + { + "epoch": 0.9237285676985117, + "grad_norm": 0.09639087319374084, + "learning_rate": 9.500215383503784e-06, + "loss": 2.7675, + "step": 15920 + }, + { + "epoch": 0.9243087992108852, + "grad_norm": 0.09851641952991486, + "learning_rate": 9.357027806541084e-06, + "loss": 2.7748, + "step": 15930 + }, + { + "epoch": 0.9248890307232586, + "grad_norm": 0.10145829617977142, + "learning_rate": 9.214910407448871e-06, + "loss": 2.7841, + "step": 15940 + }, + { + "epoch": 0.925469262235632, + "grad_norm": 0.09769120067358017, + "learning_rate": 9.073863709518426e-06, + "loss": 2.7703, + "step": 15950 + }, + { + "epoch": 0.9260494937480055, + "grad_norm": 0.09475893527269363, + "learning_rate": 8.933888232098408e-06, + "loss": 2.7703, + "step": 15960 + }, + { + "epoch": 0.9266297252603789, + "grad_norm": 0.09624000638723373, + "learning_rate": 8.794984490593171e-06, + "loss": 2.7753, + "step": 15970 + }, + { + "epoch": 0.9272099567727523, + "grad_norm": 0.09569297730922699, + "learning_rate": 8.657152996460958e-06, + "loss": 2.7635, + "step": 15980 + }, + { + "epoch": 0.9277901882851257, + "grad_norm": 0.10107609629631042, + "learning_rate": 8.520394257211605e-06, + "loss": 2.7714, + "step": 15990 + }, + { + "epoch": 0.9283704197974992, + "grad_norm": 0.09753672778606415, + "learning_rate": 8.384708776405236e-06, + "loss": 2.7706, + "step": 16000 + }, + { + "epoch": 0.9283704197974992, + "eval_loss": 2.7369606494903564, + "eval_runtime": 3.2559, + "eval_samples_per_second": 1329.896, + "eval_steps_per_second": 2.764, + "step": 16000 + }, + { + "epoch": 0.9289506513098726, + "grad_norm": 0.09548928588628769, + "learning_rate": 8.25009705364994e-06, + "loss": 2.7754, + "step": 16010 + }, + { + "epoch": 0.929530882822246, + "grad_norm": 0.09287203848361969, + "learning_rate": 8.116559584600201e-06, + "loss": 2.7777, + "step": 16020 + }, + { + "epoch": 0.9301111143346195, + "grad_norm": 0.0972280502319336, + "learning_rate": 7.984096860955036e-06, + "loss": 2.781, + "step": 16030 + }, + { + "epoch": 0.930691345846993, + "grad_norm": 0.09617298096418381, + "learning_rate": 7.852709370455922e-06, + "loss": 2.7692, + "step": 16040 + }, + { + "epoch": 0.9312715773593664, + "grad_norm": 0.09682459384202957, + "learning_rate": 7.72239759688551e-06, + "loss": 2.7742, + "step": 16050 + }, + { + "epoch": 0.9318518088717398, + "grad_norm": 0.09648177772760391, + "learning_rate": 7.593162020065313e-06, + "loss": 2.7783, + "step": 16060 + }, + { + "epoch": 0.9324320403841133, + "grad_norm": 0.09511367976665497, + "learning_rate": 7.4650031158542845e-06, + "loss": 2.7706, + "step": 16070 + }, + { + "epoch": 0.9330122718964867, + "grad_norm": 0.09434488415718079, + "learning_rate": 7.337921356146981e-06, + "loss": 2.7694, + "step": 16080 + }, + { + "epoch": 0.9335925034088601, + "grad_norm": 0.09737717360258102, + "learning_rate": 7.211917208871665e-06, + "loss": 2.7674, + "step": 16090 + }, + { + "epoch": 0.9341727349212335, + "grad_norm": 0.09725455194711685, + "learning_rate": 7.086991137988906e-06, + "loss": 2.7639, + "step": 16100 + }, + { + "epoch": 0.9347529664336071, + "grad_norm": 0.10136746615171432, + "learning_rate": 6.963143603489518e-06, + "loss": 2.7677, + "step": 16110 + }, + { + "epoch": 0.9353331979459805, + "grad_norm": 0.09756675362586975, + "learning_rate": 6.840375061393122e-06, + "loss": 2.765, + "step": 16120 + }, + { + "epoch": 0.9359134294583539, + "grad_norm": 0.09939330816268921, + "learning_rate": 6.718685963746318e-06, + "loss": 2.7751, + "step": 16130 + }, + { + "epoch": 0.9364936609707273, + "grad_norm": 0.09836092591285706, + "learning_rate": 6.598076758621118e-06, + "loss": 2.7828, + "step": 16140 + }, + { + "epoch": 0.9370738924831008, + "grad_norm": 0.09677501767873764, + "learning_rate": 6.4785478901133506e-06, + "loss": 2.769, + "step": 16150 + }, + { + "epoch": 0.9376541239954742, + "grad_norm": 0.097322478890419, + "learning_rate": 6.360099798340656e-06, + "loss": 2.7656, + "step": 16160 + }, + { + "epoch": 0.9382343555078476, + "grad_norm": 0.09472298622131348, + "learning_rate": 6.242732919441462e-06, + "loss": 2.7737, + "step": 16170 + }, + { + "epoch": 0.938814587020221, + "grad_norm": 0.09517394751310349, + "learning_rate": 6.126447685572844e-06, + "loss": 2.7807, + "step": 16180 + }, + { + "epoch": 0.9393948185325945, + "grad_norm": 0.09591302275657654, + "learning_rate": 6.011244524909198e-06, + "loss": 2.7774, + "step": 16190 + }, + { + "epoch": 0.939975050044968, + "grad_norm": 0.09797896444797516, + "learning_rate": 5.8971238616407405e-06, + "loss": 2.7637, + "step": 16200 + }, + { + "epoch": 0.9405552815573414, + "grad_norm": 0.09744720160961151, + "learning_rate": 5.7840861159715425e-06, + "loss": 2.7773, + "step": 16210 + }, + { + "epoch": 0.9411355130697148, + "grad_norm": 0.09814444929361343, + "learning_rate": 5.672131704118565e-06, + "loss": 2.7741, + "step": 16220 + }, + { + "epoch": 0.9417157445820883, + "grad_norm": 0.09604529291391373, + "learning_rate": 5.561261038309628e-06, + "loss": 2.7727, + "step": 16230 + }, + { + "epoch": 0.9422959760944617, + "grad_norm": 0.09737398475408554, + "learning_rate": 5.4514745267821404e-06, + "loss": 2.7737, + "step": 16240 + }, + { + "epoch": 0.9428762076068351, + "grad_norm": 0.09697815030813217, + "learning_rate": 5.342772573781507e-06, + "loss": 2.7638, + "step": 16250 + }, + { + "epoch": 0.9434564391192085, + "grad_norm": 0.09917178004980087, + "learning_rate": 5.235155579559725e-06, + "loss": 2.7709, + "step": 16260 + }, + { + "epoch": 0.944036670631582, + "grad_norm": 0.096290223300457, + "learning_rate": 5.128623940373888e-06, + "loss": 2.7674, + "step": 16270 + }, + { + "epoch": 0.9446169021439554, + "grad_norm": 0.09504272043704987, + "learning_rate": 5.023178048484589e-06, + "loss": 2.7694, + "step": 16280 + }, + { + "epoch": 0.9451971336563288, + "grad_norm": 0.09743209183216095, + "learning_rate": 4.91881829215468e-06, + "loss": 2.781, + "step": 16290 + }, + { + "epoch": 0.9457773651687024, + "grad_norm": 0.09843679517507553, + "learning_rate": 4.815545055647718e-06, + "loss": 2.776, + "step": 16300 + }, + { + "epoch": 0.9463575966810758, + "grad_norm": 0.0955999493598938, + "learning_rate": 4.713358719226523e-06, + "loss": 2.7789, + "step": 16310 + }, + { + "epoch": 0.9469378281934492, + "grad_norm": 0.09576351940631866, + "learning_rate": 4.612259659151984e-06, + "loss": 2.7716, + "step": 16320 + }, + { + "epoch": 0.9475180597058226, + "grad_norm": 0.09730935841798782, + "learning_rate": 4.512248247681394e-06, + "loss": 2.7802, + "step": 16330 + }, + { + "epoch": 0.9480982912181961, + "grad_norm": 0.09646177291870117, + "learning_rate": 4.413324853067213e-06, + "loss": 2.7765, + "step": 16340 + }, + { + "epoch": 0.9486785227305695, + "grad_norm": 0.09553349018096924, + "learning_rate": 4.3154898395557744e-06, + "loss": 2.778, + "step": 16350 + }, + { + "epoch": 0.9492587542429429, + "grad_norm": 0.09604230523109436, + "learning_rate": 4.218743567385852e-06, + "loss": 2.78, + "step": 16360 + }, + { + "epoch": 0.9498389857553163, + "grad_norm": 0.09518173336982727, + "learning_rate": 4.123086392787289e-06, + "loss": 2.7695, + "step": 16370 + }, + { + "epoch": 0.9504192172676899, + "grad_norm": 0.09625556319952011, + "learning_rate": 4.0285186679799406e-06, + "loss": 2.7694, + "step": 16380 + }, + { + "epoch": 0.9509994487800633, + "grad_norm": 0.09755248576402664, + "learning_rate": 3.935040741171969e-06, + "loss": 2.7625, + "step": 16390 + }, + { + "epoch": 0.9515796802924367, + "grad_norm": 0.09465952962636948, + "learning_rate": 3.842652956558945e-06, + "loss": 2.7658, + "step": 16400 + }, + { + "epoch": 0.9521599118048101, + "grad_norm": 0.0960998460650444, + "learning_rate": 3.7513556543223855e-06, + "loss": 2.7846, + "step": 16410 + }, + { + "epoch": 0.9527401433171836, + "grad_norm": 0.09892145544290543, + "learning_rate": 3.6611491706284856e-06, + "loss": 2.7708, + "step": 16420 + }, + { + "epoch": 0.953320374829557, + "grad_norm": 0.09714221954345703, + "learning_rate": 3.572033837626953e-06, + "loss": 2.7874, + "step": 16430 + }, + { + "epoch": 0.9539006063419304, + "grad_norm": 0.09727420657873154, + "learning_rate": 3.484009983449809e-06, + "loss": 2.7834, + "step": 16440 + }, + { + "epoch": 0.9544808378543038, + "grad_norm": 0.09665530920028687, + "learning_rate": 3.397077932210124e-06, + "loss": 2.7726, + "step": 16450 + }, + { + "epoch": 0.9550610693666773, + "grad_norm": 0.09558922797441483, + "learning_rate": 3.3112380040008156e-06, + "loss": 2.7723, + "step": 16460 + }, + { + "epoch": 0.9556413008790507, + "grad_norm": 0.0972527414560318, + "learning_rate": 3.2264905148934208e-06, + "loss": 2.772, + "step": 16470 + }, + { + "epoch": 0.9562215323914242, + "grad_norm": 0.09882599860429764, + "learning_rate": 3.142835776937158e-06, + "loss": 2.7685, + "step": 16480 + }, + { + "epoch": 0.9568017639037976, + "grad_norm": 0.09505190700292587, + "learning_rate": 3.060274098157467e-06, + "loss": 2.7694, + "step": 16490 + }, + { + "epoch": 0.9573819954161711, + "grad_norm": 0.09600254893302917, + "learning_rate": 2.9788057825551714e-06, + "loss": 2.7778, + "step": 16500 + }, + { + "epoch": 0.9579622269285445, + "grad_norm": 0.09696151316165924, + "learning_rate": 2.8984311301050835e-06, + "loss": 2.784, + "step": 16510 + }, + { + "epoch": 0.9585424584409179, + "grad_norm": 0.09621264785528183, + "learning_rate": 2.819150436755135e-06, + "loss": 2.7668, + "step": 16520 + }, + { + "epoch": 0.9591226899532914, + "grad_norm": 0.09673577547073364, + "learning_rate": 2.7409639944251162e-06, + "loss": 2.774, + "step": 16530 + }, + { + "epoch": 0.9597029214656648, + "grad_norm": 0.09513070434331894, + "learning_rate": 2.6638720910056697e-06, + "loss": 2.7783, + "step": 16540 + }, + { + "epoch": 0.9602831529780382, + "grad_norm": 0.09311112761497498, + "learning_rate": 2.587875010357332e-06, + "loss": 2.7665, + "step": 16550 + }, + { + "epoch": 0.9608633844904116, + "grad_norm": 0.09406144171953201, + "learning_rate": 2.5129730323092622e-06, + "loss": 2.7671, + "step": 16560 + }, + { + "epoch": 0.9614436160027852, + "grad_norm": 0.09770730882883072, + "learning_rate": 2.439166432658446e-06, + "loss": 2.7673, + "step": 16570 + }, + { + "epoch": 0.9620238475151586, + "grad_norm": 0.09938254207372665, + "learning_rate": 2.366455483168428e-06, + "loss": 2.7637, + "step": 16580 + }, + { + "epoch": 0.962604079027532, + "grad_norm": 0.09504234790802002, + "learning_rate": 2.2948404515686136e-06, + "loss": 2.7708, + "step": 16590 + }, + { + "epoch": 0.9631843105399054, + "grad_norm": 0.09619156271219254, + "learning_rate": 2.2243216015530362e-06, + "loss": 2.7716, + "step": 16600 + }, + { + "epoch": 0.9637645420522789, + "grad_norm": 0.09520803391933441, + "learning_rate": 2.1548991927794244e-06, + "loss": 2.771, + "step": 16610 + }, + { + "epoch": 0.9643447735646523, + "grad_norm": 0.09521950781345367, + "learning_rate": 2.0865734808684697e-06, + "loss": 2.7679, + "step": 16620 + }, + { + "epoch": 0.9649250050770257, + "grad_norm": 0.09744451195001602, + "learning_rate": 2.0193447174025268e-06, + "loss": 2.7715, + "step": 16630 + }, + { + "epoch": 0.9655052365893991, + "grad_norm": 0.09531662613153458, + "learning_rate": 1.953213149924948e-06, + "loss": 2.7824, + "step": 16640 + }, + { + "epoch": 0.9660854681017726, + "grad_norm": 0.09525689482688904, + "learning_rate": 1.8881790219391512e-06, + "loss": 2.7694, + "step": 16650 + }, + { + "epoch": 0.9666656996141461, + "grad_norm": 0.09457177668809891, + "learning_rate": 1.8242425729075527e-06, + "loss": 2.7588, + "step": 16660 + }, + { + "epoch": 0.9672459311265195, + "grad_norm": 0.09685463458299637, + "learning_rate": 1.7614040382508687e-06, + "loss": 2.7714, + "step": 16670 + }, + { + "epoch": 0.9678261626388929, + "grad_norm": 0.09774652868509293, + "learning_rate": 1.6996636493471494e-06, + "loss": 2.7683, + "step": 16680 + }, + { + "epoch": 0.9684063941512664, + "grad_norm": 0.09525836259126663, + "learning_rate": 1.6390216335309792e-06, + "loss": 2.77, + "step": 16690 + }, + { + "epoch": 0.9689866256636398, + "grad_norm": 0.09421420842409134, + "learning_rate": 1.5794782140926775e-06, + "loss": 2.7723, + "step": 16700 + }, + { + "epoch": 0.9695668571760132, + "grad_norm": 0.09693361073732376, + "learning_rate": 1.5210336102772668e-06, + "loss": 2.772, + "step": 16710 + }, + { + "epoch": 0.9701470886883866, + "grad_norm": 0.09740012139081955, + "learning_rate": 1.463688037283972e-06, + "loss": 2.7673, + "step": 16720 + }, + { + "epoch": 0.9707273202007601, + "grad_norm": 0.09596629440784454, + "learning_rate": 1.4074417062651221e-06, + "loss": 2.7878, + "step": 16730 + }, + { + "epoch": 0.9713075517131335, + "grad_norm": 0.09561031311750412, + "learning_rate": 1.3522948243256503e-06, + "loss": 2.7728, + "step": 16740 + }, + { + "epoch": 0.971887783225507, + "grad_norm": 0.09793524444103241, + "learning_rate": 1.2982475945221615e-06, + "loss": 2.7718, + "step": 16750 + }, + { + "epoch": 0.9724680147378804, + "grad_norm": 0.09407012164592743, + "learning_rate": 1.245300215862166e-06, + "loss": 2.7797, + "step": 16760 + }, + { + "epoch": 0.9730482462502539, + "grad_norm": 0.09444325417280197, + "learning_rate": 1.1934528833035139e-06, + "loss": 2.7725, + "step": 16770 + }, + { + "epoch": 0.9736284777626273, + "grad_norm": 0.09787797182798386, + "learning_rate": 1.1427057877534951e-06, + "loss": 2.7691, + "step": 16780 + }, + { + "epoch": 0.9742087092750007, + "grad_norm": 0.09456036239862442, + "learning_rate": 1.09305911606824e-06, + "loss": 2.7766, + "step": 16790 + }, + { + "epoch": 0.9747889407873742, + "grad_norm": 0.095250204205513, + "learning_rate": 1.044513051051954e-06, + "loss": 2.7701, + "step": 16800 + }, + { + "epoch": 0.9753691722997476, + "grad_norm": 0.09521818906068802, + "learning_rate": 9.970677714563835e-07, + "loss": 2.7734, + "step": 16810 + }, + { + "epoch": 0.975949403812121, + "grad_norm": 0.09462135285139084, + "learning_rate": 9.507234519800178e-07, + "loss": 2.7705, + "step": 16820 + }, + { + "epoch": 0.9765296353244944, + "grad_norm": 0.09560775011777878, + "learning_rate": 9.054802632674551e-07, + "loss": 2.7691, + "step": 16830 + }, + { + "epoch": 0.977109866836868, + "grad_norm": 0.09410873800516129, + "learning_rate": 8.61338371908904e-07, + "loss": 2.7787, + "step": 16840 + }, + { + "epoch": 0.9776900983492414, + "grad_norm": 0.09606259316205978, + "learning_rate": 8.18297940439383e-07, + "loss": 2.7766, + "step": 16850 + }, + { + "epoch": 0.9782703298616148, + "grad_norm": 0.09549134224653244, + "learning_rate": 7.763591273382885e-07, + "loss": 2.7701, + "step": 16860 + }, + { + "epoch": 0.9788505613739882, + "grad_norm": 0.09225918352603912, + "learning_rate": 7.355220870287615e-07, + "loss": 2.7635, + "step": 16870 + }, + { + "epoch": 0.9794307928863617, + "grad_norm": 0.09305543452501297, + "learning_rate": 6.95786969876988e-07, + "loss": 2.7659, + "step": 16880 + }, + { + "epoch": 0.9800110243987351, + "grad_norm": 0.09393244236707687, + "learning_rate": 6.571539221918997e-07, + "loss": 2.7743, + "step": 16890 + }, + { + "epoch": 0.9805912559111085, + "grad_norm": 0.09278815984725952, + "learning_rate": 6.196230862244078e-07, + "loss": 2.78, + "step": 16900 + }, + { + "epoch": 0.9811714874234819, + "grad_norm": 0.09347451478242874, + "learning_rate": 5.831946001669697e-07, + "loss": 2.7747, + "step": 16910 + }, + { + "epoch": 0.9817517189358554, + "grad_norm": 0.09540887176990509, + "learning_rate": 5.478685981530894e-07, + "loss": 2.7758, + "step": 16920 + }, + { + "epoch": 0.9823319504482289, + "grad_norm": 0.09621070325374603, + "learning_rate": 5.136452102567856e-07, + "loss": 2.7713, + "step": 16930 + }, + { + "epoch": 0.9829121819606023, + "grad_norm": 0.09409264475107193, + "learning_rate": 4.805245624922238e-07, + "loss": 2.7778, + "step": 16940 + }, + { + "epoch": 0.9834924134729757, + "grad_norm": 0.09619985520839691, + "learning_rate": 4.4850677681301795e-07, + "loss": 2.7701, + "step": 16950 + }, + { + "epoch": 0.9840726449853492, + "grad_norm": 0.09401355683803558, + "learning_rate": 4.1759197111206344e-07, + "loss": 2.7689, + "step": 16960 + }, + { + "epoch": 0.9846528764977226, + "grad_norm": 0.09698129445314407, + "learning_rate": 3.877802592209045e-07, + "loss": 2.7703, + "step": 16970 + }, + { + "epoch": 0.985233108010096, + "grad_norm": 0.09333529323339462, + "learning_rate": 3.590717509093677e-07, + "loss": 2.7784, + "step": 16980 + }, + { + "epoch": 0.9858133395224694, + "grad_norm": 0.09353555738925934, + "learning_rate": 3.3146655188519557e-07, + "loss": 2.7687, + "step": 16990 + }, + { + "epoch": 0.9863935710348429, + "grad_norm": 0.09438835084438324, + "learning_rate": 3.0496476379364697e-07, + "loss": 2.7665, + "step": 17000 + }, + { + "epoch": 0.9863935710348429, + "eval_loss": 2.735684633255005, + "eval_runtime": 3.2561, + "eval_samples_per_second": 1329.798, + "eval_steps_per_second": 2.764, + "step": 17000 + }, + { + "epoch": 0.9869738025472163, + "grad_norm": 0.09504197537899017, + "learning_rate": 2.7956648421703087e-07, + "loss": 2.7762, + "step": 17010 + }, + { + "epoch": 0.9875540340595897, + "grad_norm": 0.09602217376232147, + "learning_rate": 2.5527180667453963e-07, + "loss": 2.7673, + "step": 17020 + }, + { + "epoch": 0.9881342655719633, + "grad_norm": 0.09483738243579865, + "learning_rate": 2.3208082062168288e-07, + "loss": 2.7705, + "step": 17030 + }, + { + "epoch": 0.9887144970843367, + "grad_norm": 0.09395676851272583, + "learning_rate": 2.0999361145008775e-07, + "loss": 2.7692, + "step": 17040 + }, + { + "epoch": 0.9892947285967101, + "grad_norm": 0.09432484954595566, + "learning_rate": 1.8901026048719902e-07, + "loss": 2.7707, + "step": 17050 + }, + { + "epoch": 0.9898749601090835, + "grad_norm": 0.09382540732622147, + "learning_rate": 1.6913084499587948e-07, + "loss": 2.7788, + "step": 17060 + }, + { + "epoch": 0.990455191621457, + "grad_norm": 0.09619873762130737, + "learning_rate": 1.5035543817427663e-07, + "loss": 2.7604, + "step": 17070 + }, + { + "epoch": 0.9910354231338304, + "grad_norm": 0.09365525841712952, + "learning_rate": 1.3268410915532323e-07, + "loss": 2.7785, + "step": 17080 + }, + { + "epoch": 0.9916156546462038, + "grad_norm": 0.09718578308820724, + "learning_rate": 1.1611692300680376e-07, + "loss": 2.7745, + "step": 17090 + }, + { + "epoch": 0.9921958861585772, + "grad_norm": 0.0956762507557869, + "learning_rate": 1.0065394073075494e-07, + "loss": 2.7813, + "step": 17100 + }, + { + "epoch": 0.9927761176709508, + "grad_norm": 0.09347262978553772, + "learning_rate": 8.629521926353244e-08, + "loss": 2.7714, + "step": 17110 + }, + { + "epoch": 0.9933563491833242, + "grad_norm": 0.09415694326162338, + "learning_rate": 7.304081147544439e-08, + "loss": 2.7837, + "step": 17120 + }, + { + "epoch": 0.9939365806956976, + "grad_norm": 0.09390881657600403, + "learning_rate": 6.089076617058486e-08, + "loss": 2.7725, + "step": 17130 + }, + { + "epoch": 0.994516812208071, + "grad_norm": 0.09363935142755508, + "learning_rate": 4.984512808673402e-08, + "loss": 2.776, + "step": 17140 + }, + { + "epoch": 0.9950970437204445, + "grad_norm": 0.0957217812538147, + "learning_rate": 3.9903937895091606e-08, + "loss": 2.7731, + "step": 17150 + }, + { + "epoch": 0.9956772752328179, + "grad_norm": 0.09717927128076553, + "learning_rate": 3.1067232200110426e-08, + "loss": 2.7703, + "step": 17160 + }, + { + "epoch": 0.9962575067451913, + "grad_norm": 0.09413953870534897, + "learning_rate": 2.333504353952964e-08, + "loss": 2.7733, + "step": 17170 + }, + { + "epoch": 0.9968377382575647, + "grad_norm": 0.09774868190288544, + "learning_rate": 1.670740038400842e-08, + "loss": 2.7658, + "step": 17180 + }, + { + "epoch": 0.9974179697699382, + "grad_norm": 0.09658750146627426, + "learning_rate": 1.1184327137292448e-08, + "loss": 2.7734, + "step": 17190 + }, + { + "epoch": 0.9979982012823116, + "grad_norm": 0.0932522714138031, + "learning_rate": 6.765844135847576e-09, + "loss": 2.7708, + "step": 17200 + }, + { + "epoch": 0.9985784327946851, + "grad_norm": 0.09543392807245255, + "learning_rate": 3.4519676490596393e-09, + "loss": 2.7746, + "step": 17210 + }, + { + "epoch": 0.9991586643070585, + "grad_norm": 0.09391433745622635, + "learning_rate": 1.2427098789347111e-09, + "loss": 2.7707, + "step": 17220 + }, + { + "epoch": 0.999738895819432, + "grad_norm": 0.0975637212395668, + "learning_rate": 1.3807896016571064e-10, + "loss": 2.77, + "step": 17230 + }, + { + "epoch": 0.9999709884243814, + "step": 17234, + "total_flos": 4.402536853133695e+19, + "train_loss": 3.082940493684724, + "train_runtime": 20985.9807, + "train_samples_per_second": 420.462, + "train_steps_per_second": 0.821 + } + ], + "logging_steps": 10, + "max_steps": 17234, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.402536853133695e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}