{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.432265676096595, "eval_steps": 250, "global_step": 37750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00037941339076327134, "grad_norm": 2.8125, "learning_rate": 1.44e-05, "loss": 10.5404296875, "step": 10 }, { "epoch": 0.0007588267815265427, "grad_norm": 2.984375, "learning_rate": 3.04e-05, "loss": 10.230216979980469, "step": 20 }, { "epoch": 0.001138240172289814, "grad_norm": 1.5078125, "learning_rate": 4.64e-05, "loss": 9.449974822998048, "step": 30 }, { "epoch": 0.0015176535630530854, "grad_norm": 1.21875, "learning_rate": 6.24e-05, "loss": 8.948778533935547, "step": 40 }, { "epoch": 0.0018970669538163566, "grad_norm": 0.87109375, "learning_rate": 7.840000000000001e-05, "loss": 8.42608184814453, "step": 50 }, { "epoch": 0.002276480344579628, "grad_norm": 0.546875, "learning_rate": 9.44e-05, "loss": 8.00614242553711, "step": 60 }, { "epoch": 0.0026558937353428993, "grad_norm": 0.298828125, "learning_rate": 0.00011040000000000001, "loss": 7.798372650146485, "step": 70 }, { "epoch": 0.0030353071261061708, "grad_norm": 0.57421875, "learning_rate": 0.0001264, "loss": 7.717854309082031, "step": 80 }, { "epoch": 0.003414720516869442, "grad_norm": 1.21875, "learning_rate": 0.0001424, "loss": 7.60091552734375, "step": 90 }, { "epoch": 0.0037941339076327132, "grad_norm": 0.8515625, "learning_rate": 0.00015840000000000003, "loss": 7.396147155761719, "step": 100 }, { "epoch": 0.004173547298395985, "grad_norm": 0.671875, "learning_rate": 0.0001744, "loss": 7.231055450439453, "step": 110 }, { "epoch": 0.004552960689159256, "grad_norm": 0.9765625, "learning_rate": 0.0001904, "loss": 7.043311309814453, "step": 120 }, { "epoch": 0.004932374079922528, "grad_norm": 0.78515625, "learning_rate": 0.0002064, "loss": 6.837507629394532, "step": 130 }, { "epoch": 0.005311787470685799, "grad_norm": 1.15625, "learning_rate": 0.00022240000000000004, "loss": 6.688388061523438, "step": 140 }, { "epoch": 0.0056912008614490705, "grad_norm": 0.83984375, "learning_rate": 0.0002384, "loss": 6.5389404296875, "step": 150 }, { "epoch": 0.0060706142522123415, "grad_norm": 0.9375, "learning_rate": 0.0002544, "loss": 6.4129180908203125, "step": 160 }, { "epoch": 0.0064500276429756125, "grad_norm": 1.1171875, "learning_rate": 0.0002704, "loss": 6.270991516113281, "step": 170 }, { "epoch": 0.006829441033738884, "grad_norm": 0.8203125, "learning_rate": 0.0002864, "loss": 6.188150024414062, "step": 180 }, { "epoch": 0.007208854424502155, "grad_norm": 1.0703125, "learning_rate": 0.00030240000000000003, "loss": 6.075367736816406, "step": 190 }, { "epoch": 0.0075882678152654265, "grad_norm": 0.9609375, "learning_rate": 0.00031840000000000004, "loss": 5.984175491333008, "step": 200 }, { "epoch": 0.007967681206028698, "grad_norm": 0.86328125, "learning_rate": 0.0003344, "loss": 5.882094192504883, "step": 210 }, { "epoch": 0.00834709459679197, "grad_norm": 0.85546875, "learning_rate": 0.0003504, "loss": 5.790979385375977, "step": 220 }, { "epoch": 0.00872650798755524, "grad_norm": 0.90234375, "learning_rate": 0.0003664, "loss": 5.752438354492187, "step": 230 }, { "epoch": 0.009105921378318511, "grad_norm": 0.79296875, "learning_rate": 0.0003824, "loss": 5.639028549194336, "step": 240 }, { "epoch": 0.009485334769081784, "grad_norm": 0.609375, "learning_rate": 0.00039840000000000003, "loss": 5.566335678100586, "step": 250 }, { "epoch": 0.009485334769081784, "eval_loss": 5.517917156219482, "eval_runtime": 195.9468, "eval_samples_per_second": 19.439, "eval_steps_per_second": 3.241, "step": 250 }, { "epoch": 0.009864748159845055, "grad_norm": 0.478515625, "learning_rate": 0.0003999999709556304, "loss": 5.4802204132080075, "step": 260 }, { "epoch": 0.010244161550608326, "grad_norm": 1.1640625, "learning_rate": 0.00039999987055535117, "loss": 5.440499877929687, "step": 270 }, { "epoch": 0.010623574941371597, "grad_norm": 0.625, "learning_rate": 0.0003999996984406258, "loss": 5.361420822143555, "step": 280 }, { "epoch": 0.011002988332134868, "grad_norm": 0.478515625, "learning_rate": 0.00039999945461151613, "loss": 5.305873107910156, "step": 290 }, { "epoch": 0.011382401722898141, "grad_norm": 0.73828125, "learning_rate": 0.00039999913906810946, "loss": 5.265497207641602, "step": 300 }, { "epoch": 0.011761815113661412, "grad_norm": 0.85546875, "learning_rate": 0.000399998751810519, "loss": 5.239585876464844, "step": 310 }, { "epoch": 0.012141228504424683, "grad_norm": 0.55859375, "learning_rate": 0.00039999829283888366, "loss": 5.199300765991211, "step": 320 }, { "epoch": 0.012520641895187954, "grad_norm": 0.39453125, "learning_rate": 0.0003999977621533679, "loss": 5.100242233276367, "step": 330 }, { "epoch": 0.012900055285951225, "grad_norm": 0.396484375, "learning_rate": 0.000399997159754162, "loss": 5.061872482299805, "step": 340 }, { "epoch": 0.013279468676714496, "grad_norm": 0.48046875, "learning_rate": 0.00039999648564148217, "loss": 4.99063720703125, "step": 350 }, { "epoch": 0.013658882067477769, "grad_norm": 0.55859375, "learning_rate": 0.0003999957398155699, "loss": 4.9528450012207035, "step": 360 }, { "epoch": 0.01403829545824104, "grad_norm": 1.015625, "learning_rate": 0.00039999492227669277, "loss": 4.934624099731446, "step": 370 }, { "epoch": 0.01441770884900431, "grad_norm": 0.640625, "learning_rate": 0.00039999403302514385, "loss": 4.913988494873047, "step": 380 }, { "epoch": 0.014797122239767582, "grad_norm": 0.41015625, "learning_rate": 0.00039999307206124203, "loss": 4.825689315795898, "step": 390 }, { "epoch": 0.015176535630530853, "grad_norm": 0.421875, "learning_rate": 0.00039999203938533186, "loss": 4.811291885375977, "step": 400 }, { "epoch": 0.015555949021294126, "grad_norm": 0.40234375, "learning_rate": 0.0003999909349977836, "loss": 4.747502136230469, "step": 410 }, { "epoch": 0.015935362412057397, "grad_norm": 0.453125, "learning_rate": 0.00039998975889899335, "loss": 4.690442276000977, "step": 420 }, { "epoch": 0.016314775802820668, "grad_norm": 0.9453125, "learning_rate": 0.0003999885110893828, "loss": 4.650265884399414, "step": 430 }, { "epoch": 0.01669418919358394, "grad_norm": 0.70703125, "learning_rate": 0.00039998719156939926, "loss": 4.666769409179688, "step": 440 }, { "epoch": 0.01707360258434721, "grad_norm": 0.5859375, "learning_rate": 0.000399985800339516, "loss": 4.614315032958984, "step": 450 }, { "epoch": 0.01745301597511048, "grad_norm": 0.64453125, "learning_rate": 0.00039998433740023187, "loss": 4.559830856323242, "step": 460 }, { "epoch": 0.017832429365873752, "grad_norm": 0.466796875, "learning_rate": 0.00039998280275207137, "loss": 4.488445281982422, "step": 470 }, { "epoch": 0.018211842756637023, "grad_norm": 0.5390625, "learning_rate": 0.0003999811963955849, "loss": 4.457709884643554, "step": 480 }, { "epoch": 0.018591256147400297, "grad_norm": 0.4921875, "learning_rate": 0.00039997951833134834, "loss": 4.444478988647461, "step": 490 }, { "epoch": 0.01897066953816357, "grad_norm": 0.5234375, "learning_rate": 0.00039997776855996343, "loss": 4.406364059448242, "step": 500 }, { "epoch": 0.01897066953816357, "eval_loss": 4.39217472076416, "eval_runtime": 196.0442, "eval_samples_per_second": 19.429, "eval_steps_per_second": 3.239, "step": 500 }, { "epoch": 0.01935008292892684, "grad_norm": 0.48828125, "learning_rate": 0.0003999759470820576, "loss": 4.374259185791016, "step": 510 }, { "epoch": 0.01972949631969011, "grad_norm": 0.458984375, "learning_rate": 0.000399974053898284, "loss": 4.343398666381836, "step": 520 }, { "epoch": 0.02010890971045338, "grad_norm": 0.443359375, "learning_rate": 0.00039997208900932146, "loss": 4.328112411499023, "step": 530 }, { "epoch": 0.020488323101216652, "grad_norm": 0.3828125, "learning_rate": 0.00039997005241587446, "loss": 4.260250091552734, "step": 540 }, { "epoch": 0.020867736491979923, "grad_norm": 0.435546875, "learning_rate": 0.0003999679441186734, "loss": 4.243204116821289, "step": 550 }, { "epoch": 0.021247149882743194, "grad_norm": 0.515625, "learning_rate": 0.0003999657641184742, "loss": 4.26225471496582, "step": 560 }, { "epoch": 0.021626563273506465, "grad_norm": 0.447265625, "learning_rate": 0.0003999635124160585, "loss": 4.195428466796875, "step": 570 }, { "epoch": 0.022005976664269736, "grad_norm": 0.365234375, "learning_rate": 0.00039996118901223376, "loss": 4.165164184570313, "step": 580 }, { "epoch": 0.022385390055033007, "grad_norm": 0.404296875, "learning_rate": 0.0003999587939078331, "loss": 4.178491592407227, "step": 590 }, { "epoch": 0.022764803445796282, "grad_norm": 0.5, "learning_rate": 0.00039995632710371524, "loss": 4.171508407592773, "step": 600 }, { "epoch": 0.023144216836559553, "grad_norm": 0.435546875, "learning_rate": 0.0003999537886007648, "loss": 4.136694717407226, "step": 610 }, { "epoch": 0.023523630227322824, "grad_norm": 0.462890625, "learning_rate": 0.00039995117839989203, "loss": 4.142991638183593, "step": 620 }, { "epoch": 0.023903043618086095, "grad_norm": 0.458984375, "learning_rate": 0.00039994849650203275, "loss": 4.078335952758789, "step": 630 }, { "epoch": 0.024282457008849366, "grad_norm": 0.400390625, "learning_rate": 0.0003999457429081488, "loss": 4.073180389404297, "step": 640 }, { "epoch": 0.024661870399612637, "grad_norm": 0.44921875, "learning_rate": 0.0003999429176192274, "loss": 4.077816390991211, "step": 650 }, { "epoch": 0.025041283790375908, "grad_norm": 0.4375, "learning_rate": 0.00039994002063628163, "loss": 4.059943771362304, "step": 660 }, { "epoch": 0.02542069718113918, "grad_norm": 0.37890625, "learning_rate": 0.00039993705196035036, "loss": 4.063723373413086, "step": 670 }, { "epoch": 0.02580011057190245, "grad_norm": 0.3984375, "learning_rate": 0.000399934011592498, "loss": 4.047683334350586, "step": 680 }, { "epoch": 0.02617952396266572, "grad_norm": 0.451171875, "learning_rate": 0.00039993089953381484, "loss": 3.9986732482910154, "step": 690 }, { "epoch": 0.026558937353428992, "grad_norm": 0.361328125, "learning_rate": 0.00039992771578541657, "loss": 3.999075698852539, "step": 700 }, { "epoch": 0.026938350744192267, "grad_norm": 0.376953125, "learning_rate": 0.00039992446034844503, "loss": 3.9915229797363283, "step": 710 }, { "epoch": 0.027317764134955538, "grad_norm": 0.466796875, "learning_rate": 0.0003999211332240673, "loss": 4.002296447753906, "step": 720 }, { "epoch": 0.02769717752571881, "grad_norm": 0.59765625, "learning_rate": 0.00039991773441347666, "loss": 3.987569808959961, "step": 730 }, { "epoch": 0.02807659091648208, "grad_norm": 0.3828125, "learning_rate": 0.00039991426391789163, "loss": 3.972361373901367, "step": 740 }, { "epoch": 0.02845600430724535, "grad_norm": 0.341796875, "learning_rate": 0.0003999107217385567, "loss": 3.945170211791992, "step": 750 }, { "epoch": 0.02845600430724535, "eval_loss": 3.9254627227783203, "eval_runtime": 196.1779, "eval_samples_per_second": 19.416, "eval_steps_per_second": 3.237, "step": 750 }, { "epoch": 0.02883541769800862, "grad_norm": 0.390625, "learning_rate": 0.0003999071078767419, "loss": 3.9154258728027345, "step": 760 }, { "epoch": 0.029214831088771893, "grad_norm": 0.4140625, "learning_rate": 0.0003999034223337433, "loss": 3.9048130035400392, "step": 770 }, { "epoch": 0.029594244479535164, "grad_norm": 0.439453125, "learning_rate": 0.0003998996651108822, "loss": 3.9022491455078123, "step": 780 }, { "epoch": 0.029973657870298435, "grad_norm": 0.4375, "learning_rate": 0.000399895836209506, "loss": 3.9012115478515623, "step": 790 }, { "epoch": 0.030353071261061706, "grad_norm": 0.40234375, "learning_rate": 0.0003998919356309875, "loss": 3.881245803833008, "step": 800 }, { "epoch": 0.030732484651824977, "grad_norm": 0.400390625, "learning_rate": 0.00039988796337672543, "loss": 3.8917861938476563, "step": 810 }, { "epoch": 0.03111189804258825, "grad_norm": 0.404296875, "learning_rate": 0.00039988391944814405, "loss": 3.8680957794189452, "step": 820 }, { "epoch": 0.03149131143335152, "grad_norm": 0.77734375, "learning_rate": 0.00039987980384669354, "loss": 3.879084014892578, "step": 830 }, { "epoch": 0.03187072482411479, "grad_norm": 0.40625, "learning_rate": 0.0003998756165738495, "loss": 3.892671585083008, "step": 840 }, { "epoch": 0.032250138214878064, "grad_norm": 0.400390625, "learning_rate": 0.00039987135763111347, "loss": 3.8625461578369142, "step": 850 }, { "epoch": 0.032629551605641335, "grad_norm": 0.404296875, "learning_rate": 0.00039986702702001246, "loss": 3.807573699951172, "step": 860 }, { "epoch": 0.033008964996404606, "grad_norm": 0.404296875, "learning_rate": 0.0003998626247420995, "loss": 3.858591079711914, "step": 870 }, { "epoch": 0.03338837838716788, "grad_norm": 0.384765625, "learning_rate": 0.0003998581507989529, "loss": 3.8467525482177733, "step": 880 }, { "epoch": 0.03376779177793115, "grad_norm": 0.390625, "learning_rate": 0.00039985360519217717, "loss": 3.813101577758789, "step": 890 }, { "epoch": 0.03414720516869442, "grad_norm": 1.5859375, "learning_rate": 0.00039984898792340195, "loss": 3.8320926666259765, "step": 900 }, { "epoch": 0.03452661855945769, "grad_norm": 0.376953125, "learning_rate": 0.0003998442989942831, "loss": 3.803506851196289, "step": 910 }, { "epoch": 0.03490603195022096, "grad_norm": 0.345703125, "learning_rate": 0.00039983953840650176, "loss": 3.782692718505859, "step": 920 }, { "epoch": 0.03528544534098423, "grad_norm": 0.333984375, "learning_rate": 0.0003998347061617651, "loss": 3.793212127685547, "step": 930 }, { "epoch": 0.035664858731747504, "grad_norm": 0.37890625, "learning_rate": 0.0003998298022618057, "loss": 3.765856170654297, "step": 940 }, { "epoch": 0.036044272122510775, "grad_norm": 0.388671875, "learning_rate": 0.00039982482670838205, "loss": 3.755923843383789, "step": 950 }, { "epoch": 0.036423685513274046, "grad_norm": 0.34375, "learning_rate": 0.00039981977950327816, "loss": 3.800998306274414, "step": 960 }, { "epoch": 0.03680309890403732, "grad_norm": 0.3359375, "learning_rate": 0.00039981466064830393, "loss": 3.7708820343017577, "step": 970 }, { "epoch": 0.037182512294800595, "grad_norm": 0.3359375, "learning_rate": 0.00039980947014529476, "loss": 3.7808441162109374, "step": 980 }, { "epoch": 0.037561925685563866, "grad_norm": 0.361328125, "learning_rate": 0.00039980420799611184, "loss": 3.731973648071289, "step": 990 }, { "epoch": 0.03794133907632714, "grad_norm": 0.373046875, "learning_rate": 0.00039979887420264206, "loss": 3.744094467163086, "step": 1000 }, { "epoch": 0.03794133907632714, "eval_loss": 3.7319464683532715, "eval_runtime": 198.5333, "eval_samples_per_second": 19.186, "eval_steps_per_second": 3.198, "step": 1000 }, { "epoch": 0.03832075246709041, "grad_norm": 0.3828125, "learning_rate": 0.0003997934687667979, "loss": 3.7279151916503905, "step": 1010 }, { "epoch": 0.03870016585785368, "grad_norm": 0.404296875, "learning_rate": 0.0003997879916905177, "loss": 3.7036182403564455, "step": 1020 }, { "epoch": 0.03907957924861695, "grad_norm": 0.33203125, "learning_rate": 0.00039978244297576534, "loss": 3.692121887207031, "step": 1030 }, { "epoch": 0.03945899263938022, "grad_norm": 0.349609375, "learning_rate": 0.00039977682262453036, "loss": 3.7168228149414064, "step": 1040 }, { "epoch": 0.03983840603014349, "grad_norm": 0.328125, "learning_rate": 0.0003997711306388282, "loss": 3.706019973754883, "step": 1050 }, { "epoch": 0.04021781942090676, "grad_norm": 0.353515625, "learning_rate": 0.0003997653670206998, "loss": 3.7065948486328124, "step": 1060 }, { "epoch": 0.040597232811670034, "grad_norm": 0.365234375, "learning_rate": 0.0003997595317722118, "loss": 3.680253601074219, "step": 1070 }, { "epoch": 0.040976646202433305, "grad_norm": 0.328125, "learning_rate": 0.0003997536248954566, "loss": 3.7154819488525392, "step": 1080 }, { "epoch": 0.041356059593196576, "grad_norm": 0.36328125, "learning_rate": 0.0003997476463925522, "loss": 3.687308120727539, "step": 1090 }, { "epoch": 0.04173547298395985, "grad_norm": 0.33203125, "learning_rate": 0.00039974159626564234, "loss": 3.6539249420166016, "step": 1100 }, { "epoch": 0.04211488637472312, "grad_norm": 0.34765625, "learning_rate": 0.00039973547451689645, "loss": 3.6446033477783204, "step": 1110 }, { "epoch": 0.04249429976548639, "grad_norm": 0.353515625, "learning_rate": 0.0003997292811485096, "loss": 3.6805885314941404, "step": 1120 }, { "epoch": 0.04287371315624966, "grad_norm": 0.40234375, "learning_rate": 0.00039972301616270257, "loss": 3.660490798950195, "step": 1130 }, { "epoch": 0.04325312654701293, "grad_norm": 0.359375, "learning_rate": 0.00039971667956172183, "loss": 3.681305694580078, "step": 1140 }, { "epoch": 0.0436325399377762, "grad_norm": 0.373046875, "learning_rate": 0.00039971027134783955, "loss": 3.6562755584716795, "step": 1150 }, { "epoch": 0.04401195332853947, "grad_norm": 0.3671875, "learning_rate": 0.00039970379152335343, "loss": 3.658782958984375, "step": 1160 }, { "epoch": 0.044391366719302744, "grad_norm": 0.357421875, "learning_rate": 0.000399697240090587, "loss": 3.6295089721679688, "step": 1170 }, { "epoch": 0.044770780110066015, "grad_norm": 0.33203125, "learning_rate": 0.0003996906170518894, "loss": 3.625703048706055, "step": 1180 }, { "epoch": 0.045150193500829286, "grad_norm": 0.337890625, "learning_rate": 0.00039968392240963556, "loss": 3.6455429077148436, "step": 1190 }, { "epoch": 0.045529606891592564, "grad_norm": 0.31640625, "learning_rate": 0.0003996771561662259, "loss": 3.615035629272461, "step": 1200 }, { "epoch": 0.045909020282355835, "grad_norm": 0.306640625, "learning_rate": 0.00039967031832408674, "loss": 3.6154457092285157, "step": 1210 }, { "epoch": 0.046288433673119106, "grad_norm": 0.34765625, "learning_rate": 0.0003996634088856697, "loss": 3.5981407165527344, "step": 1220 }, { "epoch": 0.04666784706388238, "grad_norm": 0.828125, "learning_rate": 0.00039965642785345256, "loss": 3.613970184326172, "step": 1230 }, { "epoch": 0.04704726045464565, "grad_norm": 0.36328125, "learning_rate": 0.0003996493752299384, "loss": 3.6008975982666014, "step": 1240 }, { "epoch": 0.04742667384540892, "grad_norm": 0.326171875, "learning_rate": 0.0003996422510176561, "loss": 3.646671676635742, "step": 1250 }, { "epoch": 0.04742667384540892, "eval_loss": 3.598872423171997, "eval_runtime": 199.2268, "eval_samples_per_second": 19.119, "eval_steps_per_second": 3.187, "step": 1250 }, { "epoch": 0.04780608723617219, "grad_norm": 0.41796875, "learning_rate": 0.00039963505521916026, "loss": 3.628643035888672, "step": 1260 }, { "epoch": 0.04818550062693546, "grad_norm": 0.310546875, "learning_rate": 0.0003996277878370311, "loss": 3.6121673583984375, "step": 1270 }, { "epoch": 0.04856491401769873, "grad_norm": 0.373046875, "learning_rate": 0.0003996204488738744, "loss": 3.603960418701172, "step": 1280 }, { "epoch": 0.048944327408462, "grad_norm": 0.33203125, "learning_rate": 0.0003996130383323218, "loss": 3.5739994049072266, "step": 1290 }, { "epoch": 0.049323740799225274, "grad_norm": 0.345703125, "learning_rate": 0.0003996055562150305, "loss": 3.5921630859375, "step": 1300 }, { "epoch": 0.049703154189988545, "grad_norm": 0.3203125, "learning_rate": 0.00039959800252468335, "loss": 3.575924301147461, "step": 1310 }, { "epoch": 0.050082567580751816, "grad_norm": 0.361328125, "learning_rate": 0.00039959037726398894, "loss": 3.6012348175048827, "step": 1320 }, { "epoch": 0.05046198097151509, "grad_norm": 0.306640625, "learning_rate": 0.0003995826804356815, "loss": 3.5817798614501952, "step": 1330 }, { "epoch": 0.05084139436227836, "grad_norm": 0.392578125, "learning_rate": 0.00039957491204252075, "loss": 3.538928985595703, "step": 1340 }, { "epoch": 0.05122080775304163, "grad_norm": 0.333984375, "learning_rate": 0.0003995670720872925, "loss": 3.5673004150390626, "step": 1350 }, { "epoch": 0.0516002211438049, "grad_norm": 0.365234375, "learning_rate": 0.0003995591605728076, "loss": 3.5756252288818358, "step": 1360 }, { "epoch": 0.05197963453456817, "grad_norm": 0.357421875, "learning_rate": 0.0003995511775019032, "loss": 3.527027893066406, "step": 1370 }, { "epoch": 0.05235904792533144, "grad_norm": 0.333984375, "learning_rate": 0.00039954312287744165, "loss": 3.568987274169922, "step": 1380 }, { "epoch": 0.05273846131609471, "grad_norm": 0.3359375, "learning_rate": 0.0003995349967023111, "loss": 3.5293834686279295, "step": 1390 }, { "epoch": 0.053117874706857984, "grad_norm": 0.34375, "learning_rate": 0.00039952679897942546, "loss": 3.5607799530029296, "step": 1400 }, { "epoch": 0.053497288097621255, "grad_norm": 0.3125, "learning_rate": 0.0003995185297117242, "loss": 3.558505630493164, "step": 1410 }, { "epoch": 0.05387670148838453, "grad_norm": 0.3203125, "learning_rate": 0.0003995101889021724, "loss": 3.5406063079833983, "step": 1420 }, { "epoch": 0.054256114879147804, "grad_norm": 0.330078125, "learning_rate": 0.0003995017765537608, "loss": 3.5357891082763673, "step": 1430 }, { "epoch": 0.054635528269911075, "grad_norm": 0.345703125, "learning_rate": 0.00039949329266950604, "loss": 3.550967788696289, "step": 1440 }, { "epoch": 0.055014941660674346, "grad_norm": 0.341796875, "learning_rate": 0.00039948473725245, "loss": 3.497148895263672, "step": 1450 }, { "epoch": 0.05539435505143762, "grad_norm": 0.314453125, "learning_rate": 0.0003994761103056605, "loss": 3.513141632080078, "step": 1460 }, { "epoch": 0.05577376844220089, "grad_norm": 0.361328125, "learning_rate": 0.0003994674118322309, "loss": 3.5465816497802733, "step": 1470 }, { "epoch": 0.05615318183296416, "grad_norm": 0.33203125, "learning_rate": 0.00039945864183528023, "loss": 3.5267097473144533, "step": 1480 }, { "epoch": 0.05653259522372743, "grad_norm": 0.32421875, "learning_rate": 0.00039944980031795326, "loss": 3.529467010498047, "step": 1490 }, { "epoch": 0.0569120086144907, "grad_norm": 0.77734375, "learning_rate": 0.00039944088728342017, "loss": 3.513682174682617, "step": 1500 }, { "epoch": 0.0569120086144907, "eval_loss": 3.504640579223633, "eval_runtime": 198.5869, "eval_samples_per_second": 19.181, "eval_steps_per_second": 3.198, "step": 1500 }, { "epoch": 0.05729142200525397, "grad_norm": 0.33984375, "learning_rate": 0.000399431902734877, "loss": 3.527770233154297, "step": 1510 }, { "epoch": 0.05767083539601724, "grad_norm": 0.333984375, "learning_rate": 0.00039942284667554535, "loss": 3.5062992095947267, "step": 1520 }, { "epoch": 0.058050248786780514, "grad_norm": 0.287109375, "learning_rate": 0.00039941371910867256, "loss": 3.5329803466796874, "step": 1530 }, { "epoch": 0.058429662177543785, "grad_norm": 0.322265625, "learning_rate": 0.0003994045200375314, "loss": 3.479861831665039, "step": 1540 }, { "epoch": 0.058809075568307057, "grad_norm": 0.296875, "learning_rate": 0.00039939524946542047, "loss": 3.503245162963867, "step": 1550 }, { "epoch": 0.05918848895907033, "grad_norm": 0.30078125, "learning_rate": 0.0003993859073956639, "loss": 3.4819522857666017, "step": 1560 }, { "epoch": 0.0595679023498336, "grad_norm": 0.306640625, "learning_rate": 0.0003993764938316115, "loss": 3.474462890625, "step": 1570 }, { "epoch": 0.05994731574059687, "grad_norm": 0.322265625, "learning_rate": 0.0003993670087766388, "loss": 3.472828674316406, "step": 1580 }, { "epoch": 0.06032672913136014, "grad_norm": 0.30078125, "learning_rate": 0.0003993574522341468, "loss": 3.416951370239258, "step": 1590 }, { "epoch": 0.06070614252212341, "grad_norm": 0.27734375, "learning_rate": 0.00039934782420756226, "loss": 3.4840694427490235, "step": 1600 }, { "epoch": 0.06108555591288668, "grad_norm": 0.318359375, "learning_rate": 0.00039933812470033746, "loss": 3.4658809661865235, "step": 1610 }, { "epoch": 0.061464969303649954, "grad_norm": 0.294921875, "learning_rate": 0.00039932835371595037, "loss": 3.473246765136719, "step": 1620 }, { "epoch": 0.06184438269441323, "grad_norm": 0.359375, "learning_rate": 0.00039931851125790473, "loss": 3.467228317260742, "step": 1630 }, { "epoch": 0.0622237960851765, "grad_norm": 0.318359375, "learning_rate": 0.0003993085973297296, "loss": 3.472382354736328, "step": 1640 }, { "epoch": 0.06260320947593977, "grad_norm": 0.318359375, "learning_rate": 0.00039929861193498, "loss": 3.470948028564453, "step": 1650 }, { "epoch": 0.06298262286670304, "grad_norm": 0.294921875, "learning_rate": 0.00039928855507723627, "loss": 3.4849205017089844, "step": 1660 }, { "epoch": 0.06336203625746631, "grad_norm": 0.33203125, "learning_rate": 0.0003992784267601046, "loss": 3.43970947265625, "step": 1670 }, { "epoch": 0.06374144964822959, "grad_norm": 0.306640625, "learning_rate": 0.00039926822698721677, "loss": 3.4767566680908204, "step": 1680 }, { "epoch": 0.06412086303899285, "grad_norm": 0.3203125, "learning_rate": 0.0003992579557622301, "loss": 3.4669559478759764, "step": 1690 }, { "epoch": 0.06450027642975613, "grad_norm": 0.291015625, "learning_rate": 0.0003992476130888275, "loss": 3.4744457244873046, "step": 1700 }, { "epoch": 0.06487968982051939, "grad_norm": 0.302734375, "learning_rate": 0.00039923719897071765, "loss": 3.4358070373535154, "step": 1710 }, { "epoch": 0.06525910321128267, "grad_norm": 0.29296875, "learning_rate": 0.0003992267134116347, "loss": 3.456542205810547, "step": 1720 }, { "epoch": 0.06563851660204593, "grad_norm": 0.306640625, "learning_rate": 0.00039921615641533856, "loss": 3.4641254425048826, "step": 1730 }, { "epoch": 0.06601792999280921, "grad_norm": 0.287109375, "learning_rate": 0.00039920552798561467, "loss": 3.437123489379883, "step": 1740 }, { "epoch": 0.06639734338357249, "grad_norm": 0.3046875, "learning_rate": 0.000399194828126274, "loss": 3.440760040283203, "step": 1750 }, { "epoch": 0.06639734338357249, "eval_loss": 3.43211030960083, "eval_runtime": 210.4721, "eval_samples_per_second": 18.097, "eval_steps_per_second": 3.017, "step": 1750 }, { "epoch": 0.06677675677433575, "grad_norm": 0.279296875, "learning_rate": 0.0003991840568411533, "loss": 3.454257583618164, "step": 1760 }, { "epoch": 0.06715617016509903, "grad_norm": 0.2890625, "learning_rate": 0.00039917321413411485, "loss": 3.4580535888671875, "step": 1770 }, { "epoch": 0.0675355835558623, "grad_norm": 0.306640625, "learning_rate": 0.00039916230000904663, "loss": 3.436717987060547, "step": 1780 }, { "epoch": 0.06791499694662557, "grad_norm": 0.314453125, "learning_rate": 0.0003991513144698619, "loss": 3.4585205078125, "step": 1790 }, { "epoch": 0.06829441033738884, "grad_norm": 0.33203125, "learning_rate": 0.0003991402575205, "loss": 3.4226608276367188, "step": 1800 }, { "epoch": 0.06867382372815212, "grad_norm": 0.306640625, "learning_rate": 0.00039912912916492563, "loss": 3.4409008026123047, "step": 1810 }, { "epoch": 0.06905323711891538, "grad_norm": 0.318359375, "learning_rate": 0.000399117929407129, "loss": 3.4104915618896485, "step": 1820 }, { "epoch": 0.06943265050967866, "grad_norm": 0.29296875, "learning_rate": 0.00039910665825112606, "loss": 3.4137596130371093, "step": 1830 }, { "epoch": 0.06981206390044192, "grad_norm": 0.328125, "learning_rate": 0.00039909531570095846, "loss": 3.44422607421875, "step": 1840 }, { "epoch": 0.0701914772912052, "grad_norm": 0.31640625, "learning_rate": 0.00039908390176069314, "loss": 3.3948230743408203, "step": 1850 }, { "epoch": 0.07057089068196847, "grad_norm": 0.30078125, "learning_rate": 0.00039907241643442296, "loss": 3.4220741271972654, "step": 1860 }, { "epoch": 0.07095030407273174, "grad_norm": 0.33203125, "learning_rate": 0.00039906085972626616, "loss": 3.4325870513916015, "step": 1870 }, { "epoch": 0.07132971746349501, "grad_norm": 0.29296875, "learning_rate": 0.0003990492316403668, "loss": 3.3992130279541017, "step": 1880 }, { "epoch": 0.07170913085425829, "grad_norm": 0.298828125, "learning_rate": 0.00039903753218089417, "loss": 3.4037288665771483, "step": 1890 }, { "epoch": 0.07208854424502155, "grad_norm": 0.2890625, "learning_rate": 0.00039902576135204354, "loss": 3.3768444061279297, "step": 1900 }, { "epoch": 0.07246795763578483, "grad_norm": 0.3203125, "learning_rate": 0.00039901391915803553, "loss": 3.398454284667969, "step": 1910 }, { "epoch": 0.07284737102654809, "grad_norm": 0.2890625, "learning_rate": 0.0003990020056031165, "loss": 3.3866065979003905, "step": 1920 }, { "epoch": 0.07322678441731137, "grad_norm": 0.298828125, "learning_rate": 0.0003989900206915583, "loss": 3.4163612365722655, "step": 1930 }, { "epoch": 0.07360619780807463, "grad_norm": 0.33984375, "learning_rate": 0.0003989779644276583, "loss": 3.411692810058594, "step": 1940 }, { "epoch": 0.07398561119883791, "grad_norm": 0.3125, "learning_rate": 0.0003989658368157396, "loss": 3.3672412872314452, "step": 1950 }, { "epoch": 0.07436502458960119, "grad_norm": 0.2890625, "learning_rate": 0.0003989536378601509, "loss": 3.3683753967285157, "step": 1960 }, { "epoch": 0.07474443798036445, "grad_norm": 0.5234375, "learning_rate": 0.0003989413675652663, "loss": 3.4047016143798827, "step": 1970 }, { "epoch": 0.07512385137112773, "grad_norm": 0.287109375, "learning_rate": 0.00039892902593548565, "loss": 3.374019241333008, "step": 1980 }, { "epoch": 0.075503264761891, "grad_norm": 0.30859375, "learning_rate": 0.0003989166129752343, "loss": 3.3961673736572267, "step": 1990 }, { "epoch": 0.07588267815265427, "grad_norm": 0.2890625, "learning_rate": 0.0003989041286889632, "loss": 3.3799392700195314, "step": 2000 }, { "epoch": 0.07588267815265427, "eval_loss": 3.374641180038452, "eval_runtime": 196.0433, "eval_samples_per_second": 19.429, "eval_steps_per_second": 3.239, "step": 2000 }, { "epoch": 0.07626209154341754, "grad_norm": 0.314453125, "learning_rate": 0.00039889157308114885, "loss": 3.3951652526855467, "step": 2010 }, { "epoch": 0.07664150493418082, "grad_norm": 0.287109375, "learning_rate": 0.00039887894615629337, "loss": 3.3532970428466795, "step": 2020 }, { "epoch": 0.07702091832494408, "grad_norm": 0.345703125, "learning_rate": 0.0003988662479189245, "loss": 3.388214874267578, "step": 2030 }, { "epoch": 0.07740033171570736, "grad_norm": 0.271484375, "learning_rate": 0.0003988534783735953, "loss": 3.38348388671875, "step": 2040 }, { "epoch": 0.07777974510647062, "grad_norm": 0.306640625, "learning_rate": 0.00039884063752488474, "loss": 3.3785205841064454, "step": 2050 }, { "epoch": 0.0781591584972339, "grad_norm": 0.2890625, "learning_rate": 0.0003988277253773971, "loss": 3.3606407165527346, "step": 2060 }, { "epoch": 0.07853857188799716, "grad_norm": 0.30859375, "learning_rate": 0.00039881474193576233, "loss": 3.362018585205078, "step": 2070 }, { "epoch": 0.07891798527876044, "grad_norm": 0.28515625, "learning_rate": 0.00039880168720463603, "loss": 3.379924011230469, "step": 2080 }, { "epoch": 0.0792973986695237, "grad_norm": 0.279296875, "learning_rate": 0.00039878856118869915, "loss": 3.381979751586914, "step": 2090 }, { "epoch": 0.07967681206028698, "grad_norm": 0.29296875, "learning_rate": 0.0003987753638926584, "loss": 3.364896774291992, "step": 2100 }, { "epoch": 0.08005622545105025, "grad_norm": 0.296875, "learning_rate": 0.0003987620953212459, "loss": 3.3263385772705076, "step": 2110 }, { "epoch": 0.08043563884181353, "grad_norm": 0.291015625, "learning_rate": 0.0003987487554792194, "loss": 3.3634849548339845, "step": 2120 }, { "epoch": 0.08081505223257679, "grad_norm": 0.294921875, "learning_rate": 0.0003987353443713623, "loss": 3.3564342498779296, "step": 2130 }, { "epoch": 0.08119446562334007, "grad_norm": 0.3046875, "learning_rate": 0.00039872186200248333, "loss": 3.3948665618896485, "step": 2140 }, { "epoch": 0.08157387901410333, "grad_norm": 0.291015625, "learning_rate": 0.0003987083083774169, "loss": 3.344033050537109, "step": 2150 }, { "epoch": 0.08195329240486661, "grad_norm": 0.447265625, "learning_rate": 0.00039869468350102314, "loss": 3.3565109252929686, "step": 2160 }, { "epoch": 0.08233270579562987, "grad_norm": 0.2890625, "learning_rate": 0.00039868098737818735, "loss": 3.3645370483398436, "step": 2170 }, { "epoch": 0.08271211918639315, "grad_norm": 0.302734375, "learning_rate": 0.0003986672200138206, "loss": 3.3361286163330077, "step": 2180 }, { "epoch": 0.08309153257715643, "grad_norm": 0.306640625, "learning_rate": 0.00039865338141285964, "loss": 3.329209899902344, "step": 2190 }, { "epoch": 0.0834709459679197, "grad_norm": 0.302734375, "learning_rate": 0.0003986394715802665, "loss": 3.340392303466797, "step": 2200 }, { "epoch": 0.08385035935868297, "grad_norm": 0.296875, "learning_rate": 0.0003986254905210289, "loss": 3.3661399841308595, "step": 2210 }, { "epoch": 0.08422977274944624, "grad_norm": 0.302734375, "learning_rate": 0.00039861143824016003, "loss": 3.3295963287353514, "step": 2220 }, { "epoch": 0.08460918614020951, "grad_norm": 0.283203125, "learning_rate": 0.00039859731474269864, "loss": 3.3122394561767576, "step": 2230 }, { "epoch": 0.08498859953097278, "grad_norm": 0.283203125, "learning_rate": 0.00039858312003370904, "loss": 3.3390274047851562, "step": 2240 }, { "epoch": 0.08536801292173606, "grad_norm": 0.29296875, "learning_rate": 0.0003985688541182811, "loss": 3.3098018646240233, "step": 2250 }, { "epoch": 0.08536801292173606, "eval_loss": 3.3282086849212646, "eval_runtime": 192.0721, "eval_samples_per_second": 19.831, "eval_steps_per_second": 3.306, "step": 2250 }, { "epoch": 0.08574742631249932, "grad_norm": 0.291015625, "learning_rate": 0.00039855451700153015, "loss": 3.319503402709961, "step": 2260 }, { "epoch": 0.0861268397032626, "grad_norm": 0.28515625, "learning_rate": 0.00039854010868859715, "loss": 3.324789047241211, "step": 2270 }, { "epoch": 0.08650625309402586, "grad_norm": 0.294921875, "learning_rate": 0.00039852562918464844, "loss": 3.3506725311279295, "step": 2280 }, { "epoch": 0.08688566648478914, "grad_norm": 0.294921875, "learning_rate": 0.00039851107849487596, "loss": 3.3031063079833984, "step": 2290 }, { "epoch": 0.0872650798755524, "grad_norm": 0.265625, "learning_rate": 0.00039849645662449725, "loss": 3.328424072265625, "step": 2300 }, { "epoch": 0.08764449326631568, "grad_norm": 0.271484375, "learning_rate": 0.0003984817635787553, "loss": 3.3317771911621095, "step": 2310 }, { "epoch": 0.08802390665707895, "grad_norm": 0.27734375, "learning_rate": 0.0003984669993629186, "loss": 3.2880271911621093, "step": 2320 }, { "epoch": 0.08840332004784222, "grad_norm": 0.30078125, "learning_rate": 0.00039845216398228124, "loss": 3.342202377319336, "step": 2330 }, { "epoch": 0.08878273343860549, "grad_norm": 0.28125, "learning_rate": 0.0003984372574421627, "loss": 3.316761016845703, "step": 2340 }, { "epoch": 0.08916214682936877, "grad_norm": 0.3125, "learning_rate": 0.00039842227974790816, "loss": 3.3245128631591796, "step": 2350 }, { "epoch": 0.08954156022013203, "grad_norm": 0.294921875, "learning_rate": 0.00039840723090488813, "loss": 3.310524749755859, "step": 2360 }, { "epoch": 0.08992097361089531, "grad_norm": 0.275390625, "learning_rate": 0.0003983921109184987, "loss": 3.2855403900146483, "step": 2370 }, { "epoch": 0.09030038700165857, "grad_norm": 0.2890625, "learning_rate": 0.00039837691979416155, "loss": 3.297772979736328, "step": 2380 }, { "epoch": 0.09067980039242185, "grad_norm": 0.34375, "learning_rate": 0.00039836165753732376, "loss": 3.3324337005615234, "step": 2390 }, { "epoch": 0.09105921378318513, "grad_norm": 0.287109375, "learning_rate": 0.000398346324153458, "loss": 3.3117176055908204, "step": 2400 }, { "epoch": 0.09143862717394839, "grad_norm": 0.291015625, "learning_rate": 0.00039833091964806226, "loss": 3.312748336791992, "step": 2410 }, { "epoch": 0.09181804056471167, "grad_norm": 0.294921875, "learning_rate": 0.00039831544402666027, "loss": 3.3131919860839845, "step": 2420 }, { "epoch": 0.09219745395547493, "grad_norm": 0.296875, "learning_rate": 0.00039829989729480125, "loss": 3.2897037506103515, "step": 2430 }, { "epoch": 0.09257686734623821, "grad_norm": 0.265625, "learning_rate": 0.00039828427945805966, "loss": 3.336315155029297, "step": 2440 }, { "epoch": 0.09295628073700148, "grad_norm": 0.306640625, "learning_rate": 0.00039826859052203575, "loss": 3.303927993774414, "step": 2450 }, { "epoch": 0.09333569412776475, "grad_norm": 0.27734375, "learning_rate": 0.0003982528304923551, "loss": 3.311422348022461, "step": 2460 }, { "epoch": 0.09371510751852802, "grad_norm": 0.28125, "learning_rate": 0.0003982369993746688, "loss": 3.3016288757324217, "step": 2470 }, { "epoch": 0.0940945209092913, "grad_norm": 0.27734375, "learning_rate": 0.0003982210971746535, "loss": 3.3147655487060548, "step": 2480 }, { "epoch": 0.09447393430005456, "grad_norm": 0.265625, "learning_rate": 0.00039820512389801126, "loss": 3.305263900756836, "step": 2490 }, { "epoch": 0.09485334769081784, "grad_norm": 0.310546875, "learning_rate": 0.0003981890795504696, "loss": 3.272055435180664, "step": 2500 }, { "epoch": 0.09485334769081784, "eval_loss": 3.2935683727264404, "eval_runtime": 191.7709, "eval_samples_per_second": 19.862, "eval_steps_per_second": 3.311, "step": 2500 }, { "epoch": 0.0952327610815811, "grad_norm": 0.283203125, "learning_rate": 0.00039817296413778173, "loss": 3.319232940673828, "step": 2510 }, { "epoch": 0.09561217447234438, "grad_norm": 0.287109375, "learning_rate": 0.0003981567776657261, "loss": 3.298080062866211, "step": 2520 }, { "epoch": 0.09599158786310764, "grad_norm": 0.291015625, "learning_rate": 0.0003981405201401066, "loss": 3.2867507934570312, "step": 2530 }, { "epoch": 0.09637100125387092, "grad_norm": 0.310546875, "learning_rate": 0.00039812419156675305, "loss": 3.302895355224609, "step": 2540 }, { "epoch": 0.09675041464463419, "grad_norm": 0.28515625, "learning_rate": 0.0003981077919515201, "loss": 3.286672592163086, "step": 2550 }, { "epoch": 0.09712982803539746, "grad_norm": 0.267578125, "learning_rate": 0.0003980913213002884, "loss": 3.289723205566406, "step": 2560 }, { "epoch": 0.09750924142616073, "grad_norm": 0.28515625, "learning_rate": 0.0003980747796189638, "loss": 3.305155563354492, "step": 2570 }, { "epoch": 0.097888654816924, "grad_norm": 0.2734375, "learning_rate": 0.0003980581669134778, "loss": 3.271694564819336, "step": 2580 }, { "epoch": 0.09826806820768727, "grad_norm": 0.294921875, "learning_rate": 0.00039804148318978703, "loss": 3.274801254272461, "step": 2590 }, { "epoch": 0.09864748159845055, "grad_norm": 0.310546875, "learning_rate": 0.00039802472845387407, "loss": 3.298069381713867, "step": 2600 }, { "epoch": 0.09902689498921383, "grad_norm": 0.3046875, "learning_rate": 0.0003980079027117465, "loss": 3.2600337982177736, "step": 2610 }, { "epoch": 0.09940630837997709, "grad_norm": 0.294921875, "learning_rate": 0.0003979910059694377, "loss": 3.279772186279297, "step": 2620 }, { "epoch": 0.09978572177074037, "grad_norm": 0.287109375, "learning_rate": 0.00039797403823300633, "loss": 3.295903778076172, "step": 2630 }, { "epoch": 0.10016513516150363, "grad_norm": 0.279296875, "learning_rate": 0.0003979569995085366, "loss": 3.278548812866211, "step": 2640 }, { "epoch": 0.10054454855226691, "grad_norm": 0.5546875, "learning_rate": 0.00039793988980213804, "loss": 3.2898914337158205, "step": 2650 }, { "epoch": 0.10092396194303017, "grad_norm": 0.330078125, "learning_rate": 0.0003979227091199458, "loss": 3.24792594909668, "step": 2660 }, { "epoch": 0.10130337533379345, "grad_norm": 0.28125, "learning_rate": 0.0003979054574681203, "loss": 3.254792404174805, "step": 2670 }, { "epoch": 0.10168278872455672, "grad_norm": 0.27734375, "learning_rate": 0.0003978881348528476, "loss": 3.259050750732422, "step": 2680 }, { "epoch": 0.10206220211532, "grad_norm": 0.259765625, "learning_rate": 0.00039787074128033907, "loss": 3.305150604248047, "step": 2690 }, { "epoch": 0.10244161550608326, "grad_norm": 0.28125, "learning_rate": 0.0003978532767568316, "loss": 3.312991714477539, "step": 2700 }, { "epoch": 0.10282102889684654, "grad_norm": 0.306640625, "learning_rate": 0.0003978357412885874, "loss": 3.2554290771484373, "step": 2710 }, { "epoch": 0.1032004422876098, "grad_norm": 0.28515625, "learning_rate": 0.00039781813488189437, "loss": 3.272433853149414, "step": 2720 }, { "epoch": 0.10357985567837308, "grad_norm": 0.275390625, "learning_rate": 0.0003978004575430655, "loss": 3.225101089477539, "step": 2730 }, { "epoch": 0.10395926906913634, "grad_norm": 0.306640625, "learning_rate": 0.00039778270927843954, "loss": 3.260712432861328, "step": 2740 }, { "epoch": 0.10433868245989962, "grad_norm": 0.271484375, "learning_rate": 0.00039776489009438047, "loss": 3.255579376220703, "step": 2750 }, { "epoch": 0.10433868245989962, "eval_loss": 3.259019136428833, "eval_runtime": 199.5272, "eval_samples_per_second": 19.09, "eval_steps_per_second": 3.183, "step": 2750 }, { "epoch": 0.10471809585066288, "grad_norm": 0.34765625, "learning_rate": 0.00039774699999727773, "loss": 3.3122615814208984, "step": 2760 }, { "epoch": 0.10509750924142616, "grad_norm": 0.283203125, "learning_rate": 0.0003977290389935463, "loss": 3.2601490020751953, "step": 2770 }, { "epoch": 0.10547692263218943, "grad_norm": 0.279296875, "learning_rate": 0.0003977110070896264, "loss": 3.24356689453125, "step": 2780 }, { "epoch": 0.1058563360229527, "grad_norm": 0.28515625, "learning_rate": 0.0003976929042919838, "loss": 3.224711608886719, "step": 2790 }, { "epoch": 0.10623574941371597, "grad_norm": 0.271484375, "learning_rate": 0.00039767473060710974, "loss": 3.2539390563964843, "step": 2800 }, { "epoch": 0.10661516280447925, "grad_norm": 0.29296875, "learning_rate": 0.00039765648604152065, "loss": 3.248219299316406, "step": 2810 }, { "epoch": 0.10699457619524251, "grad_norm": 0.275390625, "learning_rate": 0.0003976381706017587, "loss": 3.2163249969482424, "step": 2820 }, { "epoch": 0.10737398958600579, "grad_norm": 0.37109375, "learning_rate": 0.00039761978429439125, "loss": 3.243408966064453, "step": 2830 }, { "epoch": 0.10775340297676907, "grad_norm": 0.279296875, "learning_rate": 0.00039760132712601104, "loss": 3.2626617431640623, "step": 2840 }, { "epoch": 0.10813281636753233, "grad_norm": 0.2734375, "learning_rate": 0.00039758279910323644, "loss": 3.2402717590332033, "step": 2850 }, { "epoch": 0.10851222975829561, "grad_norm": 0.294921875, "learning_rate": 0.000397564200232711, "loss": 3.238645553588867, "step": 2860 }, { "epoch": 0.10889164314905887, "grad_norm": 0.2734375, "learning_rate": 0.00039754553052110374, "loss": 3.2292362213134767, "step": 2870 }, { "epoch": 0.10927105653982215, "grad_norm": 0.267578125, "learning_rate": 0.0003975267899751092, "loss": 3.2438121795654298, "step": 2880 }, { "epoch": 0.10965046993058541, "grad_norm": 0.275390625, "learning_rate": 0.00039750797860144707, "loss": 3.248378372192383, "step": 2890 }, { "epoch": 0.11002988332134869, "grad_norm": 0.2734375, "learning_rate": 0.00039748909640686275, "loss": 3.270367431640625, "step": 2900 }, { "epoch": 0.11040929671211196, "grad_norm": 0.267578125, "learning_rate": 0.0003974701433981268, "loss": 3.238606643676758, "step": 2910 }, { "epoch": 0.11078871010287523, "grad_norm": 0.271484375, "learning_rate": 0.00039745111958203525, "loss": 3.2293777465820312, "step": 2920 }, { "epoch": 0.1111681234936385, "grad_norm": 0.2734375, "learning_rate": 0.00039743202496540954, "loss": 3.2732200622558594, "step": 2930 }, { "epoch": 0.11154753688440178, "grad_norm": 0.279296875, "learning_rate": 0.00039741285955509646, "loss": 3.254547882080078, "step": 2940 }, { "epoch": 0.11192695027516504, "grad_norm": 0.287109375, "learning_rate": 0.0003973936233579682, "loss": 3.2391891479492188, "step": 2950 }, { "epoch": 0.11230636366592832, "grad_norm": 0.291015625, "learning_rate": 0.0003973743163809223, "loss": 3.240161895751953, "step": 2960 }, { "epoch": 0.11268577705669158, "grad_norm": 0.302734375, "learning_rate": 0.0003973549386308817, "loss": 3.2186958312988283, "step": 2970 }, { "epoch": 0.11306519044745486, "grad_norm": 0.2890625, "learning_rate": 0.0003973354901147949, "loss": 3.23299560546875, "step": 2980 }, { "epoch": 0.11344460383821812, "grad_norm": 0.283203125, "learning_rate": 0.00039731597083963533, "loss": 3.2404205322265627, "step": 2990 }, { "epoch": 0.1138240172289814, "grad_norm": 0.369140625, "learning_rate": 0.0003972963808124023, "loss": 3.246659851074219, "step": 3000 }, { "epoch": 0.1138240172289814, "eval_loss": 3.2267839908599854, "eval_runtime": 191.8366, "eval_samples_per_second": 19.855, "eval_steps_per_second": 3.31, "step": 3000 }, { "epoch": 0.11420343061974467, "grad_norm": 0.28515625, "learning_rate": 0.0003972767200401201, "loss": 3.2318279266357424, "step": 3010 }, { "epoch": 0.11458284401050794, "grad_norm": 0.26171875, "learning_rate": 0.00039725698852983857, "loss": 3.241360092163086, "step": 3020 }, { "epoch": 0.11496225740127121, "grad_norm": 0.267578125, "learning_rate": 0.00039723718628863295, "loss": 3.2138748168945312, "step": 3030 }, { "epoch": 0.11534167079203449, "grad_norm": 0.294921875, "learning_rate": 0.0003972173133236038, "loss": 3.230352783203125, "step": 3040 }, { "epoch": 0.11572108418279776, "grad_norm": 0.291015625, "learning_rate": 0.0003971973696418769, "loss": 3.220648193359375, "step": 3050 }, { "epoch": 0.11610049757356103, "grad_norm": 0.29296875, "learning_rate": 0.00039717735525060353, "loss": 3.2253433227539063, "step": 3060 }, { "epoch": 0.1164799109643243, "grad_norm": 0.265625, "learning_rate": 0.0003971572701569604, "loss": 3.1920917510986326, "step": 3070 }, { "epoch": 0.11685932435508757, "grad_norm": 0.267578125, "learning_rate": 0.0003971371143681494, "loss": 3.2291126251220703, "step": 3080 }, { "epoch": 0.11723873774585085, "grad_norm": 0.279296875, "learning_rate": 0.0003971168878913978, "loss": 3.2412979125976564, "step": 3090 }, { "epoch": 0.11761815113661411, "grad_norm": 0.291015625, "learning_rate": 0.0003970965907339584, "loss": 3.2690067291259766, "step": 3100 }, { "epoch": 0.11799756452737739, "grad_norm": 0.279296875, "learning_rate": 0.000397076222903109, "loss": 3.231862258911133, "step": 3110 }, { "epoch": 0.11837697791814066, "grad_norm": 0.26953125, "learning_rate": 0.0003970557844061531, "loss": 3.2309947967529298, "step": 3120 }, { "epoch": 0.11875639130890393, "grad_norm": 0.263671875, "learning_rate": 0.0003970352752504193, "loss": 3.1961137771606447, "step": 3130 }, { "epoch": 0.1191358046996672, "grad_norm": 0.2578125, "learning_rate": 0.0003970146954432617, "loss": 3.2005115509033204, "step": 3140 }, { "epoch": 0.11951521809043048, "grad_norm": 0.283203125, "learning_rate": 0.0003969940449920595, "loss": 3.2140071868896483, "step": 3150 }, { "epoch": 0.11989463148119374, "grad_norm": 0.2734375, "learning_rate": 0.00039697332390421764, "loss": 3.1643339157104493, "step": 3160 }, { "epoch": 0.12027404487195702, "grad_norm": 0.298828125, "learning_rate": 0.00039695253218716587, "loss": 3.2335384368896483, "step": 3170 }, { "epoch": 0.12065345826272028, "grad_norm": 0.27734375, "learning_rate": 0.00039693166984835967, "loss": 3.234294891357422, "step": 3180 }, { "epoch": 0.12103287165348356, "grad_norm": 0.3359375, "learning_rate": 0.00039691073689527965, "loss": 3.216410827636719, "step": 3190 }, { "epoch": 0.12141228504424682, "grad_norm": 0.271484375, "learning_rate": 0.0003968897333354318, "loss": 3.224312591552734, "step": 3200 }, { "epoch": 0.1217916984350101, "grad_norm": 0.26953125, "learning_rate": 0.0003968686591763475, "loss": 3.234964370727539, "step": 3210 }, { "epoch": 0.12217111182577337, "grad_norm": 0.26953125, "learning_rate": 0.0003968475144255832, "loss": 3.2149890899658202, "step": 3220 }, { "epoch": 0.12255052521653664, "grad_norm": 0.271484375, "learning_rate": 0.0003968262990907209, "loss": 3.2030715942382812, "step": 3230 }, { "epoch": 0.12292993860729991, "grad_norm": 0.2734375, "learning_rate": 0.00039680501317936794, "loss": 3.1917633056640624, "step": 3240 }, { "epoch": 0.12330935199806319, "grad_norm": 0.2890625, "learning_rate": 0.0003967836566991567, "loss": 3.2128475189208983, "step": 3250 }, { "epoch": 0.12330935199806319, "eval_loss": 3.1993772983551025, "eval_runtime": 190.5698, "eval_samples_per_second": 19.987, "eval_steps_per_second": 3.332, "step": 3250 }, { "epoch": 0.12368876538882646, "grad_norm": 0.259765625, "learning_rate": 0.00039676222965774507, "loss": 3.219722366333008, "step": 3260 }, { "epoch": 0.12406817877958973, "grad_norm": 0.271484375, "learning_rate": 0.0003967407320628163, "loss": 3.2288341522216797, "step": 3270 }, { "epoch": 0.124447592170353, "grad_norm": 0.26953125, "learning_rate": 0.0003967191639220787, "loss": 3.201400375366211, "step": 3280 }, { "epoch": 0.12482700556111627, "grad_norm": 0.3046875, "learning_rate": 0.00039669752524326604, "loss": 3.191239929199219, "step": 3290 }, { "epoch": 0.12520641895187953, "grad_norm": 0.267578125, "learning_rate": 0.0003966758160341375, "loss": 3.179216003417969, "step": 3300 }, { "epoch": 0.1255858323426428, "grad_norm": 0.26953125, "learning_rate": 0.00039665403630247717, "loss": 3.1856828689575196, "step": 3310 }, { "epoch": 0.1259652457334061, "grad_norm": 0.263671875, "learning_rate": 0.0003966321860560948, "loss": 3.2130191802978514, "step": 3320 }, { "epoch": 0.12634465912416937, "grad_norm": 0.265625, "learning_rate": 0.0003966102653028253, "loss": 3.1774335861206056, "step": 3330 }, { "epoch": 0.12672407251493262, "grad_norm": 0.2734375, "learning_rate": 0.0003965882740505288, "loss": 3.158614730834961, "step": 3340 }, { "epoch": 0.1271034859056959, "grad_norm": 0.267578125, "learning_rate": 0.00039656621230709074, "loss": 3.2069393157958985, "step": 3350 }, { "epoch": 0.12748289929645917, "grad_norm": 0.267578125, "learning_rate": 0.000396544080080422, "loss": 3.1481386184692384, "step": 3360 }, { "epoch": 0.12786231268722245, "grad_norm": 0.2578125, "learning_rate": 0.00039652187737845835, "loss": 3.1857141494750976, "step": 3370 }, { "epoch": 0.1282417260779857, "grad_norm": 0.26171875, "learning_rate": 0.0003964996042091612, "loss": 3.174033355712891, "step": 3380 }, { "epoch": 0.12862113946874898, "grad_norm": 0.267578125, "learning_rate": 0.00039647726058051713, "loss": 3.1931631088256838, "step": 3390 }, { "epoch": 0.12900055285951226, "grad_norm": 0.26953125, "learning_rate": 0.0003964548465005379, "loss": 3.1501256942749025, "step": 3400 }, { "epoch": 0.12937996625027554, "grad_norm": 0.26953125, "learning_rate": 0.0003964323619772605, "loss": 3.154039001464844, "step": 3410 }, { "epoch": 0.12975937964103879, "grad_norm": 0.294921875, "learning_rate": 0.0003964098070187475, "loss": 3.1992359161376953, "step": 3420 }, { "epoch": 0.13013879303180206, "grad_norm": 0.27734375, "learning_rate": 0.0003963871816330862, "loss": 3.1773324966430665, "step": 3430 }, { "epoch": 0.13051820642256534, "grad_norm": 0.2578125, "learning_rate": 0.00039636448582838963, "loss": 3.1678487777709963, "step": 3440 }, { "epoch": 0.13089761981332862, "grad_norm": 0.259765625, "learning_rate": 0.0003963417196127958, "loss": 3.1810958862304686, "step": 3450 }, { "epoch": 0.13127703320409187, "grad_norm": 0.271484375, "learning_rate": 0.0003963188829944681, "loss": 3.1987842559814452, "step": 3460 }, { "epoch": 0.13165644659485515, "grad_norm": 0.263671875, "learning_rate": 0.00039629597598159514, "loss": 3.203843688964844, "step": 3470 }, { "epoch": 0.13203585998561843, "grad_norm": 0.27734375, "learning_rate": 0.00039627299858239064, "loss": 3.2131175994873047, "step": 3480 }, { "epoch": 0.1324152733763817, "grad_norm": 0.263671875, "learning_rate": 0.00039624995080509367, "loss": 3.198674201965332, "step": 3490 }, { "epoch": 0.13279468676714498, "grad_norm": 0.271484375, "learning_rate": 0.00039622683265796863, "loss": 3.217747116088867, "step": 3500 }, { "epoch": 0.13279468676714498, "eval_loss": 3.1763858795166016, "eval_runtime": 190.4916, "eval_samples_per_second": 19.996, "eval_steps_per_second": 3.333, "step": 3500 }, { "epoch": 0.13317410015790823, "grad_norm": 0.2431640625, "learning_rate": 0.000396203644149305, "loss": 3.1646514892578126, "step": 3510 }, { "epoch": 0.1335535135486715, "grad_norm": 0.26953125, "learning_rate": 0.00039618038528741747, "loss": 3.1898372650146483, "step": 3520 }, { "epoch": 0.1339329269394348, "grad_norm": 0.265625, "learning_rate": 0.0003961570560806461, "loss": 3.165264129638672, "step": 3530 }, { "epoch": 0.13431234033019807, "grad_norm": 0.287109375, "learning_rate": 0.0003961336565373561, "loss": 3.1918710708618163, "step": 3540 }, { "epoch": 0.13469175372096132, "grad_norm": 0.263671875, "learning_rate": 0.00039611018666593794, "loss": 3.1953025817871095, "step": 3550 }, { "epoch": 0.1350711671117246, "grad_norm": 0.255859375, "learning_rate": 0.00039608664647480715, "loss": 3.201638031005859, "step": 3560 }, { "epoch": 0.13545058050248787, "grad_norm": 0.259765625, "learning_rate": 0.00039606303597240473, "loss": 3.1955610275268556, "step": 3570 }, { "epoch": 0.13582999389325115, "grad_norm": 0.265625, "learning_rate": 0.0003960393551671967, "loss": 3.2074302673339843, "step": 3580 }, { "epoch": 0.1362094072840144, "grad_norm": 0.28125, "learning_rate": 0.00039601560406767424, "loss": 3.1699798583984373, "step": 3590 }, { "epoch": 0.13658882067477768, "grad_norm": 0.298828125, "learning_rate": 0.00039599178268235404, "loss": 3.13268928527832, "step": 3600 }, { "epoch": 0.13696823406554096, "grad_norm": 0.28515625, "learning_rate": 0.00039596789101977765, "loss": 3.2017677307128904, "step": 3610 }, { "epoch": 0.13734764745630423, "grad_norm": 0.271484375, "learning_rate": 0.00039594392908851196, "loss": 3.179719924926758, "step": 3620 }, { "epoch": 0.13772706084706748, "grad_norm": 0.298828125, "learning_rate": 0.00039591989689714916, "loss": 3.1756086349487305, "step": 3630 }, { "epoch": 0.13810647423783076, "grad_norm": 0.310546875, "learning_rate": 0.0003958957944543065, "loss": 3.2004966735839844, "step": 3640 }, { "epoch": 0.13848588762859404, "grad_norm": 0.26171875, "learning_rate": 0.0003958716217686264, "loss": 3.1591590881347655, "step": 3650 }, { "epoch": 0.13886530101935732, "grad_norm": 0.259765625, "learning_rate": 0.00039584737884877647, "loss": 3.1892677307128907, "step": 3660 }, { "epoch": 0.13924471441012057, "grad_norm": 0.2890625, "learning_rate": 0.0003958230657034497, "loss": 3.185030937194824, "step": 3670 }, { "epoch": 0.13962412780088385, "grad_norm": 0.2578125, "learning_rate": 0.00039579868234136407, "loss": 3.156549263000488, "step": 3680 }, { "epoch": 0.14000354119164712, "grad_norm": 0.267578125, "learning_rate": 0.00039577422877126267, "loss": 3.1410797119140623, "step": 3690 }, { "epoch": 0.1403829545824104, "grad_norm": 0.26171875, "learning_rate": 0.00039574970500191403, "loss": 3.1465173721313477, "step": 3700 }, { "epoch": 0.14076236797317368, "grad_norm": 0.265625, "learning_rate": 0.0003957251110421116, "loss": 3.145336151123047, "step": 3710 }, { "epoch": 0.14114178136393693, "grad_norm": 0.25390625, "learning_rate": 0.0003957004469006741, "loss": 3.1785070419311525, "step": 3720 }, { "epoch": 0.1415211947547002, "grad_norm": 0.263671875, "learning_rate": 0.0003956757125864455, "loss": 3.1859132766723635, "step": 3730 }, { "epoch": 0.14190060814546349, "grad_norm": 0.27734375, "learning_rate": 0.0003956509081082947, "loss": 3.16925163269043, "step": 3740 }, { "epoch": 0.14228002153622676, "grad_norm": 0.267578125, "learning_rate": 0.00039562603347511607, "loss": 3.1520597457885744, "step": 3750 }, { "epoch": 0.14228002153622676, "eval_loss": 3.161163806915283, "eval_runtime": 190.5194, "eval_samples_per_second": 19.993, "eval_steps_per_second": 3.333, "step": 3750 }, { "epoch": 0.14265943492699001, "grad_norm": 0.267578125, "learning_rate": 0.0003956010886958289, "loss": 3.1545812606811525, "step": 3760 }, { "epoch": 0.1430388483177533, "grad_norm": 0.259765625, "learning_rate": 0.0003955760737793776, "loss": 3.1742273330688477, "step": 3770 }, { "epoch": 0.14341826170851657, "grad_norm": 0.26171875, "learning_rate": 0.00039555098873473194, "loss": 3.1577796936035156, "step": 3780 }, { "epoch": 0.14379767509927985, "grad_norm": 0.291015625, "learning_rate": 0.0003955258335708868, "loss": 3.1755435943603514, "step": 3790 }, { "epoch": 0.1441770884900431, "grad_norm": 0.251953125, "learning_rate": 0.0003955006082968619, "loss": 3.16396427154541, "step": 3800 }, { "epoch": 0.14455650188080638, "grad_norm": 0.25390625, "learning_rate": 0.0003954753129217025, "loss": 3.1266719818115236, "step": 3810 }, { "epoch": 0.14493591527156965, "grad_norm": 0.259765625, "learning_rate": 0.0003954499474544788, "loss": 3.139068603515625, "step": 3820 }, { "epoch": 0.14531532866233293, "grad_norm": 0.30078125, "learning_rate": 0.0003954245119042861, "loss": 3.156755065917969, "step": 3830 }, { "epoch": 0.14569474205309618, "grad_norm": 0.271484375, "learning_rate": 0.000395399006280245, "loss": 3.1434293746948243, "step": 3840 }, { "epoch": 0.14607415544385946, "grad_norm": 0.267578125, "learning_rate": 0.00039537343059150096, "loss": 3.158995246887207, "step": 3850 }, { "epoch": 0.14645356883462274, "grad_norm": 0.25390625, "learning_rate": 0.0003953477848472249, "loss": 3.156184768676758, "step": 3860 }, { "epoch": 0.14683298222538602, "grad_norm": 0.263671875, "learning_rate": 0.0003953220690566125, "loss": 3.1421764373779295, "step": 3870 }, { "epoch": 0.14721239561614927, "grad_norm": 0.28515625, "learning_rate": 0.00039529628322888485, "loss": 3.166634368896484, "step": 3880 }, { "epoch": 0.14759180900691254, "grad_norm": 0.275390625, "learning_rate": 0.000395270427373288, "loss": 3.172174072265625, "step": 3890 }, { "epoch": 0.14797122239767582, "grad_norm": 0.27734375, "learning_rate": 0.0003952445014990931, "loss": 3.172516632080078, "step": 3900 }, { "epoch": 0.1483506357884391, "grad_norm": 0.28125, "learning_rate": 0.0003952185056155966, "loss": 3.1702529907226564, "step": 3910 }, { "epoch": 0.14873004917920238, "grad_norm": 0.28125, "learning_rate": 0.00039519243973211974, "loss": 3.158992576599121, "step": 3920 }, { "epoch": 0.14910946256996563, "grad_norm": 0.26953125, "learning_rate": 0.00039516630385800914, "loss": 3.148736572265625, "step": 3930 }, { "epoch": 0.1494888759607289, "grad_norm": 0.271484375, "learning_rate": 0.0003951400980026364, "loss": 3.1911731719970704, "step": 3940 }, { "epoch": 0.14986828935149218, "grad_norm": 0.2578125, "learning_rate": 0.00039511382217539807, "loss": 3.1700101852416993, "step": 3950 }, { "epoch": 0.15024770274225546, "grad_norm": 0.296875, "learning_rate": 0.00039508747638571616, "loss": 3.1607608795166016, "step": 3960 }, { "epoch": 0.1506271161330187, "grad_norm": 0.265625, "learning_rate": 0.0003950610606430374, "loss": 3.144011306762695, "step": 3970 }, { "epoch": 0.151006529523782, "grad_norm": 0.298828125, "learning_rate": 0.0003950345749568337, "loss": 3.1690290451049803, "step": 3980 }, { "epoch": 0.15138594291454527, "grad_norm": 0.267578125, "learning_rate": 0.0003950080193366023, "loss": 3.1579965591430663, "step": 3990 }, { "epoch": 0.15176535630530855, "grad_norm": 0.30078125, "learning_rate": 0.00039498139379186514, "loss": 3.149643135070801, "step": 4000 }, { "epoch": 0.15176535630530855, "eval_loss": 3.1423935890197754, "eval_runtime": 191.7216, "eval_samples_per_second": 19.867, "eval_steps_per_second": 3.312, "step": 4000 }, { "epoch": 0.1521447696960718, "grad_norm": 0.271484375, "learning_rate": 0.0003949546983321695, "loss": 3.147187423706055, "step": 4010 }, { "epoch": 0.15252418308683507, "grad_norm": 0.255859375, "learning_rate": 0.00039492793296708753, "loss": 3.1519872665405275, "step": 4020 }, { "epoch": 0.15290359647759835, "grad_norm": 0.279296875, "learning_rate": 0.00039490109770621667, "loss": 3.1262311935424805, "step": 4030 }, { "epoch": 0.15328300986836163, "grad_norm": 0.267578125, "learning_rate": 0.0003948741925591793, "loss": 3.155449104309082, "step": 4040 }, { "epoch": 0.15366242325912488, "grad_norm": 0.255859375, "learning_rate": 0.00039484721753562274, "loss": 3.1307321548461915, "step": 4050 }, { "epoch": 0.15404183664988816, "grad_norm": 0.279296875, "learning_rate": 0.0003948201726452196, "loss": 3.1360015869140625, "step": 4060 }, { "epoch": 0.15442125004065144, "grad_norm": 0.26953125, "learning_rate": 0.00039479305789766745, "loss": 3.1438663482666014, "step": 4070 }, { "epoch": 0.15480066343141471, "grad_norm": 0.283203125, "learning_rate": 0.0003947658733026888, "loss": 3.1049808502197265, "step": 4080 }, { "epoch": 0.15518007682217796, "grad_norm": 0.2578125, "learning_rate": 0.00039473861887003134, "loss": 3.12929630279541, "step": 4090 }, { "epoch": 0.15555949021294124, "grad_norm": 0.25, "learning_rate": 0.0003947112946094678, "loss": 3.1306598663330076, "step": 4100 }, { "epoch": 0.15593890360370452, "grad_norm": 0.58203125, "learning_rate": 0.0003946839005307958, "loss": 3.1580244064331056, "step": 4110 }, { "epoch": 0.1563183169944678, "grad_norm": 0.267578125, "learning_rate": 0.00039465643664383823, "loss": 3.130876159667969, "step": 4120 }, { "epoch": 0.15669773038523108, "grad_norm": 0.275390625, "learning_rate": 0.0003946289029584429, "loss": 3.1697431564331056, "step": 4130 }, { "epoch": 0.15707714377599433, "grad_norm": 0.265625, "learning_rate": 0.0003946012994844824, "loss": 3.1234243392944334, "step": 4140 }, { "epoch": 0.1574565571667576, "grad_norm": 0.2578125, "learning_rate": 0.00039457362623185484, "loss": 3.12270450592041, "step": 4150 }, { "epoch": 0.15783597055752088, "grad_norm": 0.265625, "learning_rate": 0.000394545883210483, "loss": 3.1169191360473634, "step": 4160 }, { "epoch": 0.15821538394828416, "grad_norm": 0.251953125, "learning_rate": 0.0003945180704303147, "loss": 3.1725353240966796, "step": 4170 }, { "epoch": 0.1585947973390474, "grad_norm": 0.275390625, "learning_rate": 0.00039449018790132286, "loss": 3.153780937194824, "step": 4180 }, { "epoch": 0.1589742107298107, "grad_norm": 0.279296875, "learning_rate": 0.00039446223563350543, "loss": 3.1445758819580076, "step": 4190 }, { "epoch": 0.15935362412057397, "grad_norm": 0.248046875, "learning_rate": 0.0003944342136368853, "loss": 3.1530731201171873, "step": 4200 }, { "epoch": 0.15973303751133724, "grad_norm": 0.267578125, "learning_rate": 0.0003944061219215104, "loss": 3.1206655502319336, "step": 4210 }, { "epoch": 0.1601124509021005, "grad_norm": 0.275390625, "learning_rate": 0.00039437796049745366, "loss": 3.1229215621948243, "step": 4220 }, { "epoch": 0.16049186429286377, "grad_norm": 0.28515625, "learning_rate": 0.0003943497293748129, "loss": 3.122256278991699, "step": 4230 }, { "epoch": 0.16087127768362705, "grad_norm": 0.310546875, "learning_rate": 0.00039432142856371115, "loss": 3.1335176467895507, "step": 4240 }, { "epoch": 0.16125069107439033, "grad_norm": 0.291015625, "learning_rate": 0.0003942930580742962, "loss": 3.135046195983887, "step": 4250 }, { "epoch": 0.16125069107439033, "eval_loss": 3.1235926151275635, "eval_runtime": 189.2363, "eval_samples_per_second": 20.128, "eval_steps_per_second": 3.356, "step": 4250 }, { "epoch": 0.16163010446515358, "grad_norm": 0.26953125, "learning_rate": 0.00039426461791674096, "loss": 3.1341617584228514, "step": 4260 }, { "epoch": 0.16200951785591686, "grad_norm": 0.26171875, "learning_rate": 0.00039423610810124335, "loss": 3.116948890686035, "step": 4270 }, { "epoch": 0.16238893124668013, "grad_norm": 0.26953125, "learning_rate": 0.0003942075286380261, "loss": 3.135293197631836, "step": 4280 }, { "epoch": 0.1627683446374434, "grad_norm": 0.2734375, "learning_rate": 0.0003941788795373371, "loss": 3.1215787887573243, "step": 4290 }, { "epoch": 0.16314775802820666, "grad_norm": 0.291015625, "learning_rate": 0.000394150160809449, "loss": 3.109284019470215, "step": 4300 }, { "epoch": 0.16352717141896994, "grad_norm": 0.28125, "learning_rate": 0.00039412137246465974, "loss": 3.1205610275268554, "step": 4310 }, { "epoch": 0.16390658480973322, "grad_norm": 0.26953125, "learning_rate": 0.0003940925145132919, "loss": 3.1329755783081055, "step": 4320 }, { "epoch": 0.1642859982004965, "grad_norm": 0.265625, "learning_rate": 0.00039406358696569317, "loss": 3.1206809997558596, "step": 4330 }, { "epoch": 0.16466541159125975, "grad_norm": 0.251953125, "learning_rate": 0.0003940345898322362, "loss": 3.1274477005004884, "step": 4340 }, { "epoch": 0.16504482498202303, "grad_norm": 0.265625, "learning_rate": 0.00039400552312331854, "loss": 3.0842920303344727, "step": 4350 }, { "epoch": 0.1654242383727863, "grad_norm": 0.265625, "learning_rate": 0.00039397638684936264, "loss": 3.120190238952637, "step": 4360 }, { "epoch": 0.16580365176354958, "grad_norm": 0.3125, "learning_rate": 0.00039394718102081614, "loss": 3.0977289199829103, "step": 4370 }, { "epoch": 0.16618306515431286, "grad_norm": 0.263671875, "learning_rate": 0.0003939179056481513, "loss": 3.0984542846679686, "step": 4380 }, { "epoch": 0.1665624785450761, "grad_norm": 0.26953125, "learning_rate": 0.0003938885607418655, "loss": 3.130707359313965, "step": 4390 }, { "epoch": 0.1669418919358394, "grad_norm": 0.25, "learning_rate": 0.000393859146312481, "loss": 3.1190866470336913, "step": 4400 }, { "epoch": 0.16732130532660267, "grad_norm": 0.2578125, "learning_rate": 0.000393829662370545, "loss": 3.136975860595703, "step": 4410 }, { "epoch": 0.16770071871736594, "grad_norm": 0.291015625, "learning_rate": 0.00039380010892662974, "loss": 3.131087303161621, "step": 4420 }, { "epoch": 0.1680801321081292, "grad_norm": 0.259765625, "learning_rate": 0.00039377048599133213, "loss": 3.128687858581543, "step": 4430 }, { "epoch": 0.16845954549889247, "grad_norm": 0.28515625, "learning_rate": 0.00039374079357527424, "loss": 3.0863744735717775, "step": 4440 }, { "epoch": 0.16883895888965575, "grad_norm": 0.26171875, "learning_rate": 0.0003937110316891029, "loss": 3.097390365600586, "step": 4450 }, { "epoch": 0.16921837228041903, "grad_norm": 0.255859375, "learning_rate": 0.00039368120034348985, "loss": 3.096174430847168, "step": 4460 }, { "epoch": 0.16959778567118228, "grad_norm": 0.255859375, "learning_rate": 0.0003936512995491319, "loss": 3.138645362854004, "step": 4470 }, { "epoch": 0.16997719906194556, "grad_norm": 0.265625, "learning_rate": 0.0003936213293167506, "loss": 3.129154396057129, "step": 4480 }, { "epoch": 0.17035661245270883, "grad_norm": 0.267578125, "learning_rate": 0.0003935912896570924, "loss": 3.1045516967773437, "step": 4490 }, { "epoch": 0.1707360258434721, "grad_norm": 0.25390625, "learning_rate": 0.00039356118058092875, "loss": 3.1128679275512696, "step": 4500 }, { "epoch": 0.1707360258434721, "eval_loss": 3.105747699737549, "eval_runtime": 188.0796, "eval_samples_per_second": 20.252, "eval_steps_per_second": 3.376, "step": 4500 }, { "epoch": 0.17111543923423536, "grad_norm": 0.267578125, "learning_rate": 0.000393531002099056, "loss": 3.093027687072754, "step": 4510 }, { "epoch": 0.17149485262499864, "grad_norm": 0.251953125, "learning_rate": 0.0003935007542222952, "loss": 3.092297172546387, "step": 4520 }, { "epoch": 0.17187426601576192, "grad_norm": 0.2890625, "learning_rate": 0.00039347043696149253, "loss": 3.1150651931762696, "step": 4530 }, { "epoch": 0.1722536794065252, "grad_norm": 0.27734375, "learning_rate": 0.0003934400503275188, "loss": 3.105441093444824, "step": 4540 }, { "epoch": 0.17263309279728845, "grad_norm": 0.26953125, "learning_rate": 0.00039340959433126985, "loss": 3.0971046447753907, "step": 4550 }, { "epoch": 0.17301250618805172, "grad_norm": 0.26171875, "learning_rate": 0.0003933790689836664, "loss": 3.1036563873291017, "step": 4560 }, { "epoch": 0.173391919578815, "grad_norm": 0.2578125, "learning_rate": 0.000393348474295654, "loss": 3.1101850509643554, "step": 4570 }, { "epoch": 0.17377133296957828, "grad_norm": 0.265625, "learning_rate": 0.00039331781027820306, "loss": 3.123235321044922, "step": 4580 }, { "epoch": 0.17415074636034156, "grad_norm": 0.25390625, "learning_rate": 0.00039328707694230884, "loss": 3.1062808990478517, "step": 4590 }, { "epoch": 0.1745301597511048, "grad_norm": 0.255859375, "learning_rate": 0.00039325627429899143, "loss": 3.1089080810546874, "step": 4600 }, { "epoch": 0.17490957314186809, "grad_norm": 0.265625, "learning_rate": 0.00039322540235929587, "loss": 3.100897026062012, "step": 4610 }, { "epoch": 0.17528898653263136, "grad_norm": 0.267578125, "learning_rate": 0.00039319446113429197, "loss": 3.106528091430664, "step": 4620 }, { "epoch": 0.17566839992339464, "grad_norm": 0.259765625, "learning_rate": 0.00039316345063507437, "loss": 3.113416290283203, "step": 4630 }, { "epoch": 0.1760478133141579, "grad_norm": 0.2734375, "learning_rate": 0.00039313237087276265, "loss": 3.0907567977905273, "step": 4640 }, { "epoch": 0.17642722670492117, "grad_norm": 0.271484375, "learning_rate": 0.00039310122185850106, "loss": 3.091078186035156, "step": 4650 }, { "epoch": 0.17680664009568445, "grad_norm": 0.265625, "learning_rate": 0.00039307000360345887, "loss": 3.125278663635254, "step": 4660 }, { "epoch": 0.17718605348644773, "grad_norm": 0.26171875, "learning_rate": 0.0003930387161188301, "loss": 3.1307973861694336, "step": 4670 }, { "epoch": 0.17756546687721098, "grad_norm": 0.263671875, "learning_rate": 0.0003930073594158334, "loss": 3.121413230895996, "step": 4680 }, { "epoch": 0.17794488026797425, "grad_norm": 0.25, "learning_rate": 0.00039297593350571267, "loss": 3.1079534530639648, "step": 4690 }, { "epoch": 0.17832429365873753, "grad_norm": 0.25390625, "learning_rate": 0.0003929444383997362, "loss": 3.0847034454345703, "step": 4700 }, { "epoch": 0.1787037070495008, "grad_norm": 0.248046875, "learning_rate": 0.00039291287410919733, "loss": 3.097280502319336, "step": 4710 }, { "epoch": 0.17908312044026406, "grad_norm": 0.28515625, "learning_rate": 0.0003928812406454141, "loss": 3.1118825912475585, "step": 4720 }, { "epoch": 0.17946253383102734, "grad_norm": 0.2890625, "learning_rate": 0.00039284953801972954, "loss": 3.1392723083496095, "step": 4730 }, { "epoch": 0.17984194722179062, "grad_norm": 0.263671875, "learning_rate": 0.0003928177662435112, "loss": 3.102610969543457, "step": 4740 }, { "epoch": 0.1802213606125539, "grad_norm": 0.275390625, "learning_rate": 0.00039278592532815155, "loss": 3.123360443115234, "step": 4750 }, { "epoch": 0.1802213606125539, "eval_loss": 3.0948104858398438, "eval_runtime": 189.7506, "eval_samples_per_second": 20.074, "eval_steps_per_second": 3.346, "step": 4750 }, { "epoch": 0.18060077400331714, "grad_norm": 0.25390625, "learning_rate": 0.00039275401528506796, "loss": 3.077744483947754, "step": 4760 }, { "epoch": 0.18098018739408042, "grad_norm": 0.26171875, "learning_rate": 0.00039272203612570244, "loss": 3.0786006927490233, "step": 4770 }, { "epoch": 0.1813596007848437, "grad_norm": 0.259765625, "learning_rate": 0.00039268998786152185, "loss": 3.101948547363281, "step": 4780 }, { "epoch": 0.18173901417560698, "grad_norm": 0.251953125, "learning_rate": 0.00039265787050401775, "loss": 3.0892316818237306, "step": 4790 }, { "epoch": 0.18211842756637026, "grad_norm": 0.244140625, "learning_rate": 0.0003926256840647067, "loss": 3.102895164489746, "step": 4800 }, { "epoch": 0.1824978409571335, "grad_norm": 0.255859375, "learning_rate": 0.00039259342855512975, "loss": 3.092983055114746, "step": 4810 }, { "epoch": 0.18287725434789678, "grad_norm": 0.26171875, "learning_rate": 0.0003925611039868528, "loss": 3.1071754455566407, "step": 4820 }, { "epoch": 0.18325666773866006, "grad_norm": 0.26953125, "learning_rate": 0.0003925287103714666, "loss": 3.0933536529541015, "step": 4830 }, { "epoch": 0.18363608112942334, "grad_norm": 0.279296875, "learning_rate": 0.0003924962477205867, "loss": 3.092901420593262, "step": 4840 }, { "epoch": 0.1840154945201866, "grad_norm": 0.265625, "learning_rate": 0.00039246371604585316, "loss": 3.0926456451416016, "step": 4850 }, { "epoch": 0.18439490791094987, "grad_norm": 0.2734375, "learning_rate": 0.00039243111535893106, "loss": 3.0917062759399414, "step": 4860 }, { "epoch": 0.18477432130171315, "grad_norm": 0.255859375, "learning_rate": 0.00039239844567151, "loss": 3.1093847274780275, "step": 4870 }, { "epoch": 0.18515373469247642, "grad_norm": 0.2734375, "learning_rate": 0.0003923657069953046, "loss": 3.085328483581543, "step": 4880 }, { "epoch": 0.18553314808323967, "grad_norm": 0.251953125, "learning_rate": 0.0003923328993420538, "loss": 3.0880931854248046, "step": 4890 }, { "epoch": 0.18591256147400295, "grad_norm": 0.265625, "learning_rate": 0.00039230002272352174, "loss": 3.1040718078613283, "step": 4900 }, { "epoch": 0.18629197486476623, "grad_norm": 0.259765625, "learning_rate": 0.00039226707715149696, "loss": 3.0825883865356447, "step": 4910 }, { "epoch": 0.1866713882555295, "grad_norm": 0.2470703125, "learning_rate": 0.0003922340626377929, "loss": 3.098618507385254, "step": 4920 }, { "epoch": 0.18705080164629276, "grad_norm": 0.2578125, "learning_rate": 0.0003922009791942476, "loss": 3.094152069091797, "step": 4930 }, { "epoch": 0.18743021503705604, "grad_norm": 0.26953125, "learning_rate": 0.0003921678268327239, "loss": 3.0933414459228517, "step": 4940 }, { "epoch": 0.18780962842781931, "grad_norm": 0.25390625, "learning_rate": 0.00039213460556510927, "loss": 3.1098737716674805, "step": 4950 }, { "epoch": 0.1881890418185826, "grad_norm": 0.2578125, "learning_rate": 0.00039210131540331605, "loss": 3.0880388259887694, "step": 4960 }, { "epoch": 0.18856845520934584, "grad_norm": 0.24609375, "learning_rate": 0.00039206795635928106, "loss": 3.0991273880004884, "step": 4970 }, { "epoch": 0.18894786860010912, "grad_norm": 0.251953125, "learning_rate": 0.00039203452844496604, "loss": 3.088553619384766, "step": 4980 }, { "epoch": 0.1893272819908724, "grad_norm": 0.2451171875, "learning_rate": 0.00039200103167235723, "loss": 3.0754655838012694, "step": 4990 }, { "epoch": 0.18970669538163568, "grad_norm": 0.2421875, "learning_rate": 0.0003919674660534657, "loss": 3.051005744934082, "step": 5000 }, { "epoch": 0.18970669538163568, "eval_loss": 3.0760586261749268, "eval_runtime": 190.4734, "eval_samples_per_second": 19.998, "eval_steps_per_second": 3.334, "step": 5000 }, { "epoch": 0.19008610877239895, "grad_norm": 0.26171875, "learning_rate": 0.00039193383160032714, "loss": 3.1063222885131836, "step": 5010 }, { "epoch": 0.1904655221631622, "grad_norm": 0.25, "learning_rate": 0.000391900128325002, "loss": 3.0939071655273436, "step": 5020 }, { "epoch": 0.19084493555392548, "grad_norm": 0.2578125, "learning_rate": 0.0003918663562395752, "loss": 3.048473930358887, "step": 5030 }, { "epoch": 0.19122434894468876, "grad_norm": 0.265625, "learning_rate": 0.00039183251535615657, "loss": 3.0736146926879884, "step": 5040 }, { "epoch": 0.19160376233545204, "grad_norm": 0.251953125, "learning_rate": 0.0003917986056868806, "loss": 3.075603485107422, "step": 5050 }, { "epoch": 0.1919831757262153, "grad_norm": 0.271484375, "learning_rate": 0.0003917646272439062, "loss": 3.0998844146728515, "step": 5060 }, { "epoch": 0.19236258911697857, "grad_norm": 0.26171875, "learning_rate": 0.00039173058003941723, "loss": 3.0792354583740233, "step": 5070 }, { "epoch": 0.19274200250774184, "grad_norm": 0.24609375, "learning_rate": 0.000391696464085622, "loss": 3.0773239135742188, "step": 5080 }, { "epoch": 0.19312141589850512, "grad_norm": 0.287109375, "learning_rate": 0.0003916622793947536, "loss": 3.069914436340332, "step": 5090 }, { "epoch": 0.19350082928926837, "grad_norm": 0.267578125, "learning_rate": 0.00039162802597906974, "loss": 3.0857072830200196, "step": 5100 }, { "epoch": 0.19388024268003165, "grad_norm": 0.2890625, "learning_rate": 0.0003915937038508527, "loss": 3.09625244140625, "step": 5110 }, { "epoch": 0.19425965607079493, "grad_norm": 0.255859375, "learning_rate": 0.0003915593130224094, "loss": 3.0600091934204103, "step": 5120 }, { "epoch": 0.1946390694615582, "grad_norm": 0.255859375, "learning_rate": 0.0003915248535060716, "loss": 3.092440605163574, "step": 5130 }, { "epoch": 0.19501848285232146, "grad_norm": 0.279296875, "learning_rate": 0.00039149032531419543, "loss": 3.0563852310180666, "step": 5140 }, { "epoch": 0.19539789624308473, "grad_norm": 0.259765625, "learning_rate": 0.0003914557284591617, "loss": 3.0726715087890626, "step": 5150 }, { "epoch": 0.195777309633848, "grad_norm": 0.259765625, "learning_rate": 0.00039142106295337605, "loss": 3.053900146484375, "step": 5160 }, { "epoch": 0.1961567230246113, "grad_norm": 0.263671875, "learning_rate": 0.0003913863288092684, "loss": 3.0832475662231444, "step": 5170 }, { "epoch": 0.19653613641537454, "grad_norm": 0.2490234375, "learning_rate": 0.00039135152603929363, "loss": 3.0559377670288086, "step": 5180 }, { "epoch": 0.19691554980613782, "grad_norm": 0.259765625, "learning_rate": 0.00039131665465593087, "loss": 3.0558164596557615, "step": 5190 }, { "epoch": 0.1972949631969011, "grad_norm": 0.259765625, "learning_rate": 0.00039128171467168413, "loss": 3.0758010864257814, "step": 5200 }, { "epoch": 0.19767437658766437, "grad_norm": 0.248046875, "learning_rate": 0.000391246706099082, "loss": 3.0679378509521484, "step": 5210 }, { "epoch": 0.19805378997842765, "grad_norm": 0.263671875, "learning_rate": 0.00039121162895067743, "loss": 3.0887741088867187, "step": 5220 }, { "epoch": 0.1984332033691909, "grad_norm": 0.265625, "learning_rate": 0.0003911764832390483, "loss": 3.0620540618896483, "step": 5230 }, { "epoch": 0.19881261675995418, "grad_norm": 0.263671875, "learning_rate": 0.0003911412689767967, "loss": 3.1002960205078125, "step": 5240 }, { "epoch": 0.19919203015071746, "grad_norm": 0.271484375, "learning_rate": 0.00039110598617654965, "loss": 3.043157196044922, "step": 5250 }, { "epoch": 0.19919203015071746, "eval_loss": 3.0658929347991943, "eval_runtime": 190.4705, "eval_samples_per_second": 19.998, "eval_steps_per_second": 3.334, "step": 5250 }, { "epoch": 0.19957144354148074, "grad_norm": 0.271484375, "learning_rate": 0.0003910706348509585, "loss": 3.0650875091552736, "step": 5260 }, { "epoch": 0.199950856932244, "grad_norm": 0.271484375, "learning_rate": 0.00039103521501269935, "loss": 3.1026472091674804, "step": 5270 }, { "epoch": 0.20033027032300726, "grad_norm": 0.267578125, "learning_rate": 0.00039099972667447266, "loss": 3.094883918762207, "step": 5280 }, { "epoch": 0.20070968371377054, "grad_norm": 0.26171875, "learning_rate": 0.0003909641698490037, "loss": 3.0660074234008787, "step": 5290 }, { "epoch": 0.20108909710453382, "grad_norm": 0.2578125, "learning_rate": 0.00039092854454904206, "loss": 3.075409507751465, "step": 5300 }, { "epoch": 0.20146851049529707, "grad_norm": 0.2431640625, "learning_rate": 0.00039089285078736205, "loss": 3.04154052734375, "step": 5310 }, { "epoch": 0.20184792388606035, "grad_norm": 0.26953125, "learning_rate": 0.0003908570885767625, "loss": 3.06988639831543, "step": 5320 }, { "epoch": 0.20222733727682363, "grad_norm": 0.263671875, "learning_rate": 0.0003908212579300666, "loss": 3.087972068786621, "step": 5330 }, { "epoch": 0.2026067506675869, "grad_norm": 0.25, "learning_rate": 0.00039078535886012244, "loss": 3.0782989501953124, "step": 5340 }, { "epoch": 0.20298616405835015, "grad_norm": 0.267578125, "learning_rate": 0.0003907493913798023, "loss": 3.0835615158081056, "step": 5350 }, { "epoch": 0.20336557744911343, "grad_norm": 0.26171875, "learning_rate": 0.0003907133555020032, "loss": 3.08081169128418, "step": 5360 }, { "epoch": 0.2037449908398767, "grad_norm": 0.271484375, "learning_rate": 0.0003906772512396466, "loss": 3.0684831619262694, "step": 5370 }, { "epoch": 0.20412440423064, "grad_norm": 0.265625, "learning_rate": 0.0003906410786056784, "loss": 3.0699722290039064, "step": 5380 }, { "epoch": 0.20450381762140324, "grad_norm": 0.265625, "learning_rate": 0.00039060483761306925, "loss": 3.0449508666992187, "step": 5390 }, { "epoch": 0.20488323101216652, "grad_norm": 0.2578125, "learning_rate": 0.0003905685282748141, "loss": 3.0783575057983397, "step": 5400 }, { "epoch": 0.2052626444029298, "grad_norm": 0.263671875, "learning_rate": 0.00039053215060393245, "loss": 3.055019760131836, "step": 5410 }, { "epoch": 0.20564205779369307, "grad_norm": 0.259765625, "learning_rate": 0.0003904957046134684, "loss": 3.0440196990966797, "step": 5420 }, { "epoch": 0.20602147118445632, "grad_norm": 0.2490234375, "learning_rate": 0.0003904591903164904, "loss": 3.058334732055664, "step": 5430 }, { "epoch": 0.2064008845752196, "grad_norm": 0.265625, "learning_rate": 0.00039042260772609153, "loss": 3.029117202758789, "step": 5440 }, { "epoch": 0.20678029796598288, "grad_norm": 0.26953125, "learning_rate": 0.0003903859568553894, "loss": 3.092864418029785, "step": 5450 }, { "epoch": 0.20715971135674616, "grad_norm": 0.26171875, "learning_rate": 0.0003903492377175258, "loss": 3.026767349243164, "step": 5460 }, { "epoch": 0.20753912474750943, "grad_norm": 0.25390625, "learning_rate": 0.0003903124503256673, "loss": 3.052578353881836, "step": 5470 }, { "epoch": 0.20791853813827268, "grad_norm": 0.2578125, "learning_rate": 0.00039027559469300484, "loss": 3.0441753387451174, "step": 5480 }, { "epoch": 0.20829795152903596, "grad_norm": 0.25390625, "learning_rate": 0.0003902386708327538, "loss": 3.059152030944824, "step": 5490 }, { "epoch": 0.20867736491979924, "grad_norm": 0.255859375, "learning_rate": 0.00039020167875815413, "loss": 3.0807073593139647, "step": 5500 }, { "epoch": 0.20867736491979924, "eval_loss": 3.0547823905944824, "eval_runtime": 191.6277, "eval_samples_per_second": 19.877, "eval_steps_per_second": 3.314, "step": 5500 }, { "epoch": 0.20905677831056252, "grad_norm": 0.255859375, "learning_rate": 0.0003901646184824701, "loss": 3.057386779785156, "step": 5510 }, { "epoch": 0.20943619170132577, "grad_norm": 0.259765625, "learning_rate": 0.0003901274900189906, "loss": 3.0476299285888673, "step": 5520 }, { "epoch": 0.20981560509208905, "grad_norm": 0.248046875, "learning_rate": 0.00039009029338102884, "loss": 3.022732162475586, "step": 5530 }, { "epoch": 0.21019501848285232, "grad_norm": 0.271484375, "learning_rate": 0.0003900530285819223, "loss": 3.069116973876953, "step": 5540 }, { "epoch": 0.2105744318736156, "grad_norm": 0.263671875, "learning_rate": 0.00039001569563503353, "loss": 3.0568925857543947, "step": 5550 }, { "epoch": 0.21095384526437885, "grad_norm": 0.267578125, "learning_rate": 0.00038997829455374866, "loss": 3.0618032455444335, "step": 5560 }, { "epoch": 0.21133325865514213, "grad_norm": 0.26171875, "learning_rate": 0.000389940825351479, "loss": 3.063393211364746, "step": 5570 }, { "epoch": 0.2117126720459054, "grad_norm": 0.255859375, "learning_rate": 0.00038990328804165984, "loss": 3.0570919036865236, "step": 5580 }, { "epoch": 0.2120920854366687, "grad_norm": 0.271484375, "learning_rate": 0.00038986568263775105, "loss": 3.01767635345459, "step": 5590 }, { "epoch": 0.21247149882743194, "grad_norm": 0.265625, "learning_rate": 0.00038982800915323687, "loss": 3.074551010131836, "step": 5600 }, { "epoch": 0.21285091221819522, "grad_norm": 0.302734375, "learning_rate": 0.000389790267601626, "loss": 3.056790351867676, "step": 5610 }, { "epoch": 0.2132303256089585, "grad_norm": 0.275390625, "learning_rate": 0.0003897524579964515, "loss": 3.0540019989013674, "step": 5620 }, { "epoch": 0.21360973899972177, "grad_norm": 0.2734375, "learning_rate": 0.0003897145803512709, "loss": 3.0578676223754884, "step": 5630 }, { "epoch": 0.21398915239048502, "grad_norm": 0.26953125, "learning_rate": 0.000389676634679666, "loss": 3.0529884338378905, "step": 5640 }, { "epoch": 0.2143685657812483, "grad_norm": 0.2734375, "learning_rate": 0.0003896386209952431, "loss": 3.029806900024414, "step": 5650 }, { "epoch": 0.21474797917201158, "grad_norm": 0.279296875, "learning_rate": 0.000389600539311633, "loss": 3.05550537109375, "step": 5660 }, { "epoch": 0.21512739256277486, "grad_norm": 0.267578125, "learning_rate": 0.00038956238964249056, "loss": 3.0260597229003907, "step": 5670 }, { "epoch": 0.21550680595353813, "grad_norm": 0.28125, "learning_rate": 0.0003895241720014952, "loss": 3.04888858795166, "step": 5680 }, { "epoch": 0.21588621934430138, "grad_norm": 0.265625, "learning_rate": 0.0003894858864023509, "loss": 3.0761295318603517, "step": 5690 }, { "epoch": 0.21626563273506466, "grad_norm": 0.255859375, "learning_rate": 0.0003894475328587856, "loss": 3.0505929946899415, "step": 5700 }, { "epoch": 0.21664504612582794, "grad_norm": 0.267578125, "learning_rate": 0.000389409111384552, "loss": 3.063776206970215, "step": 5710 }, { "epoch": 0.21702445951659122, "grad_norm": 0.255859375, "learning_rate": 0.00038937062199342686, "loss": 3.0979862213134766, "step": 5720 }, { "epoch": 0.21740387290735447, "grad_norm": 0.275390625, "learning_rate": 0.0003893320646992114, "loss": 3.1099876403808593, "step": 5730 }, { "epoch": 0.21778328629811775, "grad_norm": 0.27734375, "learning_rate": 0.0003892934395157314, "loss": 3.0678497314453126, "step": 5740 }, { "epoch": 0.21816269968888102, "grad_norm": 0.267578125, "learning_rate": 0.0003892547464568366, "loss": 3.0380367279052733, "step": 5750 }, { "epoch": 0.21816269968888102, "eval_loss": 3.0460784435272217, "eval_runtime": 190.9281, "eval_samples_per_second": 19.95, "eval_steps_per_second": 3.326, "step": 5750 }, { "epoch": 0.2185421130796443, "grad_norm": 0.265625, "learning_rate": 0.0003892159855364013, "loss": 3.050878715515137, "step": 5760 }, { "epoch": 0.21892152647040755, "grad_norm": 0.255859375, "learning_rate": 0.0003891771567683241, "loss": 3.041689872741699, "step": 5770 }, { "epoch": 0.21930093986117083, "grad_norm": 0.265625, "learning_rate": 0.00038913826016652805, "loss": 3.006458282470703, "step": 5780 }, { "epoch": 0.2196803532519341, "grad_norm": 0.26953125, "learning_rate": 0.0003890992957449602, "loss": 3.0653383255004885, "step": 5790 }, { "epoch": 0.22005976664269739, "grad_norm": 0.2734375, "learning_rate": 0.00038906026351759223, "loss": 3.0421953201293945, "step": 5800 }, { "epoch": 0.22043918003346064, "grad_norm": 0.25, "learning_rate": 0.00038902116349842, "loss": 3.051733207702637, "step": 5810 }, { "epoch": 0.2208185934242239, "grad_norm": 0.25, "learning_rate": 0.00038898199570146377, "loss": 3.071940040588379, "step": 5820 }, { "epoch": 0.2211980068149872, "grad_norm": 0.2578125, "learning_rate": 0.0003889427601407679, "loss": 3.0515012741088867, "step": 5830 }, { "epoch": 0.22157742020575047, "grad_norm": 0.25390625, "learning_rate": 0.00038890345683040127, "loss": 3.0541091918945313, "step": 5840 }, { "epoch": 0.22195683359651372, "grad_norm": 0.26171875, "learning_rate": 0.00038886408578445695, "loss": 3.032663345336914, "step": 5850 }, { "epoch": 0.222336246987277, "grad_norm": 0.328125, "learning_rate": 0.00038882464701705226, "loss": 3.0498470306396483, "step": 5860 }, { "epoch": 0.22271566037804028, "grad_norm": 0.26171875, "learning_rate": 0.000388785140542329, "loss": 3.033738899230957, "step": 5870 }, { "epoch": 0.22309507376880355, "grad_norm": 0.255859375, "learning_rate": 0.000388745566374453, "loss": 3.0626750946044923, "step": 5880 }, { "epoch": 0.22347448715956683, "grad_norm": 0.279296875, "learning_rate": 0.00038870592452761444, "loss": 3.052654838562012, "step": 5890 }, { "epoch": 0.22385390055033008, "grad_norm": 0.27734375, "learning_rate": 0.00038866621501602774, "loss": 3.0278879165649415, "step": 5900 }, { "epoch": 0.22423331394109336, "grad_norm": 0.2734375, "learning_rate": 0.00038862643785393187, "loss": 3.0542728424072267, "step": 5910 }, { "epoch": 0.22461272733185664, "grad_norm": 0.255859375, "learning_rate": 0.0003885865930555896, "loss": 3.0882007598876955, "step": 5920 }, { "epoch": 0.22499214072261992, "grad_norm": 0.263671875, "learning_rate": 0.0003885466806352883, "loss": 3.048355484008789, "step": 5930 }, { "epoch": 0.22537155411338317, "grad_norm": 0.76171875, "learning_rate": 0.00038850670060733945, "loss": 3.0425302505493166, "step": 5940 }, { "epoch": 0.22575096750414644, "grad_norm": 0.259765625, "learning_rate": 0.0003884666529860787, "loss": 3.0434797286987303, "step": 5950 }, { "epoch": 0.22613038089490972, "grad_norm": 0.259765625, "learning_rate": 0.0003884265377858661, "loss": 3.0820554733276366, "step": 5960 }, { "epoch": 0.226509794285673, "grad_norm": 0.24609375, "learning_rate": 0.00038838635502108583, "loss": 3.039723205566406, "step": 5970 }, { "epoch": 0.22688920767643625, "grad_norm": 0.259765625, "learning_rate": 0.0003883461047061464, "loss": 3.0626552581787108, "step": 5980 }, { "epoch": 0.22726862106719953, "grad_norm": 0.25390625, "learning_rate": 0.0003883057868554803, "loss": 3.069881057739258, "step": 5990 }, { "epoch": 0.2276480344579628, "grad_norm": 0.4140625, "learning_rate": 0.0003882654014835446, "loss": 3.022809600830078, "step": 6000 }, { "epoch": 0.2276480344579628, "eval_loss": 3.0358755588531494, "eval_runtime": 190.4336, "eval_samples_per_second": 20.002, "eval_steps_per_second": 3.334, "step": 6000 }, { "epoch": 0.22802744784872608, "grad_norm": 0.26171875, "learning_rate": 0.00038822494860482024, "loss": 3.0434757232666017, "step": 6010 }, { "epoch": 0.22840686123948933, "grad_norm": 0.2490234375, "learning_rate": 0.0003881844282338126, "loss": 3.0322269439697265, "step": 6020 }, { "epoch": 0.2287862746302526, "grad_norm": 0.251953125, "learning_rate": 0.000388143840385051, "loss": 3.0294137954711915, "step": 6030 }, { "epoch": 0.2291656880210159, "grad_norm": 0.27734375, "learning_rate": 0.00038810318507308933, "loss": 3.0562149047851563, "step": 6040 }, { "epoch": 0.22954510141177917, "grad_norm": 0.26953125, "learning_rate": 0.0003880624623125054, "loss": 3.078971099853516, "step": 6050 }, { "epoch": 0.22992451480254242, "grad_norm": 0.26953125, "learning_rate": 0.0003880216721179012, "loss": 3.0268815994262694, "step": 6060 }, { "epoch": 0.2303039281933057, "grad_norm": 0.2890625, "learning_rate": 0.000387980814503903, "loss": 3.0286006927490234, "step": 6070 }, { "epoch": 0.23068334158406897, "grad_norm": 0.259765625, "learning_rate": 0.0003879398894851614, "loss": 3.054275894165039, "step": 6080 }, { "epoch": 0.23106275497483225, "grad_norm": 0.25390625, "learning_rate": 0.0003878988970763506, "loss": 3.044801712036133, "step": 6090 }, { "epoch": 0.23144216836559553, "grad_norm": 0.267578125, "learning_rate": 0.00038785783729216976, "loss": 3.0150835037231447, "step": 6100 }, { "epoch": 0.23182158175635878, "grad_norm": 0.287109375, "learning_rate": 0.00038781671014734146, "loss": 3.060421371459961, "step": 6110 }, { "epoch": 0.23220099514712206, "grad_norm": 0.2734375, "learning_rate": 0.000387775515656613, "loss": 3.0595746994018556, "step": 6120 }, { "epoch": 0.23258040853788534, "grad_norm": 0.2578125, "learning_rate": 0.0003877342538347554, "loss": 3.010902214050293, "step": 6130 }, { "epoch": 0.2329598219286486, "grad_norm": 0.26171875, "learning_rate": 0.00038769292469656414, "loss": 3.030173683166504, "step": 6140 }, { "epoch": 0.23333923531941186, "grad_norm": 0.255859375, "learning_rate": 0.0003876515282568587, "loss": 3.067148208618164, "step": 6150 }, { "epoch": 0.23371864871017514, "grad_norm": 0.255859375, "learning_rate": 0.00038761006453048267, "loss": 3.0377218246459963, "step": 6160 }, { "epoch": 0.23409806210093842, "grad_norm": 0.2578125, "learning_rate": 0.00038756853353230376, "loss": 3.0612133026123045, "step": 6170 }, { "epoch": 0.2344774754917017, "grad_norm": 0.265625, "learning_rate": 0.00038752693527721393, "loss": 3.0273324966430666, "step": 6180 }, { "epoch": 0.23485688888246495, "grad_norm": 0.25390625, "learning_rate": 0.00038748526978012914, "loss": 3.018521308898926, "step": 6190 }, { "epoch": 0.23523630227322823, "grad_norm": 0.25390625, "learning_rate": 0.0003874435370559895, "loss": 3.0423206329345702, "step": 6200 }, { "epoch": 0.2356157156639915, "grad_norm": 0.251953125, "learning_rate": 0.0003874017371197591, "loss": 3.046007537841797, "step": 6210 }, { "epoch": 0.23599512905475478, "grad_norm": 0.2578125, "learning_rate": 0.00038735986998642637, "loss": 3.01108455657959, "step": 6220 }, { "epoch": 0.23637454244551803, "grad_norm": 0.267578125, "learning_rate": 0.00038731793567100376, "loss": 3.046634292602539, "step": 6230 }, { "epoch": 0.2367539558362813, "grad_norm": 0.326171875, "learning_rate": 0.0003872759341885276, "loss": 3.022522735595703, "step": 6240 }, { "epoch": 0.2371333692270446, "grad_norm": 0.27734375, "learning_rate": 0.00038723386555405867, "loss": 3.0167903900146484, "step": 6250 }, { "epoch": 0.2371333692270446, "eval_loss": 3.027198314666748, "eval_runtime": 189.6678, "eval_samples_per_second": 20.082, "eval_steps_per_second": 3.348, "step": 6250 }, { "epoch": 0.23751278261780787, "grad_norm": 0.2578125, "learning_rate": 0.0003871917297826814, "loss": 3.0325300216674806, "step": 6260 }, { "epoch": 0.23789219600857112, "grad_norm": 0.2470703125, "learning_rate": 0.00038714952688950456, "loss": 3.053480339050293, "step": 6270 }, { "epoch": 0.2382716093993344, "grad_norm": 0.2490234375, "learning_rate": 0.0003871072568896611, "loss": 2.9979526519775392, "step": 6280 }, { "epoch": 0.23865102279009767, "grad_norm": 0.265625, "learning_rate": 0.00038706491979830773, "loss": 3.0213884353637694, "step": 6290 }, { "epoch": 0.23903043618086095, "grad_norm": 0.275390625, "learning_rate": 0.0003870225156306255, "loss": 3.015676498413086, "step": 6300 }, { "epoch": 0.23940984957162423, "grad_norm": 0.28515625, "learning_rate": 0.00038698004440181923, "loss": 3.0345455169677735, "step": 6310 }, { "epoch": 0.23978926296238748, "grad_norm": 0.248046875, "learning_rate": 0.000386937506127118, "loss": 3.038931655883789, "step": 6320 }, { "epoch": 0.24016867635315076, "grad_norm": 0.263671875, "learning_rate": 0.00038689490082177485, "loss": 3.071193504333496, "step": 6330 }, { "epoch": 0.24054808974391403, "grad_norm": 0.263671875, "learning_rate": 0.0003868522285010669, "loss": 3.019996452331543, "step": 6340 }, { "epoch": 0.2409275031346773, "grad_norm": 0.251953125, "learning_rate": 0.0003868094891802953, "loss": 3.047524261474609, "step": 6350 }, { "epoch": 0.24130691652544056, "grad_norm": 0.255859375, "learning_rate": 0.00038676668287478507, "loss": 3.0281131744384764, "step": 6360 }, { "epoch": 0.24168632991620384, "grad_norm": 0.26171875, "learning_rate": 0.00038672380959988545, "loss": 2.9860763549804688, "step": 6370 }, { "epoch": 0.24206574330696712, "grad_norm": 0.263671875, "learning_rate": 0.0003866808693709696, "loss": 3.0220035552978515, "step": 6380 }, { "epoch": 0.2424451566977304, "grad_norm": 0.26171875, "learning_rate": 0.0003866378622034348, "loss": 3.0200822830200194, "step": 6390 }, { "epoch": 0.24282457008849365, "grad_norm": 0.263671875, "learning_rate": 0.0003865947881127021, "loss": 3.0201095581054687, "step": 6400 }, { "epoch": 0.24320398347925692, "grad_norm": 0.25, "learning_rate": 0.00038655164711421666, "loss": 3.0351734161376953, "step": 6410 }, { "epoch": 0.2435833968700202, "grad_norm": 0.2578125, "learning_rate": 0.00038650843922344785, "loss": 3.04272403717041, "step": 6420 }, { "epoch": 0.24396281026078348, "grad_norm": 0.255859375, "learning_rate": 0.00038646516445588864, "loss": 3.005428504943848, "step": 6430 }, { "epoch": 0.24434222365154673, "grad_norm": 0.255859375, "learning_rate": 0.00038642182282705624, "loss": 3.020662307739258, "step": 6440 }, { "epoch": 0.24472163704231, "grad_norm": 0.2578125, "learning_rate": 0.0003863784143524917, "loss": 3.0340269088745115, "step": 6450 }, { "epoch": 0.2451010504330733, "grad_norm": 0.2578125, "learning_rate": 0.0003863349390477603, "loss": 3.0278892517089844, "step": 6460 }, { "epoch": 0.24548046382383656, "grad_norm": 0.259765625, "learning_rate": 0.00038629139692845086, "loss": 3.029013824462891, "step": 6470 }, { "epoch": 0.24585987721459981, "grad_norm": 0.26171875, "learning_rate": 0.0003862477880101765, "loss": 3.015666389465332, "step": 6480 }, { "epoch": 0.2462392906053631, "grad_norm": 0.2490234375, "learning_rate": 0.0003862041123085742, "loss": 3.0166141510009767, "step": 6490 }, { "epoch": 0.24661870399612637, "grad_norm": 0.267578125, "learning_rate": 0.0003861603698393048, "loss": 3.027622604370117, "step": 6500 }, { "epoch": 0.24661870399612637, "eval_loss": 3.0193469524383545, "eval_runtime": 189.3526, "eval_samples_per_second": 20.116, "eval_steps_per_second": 3.354, "step": 6500 }, { "epoch": 0.24699811738688965, "grad_norm": 0.2578125, "learning_rate": 0.0003861165606180531, "loss": 3.0049823760986327, "step": 6510 }, { "epoch": 0.24737753077765293, "grad_norm": 0.251953125, "learning_rate": 0.000386072684660528, "loss": 3.02755069732666, "step": 6520 }, { "epoch": 0.24775694416841618, "grad_norm": 0.25, "learning_rate": 0.0003860287419824621, "loss": 3.0094528198242188, "step": 6530 }, { "epoch": 0.24813635755917945, "grad_norm": 0.265625, "learning_rate": 0.00038598473259961213, "loss": 3.014842987060547, "step": 6540 }, { "epoch": 0.24851577094994273, "grad_norm": 0.2470703125, "learning_rate": 0.0003859406565277586, "loss": 2.998270606994629, "step": 6550 }, { "epoch": 0.248895184340706, "grad_norm": 0.26171875, "learning_rate": 0.000385896513782706, "loss": 3.0502750396728517, "step": 6560 }, { "epoch": 0.24927459773146926, "grad_norm": 0.26953125, "learning_rate": 0.00038585230438028265, "loss": 3.011553955078125, "step": 6570 }, { "epoch": 0.24965401112223254, "grad_norm": 0.275390625, "learning_rate": 0.0003858080283363409, "loss": 3.000467300415039, "step": 6580 }, { "epoch": 0.2500334245129958, "grad_norm": 0.265625, "learning_rate": 0.0003857636856667568, "loss": 3.0071460723876955, "step": 6590 }, { "epoch": 0.25041283790375907, "grad_norm": 0.251953125, "learning_rate": 0.0003857192763874305, "loss": 3.0259674072265623, "step": 6600 }, { "epoch": 0.2507922512945224, "grad_norm": 0.248046875, "learning_rate": 0.0003856748005142859, "loss": 2.9950077056884767, "step": 6610 }, { "epoch": 0.2511716646852856, "grad_norm": 0.265625, "learning_rate": 0.000385630258063271, "loss": 3.0170356750488283, "step": 6620 }, { "epoch": 0.2515510780760489, "grad_norm": 0.255859375, "learning_rate": 0.00038558564905035716, "loss": 3.0212745666503906, "step": 6630 }, { "epoch": 0.2519304914668122, "grad_norm": 0.251953125, "learning_rate": 0.0003855409734915402, "loss": 3.0228708267211912, "step": 6640 }, { "epoch": 0.25230990485757543, "grad_norm": 0.255859375, "learning_rate": 0.00038549623140283953, "loss": 3.0243675231933596, "step": 6650 }, { "epoch": 0.25268931824833873, "grad_norm": 0.265625, "learning_rate": 0.00038545142280029833, "loss": 2.999564361572266, "step": 6660 }, { "epoch": 0.253068731639102, "grad_norm": 0.267578125, "learning_rate": 0.0003854065476999838, "loss": 3.011488914489746, "step": 6670 }, { "epoch": 0.25344814502986523, "grad_norm": 0.2578125, "learning_rate": 0.0003853616061179869, "loss": 3.022006034851074, "step": 6680 }, { "epoch": 0.25382755842062854, "grad_norm": 0.267578125, "learning_rate": 0.0003853165980704224, "loss": 3.018946075439453, "step": 6690 }, { "epoch": 0.2542069718113918, "grad_norm": 0.265625, "learning_rate": 0.000385271523573429, "loss": 3.0323249816894533, "step": 6700 }, { "epoch": 0.25458638520215504, "grad_norm": 0.255859375, "learning_rate": 0.00038522638264316917, "loss": 2.9912452697753906, "step": 6710 }, { "epoch": 0.25496579859291835, "grad_norm": 0.26171875, "learning_rate": 0.0003851811752958292, "loss": 2.978348731994629, "step": 6720 }, { "epoch": 0.2553452119836816, "grad_norm": 0.263671875, "learning_rate": 0.00038513590154761916, "loss": 2.985123634338379, "step": 6730 }, { "epoch": 0.2557246253744449, "grad_norm": 0.251953125, "learning_rate": 0.000385090561414773, "loss": 2.9994382858276367, "step": 6740 }, { "epoch": 0.25610403876520815, "grad_norm": 0.26171875, "learning_rate": 0.0003850451549135485, "loss": 3.00350284576416, "step": 6750 }, { "epoch": 0.25610403876520815, "eval_loss": 3.008527994155884, "eval_runtime": 188.711, "eval_samples_per_second": 20.184, "eval_steps_per_second": 3.365, "step": 6750 }, { "epoch": 0.2564834521559714, "grad_norm": 0.267578125, "learning_rate": 0.00038499968206022705, "loss": 2.9969425201416016, "step": 6760 }, { "epoch": 0.2568628655467347, "grad_norm": 0.2578125, "learning_rate": 0.0003849541428711141, "loss": 3.0168811798095705, "step": 6770 }, { "epoch": 0.25724227893749796, "grad_norm": 0.251953125, "learning_rate": 0.00038490853736253863, "loss": 3.000655937194824, "step": 6780 }, { "epoch": 0.25762169232826126, "grad_norm": 0.25390625, "learning_rate": 0.0003848628655508537, "loss": 3.013784980773926, "step": 6790 }, { "epoch": 0.2580011057190245, "grad_norm": 0.25390625, "learning_rate": 0.0003848171274524357, "loss": 2.9875741958618165, "step": 6800 }, { "epoch": 0.25838051910978777, "grad_norm": 0.279296875, "learning_rate": 0.00038477132308368525, "loss": 2.9930179595947264, "step": 6810 }, { "epoch": 0.25875993250055107, "grad_norm": 0.255859375, "learning_rate": 0.00038472545246102653, "loss": 3.0150022506713867, "step": 6820 }, { "epoch": 0.2591393458913143, "grad_norm": 0.275390625, "learning_rate": 0.00038467951560090737, "loss": 3.0006885528564453, "step": 6830 }, { "epoch": 0.25951875928207757, "grad_norm": 0.259765625, "learning_rate": 0.0003846335125197995, "loss": 2.9965118408203124, "step": 6840 }, { "epoch": 0.2598981726728409, "grad_norm": 0.26171875, "learning_rate": 0.0003845874432341984, "loss": 2.965704345703125, "step": 6850 }, { "epoch": 0.2602775860636041, "grad_norm": 0.271484375, "learning_rate": 0.00038454130776062326, "loss": 3.002559280395508, "step": 6860 }, { "epoch": 0.26065699945436743, "grad_norm": 0.25390625, "learning_rate": 0.00038449510611561694, "loss": 2.9910932540893556, "step": 6870 }, { "epoch": 0.2610364128451307, "grad_norm": 0.27734375, "learning_rate": 0.0003844488383157461, "loss": 3.028472328186035, "step": 6880 }, { "epoch": 0.26141582623589393, "grad_norm": 0.384765625, "learning_rate": 0.00038440250437760115, "loss": 2.998325157165527, "step": 6890 }, { "epoch": 0.26179523962665724, "grad_norm": 0.2470703125, "learning_rate": 0.000384356104317796, "loss": 3.00711612701416, "step": 6900 }, { "epoch": 0.2621746530174205, "grad_norm": 0.259765625, "learning_rate": 0.0003843096381529686, "loss": 2.9970094680786135, "step": 6910 }, { "epoch": 0.26255406640818374, "grad_norm": 0.255859375, "learning_rate": 0.0003842631058997804, "loss": 3.0075603485107423, "step": 6920 }, { "epoch": 0.26293347979894705, "grad_norm": 0.2578125, "learning_rate": 0.0003842165075749166, "loss": 2.9960258483886717, "step": 6930 }, { "epoch": 0.2633128931897103, "grad_norm": 0.2578125, "learning_rate": 0.000384169843195086, "loss": 3.0225095748901367, "step": 6940 }, { "epoch": 0.2636923065804736, "grad_norm": 0.26171875, "learning_rate": 0.0003841231127770212, "loss": 2.998221588134766, "step": 6950 }, { "epoch": 0.26407171997123685, "grad_norm": 0.251953125, "learning_rate": 0.0003840763163374784, "loss": 3.0349639892578124, "step": 6960 }, { "epoch": 0.2644511333620001, "grad_norm": 0.27734375, "learning_rate": 0.00038402945389323764, "loss": 2.963156318664551, "step": 6970 }, { "epoch": 0.2648305467527634, "grad_norm": 0.267578125, "learning_rate": 0.00038398252546110236, "loss": 2.973239517211914, "step": 6980 }, { "epoch": 0.26520996014352666, "grad_norm": 0.25, "learning_rate": 0.0003839355310578999, "loss": 3.011526870727539, "step": 6990 }, { "epoch": 0.26558937353428996, "grad_norm": 0.24609375, "learning_rate": 0.0003838884707004811, "loss": 3.001215171813965, "step": 7000 }, { "epoch": 0.26558937353428996, "eval_loss": 3.000521659851074, "eval_runtime": 188.5206, "eval_samples_per_second": 20.205, "eval_steps_per_second": 3.368, "step": 7000 }, { "epoch": 0.2659687869250532, "grad_norm": 0.263671875, "learning_rate": 0.0003838413444057205, "loss": 3.002371406555176, "step": 7010 }, { "epoch": 0.26634820031581646, "grad_norm": 0.275390625, "learning_rate": 0.00038379415219051645, "loss": 2.9903669357299805, "step": 7020 }, { "epoch": 0.26672761370657977, "grad_norm": 0.24609375, "learning_rate": 0.00038374689407179053, "loss": 3.002288818359375, "step": 7030 }, { "epoch": 0.267107027097343, "grad_norm": 0.259765625, "learning_rate": 0.00038369957006648836, "loss": 2.9953590393066407, "step": 7040 }, { "epoch": 0.26748644048810627, "grad_norm": 0.298828125, "learning_rate": 0.000383652180191579, "loss": 3.015280342102051, "step": 7050 }, { "epoch": 0.2678658538788696, "grad_norm": 0.248046875, "learning_rate": 0.0003836047244640551, "loss": 2.9982473373413088, "step": 7060 }, { "epoch": 0.2682452672696328, "grad_norm": 0.25, "learning_rate": 0.0003835572029009331, "loss": 2.9682384490966798, "step": 7070 }, { "epoch": 0.26862468066039613, "grad_norm": 0.263671875, "learning_rate": 0.0003835096155192528, "loss": 3.0237173080444335, "step": 7080 }, { "epoch": 0.2690040940511594, "grad_norm": 0.2470703125, "learning_rate": 0.0003834619623360777, "loss": 2.994947052001953, "step": 7090 }, { "epoch": 0.26938350744192263, "grad_norm": 0.283203125, "learning_rate": 0.0003834142433684951, "loss": 3.0062509536743165, "step": 7100 }, { "epoch": 0.26976292083268594, "grad_norm": 0.267578125, "learning_rate": 0.00038336645863361557, "loss": 3.041251373291016, "step": 7110 }, { "epoch": 0.2701423342234492, "grad_norm": 0.251953125, "learning_rate": 0.0003833186081485734, "loss": 3.0251136779785157, "step": 7120 }, { "epoch": 0.27052174761421244, "grad_norm": 0.26953125, "learning_rate": 0.0003832706919305265, "loss": 3.028840446472168, "step": 7130 }, { "epoch": 0.27090116100497574, "grad_norm": 0.25390625, "learning_rate": 0.00038322270999665626, "loss": 3.008879280090332, "step": 7140 }, { "epoch": 0.271280574395739, "grad_norm": 0.275390625, "learning_rate": 0.0003831746623641677, "loss": 3.0107467651367186, "step": 7150 }, { "epoch": 0.2716599877865023, "grad_norm": 0.251953125, "learning_rate": 0.00038312654905028935, "loss": 3.013968658447266, "step": 7160 }, { "epoch": 0.27203940117726555, "grad_norm": 0.259765625, "learning_rate": 0.00038307837007227347, "loss": 3.0311410903930662, "step": 7170 }, { "epoch": 0.2724188145680288, "grad_norm": 0.259765625, "learning_rate": 0.00038303012544739555, "loss": 2.9807315826416017, "step": 7180 }, { "epoch": 0.2727982279587921, "grad_norm": 0.25, "learning_rate": 0.00038298181519295476, "loss": 2.988920211791992, "step": 7190 }, { "epoch": 0.27317764134955536, "grad_norm": 0.306640625, "learning_rate": 0.00038293343932627394, "loss": 2.99883975982666, "step": 7200 }, { "epoch": 0.27355705474031866, "grad_norm": 0.322265625, "learning_rate": 0.0003828849978646994, "loss": 2.998494529724121, "step": 7210 }, { "epoch": 0.2739364681310819, "grad_norm": 0.25, "learning_rate": 0.0003828364908256007, "loss": 3.002914619445801, "step": 7220 }, { "epoch": 0.27431588152184516, "grad_norm": 0.255859375, "learning_rate": 0.0003827879182263713, "loss": 2.9851640701293944, "step": 7230 }, { "epoch": 0.27469529491260847, "grad_norm": 0.2578125, "learning_rate": 0.00038273928008442787, "loss": 3.0035892486572267, "step": 7240 }, { "epoch": 0.2750747083033717, "grad_norm": 0.25390625, "learning_rate": 0.0003826905764172109, "loss": 3.0013660430908202, "step": 7250 }, { "epoch": 0.2750747083033717, "eval_loss": 2.994936943054199, "eval_runtime": 187.8492, "eval_samples_per_second": 20.277, "eval_steps_per_second": 3.38, "step": 7250 }, { "epoch": 0.27545412169413497, "grad_norm": 0.2490234375, "learning_rate": 0.000382641807242184, "loss": 3.0024702072143556, "step": 7260 }, { "epoch": 0.2758335350848983, "grad_norm": 0.25390625, "learning_rate": 0.0003825929725768345, "loss": 2.9864158630371094, "step": 7270 }, { "epoch": 0.2762129484756615, "grad_norm": 0.267578125, "learning_rate": 0.00038254407243867324, "loss": 3.0124532699584963, "step": 7280 }, { "epoch": 0.27659236186642483, "grad_norm": 0.259765625, "learning_rate": 0.0003824951068452344, "loss": 3.0020378112792967, "step": 7290 }, { "epoch": 0.2769717752571881, "grad_norm": 0.265625, "learning_rate": 0.00038244607581407566, "loss": 2.9993045806884764, "step": 7300 }, { "epoch": 0.27735118864795133, "grad_norm": 0.26171875, "learning_rate": 0.00038239697936277823, "loss": 2.984160804748535, "step": 7310 }, { "epoch": 0.27773060203871464, "grad_norm": 0.259765625, "learning_rate": 0.00038234781750894677, "loss": 3.011053466796875, "step": 7320 }, { "epoch": 0.2781100154294779, "grad_norm": 0.259765625, "learning_rate": 0.0003822985902702094, "loss": 2.9880607604980467, "step": 7330 }, { "epoch": 0.27848942882024114, "grad_norm": 0.26171875, "learning_rate": 0.0003822492976642175, "loss": 2.9903610229492186, "step": 7340 }, { "epoch": 0.27886884221100444, "grad_norm": 0.2578125, "learning_rate": 0.00038219993970864624, "loss": 2.999019432067871, "step": 7350 }, { "epoch": 0.2792482556017677, "grad_norm": 0.25390625, "learning_rate": 0.0003821505164211939, "loss": 3.026545524597168, "step": 7360 }, { "epoch": 0.279627668992531, "grad_norm": 0.28125, "learning_rate": 0.00038210102781958234, "loss": 3.0079200744628904, "step": 7370 }, { "epoch": 0.28000708238329425, "grad_norm": 0.255859375, "learning_rate": 0.0003820514739215568, "loss": 2.996330451965332, "step": 7380 }, { "epoch": 0.2803864957740575, "grad_norm": 0.2578125, "learning_rate": 0.00038200185474488596, "loss": 2.9758481979370117, "step": 7390 }, { "epoch": 0.2807659091648208, "grad_norm": 0.25, "learning_rate": 0.0003819521703073619, "loss": 3.014063835144043, "step": 7400 }, { "epoch": 0.28114532255558405, "grad_norm": 0.251953125, "learning_rate": 0.0003819024206268, "loss": 3.023019027709961, "step": 7410 }, { "epoch": 0.28152473594634736, "grad_norm": 0.26171875, "learning_rate": 0.00038185260572103926, "loss": 2.988862228393555, "step": 7420 }, { "epoch": 0.2819041493371106, "grad_norm": 0.244140625, "learning_rate": 0.00038180272560794183, "loss": 2.998501014709473, "step": 7430 }, { "epoch": 0.28228356272787386, "grad_norm": 0.25390625, "learning_rate": 0.0003817527803053934, "loss": 2.992526054382324, "step": 7440 }, { "epoch": 0.28266297611863717, "grad_norm": 0.25390625, "learning_rate": 0.000381702769831303, "loss": 2.9803266525268555, "step": 7450 }, { "epoch": 0.2830423895094004, "grad_norm": 0.26171875, "learning_rate": 0.0003816526942036029, "loss": 2.9752931594848633, "step": 7460 }, { "epoch": 0.28342180290016367, "grad_norm": 0.267578125, "learning_rate": 0.000381602553440249, "loss": 2.9878353118896483, "step": 7470 }, { "epoch": 0.28380121629092697, "grad_norm": 0.271484375, "learning_rate": 0.0003815523475592203, "loss": 2.9940359115600588, "step": 7480 }, { "epoch": 0.2841806296816902, "grad_norm": 0.26171875, "learning_rate": 0.0003815020765785192, "loss": 2.9945470809936525, "step": 7490 }, { "epoch": 0.28456004307245353, "grad_norm": 0.25, "learning_rate": 0.00038145174051617165, "loss": 3.009478759765625, "step": 7500 }, { "epoch": 0.28456004307245353, "eval_loss": 2.986814022064209, "eval_runtime": 188.212, "eval_samples_per_second": 20.238, "eval_steps_per_second": 3.374, "step": 7500 }, { "epoch": 0.2849394564632168, "grad_norm": 0.271484375, "learning_rate": 0.00038140133939022655, "loss": 3.0304492950439452, "step": 7510 }, { "epoch": 0.28531886985398003, "grad_norm": 0.263671875, "learning_rate": 0.0003813508732187566, "loss": 3.0003053665161135, "step": 7520 }, { "epoch": 0.28569828324474333, "grad_norm": 0.255859375, "learning_rate": 0.0003813003420198574, "loss": 3.0187971115112306, "step": 7530 }, { "epoch": 0.2860776966355066, "grad_norm": 0.26171875, "learning_rate": 0.0003812497458116481, "loss": 2.9801219940185546, "step": 7540 }, { "epoch": 0.28645711002626983, "grad_norm": 0.265625, "learning_rate": 0.00038119908461227116, "loss": 2.977411651611328, "step": 7550 }, { "epoch": 0.28683652341703314, "grad_norm": 0.251953125, "learning_rate": 0.00038114835843989223, "loss": 2.9921607971191406, "step": 7560 }, { "epoch": 0.2872159368077964, "grad_norm": 0.263671875, "learning_rate": 0.0003810975673127003, "loss": 2.9867013931274413, "step": 7570 }, { "epoch": 0.2875953501985597, "grad_norm": 0.259765625, "learning_rate": 0.0003810467112489077, "loss": 3.001681327819824, "step": 7580 }, { "epoch": 0.28797476358932295, "grad_norm": 0.2734375, "learning_rate": 0.00038099579026675, "loss": 2.985209846496582, "step": 7590 }, { "epoch": 0.2883541769800862, "grad_norm": 0.265625, "learning_rate": 0.00038094480438448613, "loss": 3.00831298828125, "step": 7600 }, { "epoch": 0.2887335903708495, "grad_norm": 0.271484375, "learning_rate": 0.00038089375362039817, "loss": 2.9684032440185546, "step": 7610 }, { "epoch": 0.28911300376161275, "grad_norm": 0.275390625, "learning_rate": 0.0003808426379927915, "loss": 3.0032602310180665, "step": 7620 }, { "epoch": 0.28949241715237606, "grad_norm": 0.26171875, "learning_rate": 0.0003807914575199948, "loss": 2.987193298339844, "step": 7630 }, { "epoch": 0.2898718305431393, "grad_norm": 0.2578125, "learning_rate": 0.00038074021222036, "loss": 2.9869512557983398, "step": 7640 }, { "epoch": 0.29025124393390256, "grad_norm": 0.240234375, "learning_rate": 0.0003806889021122622, "loss": 2.9818756103515627, "step": 7650 }, { "epoch": 0.29063065732466586, "grad_norm": 0.2578125, "learning_rate": 0.0003806375272140998, "loss": 2.970410919189453, "step": 7660 }, { "epoch": 0.2910100707154291, "grad_norm": 0.265625, "learning_rate": 0.0003805860875442945, "loss": 3.0024997711181642, "step": 7670 }, { "epoch": 0.29138948410619236, "grad_norm": 0.271484375, "learning_rate": 0.000380534583121291, "loss": 3.0034530639648436, "step": 7680 }, { "epoch": 0.29176889749695567, "grad_norm": 0.26171875, "learning_rate": 0.00038048301396355756, "loss": 3.0075313568115236, "step": 7690 }, { "epoch": 0.2921483108877189, "grad_norm": 0.255859375, "learning_rate": 0.0003804313800895853, "loss": 2.974080276489258, "step": 7700 }, { "epoch": 0.2925277242784822, "grad_norm": 0.265625, "learning_rate": 0.0003803796815178888, "loss": 2.993200492858887, "step": 7710 }, { "epoch": 0.2929071376692455, "grad_norm": 0.251953125, "learning_rate": 0.0003803279182670057, "loss": 2.967024803161621, "step": 7720 }, { "epoch": 0.2932865510600087, "grad_norm": 0.267578125, "learning_rate": 0.0003802760903554969, "loss": 2.9748504638671873, "step": 7730 }, { "epoch": 0.29366596445077203, "grad_norm": 0.2578125, "learning_rate": 0.00038022419780194645, "loss": 2.9818119049072265, "step": 7740 }, { "epoch": 0.2940453778415353, "grad_norm": 0.2578125, "learning_rate": 0.00038017224062496154, "loss": 2.9740381240844727, "step": 7750 }, { "epoch": 0.2940453778415353, "eval_loss": 2.979459047317505, "eval_runtime": 189.344, "eval_samples_per_second": 20.117, "eval_steps_per_second": 3.354, "step": 7750 }, { "epoch": 0.29442479123229853, "grad_norm": 0.24609375, "learning_rate": 0.00038012021884317265, "loss": 2.9567392349243162, "step": 7760 }, { "epoch": 0.29480420462306184, "grad_norm": 0.255859375, "learning_rate": 0.0003800681324752334, "loss": 2.968921661376953, "step": 7770 }, { "epoch": 0.2951836180138251, "grad_norm": 0.26171875, "learning_rate": 0.00038001598153982044, "loss": 3.001677131652832, "step": 7780 }, { "epoch": 0.2955630314045884, "grad_norm": 0.26953125, "learning_rate": 0.00037996376605563357, "loss": 2.9762947082519533, "step": 7790 }, { "epoch": 0.29594244479535164, "grad_norm": 0.2470703125, "learning_rate": 0.0003799114860413961, "loss": 3.004165840148926, "step": 7800 }, { "epoch": 0.2963218581861149, "grad_norm": 0.24609375, "learning_rate": 0.00037985914151585396, "loss": 2.9875755310058594, "step": 7810 }, { "epoch": 0.2967012715768782, "grad_norm": 0.353515625, "learning_rate": 0.0003798067324977765, "loss": 2.9743780136108398, "step": 7820 }, { "epoch": 0.29708068496764145, "grad_norm": 0.275390625, "learning_rate": 0.00037975425900595633, "loss": 2.989496421813965, "step": 7830 }, { "epoch": 0.29746009835840476, "grad_norm": 0.259765625, "learning_rate": 0.00037970172105920874, "loss": 2.985903739929199, "step": 7840 }, { "epoch": 0.297839511749168, "grad_norm": 0.255859375, "learning_rate": 0.0003796491186763726, "loss": 3.001186752319336, "step": 7850 }, { "epoch": 0.29821892513993126, "grad_norm": 0.255859375, "learning_rate": 0.00037959645187630954, "loss": 2.986353302001953, "step": 7860 }, { "epoch": 0.29859833853069456, "grad_norm": 0.2578125, "learning_rate": 0.00037954372067790443, "loss": 2.991337013244629, "step": 7870 }, { "epoch": 0.2989777519214578, "grad_norm": 0.26953125, "learning_rate": 0.0003794909251000653, "loss": 2.975544548034668, "step": 7880 }, { "epoch": 0.29935716531222106, "grad_norm": 0.271484375, "learning_rate": 0.0003794380651617232, "loss": 2.9473474502563475, "step": 7890 }, { "epoch": 0.29973657870298437, "grad_norm": 0.26171875, "learning_rate": 0.0003793851408818322, "loss": 2.948260498046875, "step": 7900 }, { "epoch": 0.3001159920937476, "grad_norm": 0.267578125, "learning_rate": 0.00037933215227936947, "loss": 2.991917037963867, "step": 7910 }, { "epoch": 0.3004954054845109, "grad_norm": 0.25, "learning_rate": 0.0003792790993733353, "loss": 2.9717828750610353, "step": 7920 }, { "epoch": 0.3008748188752742, "grad_norm": 0.267578125, "learning_rate": 0.00037922598218275295, "loss": 2.9592878341674806, "step": 7930 }, { "epoch": 0.3012542322660374, "grad_norm": 0.259765625, "learning_rate": 0.00037917280072666883, "loss": 2.963702392578125, "step": 7940 }, { "epoch": 0.30163364565680073, "grad_norm": 0.248046875, "learning_rate": 0.00037911955502415237, "loss": 2.9732229232788088, "step": 7950 }, { "epoch": 0.302013059047564, "grad_norm": 0.25390625, "learning_rate": 0.00037906624509429594, "loss": 2.96759090423584, "step": 7960 }, { "epoch": 0.30239247243832723, "grad_norm": 0.26171875, "learning_rate": 0.00037901287095621505, "loss": 3.030630111694336, "step": 7970 }, { "epoch": 0.30277188582909054, "grad_norm": 0.259765625, "learning_rate": 0.00037895943262904826, "loss": 2.9968584060668944, "step": 7980 }, { "epoch": 0.3031512992198538, "grad_norm": 0.265625, "learning_rate": 0.00037890593013195693, "loss": 2.983555221557617, "step": 7990 }, { "epoch": 0.3035307126106171, "grad_norm": 0.267578125, "learning_rate": 0.00037885236348412577, "loss": 2.945888710021973, "step": 8000 }, { "epoch": 0.3035307126106171, "eval_loss": 2.9727110862731934, "eval_runtime": 189.4565, "eval_samples_per_second": 20.105, "eval_steps_per_second": 3.352, "step": 8000 }, { "epoch": 0.30391012600138034, "grad_norm": 0.2490234375, "learning_rate": 0.0003787987327047621, "loss": 2.956469917297363, "step": 8010 }, { "epoch": 0.3042895393921436, "grad_norm": 0.25390625, "learning_rate": 0.00037874503781309667, "loss": 2.9931638717651365, "step": 8020 }, { "epoch": 0.3046689527829069, "grad_norm": 0.27734375, "learning_rate": 0.0003786912788283828, "loss": 2.9602277755737303, "step": 8030 }, { "epoch": 0.30504836617367015, "grad_norm": 0.255859375, "learning_rate": 0.0003786374557698971, "loss": 2.996146392822266, "step": 8040 }, { "epoch": 0.30542777956443345, "grad_norm": 0.265625, "learning_rate": 0.00037858356865693896, "loss": 2.9844852447509767, "step": 8050 }, { "epoch": 0.3058071929551967, "grad_norm": 0.259765625, "learning_rate": 0.0003785296175088308, "loss": 2.977220916748047, "step": 8060 }, { "epoch": 0.30618660634595996, "grad_norm": 0.267578125, "learning_rate": 0.0003784756023449181, "loss": 2.981614875793457, "step": 8070 }, { "epoch": 0.30656601973672326, "grad_norm": 0.251953125, "learning_rate": 0.00037842152318456917, "loss": 2.9601680755615236, "step": 8080 }, { "epoch": 0.3069454331274865, "grad_norm": 0.26953125, "learning_rate": 0.00037836738004717527, "loss": 2.9674659729003907, "step": 8090 }, { "epoch": 0.30732484651824976, "grad_norm": 0.26171875, "learning_rate": 0.0003783131729521507, "loss": 2.9853858947753906, "step": 8100 }, { "epoch": 0.30770425990901307, "grad_norm": 0.255859375, "learning_rate": 0.00037825890191893263, "loss": 2.9626253128051756, "step": 8110 }, { "epoch": 0.3080836732997763, "grad_norm": 0.26953125, "learning_rate": 0.0003782045669669811, "loss": 2.9855104446411134, "step": 8120 }, { "epoch": 0.3084630866905396, "grad_norm": 0.25, "learning_rate": 0.0003781501681157791, "loss": 2.9517257690429686, "step": 8130 }, { "epoch": 0.3088425000813029, "grad_norm": 0.259765625, "learning_rate": 0.00037809570538483274, "loss": 2.974495506286621, "step": 8140 }, { "epoch": 0.3092219134720661, "grad_norm": 0.2578125, "learning_rate": 0.0003780411787936707, "loss": 2.9476388931274413, "step": 8150 }, { "epoch": 0.30960132686282943, "grad_norm": 0.2578125, "learning_rate": 0.00037798658836184473, "loss": 2.964387321472168, "step": 8160 }, { "epoch": 0.3099807402535927, "grad_norm": 0.263671875, "learning_rate": 0.0003779319341089295, "loss": 2.9908618927001953, "step": 8170 }, { "epoch": 0.31036015364435593, "grad_norm": 0.267578125, "learning_rate": 0.00037787721605452246, "loss": 2.9662683486938475, "step": 8180 }, { "epoch": 0.31073956703511924, "grad_norm": 0.2470703125, "learning_rate": 0.0003778224342182441, "loss": 2.9819658279418944, "step": 8190 }, { "epoch": 0.3111189804258825, "grad_norm": 0.251953125, "learning_rate": 0.00037776758861973747, "loss": 2.9464126586914063, "step": 8200 }, { "epoch": 0.3114983938166458, "grad_norm": 0.2392578125, "learning_rate": 0.000377712679278669, "loss": 2.949176025390625, "step": 8210 }, { "epoch": 0.31187780720740904, "grad_norm": 0.25, "learning_rate": 0.00037765770621472737, "loss": 2.9558231353759767, "step": 8220 }, { "epoch": 0.3122572205981723, "grad_norm": 0.265625, "learning_rate": 0.0003776026694476246, "loss": 3.002948760986328, "step": 8230 }, { "epoch": 0.3126366339889356, "grad_norm": 0.2734375, "learning_rate": 0.00037754756899709526, "loss": 2.979281997680664, "step": 8240 }, { "epoch": 0.31301604737969885, "grad_norm": 0.2470703125, "learning_rate": 0.0003774924048828969, "loss": 2.9571189880371094, "step": 8250 }, { "epoch": 0.31301604737969885, "eval_loss": 2.9668333530426025, "eval_runtime": 189.4531, "eval_samples_per_second": 20.105, "eval_steps_per_second": 3.352, "step": 8250 }, { "epoch": 0.31339546077046215, "grad_norm": 0.255859375, "learning_rate": 0.0003774371771248099, "loss": 2.9641178131103514, "step": 8260 }, { "epoch": 0.3137748741612254, "grad_norm": 0.259765625, "learning_rate": 0.0003773818857426373, "loss": 2.9648052215576173, "step": 8270 }, { "epoch": 0.31415428755198865, "grad_norm": 0.251953125, "learning_rate": 0.00037732653075620514, "loss": 2.961200141906738, "step": 8280 }, { "epoch": 0.31453370094275196, "grad_norm": 0.263671875, "learning_rate": 0.00037727111218536217, "loss": 2.9613069534301757, "step": 8290 }, { "epoch": 0.3149131143335152, "grad_norm": 0.27734375, "learning_rate": 0.00037721563004998, "loss": 2.9919605255126953, "step": 8300 }, { "epoch": 0.31529252772427846, "grad_norm": 0.2578125, "learning_rate": 0.00037716008436995295, "loss": 3.0031129837036135, "step": 8310 }, { "epoch": 0.31567194111504177, "grad_norm": 0.50390625, "learning_rate": 0.00037710447516519816, "loss": 2.994892120361328, "step": 8320 }, { "epoch": 0.316051354505805, "grad_norm": 0.2734375, "learning_rate": 0.00037704880245565563, "loss": 3.0254392623901367, "step": 8330 }, { "epoch": 0.3164307678965683, "grad_norm": 0.26171875, "learning_rate": 0.00037699306626128796, "loss": 2.994435119628906, "step": 8340 }, { "epoch": 0.31681018128733157, "grad_norm": 0.255859375, "learning_rate": 0.00037693726660208073, "loss": 2.953019714355469, "step": 8350 }, { "epoch": 0.3171895946780948, "grad_norm": 0.265625, "learning_rate": 0.000376881403498042, "loss": 2.9760093688964844, "step": 8360 }, { "epoch": 0.3175690080688581, "grad_norm": 0.244140625, "learning_rate": 0.000376825476969203, "loss": 2.9607412338256838, "step": 8370 }, { "epoch": 0.3179484214596214, "grad_norm": 0.279296875, "learning_rate": 0.0003767694870356172, "loss": 2.9652837753295898, "step": 8380 }, { "epoch": 0.31832783485038463, "grad_norm": 0.251953125, "learning_rate": 0.00037671343371736116, "loss": 2.945034980773926, "step": 8390 }, { "epoch": 0.31870724824114793, "grad_norm": 0.263671875, "learning_rate": 0.000376657317034534, "loss": 2.9599498748779296, "step": 8400 }, { "epoch": 0.3190866616319112, "grad_norm": 0.2734375, "learning_rate": 0.0003766011370072577, "loss": 2.9925575256347656, "step": 8410 }, { "epoch": 0.3194660750226745, "grad_norm": 0.279296875, "learning_rate": 0.0003765448936556768, "loss": 2.9391435623168944, "step": 8420 }, { "epoch": 0.31984548841343774, "grad_norm": 0.263671875, "learning_rate": 0.0003764885869999586, "loss": 2.9872079849243165, "step": 8430 }, { "epoch": 0.320224901804201, "grad_norm": 0.255859375, "learning_rate": 0.0003764322170602932, "loss": 2.9514612197875976, "step": 8440 }, { "epoch": 0.3206043151949643, "grad_norm": 0.255859375, "learning_rate": 0.00037637578385689316, "loss": 2.9579376220703124, "step": 8450 }, { "epoch": 0.32098372858572755, "grad_norm": 0.2451171875, "learning_rate": 0.0003763192874099941, "loss": 2.9618675231933596, "step": 8460 }, { "epoch": 0.3213631419764908, "grad_norm": 0.26171875, "learning_rate": 0.00037626272773985393, "loss": 2.9844648361206056, "step": 8470 }, { "epoch": 0.3217425553672541, "grad_norm": 0.26171875, "learning_rate": 0.0003762061048667534, "loss": 2.9733015060424806, "step": 8480 }, { "epoch": 0.32212196875801735, "grad_norm": 0.263671875, "learning_rate": 0.00037614941881099597, "loss": 2.982850456237793, "step": 8490 }, { "epoch": 0.32250138214878066, "grad_norm": 0.265625, "learning_rate": 0.0003760926695929076, "loss": 2.9512081146240234, "step": 8500 }, { "epoch": 0.32250138214878066, "eval_loss": 2.963548421859741, "eval_runtime": 189.5139, "eval_samples_per_second": 20.099, "eval_steps_per_second": 3.351, "step": 8500 }, { "epoch": 0.3228807955395439, "grad_norm": 0.28125, "learning_rate": 0.0004, "loss": 2.944461631774902, "step": 8510 }, { "epoch": 0.32326020893030716, "grad_norm": 0.275390625, "learning_rate": 0.0004, "loss": 2.982776641845703, "step": 8520 }, { "epoch": 0.32363962232107046, "grad_norm": 0.275390625, "learning_rate": 0.0004, "loss": 2.9751667022705077, "step": 8530 }, { "epoch": 0.3240190357118337, "grad_norm": 0.265625, "learning_rate": 0.0004, "loss": 2.973505973815918, "step": 8540 }, { "epoch": 0.324398449102597, "grad_norm": 0.26953125, "learning_rate": 0.0004, "loss": 2.961160659790039, "step": 8550 }, { "epoch": 0.32477786249336027, "grad_norm": 0.263671875, "learning_rate": 0.0004, "loss": 3.0011922836303713, "step": 8560 }, { "epoch": 0.3251572758841235, "grad_norm": 0.251953125, "learning_rate": 0.0004, "loss": 2.9393442153930662, "step": 8570 }, { "epoch": 0.3255366892748868, "grad_norm": 0.283203125, "learning_rate": 0.0004, "loss": 2.973066711425781, "step": 8580 }, { "epoch": 0.3259161026656501, "grad_norm": 0.2578125, "learning_rate": 0.0004, "loss": 2.974162292480469, "step": 8590 }, { "epoch": 0.3262955160564133, "grad_norm": 0.26171875, "learning_rate": 0.0004, "loss": 2.998489570617676, "step": 8600 }, { "epoch": 0.32667492944717663, "grad_norm": 0.255859375, "learning_rate": 0.0004, "loss": 2.960371398925781, "step": 8610 }, { "epoch": 0.3270543428379399, "grad_norm": 0.255859375, "learning_rate": 0.0004, "loss": 2.9562782287597655, "step": 8620 }, { "epoch": 0.3274337562287032, "grad_norm": 0.251953125, "learning_rate": 0.0004, "loss": 2.9719661712646483, "step": 8630 }, { "epoch": 0.32781316961946644, "grad_norm": 0.263671875, "learning_rate": 0.0004, "loss": 2.9714967727661135, "step": 8640 }, { "epoch": 0.3281925830102297, "grad_norm": 0.255859375, "learning_rate": 0.0004, "loss": 2.9389328002929687, "step": 8650 }, { "epoch": 0.328571996400993, "grad_norm": 0.275390625, "learning_rate": 0.0004, "loss": 2.9809543609619142, "step": 8660 }, { "epoch": 0.32895140979175624, "grad_norm": 0.296875, "learning_rate": 0.0004, "loss": 2.9486549377441404, "step": 8670 }, { "epoch": 0.3293308231825195, "grad_norm": 0.29296875, "learning_rate": 0.0004, "loss": 3.01328067779541, "step": 8680 }, { "epoch": 0.3297102365732828, "grad_norm": 0.2578125, "learning_rate": 0.0004, "loss": 2.981152153015137, "step": 8690 }, { "epoch": 0.33008964996404605, "grad_norm": 0.25390625, "learning_rate": 0.0004, "loss": 2.9361000061035156, "step": 8700 }, { "epoch": 0.33046906335480936, "grad_norm": 0.279296875, "learning_rate": 0.0004, "loss": 3.0119516372680666, "step": 8710 }, { "epoch": 0.3308484767455726, "grad_norm": 0.2578125, "learning_rate": 0.0004, "loss": 2.9414594650268553, "step": 8720 }, { "epoch": 0.33122789013633586, "grad_norm": 0.267578125, "learning_rate": 0.0004, "loss": 2.9568466186523437, "step": 8730 }, { "epoch": 0.33160730352709916, "grad_norm": 0.267578125, "learning_rate": 0.0004, "loss": 2.9699321746826173, "step": 8740 }, { "epoch": 0.3319867169178624, "grad_norm": 0.259765625, "learning_rate": 0.0004, "loss": 2.968779754638672, "step": 8750 }, { "epoch": 0.3319867169178624, "eval_loss": 2.9637811183929443, "eval_runtime": 227.8751, "eval_samples_per_second": 16.715, "eval_steps_per_second": 2.787, "step": 8750 }, { "epoch": 0.3323661303086257, "grad_norm": 0.484375, "learning_rate": 0.00037336378710722953, "loss": 2.971649169921875, "step": 8760 }, { "epoch": 0.33274554369938897, "grad_norm": 0.265625, "learning_rate": 0.00037330432351704337, "loss": 2.977306938171387, "step": 8770 }, { "epoch": 0.3331249570901522, "grad_norm": 0.2734375, "learning_rate": 0.000373244798372729, "loss": 2.9803157806396485, "step": 8780 }, { "epoch": 0.3335043704809155, "grad_norm": 0.28125, "learning_rate": 0.0003731852116954284, "loss": 2.9495565414428713, "step": 8790 }, { "epoch": 0.3338837838716788, "grad_norm": 0.255859375, "learning_rate": 0.0003731255635063055, "loss": 2.9604524612426757, "step": 8800 }, { "epoch": 0.334263197262442, "grad_norm": 0.240234375, "learning_rate": 0.0003730658538265462, "loss": 2.9864070892333983, "step": 8810 }, { "epoch": 0.33464261065320533, "grad_norm": 0.26171875, "learning_rate": 0.0003730060826773581, "loss": 2.9442163467407227, "step": 8820 }, { "epoch": 0.3350220240439686, "grad_norm": 0.302734375, "learning_rate": 0.00037294625007997076, "loss": 2.9576677322387694, "step": 8830 }, { "epoch": 0.3354014374347319, "grad_norm": 0.26171875, "learning_rate": 0.0003728863560556354, "loss": 2.9716320037841797, "step": 8840 }, { "epoch": 0.33578085082549514, "grad_norm": 0.2578125, "learning_rate": 0.00037282640062562516, "loss": 2.9644914627075196, "step": 8850 }, { "epoch": 0.3361602642162584, "grad_norm": 0.244140625, "learning_rate": 0.000372766383811235, "loss": 2.971399688720703, "step": 8860 }, { "epoch": 0.3365396776070217, "grad_norm": 0.271484375, "learning_rate": 0.00037270630563378157, "loss": 2.956666946411133, "step": 8870 }, { "epoch": 0.33691909099778494, "grad_norm": 0.2734375, "learning_rate": 0.00037264616611460354, "loss": 2.938750457763672, "step": 8880 }, { "epoch": 0.3372985043885482, "grad_norm": 0.3203125, "learning_rate": 0.00037258596527506115, "loss": 2.9612655639648438, "step": 8890 }, { "epoch": 0.3376779177793115, "grad_norm": 0.263671875, "learning_rate": 0.0003725257031365365, "loss": 2.9767482757568358, "step": 8900 }, { "epoch": 0.33805733117007475, "grad_norm": 0.2734375, "learning_rate": 0.0003724653797204335, "loss": 2.9531389236450196, "step": 8910 }, { "epoch": 0.33843674456083805, "grad_norm": 0.2578125, "learning_rate": 0.0003724049950481777, "loss": 2.978731155395508, "step": 8920 }, { "epoch": 0.3388161579516013, "grad_norm": 0.25390625, "learning_rate": 0.0003723445491412166, "loss": 2.923661994934082, "step": 8930 }, { "epoch": 0.33919557134236455, "grad_norm": 0.255859375, "learning_rate": 0.0003722840420210193, "loss": 2.9372861862182615, "step": 8940 }, { "epoch": 0.33957498473312786, "grad_norm": 0.255859375, "learning_rate": 0.00037222347370907666, "loss": 2.978684425354004, "step": 8950 }, { "epoch": 0.3399543981238911, "grad_norm": 0.26171875, "learning_rate": 0.00037216284422690127, "loss": 2.952480506896973, "step": 8960 }, { "epoch": 0.3403338115146544, "grad_norm": 0.275390625, "learning_rate": 0.0003721021535960276, "loss": 2.965733528137207, "step": 8970 }, { "epoch": 0.34071322490541767, "grad_norm": 0.263671875, "learning_rate": 0.0003720414018380115, "loss": 2.919559288024902, "step": 8980 }, { "epoch": 0.3410926382961809, "grad_norm": 0.267578125, "learning_rate": 0.000371980588974431, "loss": 2.981487274169922, "step": 8990 }, { "epoch": 0.3414720516869442, "grad_norm": 0.271484375, "learning_rate": 0.0003719197150268854, "loss": 2.9381824493408204, "step": 9000 }, { "epoch": 0.3414720516869442, "eval_loss": 2.9513282775878906, "eval_runtime": 190.4835, "eval_samples_per_second": 19.996, "eval_steps_per_second": 3.334, "step": 9000 }, { "epoch": 0.3418514650777075, "grad_norm": 0.263671875, "learning_rate": 0.00037185878001699596, "loss": 2.9717939376831053, "step": 9010 }, { "epoch": 0.3422308784684707, "grad_norm": 0.259765625, "learning_rate": 0.0003717977839664055, "loss": 2.989269828796387, "step": 9020 }, { "epoch": 0.34261029185923403, "grad_norm": 0.2578125, "learning_rate": 0.00037173672689677856, "loss": 2.973894500732422, "step": 9030 }, { "epoch": 0.3429897052499973, "grad_norm": 0.251953125, "learning_rate": 0.0003716756088298013, "loss": 2.9347436904907225, "step": 9040 }, { "epoch": 0.3433691186407606, "grad_norm": 0.255859375, "learning_rate": 0.00037161442978718174, "loss": 2.9861948013305666, "step": 9050 }, { "epoch": 0.34374853203152383, "grad_norm": 0.2578125, "learning_rate": 0.00037155318979064934, "loss": 2.953886604309082, "step": 9060 }, { "epoch": 0.3441279454222871, "grad_norm": 0.255859375, "learning_rate": 0.00037149188886195523, "loss": 2.948365020751953, "step": 9070 }, { "epoch": 0.3445073588130504, "grad_norm": 0.2578125, "learning_rate": 0.00037143052702287235, "loss": 2.968613624572754, "step": 9080 }, { "epoch": 0.34488677220381364, "grad_norm": 0.251953125, "learning_rate": 0.00037136910429519507, "loss": 2.9434391021728517, "step": 9090 }, { "epoch": 0.3452661855945769, "grad_norm": 0.2578125, "learning_rate": 0.0003713076207007395, "loss": 2.989210510253906, "step": 9100 }, { "epoch": 0.3456455989853402, "grad_norm": 0.265625, "learning_rate": 0.00037124607626134337, "loss": 2.951618957519531, "step": 9110 }, { "epoch": 0.34602501237610345, "grad_norm": 0.255859375, "learning_rate": 0.000371184470998866, "loss": 2.952468681335449, "step": 9120 }, { "epoch": 0.34640442576686675, "grad_norm": 0.25, "learning_rate": 0.0003711228049351883, "loss": 2.973964309692383, "step": 9130 }, { "epoch": 0.34678383915763, "grad_norm": 0.2451171875, "learning_rate": 0.00037106107809221277, "loss": 2.9298295974731445, "step": 9140 }, { "epoch": 0.34716325254839325, "grad_norm": 0.255859375, "learning_rate": 0.00037099929049186353, "loss": 2.958415985107422, "step": 9150 }, { "epoch": 0.34754266593915656, "grad_norm": 0.29296875, "learning_rate": 0.0003709374421560862, "loss": 2.9392539978027346, "step": 9160 }, { "epoch": 0.3479220793299198, "grad_norm": 0.259765625, "learning_rate": 0.0003708755331068482, "loss": 2.9267000198364257, "step": 9170 }, { "epoch": 0.3483014927206831, "grad_norm": 0.263671875, "learning_rate": 0.0003708135633661382, "loss": 2.9709936141967774, "step": 9180 }, { "epoch": 0.34868090611144636, "grad_norm": 0.267578125, "learning_rate": 0.0003707515329559667, "loss": 2.963185691833496, "step": 9190 }, { "epoch": 0.3490603195022096, "grad_norm": 0.26953125, "learning_rate": 0.00037068944189836545, "loss": 2.9721851348876953, "step": 9200 }, { "epoch": 0.3494397328929729, "grad_norm": 0.2451171875, "learning_rate": 0.00037062729021538807, "loss": 2.963479995727539, "step": 9210 }, { "epoch": 0.34981914628373617, "grad_norm": 0.2578125, "learning_rate": 0.0003705650779291095, "loss": 2.9468564987182617, "step": 9220 }, { "epoch": 0.3501985596744994, "grad_norm": 0.287109375, "learning_rate": 0.0003705028050616262, "loss": 2.9500274658203125, "step": 9230 }, { "epoch": 0.3505779730652627, "grad_norm": 0.291015625, "learning_rate": 0.00037044047163505635, "loss": 2.9579553604125977, "step": 9240 }, { "epoch": 0.350957386456026, "grad_norm": 0.265625, "learning_rate": 0.00037037807767153943, "loss": 2.9453290939331054, "step": 9250 }, { "epoch": 0.350957386456026, "eval_loss": 2.944139242172241, "eval_runtime": 204.4523, "eval_samples_per_second": 18.63, "eval_steps_per_second": 3.106, "step": 9250 }, { "epoch": 0.3513367998467893, "grad_norm": 0.2578125, "learning_rate": 0.0003703156231932365, "loss": 2.9403099060058593, "step": 9260 }, { "epoch": 0.35171621323755253, "grad_norm": 0.251953125, "learning_rate": 0.00037025310822233004, "loss": 2.961874580383301, "step": 9270 }, { "epoch": 0.3520956266283158, "grad_norm": 0.26171875, "learning_rate": 0.0003701905327810241, "loss": 2.9436532974243166, "step": 9280 }, { "epoch": 0.3524750400190791, "grad_norm": 0.251953125, "learning_rate": 0.00037012789689154437, "loss": 2.9633541107177734, "step": 9290 }, { "epoch": 0.35285445340984234, "grad_norm": 0.265625, "learning_rate": 0.00037006520057613756, "loss": 2.9275421142578124, "step": 9300 }, { "epoch": 0.3532338668006056, "grad_norm": 0.2490234375, "learning_rate": 0.0003700024438570723, "loss": 2.943824577331543, "step": 9310 }, { "epoch": 0.3536132801913689, "grad_norm": 0.255859375, "learning_rate": 0.00036993962675663835, "loss": 2.9443172454833983, "step": 9320 }, { "epoch": 0.35399269358213215, "grad_norm": 0.2470703125, "learning_rate": 0.0003698767492971472, "loss": 2.9746564865112304, "step": 9330 }, { "epoch": 0.35437210697289545, "grad_norm": 0.271484375, "learning_rate": 0.0003698138115009314, "loss": 2.9807477951049806, "step": 9340 }, { "epoch": 0.3547515203636587, "grad_norm": 0.255859375, "learning_rate": 0.0003697508133903455, "loss": 2.9493165969848634, "step": 9350 }, { "epoch": 0.35513093375442195, "grad_norm": 0.265625, "learning_rate": 0.0003696877549877648, "loss": 2.948213577270508, "step": 9360 }, { "epoch": 0.35551034714518526, "grad_norm": 0.2490234375, "learning_rate": 0.0003696246363155866, "loss": 2.981030082702637, "step": 9370 }, { "epoch": 0.3558897605359485, "grad_norm": 0.25390625, "learning_rate": 0.00036956145739622915, "loss": 2.9171833038330077, "step": 9380 }, { "epoch": 0.3562691739267118, "grad_norm": 0.25390625, "learning_rate": 0.0003694982182521324, "loss": 2.9224538803100586, "step": 9390 }, { "epoch": 0.35664858731747506, "grad_norm": 0.263671875, "learning_rate": 0.0003694349189057576, "loss": 2.960048866271973, "step": 9400 }, { "epoch": 0.3570280007082383, "grad_norm": 0.251953125, "learning_rate": 0.0003693715593795874, "loss": 2.9462575912475586, "step": 9410 }, { "epoch": 0.3574074140990016, "grad_norm": 0.263671875, "learning_rate": 0.0003693081396961257, "loss": 2.931777763366699, "step": 9420 }, { "epoch": 0.35778682748976487, "grad_norm": 0.859375, "learning_rate": 0.000369244659877898, "loss": 2.9580799102783204, "step": 9430 }, { "epoch": 0.3581662408805281, "grad_norm": 0.267578125, "learning_rate": 0.00036918111994745084, "loss": 2.924503517150879, "step": 9440 }, { "epoch": 0.3585456542712914, "grad_norm": 0.263671875, "learning_rate": 0.00036911751992735244, "loss": 2.9450254440307617, "step": 9450 }, { "epoch": 0.3589250676620547, "grad_norm": 0.265625, "learning_rate": 0.0003690538598401922, "loss": 2.9612974166870116, "step": 9460 }, { "epoch": 0.359304481052818, "grad_norm": 0.255859375, "learning_rate": 0.0003689901397085809, "loss": 2.9254526138305663, "step": 9470 }, { "epoch": 0.35968389444358123, "grad_norm": 0.25390625, "learning_rate": 0.00036892635955515046, "loss": 2.9104738235473633, "step": 9480 }, { "epoch": 0.3600633078343445, "grad_norm": 0.259765625, "learning_rate": 0.0003688625194025545, "loss": 2.97820987701416, "step": 9490 }, { "epoch": 0.3604427212251078, "grad_norm": 0.265625, "learning_rate": 0.0003687986192734676, "loss": 2.9628326416015627, "step": 9500 }, { "epoch": 0.3604427212251078, "eval_loss": 2.9406282901763916, "eval_runtime": 199.8066, "eval_samples_per_second": 19.063, "eval_steps_per_second": 3.178, "step": 9500 }, { "epoch": 0.36082213461587104, "grad_norm": 0.26953125, "learning_rate": 0.00036873465919058575, "loss": 2.980794334411621, "step": 9510 }, { "epoch": 0.3612015480066343, "grad_norm": 0.251953125, "learning_rate": 0.0003686706391766263, "loss": 2.972036361694336, "step": 9520 }, { "epoch": 0.3615809613973976, "grad_norm": 0.255859375, "learning_rate": 0.00036860655925432785, "loss": 2.9253971099853517, "step": 9530 }, { "epoch": 0.36196037478816084, "grad_norm": 0.263671875, "learning_rate": 0.00036854241944645025, "loss": 2.9479751586914062, "step": 9540 }, { "epoch": 0.36233978817892415, "grad_norm": 0.275390625, "learning_rate": 0.00036847821977577466, "loss": 2.9366695404052736, "step": 9550 }, { "epoch": 0.3627192015696874, "grad_norm": 0.279296875, "learning_rate": 0.00036841396026510343, "loss": 2.952723503112793, "step": 9560 }, { "epoch": 0.36309861496045065, "grad_norm": 0.251953125, "learning_rate": 0.0003683496409372603, "loss": 2.939288520812988, "step": 9570 }, { "epoch": 0.36347802835121396, "grad_norm": 0.271484375, "learning_rate": 0.0003682852618150901, "loss": 2.9211965560913087, "step": 9580 }, { "epoch": 0.3638574417419772, "grad_norm": 0.271484375, "learning_rate": 0.000368220822921459, "loss": 2.927013969421387, "step": 9590 }, { "epoch": 0.3642368551327405, "grad_norm": 0.267578125, "learning_rate": 0.00036815632427925434, "loss": 2.9252603530883787, "step": 9600 }, { "epoch": 0.36461626852350376, "grad_norm": 0.26171875, "learning_rate": 0.00036809176591138474, "loss": 2.948748016357422, "step": 9610 }, { "epoch": 0.364995681914267, "grad_norm": 0.267578125, "learning_rate": 0.00036802714784078003, "loss": 2.9560102462768554, "step": 9620 }, { "epoch": 0.3653750953050303, "grad_norm": 0.2412109375, "learning_rate": 0.0003679624700903911, "loss": 2.928248977661133, "step": 9630 }, { "epoch": 0.36575450869579357, "grad_norm": 0.251953125, "learning_rate": 0.0003678977326831903, "loss": 2.923115348815918, "step": 9640 }, { "epoch": 0.3661339220865568, "grad_norm": 0.271484375, "learning_rate": 0.0003678329356421709, "loss": 2.9400506973266602, "step": 9650 }, { "epoch": 0.3665133354773201, "grad_norm": 0.26171875, "learning_rate": 0.0003677680789903476, "loss": 2.9266820907592774, "step": 9660 }, { "epoch": 0.3668927488680834, "grad_norm": 0.25390625, "learning_rate": 0.0003677031627507561, "loss": 2.936360549926758, "step": 9670 }, { "epoch": 0.3672721622588467, "grad_norm": 0.267578125, "learning_rate": 0.00036763818694645326, "loss": 2.9592258453369142, "step": 9680 }, { "epoch": 0.36765157564960993, "grad_norm": 0.263671875, "learning_rate": 0.0003675731516005172, "loss": 2.925436592102051, "step": 9690 }, { "epoch": 0.3680309890403732, "grad_norm": 0.2451171875, "learning_rate": 0.0003675080567360471, "loss": 2.9398685455322267, "step": 9700 }, { "epoch": 0.3684104024311365, "grad_norm": 0.2578125, "learning_rate": 0.00036744290237616337, "loss": 2.9428474426269533, "step": 9710 }, { "epoch": 0.36878981582189974, "grad_norm": 0.25, "learning_rate": 0.0003673776885440074, "loss": 2.9310712814331055, "step": 9720 }, { "epoch": 0.369169229212663, "grad_norm": 0.26171875, "learning_rate": 0.0003673124152627419, "loss": 2.907465934753418, "step": 9730 }, { "epoch": 0.3695486426034263, "grad_norm": 0.255859375, "learning_rate": 0.0003672470825555506, "loss": 2.938343048095703, "step": 9740 }, { "epoch": 0.36992805599418954, "grad_norm": 0.263671875, "learning_rate": 0.0003671816904456383, "loss": 2.9491209030151366, "step": 9750 }, { "epoch": 0.36992805599418954, "eval_loss": 2.934532403945923, "eval_runtime": 192.2572, "eval_samples_per_second": 19.812, "eval_steps_per_second": 3.303, "step": 9750 }, { "epoch": 0.37030746938495285, "grad_norm": 0.25, "learning_rate": 0.00036711623895623085, "loss": 2.932324028015137, "step": 9760 }, { "epoch": 0.3706868827757161, "grad_norm": 0.259765625, "learning_rate": 0.00036705072811057536, "loss": 2.9536481857299806, "step": 9770 }, { "epoch": 0.37106629616647935, "grad_norm": 0.279296875, "learning_rate": 0.00036698515793194, "loss": 2.9389116287231447, "step": 9780 }, { "epoch": 0.37144570955724265, "grad_norm": 0.263671875, "learning_rate": 0.0003669195284436138, "loss": 2.9477493286132814, "step": 9790 }, { "epoch": 0.3718251229480059, "grad_norm": 0.267578125, "learning_rate": 0.0003668538396689071, "loss": 2.9434080123901367, "step": 9800 }, { "epoch": 0.3722045363387692, "grad_norm": 0.271484375, "learning_rate": 0.0003667880916311512, "loss": 2.9402935028076174, "step": 9810 }, { "epoch": 0.37258394972953246, "grad_norm": 0.26171875, "learning_rate": 0.00036672228435369835, "loss": 2.9382415771484376, "step": 9820 }, { "epoch": 0.3729633631202957, "grad_norm": 0.2578125, "learning_rate": 0.00036665641785992205, "loss": 2.9386869430541993, "step": 9830 }, { "epoch": 0.373342776511059, "grad_norm": 0.251953125, "learning_rate": 0.00036659049217321665, "loss": 2.9607875823974608, "step": 9840 }, { "epoch": 0.37372218990182227, "grad_norm": 0.2890625, "learning_rate": 0.00036652450731699767, "loss": 2.946341323852539, "step": 9850 }, { "epoch": 0.3741016032925855, "grad_norm": 0.26171875, "learning_rate": 0.00036645846331470155, "loss": 2.919050407409668, "step": 9860 }, { "epoch": 0.3744810166833488, "grad_norm": 0.263671875, "learning_rate": 0.0003663923601897857, "loss": 2.9329030990600584, "step": 9870 }, { "epoch": 0.37486043007411207, "grad_norm": 0.26171875, "learning_rate": 0.00036632619796572865, "loss": 2.9455646514892577, "step": 9880 }, { "epoch": 0.3752398434648754, "grad_norm": 0.2578125, "learning_rate": 0.00036625997666602986, "loss": 2.931719207763672, "step": 9890 }, { "epoch": 0.37561925685563863, "grad_norm": 0.259765625, "learning_rate": 0.00036619369631420975, "loss": 2.9186126708984377, "step": 9900 }, { "epoch": 0.3759986702464019, "grad_norm": 0.255859375, "learning_rate": 0.00036612735693380977, "loss": 2.927076530456543, "step": 9910 }, { "epoch": 0.3763780836371652, "grad_norm": 0.26953125, "learning_rate": 0.0003660609585483922, "loss": 2.964809608459473, "step": 9920 }, { "epoch": 0.37675749702792843, "grad_norm": 0.27734375, "learning_rate": 0.0003659945011815405, "loss": 2.9434158325195314, "step": 9930 }, { "epoch": 0.3771369104186917, "grad_norm": 0.255859375, "learning_rate": 0.00036592798485685893, "loss": 2.919833946228027, "step": 9940 }, { "epoch": 0.377516323809455, "grad_norm": 0.275390625, "learning_rate": 0.00036586140959797277, "loss": 2.9378026962280273, "step": 9950 }, { "epoch": 0.37789573720021824, "grad_norm": 0.259765625, "learning_rate": 0.000365794775428528, "loss": 2.9173181533813475, "step": 9960 }, { "epoch": 0.37827515059098155, "grad_norm": 0.251953125, "learning_rate": 0.00036572808237219186, "loss": 2.9628374099731447, "step": 9970 }, { "epoch": 0.3786545639817448, "grad_norm": 0.271484375, "learning_rate": 0.00036566133045265236, "loss": 2.9470558166503906, "step": 9980 }, { "epoch": 0.37903397737250805, "grad_norm": 0.279296875, "learning_rate": 0.0003655945196936183, "loss": 2.9353002548217773, "step": 9990 }, { "epoch": 0.37941339076327135, "grad_norm": 0.26171875, "learning_rate": 0.00036552765011881955, "loss": 2.9345333099365236, "step": 10000 }, { "epoch": 0.37941339076327135, "eval_loss": 2.9306557178497314, "eval_runtime": 190.1448, "eval_samples_per_second": 20.032, "eval_steps_per_second": 3.34, "step": 10000 }, { "epoch": 0.3797928041540346, "grad_norm": 0.271484375, "learning_rate": 0.0003654607217520068, "loss": 2.936832046508789, "step": 10010 }, { "epoch": 0.3801722175447979, "grad_norm": 0.26953125, "learning_rate": 0.0003653937346169516, "loss": 2.9309356689453123, "step": 10020 }, { "epoch": 0.38055163093556116, "grad_norm": 0.275390625, "learning_rate": 0.0003653266887374465, "loss": 2.9501039505004885, "step": 10030 }, { "epoch": 0.3809310443263244, "grad_norm": 0.25390625, "learning_rate": 0.00036525958413730466, "loss": 2.9269783020019533, "step": 10040 }, { "epoch": 0.3813104577170877, "grad_norm": 0.26171875, "learning_rate": 0.00036519242084036047, "loss": 2.9478010177612304, "step": 10050 }, { "epoch": 0.38168987110785096, "grad_norm": 0.255859375, "learning_rate": 0.00036512519887046876, "loss": 2.931568717956543, "step": 10060 }, { "epoch": 0.3820692844986142, "grad_norm": 0.259765625, "learning_rate": 0.0003650579182515054, "loss": 2.9387348175048826, "step": 10070 }, { "epoch": 0.3824486978893775, "grad_norm": 0.2578125, "learning_rate": 0.0003649905790073672, "loss": 2.947910118103027, "step": 10080 }, { "epoch": 0.38282811128014077, "grad_norm": 0.2431640625, "learning_rate": 0.00036492318116197156, "loss": 2.936557960510254, "step": 10090 }, { "epoch": 0.3832075246709041, "grad_norm": 0.259765625, "learning_rate": 0.00036485572473925686, "loss": 2.963297653198242, "step": 10100 }, { "epoch": 0.3835869380616673, "grad_norm": 0.28125, "learning_rate": 0.00036478820976318225, "loss": 2.9529930114746095, "step": 10110 }, { "epoch": 0.3839663514524306, "grad_norm": 0.265625, "learning_rate": 0.0003647206362577276, "loss": 2.924796485900879, "step": 10120 }, { "epoch": 0.3843457648431939, "grad_norm": 0.298828125, "learning_rate": 0.0003646530042468937, "loss": 2.9366310119628904, "step": 10130 }, { "epoch": 0.38472517823395713, "grad_norm": 0.25390625, "learning_rate": 0.000364585313754702, "loss": 2.9302928924560545, "step": 10140 }, { "epoch": 0.3851045916247204, "grad_norm": 0.259765625, "learning_rate": 0.00036451756480519477, "loss": 2.9516401290893555, "step": 10150 }, { "epoch": 0.3854840050154837, "grad_norm": 0.26171875, "learning_rate": 0.000364449757422435, "loss": 2.9218185424804686, "step": 10160 }, { "epoch": 0.38586341840624694, "grad_norm": 0.25390625, "learning_rate": 0.0003643818916305066, "loss": 2.9451824188232423, "step": 10170 }, { "epoch": 0.38624283179701024, "grad_norm": 0.287109375, "learning_rate": 0.00036431396745351406, "loss": 2.9229101181030273, "step": 10180 }, { "epoch": 0.3866222451877735, "grad_norm": 0.2734375, "learning_rate": 0.00036424598491558254, "loss": 2.931039237976074, "step": 10190 }, { "epoch": 0.38700165857853674, "grad_norm": 0.271484375, "learning_rate": 0.00036417794404085816, "loss": 2.940766143798828, "step": 10200 }, { "epoch": 0.38738107196930005, "grad_norm": 0.26953125, "learning_rate": 0.0003641098448535076, "loss": 2.9501554489135744, "step": 10210 }, { "epoch": 0.3877604853600633, "grad_norm": 0.265625, "learning_rate": 0.00036404168737771834, "loss": 2.9412832260131836, "step": 10220 }, { "epoch": 0.3881398987508266, "grad_norm": 0.26171875, "learning_rate": 0.00036397347163769837, "loss": 2.958669090270996, "step": 10230 }, { "epoch": 0.38851931214158986, "grad_norm": 0.2578125, "learning_rate": 0.00036390519765767666, "loss": 2.917603874206543, "step": 10240 }, { "epoch": 0.3888987255323531, "grad_norm": 0.259765625, "learning_rate": 0.00036383686546190264, "loss": 2.940292739868164, "step": 10250 }, { "epoch": 0.3888987255323531, "eval_loss": 2.925140380859375, "eval_runtime": 189.5875, "eval_samples_per_second": 20.091, "eval_steps_per_second": 3.349, "step": 10250 }, { "epoch": 0.3892781389231164, "grad_norm": 0.265625, "learning_rate": 0.00036376847507464655, "loss": 2.952968215942383, "step": 10260 }, { "epoch": 0.38965755231387966, "grad_norm": 0.259765625, "learning_rate": 0.0003637000265201992, "loss": 2.961767387390137, "step": 10270 }, { "epoch": 0.3900369657046429, "grad_norm": 0.265625, "learning_rate": 0.00036363151982287214, "loss": 2.9187992095947264, "step": 10280 }, { "epoch": 0.3904163790954062, "grad_norm": 0.26953125, "learning_rate": 0.00036356295500699753, "loss": 2.9383487701416016, "step": 10290 }, { "epoch": 0.39079579248616947, "grad_norm": 0.25390625, "learning_rate": 0.0003634943320969282, "loss": 2.940970802307129, "step": 10300 }, { "epoch": 0.3911752058769328, "grad_norm": 0.28515625, "learning_rate": 0.0003634256511170376, "loss": 2.9222970962524415, "step": 10310 }, { "epoch": 0.391554619267696, "grad_norm": 0.263671875, "learning_rate": 0.0003633569120917197, "loss": 2.930615234375, "step": 10320 }, { "epoch": 0.3919340326584593, "grad_norm": 0.271484375, "learning_rate": 0.0003632881150453893, "loss": 2.9487403869628905, "step": 10330 }, { "epoch": 0.3923134460492226, "grad_norm": 0.248046875, "learning_rate": 0.0003632192600024817, "loss": 2.9499155044555665, "step": 10340 }, { "epoch": 0.39269285943998583, "grad_norm": 0.259765625, "learning_rate": 0.00036315034698745273, "loss": 2.9251806259155275, "step": 10350 }, { "epoch": 0.3930722728307491, "grad_norm": 0.29296875, "learning_rate": 0.00036308137602477893, "loss": 2.9446983337402344, "step": 10360 }, { "epoch": 0.3934516862215124, "grad_norm": 0.25390625, "learning_rate": 0.00036301234713895724, "loss": 2.9323333740234374, "step": 10370 }, { "epoch": 0.39383109961227564, "grad_norm": 0.26171875, "learning_rate": 0.00036294326035450544, "loss": 2.921908378601074, "step": 10380 }, { "epoch": 0.39421051300303894, "grad_norm": 0.271484375, "learning_rate": 0.0003628741156959617, "loss": 2.9559097290039062, "step": 10390 }, { "epoch": 0.3945899263938022, "grad_norm": 0.2578125, "learning_rate": 0.00036280491318788484, "loss": 2.930178642272949, "step": 10400 }, { "epoch": 0.39496933978456544, "grad_norm": 0.267578125, "learning_rate": 0.000362735652854854, "loss": 2.9360036849975586, "step": 10410 }, { "epoch": 0.39534875317532875, "grad_norm": 0.271484375, "learning_rate": 0.0003626663347214691, "loss": 2.9215581893920897, "step": 10420 }, { "epoch": 0.395728166566092, "grad_norm": 0.263671875, "learning_rate": 0.0003625969588123506, "loss": 2.9333879470825197, "step": 10430 }, { "epoch": 0.3961075799568553, "grad_norm": 0.26171875, "learning_rate": 0.00036252752515213924, "loss": 2.9367956161499023, "step": 10440 }, { "epoch": 0.39648699334761855, "grad_norm": 0.263671875, "learning_rate": 0.00036245803376549666, "loss": 2.927289581298828, "step": 10450 }, { "epoch": 0.3968664067383818, "grad_norm": 0.251953125, "learning_rate": 0.0003623884846771045, "loss": 2.884408187866211, "step": 10460 }, { "epoch": 0.3972458201291451, "grad_norm": 0.26171875, "learning_rate": 0.0003623188779116653, "loss": 2.949008750915527, "step": 10470 }, { "epoch": 0.39762523351990836, "grad_norm": 0.26171875, "learning_rate": 0.00036224921349390195, "loss": 2.913135528564453, "step": 10480 }, { "epoch": 0.3980046469106716, "grad_norm": 0.30078125, "learning_rate": 0.00036217949144855786, "loss": 2.924331855773926, "step": 10490 }, { "epoch": 0.3983840603014349, "grad_norm": 0.263671875, "learning_rate": 0.0003621097118003967, "loss": 2.959138870239258, "step": 10500 }, { "epoch": 0.3983840603014349, "eval_loss": 2.922297954559326, "eval_runtime": 190.0377, "eval_samples_per_second": 20.043, "eval_steps_per_second": 3.341, "step": 10500 }, { "epoch": 0.39876347369219817, "grad_norm": 0.2734375, "learning_rate": 0.00036203987457420297, "loss": 2.9179882049560546, "step": 10510 }, { "epoch": 0.3991428870829615, "grad_norm": 0.263671875, "learning_rate": 0.0003619699797947813, "loss": 2.965084266662598, "step": 10520 }, { "epoch": 0.3995223004737247, "grad_norm": 0.28515625, "learning_rate": 0.0003619000274869569, "loss": 2.9392547607421875, "step": 10530 }, { "epoch": 0.399901713864488, "grad_norm": 0.259765625, "learning_rate": 0.0003618300176755754, "loss": 2.9220325469970705, "step": 10540 }, { "epoch": 0.4002811272552513, "grad_norm": 0.26171875, "learning_rate": 0.0003617599503855028, "loss": 2.9164180755615234, "step": 10550 }, { "epoch": 0.40066054064601453, "grad_norm": 0.2578125, "learning_rate": 0.0003616898256416256, "loss": 2.9272010803222654, "step": 10560 }, { "epoch": 0.4010399540367778, "grad_norm": 0.265625, "learning_rate": 0.0003616196434688506, "loss": 2.9185226440429686, "step": 10570 }, { "epoch": 0.4014193674275411, "grad_norm": 0.259765625, "learning_rate": 0.0003615494038921052, "loss": 2.938305473327637, "step": 10580 }, { "epoch": 0.40179878081830434, "grad_norm": 0.265625, "learning_rate": 0.00036147910693633696, "loss": 2.933705711364746, "step": 10590 }, { "epoch": 0.40217819420906764, "grad_norm": 0.267578125, "learning_rate": 0.0003614087526265139, "loss": 2.9181568145751955, "step": 10600 }, { "epoch": 0.4025576075998309, "grad_norm": 0.263671875, "learning_rate": 0.0003613383409876245, "loss": 2.8871971130371095, "step": 10610 }, { "epoch": 0.40293702099059414, "grad_norm": 0.27734375, "learning_rate": 0.00036126787204467743, "loss": 2.916217041015625, "step": 10620 }, { "epoch": 0.40331643438135745, "grad_norm": 0.2578125, "learning_rate": 0.0003611973458227018, "loss": 2.9202529907226564, "step": 10630 }, { "epoch": 0.4036958477721207, "grad_norm": 0.2578125, "learning_rate": 0.00036112676234674717, "loss": 2.926866912841797, "step": 10640 }, { "epoch": 0.404075261162884, "grad_norm": 0.26953125, "learning_rate": 0.0003610561216418833, "loss": 2.9230756759643555, "step": 10650 }, { "epoch": 0.40445467455364725, "grad_norm": 0.26171875, "learning_rate": 0.00036098542373320025, "loss": 2.9172157287597655, "step": 10660 }, { "epoch": 0.4048340879444105, "grad_norm": 0.263671875, "learning_rate": 0.0003609146686458086, "loss": 2.9190036773681642, "step": 10670 }, { "epoch": 0.4052135013351738, "grad_norm": 0.259765625, "learning_rate": 0.00036084385640483893, "loss": 2.966244125366211, "step": 10680 }, { "epoch": 0.40559291472593706, "grad_norm": 0.255859375, "learning_rate": 0.0003607729870354424, "loss": 2.92929630279541, "step": 10690 }, { "epoch": 0.4059723281167003, "grad_norm": 0.26171875, "learning_rate": 0.0003607020605627903, "loss": 2.9079761505126953, "step": 10700 }, { "epoch": 0.4063517415074636, "grad_norm": 0.291015625, "learning_rate": 0.0003606310770120743, "loss": 2.942063331604004, "step": 10710 }, { "epoch": 0.40673115489822687, "grad_norm": 0.26171875, "learning_rate": 0.0003605600364085063, "loss": 2.9416435241699217, "step": 10720 }, { "epoch": 0.40711056828899017, "grad_norm": 0.271484375, "learning_rate": 0.0003604889387773184, "loss": 2.926756477355957, "step": 10730 }, { "epoch": 0.4074899816797534, "grad_norm": 0.251953125, "learning_rate": 0.0003604177841437631, "loss": 2.891950798034668, "step": 10740 }, { "epoch": 0.40786939507051667, "grad_norm": 0.26171875, "learning_rate": 0.000360346572533113, "loss": 2.8844213485717773, "step": 10750 }, { "epoch": 0.40786939507051667, "eval_loss": 2.9197380542755127, "eval_runtime": 189.7261, "eval_samples_per_second": 20.076, "eval_steps_per_second": 3.347, "step": 10750 }, { "epoch": 0.40824880846128, "grad_norm": 0.26171875, "learning_rate": 0.00036027530397066094, "loss": 2.9018552780151365, "step": 10760 }, { "epoch": 0.4086282218520432, "grad_norm": 0.265625, "learning_rate": 0.0003602039784817202, "loss": 2.9313064575195313, "step": 10770 }, { "epoch": 0.4090076352428065, "grad_norm": 0.25390625, "learning_rate": 0.000360132596091624, "loss": 2.909392547607422, "step": 10780 }, { "epoch": 0.4093870486335698, "grad_norm": 0.25390625, "learning_rate": 0.00036006115682572597, "loss": 2.9223072052001955, "step": 10790 }, { "epoch": 0.40976646202433303, "grad_norm": 0.25390625, "learning_rate": 0.0003599896607093998, "loss": 2.937965965270996, "step": 10800 }, { "epoch": 0.41014587541509634, "grad_norm": 0.298828125, "learning_rate": 0.00035991810776803953, "loss": 2.9522382736206056, "step": 10810 }, { "epoch": 0.4105252888058596, "grad_norm": 0.275390625, "learning_rate": 0.0003598464980270592, "loss": 2.9194883346557616, "step": 10820 }, { "epoch": 0.41090470219662284, "grad_norm": 0.271484375, "learning_rate": 0.00035977483151189313, "loss": 2.9343536376953123, "step": 10830 }, { "epoch": 0.41128411558738615, "grad_norm": 0.26171875, "learning_rate": 0.00035970310824799575, "loss": 2.939601707458496, "step": 10840 }, { "epoch": 0.4116635289781494, "grad_norm": 0.271484375, "learning_rate": 0.00035963132826084186, "loss": 2.8936939239501953, "step": 10850 }, { "epoch": 0.41204294236891265, "grad_norm": 0.271484375, "learning_rate": 0.0003595594915759261, "loss": 2.906690216064453, "step": 10860 }, { "epoch": 0.41242235575967595, "grad_norm": 0.255859375, "learning_rate": 0.00035948759821876334, "loss": 2.8999923706054687, "step": 10870 }, { "epoch": 0.4128017691504392, "grad_norm": 0.259765625, "learning_rate": 0.00035941564821488875, "loss": 2.941736030578613, "step": 10880 }, { "epoch": 0.4131811825412025, "grad_norm": 0.267578125, "learning_rate": 0.0003593436415898574, "loss": 2.9108877182006836, "step": 10890 }, { "epoch": 0.41356059593196576, "grad_norm": 0.26171875, "learning_rate": 0.0003592715783692446, "loss": 2.9032255172729493, "step": 10900 }, { "epoch": 0.413940009322729, "grad_norm": 0.2578125, "learning_rate": 0.0003591994585786457, "loss": 2.891794204711914, "step": 10910 }, { "epoch": 0.4143194227134923, "grad_norm": 0.2890625, "learning_rate": 0.0003591272822436762, "loss": 2.9108421325683596, "step": 10920 }, { "epoch": 0.41469883610425556, "grad_norm": 0.259765625, "learning_rate": 0.00035905504938997155, "loss": 2.905393600463867, "step": 10930 }, { "epoch": 0.41507824949501887, "grad_norm": 0.259765625, "learning_rate": 0.0003589827600431875, "loss": 2.915001106262207, "step": 10940 }, { "epoch": 0.4154576628857821, "grad_norm": 0.259765625, "learning_rate": 0.00035891041422899967, "loss": 2.930599594116211, "step": 10950 }, { "epoch": 0.41583707627654537, "grad_norm": 0.25390625, "learning_rate": 0.0003588380119731039, "loss": 2.8779735565185547, "step": 10960 }, { "epoch": 0.4162164896673087, "grad_norm": 0.259765625, "learning_rate": 0.0003587655533012157, "loss": 2.9335803985595703, "step": 10970 }, { "epoch": 0.4165959030580719, "grad_norm": 0.26171875, "learning_rate": 0.00035869303823907124, "loss": 2.9322763442993165, "step": 10980 }, { "epoch": 0.4169753164488352, "grad_norm": 0.251953125, "learning_rate": 0.0003586204668124262, "loss": 2.9123533248901365, "step": 10990 }, { "epoch": 0.4173547298395985, "grad_norm": 0.259765625, "learning_rate": 0.0003585478390470565, "loss": 2.8951160430908205, "step": 11000 }, { "epoch": 0.4173547298395985, "eval_loss": 2.9135656356811523, "eval_runtime": 190.119, "eval_samples_per_second": 20.035, "eval_steps_per_second": 3.34, "step": 11000 }, { "epoch": 0.41773414323036173, "grad_norm": 0.25390625, "learning_rate": 0.000358475154968758, "loss": 2.900168609619141, "step": 11010 }, { "epoch": 0.41811355662112504, "grad_norm": 0.267578125, "learning_rate": 0.0003584024146033467, "loss": 2.888751220703125, "step": 11020 }, { "epoch": 0.4184929700118883, "grad_norm": 0.255859375, "learning_rate": 0.00035832961797665824, "loss": 2.924108123779297, "step": 11030 }, { "epoch": 0.41887238340265154, "grad_norm": 0.2734375, "learning_rate": 0.0003582567651145487, "loss": 2.9153217315673827, "step": 11040 }, { "epoch": 0.41925179679341484, "grad_norm": 0.259765625, "learning_rate": 0.00035818385604289383, "loss": 2.904195785522461, "step": 11050 }, { "epoch": 0.4196312101841781, "grad_norm": 0.275390625, "learning_rate": 0.0003581108907875895, "loss": 2.9108715057373047, "step": 11060 }, { "epoch": 0.42001062357494134, "grad_norm": 0.267578125, "learning_rate": 0.00035803786937455137, "loss": 2.9022924423217775, "step": 11070 }, { "epoch": 0.42039003696570465, "grad_norm": 0.279296875, "learning_rate": 0.0003579647918297152, "loss": 2.9129371643066406, "step": 11080 }, { "epoch": 0.4207694503564679, "grad_norm": 0.2734375, "learning_rate": 0.0003578916581790366, "loss": 2.908809471130371, "step": 11090 }, { "epoch": 0.4211488637472312, "grad_norm": 0.27734375, "learning_rate": 0.0003578184684484912, "loss": 2.8997686386108397, "step": 11100 }, { "epoch": 0.42152827713799446, "grad_norm": 0.27734375, "learning_rate": 0.0003577452226640744, "loss": 2.9490257263183595, "step": 11110 }, { "epoch": 0.4219076905287577, "grad_norm": 0.265625, "learning_rate": 0.0003576719208518016, "loss": 2.9063838958740233, "step": 11120 }, { "epoch": 0.422287103919521, "grad_norm": 0.265625, "learning_rate": 0.0003575985630377082, "loss": 2.930912399291992, "step": 11130 }, { "epoch": 0.42266651731028426, "grad_norm": 0.263671875, "learning_rate": 0.0003575251492478493, "loss": 2.913425827026367, "step": 11140 }, { "epoch": 0.42304593070104757, "grad_norm": 0.265625, "learning_rate": 0.00035745167950829995, "loss": 2.9097667694091798, "step": 11150 }, { "epoch": 0.4234253440918108, "grad_norm": 0.26171875, "learning_rate": 0.00035737815384515514, "loss": 2.902098846435547, "step": 11160 }, { "epoch": 0.42380475748257407, "grad_norm": 0.28515625, "learning_rate": 0.0003573045722845297, "loss": 2.899471664428711, "step": 11170 }, { "epoch": 0.4241841708733374, "grad_norm": 0.267578125, "learning_rate": 0.00035723093485255815, "loss": 2.9432998657226563, "step": 11180 }, { "epoch": 0.4245635842641006, "grad_norm": 0.255859375, "learning_rate": 0.00035715724157539514, "loss": 2.8967912673950194, "step": 11190 }, { "epoch": 0.4249429976548639, "grad_norm": 0.251953125, "learning_rate": 0.0003570834924792149, "loss": 2.894640350341797, "step": 11200 }, { "epoch": 0.4253224110456272, "grad_norm": 0.25390625, "learning_rate": 0.0003570096875902117, "loss": 2.9175010681152345, "step": 11210 }, { "epoch": 0.42570182443639043, "grad_norm": 0.26171875, "learning_rate": 0.0003569358269345994, "loss": 2.8961219787597656, "step": 11220 }, { "epoch": 0.42608123782715374, "grad_norm": 0.26171875, "learning_rate": 0.0003568619105386119, "loss": 2.8964942932128905, "step": 11230 }, { "epoch": 0.426460651217917, "grad_norm": 0.267578125, "learning_rate": 0.00035678793842850267, "loss": 2.8604612350463867, "step": 11240 }, { "epoch": 0.42684006460868024, "grad_norm": 0.279296875, "learning_rate": 0.00035671391063054513, "loss": 2.901065635681152, "step": 11250 }, { "epoch": 0.42684006460868024, "eval_loss": 2.90970778465271, "eval_runtime": 190.4634, "eval_samples_per_second": 19.999, "eval_steps_per_second": 3.334, "step": 11250 }, { "epoch": 0.42721947799944354, "grad_norm": 0.265625, "learning_rate": 0.0003566398271710324, "loss": 2.9069660186767576, "step": 11260 }, { "epoch": 0.4275988913902068, "grad_norm": 0.265625, "learning_rate": 0.0003565656880762775, "loss": 2.919877052307129, "step": 11270 }, { "epoch": 0.42797830478097004, "grad_norm": 0.267578125, "learning_rate": 0.000356491493372613, "loss": 2.9148868560791015, "step": 11280 }, { "epoch": 0.42835771817173335, "grad_norm": 0.26953125, "learning_rate": 0.0003564172430863914, "loss": 2.8851621627807615, "step": 11290 }, { "epoch": 0.4287371315624966, "grad_norm": 0.259765625, "learning_rate": 0.00035634293724398484, "loss": 2.9204538345336912, "step": 11300 }, { "epoch": 0.4291165449532599, "grad_norm": 0.267578125, "learning_rate": 0.00035626857587178523, "loss": 2.911776542663574, "step": 11310 }, { "epoch": 0.42949595834402315, "grad_norm": 0.2578125, "learning_rate": 0.0003561941589962042, "loss": 2.9245588302612306, "step": 11320 }, { "epoch": 0.4298753717347864, "grad_norm": 0.265625, "learning_rate": 0.00035611968664367303, "loss": 2.947029876708984, "step": 11330 }, { "epoch": 0.4302547851255497, "grad_norm": 0.265625, "learning_rate": 0.00035604515884064293, "loss": 2.935041618347168, "step": 11340 }, { "epoch": 0.43063419851631296, "grad_norm": 0.2578125, "learning_rate": 0.0003559705756135845, "loss": 2.9298534393310547, "step": 11350 }, { "epoch": 0.43101361190707627, "grad_norm": 0.255859375, "learning_rate": 0.0003558959369889882, "loss": 2.922953987121582, "step": 11360 }, { "epoch": 0.4313930252978395, "grad_norm": 0.267578125, "learning_rate": 0.0003558212429933641, "loss": 2.9104801177978517, "step": 11370 }, { "epoch": 0.43177243868860277, "grad_norm": 0.2734375, "learning_rate": 0.0003557464936532421, "loss": 2.8907087326049803, "step": 11380 }, { "epoch": 0.4321518520793661, "grad_norm": 0.27734375, "learning_rate": 0.00035567168899517147, "loss": 2.9319765090942385, "step": 11390 }, { "epoch": 0.4325312654701293, "grad_norm": 0.275390625, "learning_rate": 0.0003555968290457213, "loss": 2.9274923324584963, "step": 11400 }, { "epoch": 0.4329106788608926, "grad_norm": 0.28125, "learning_rate": 0.00035552191383148046, "loss": 2.8976131439208985, "step": 11410 }, { "epoch": 0.4332900922516559, "grad_norm": 0.263671875, "learning_rate": 0.00035544694337905715, "loss": 2.8738916397094725, "step": 11420 }, { "epoch": 0.43366950564241913, "grad_norm": 0.2578125, "learning_rate": 0.0003553719177150794, "loss": 2.8901580810546874, "step": 11430 }, { "epoch": 0.43404891903318243, "grad_norm": 0.267578125, "learning_rate": 0.0003552968368661947, "loss": 2.915904998779297, "step": 11440 }, { "epoch": 0.4344283324239457, "grad_norm": 0.271484375, "learning_rate": 0.0003552217008590703, "loss": 2.9298679351806642, "step": 11450 }, { "epoch": 0.43480774581470893, "grad_norm": 0.28125, "learning_rate": 0.0003551465097203929, "loss": 2.8828752517700194, "step": 11460 }, { "epoch": 0.43518715920547224, "grad_norm": 0.283203125, "learning_rate": 0.00035507126347686895, "loss": 2.9016469955444335, "step": 11470 }, { "epoch": 0.4355665725962355, "grad_norm": 0.263671875, "learning_rate": 0.00035499596215522424, "loss": 2.9544342041015623, "step": 11480 }, { "epoch": 0.43594598598699874, "grad_norm": 0.275390625, "learning_rate": 0.0003549206057822043, "loss": 2.908851432800293, "step": 11490 }, { "epoch": 0.43632539937776205, "grad_norm": 0.2890625, "learning_rate": 0.00035484519438457415, "loss": 2.9138717651367188, "step": 11500 }, { "epoch": 0.43632539937776205, "eval_loss": 2.9092581272125244, "eval_runtime": 189.8564, "eval_samples_per_second": 20.063, "eval_steps_per_second": 3.345, "step": 11500 }, { "epoch": 0.4367048127685253, "grad_norm": 0.26171875, "learning_rate": 0.00035476972798911843, "loss": 2.8893253326416017, "step": 11510 }, { "epoch": 0.4370842261592886, "grad_norm": 0.26171875, "learning_rate": 0.0003546942066226412, "loss": 2.868338203430176, "step": 11520 }, { "epoch": 0.43746363955005185, "grad_norm": 0.259765625, "learning_rate": 0.00035461863031196604, "loss": 2.927603912353516, "step": 11530 }, { "epoch": 0.4378430529408151, "grad_norm": 0.267578125, "learning_rate": 0.00035454299908393625, "loss": 2.9076528549194336, "step": 11540 }, { "epoch": 0.4382224663315784, "grad_norm": 0.2578125, "learning_rate": 0.00035446731296541435, "loss": 2.9177932739257812, "step": 11550 }, { "epoch": 0.43860187972234166, "grad_norm": 0.265625, "learning_rate": 0.0003543915719832826, "loss": 2.915171241760254, "step": 11560 }, { "epoch": 0.43898129311310496, "grad_norm": 0.259765625, "learning_rate": 0.00035431577616444246, "loss": 2.903510856628418, "step": 11570 }, { "epoch": 0.4393607065038682, "grad_norm": 0.279296875, "learning_rate": 0.0003542399255358152, "loss": 2.8930639266967773, "step": 11580 }, { "epoch": 0.43974011989463146, "grad_norm": 0.28125, "learning_rate": 0.0003541640201243414, "loss": 2.910004806518555, "step": 11590 }, { "epoch": 0.44011953328539477, "grad_norm": 0.2578125, "learning_rate": 0.000354088059956981, "loss": 2.892741012573242, "step": 11600 }, { "epoch": 0.440498946676158, "grad_norm": 0.275390625, "learning_rate": 0.00035401204506071364, "loss": 2.8916036605834963, "step": 11610 }, { "epoch": 0.44087836006692127, "grad_norm": 0.275390625, "learning_rate": 0.00035393597546253805, "loss": 2.8900167465209963, "step": 11620 }, { "epoch": 0.4412577734576846, "grad_norm": 0.26953125, "learning_rate": 0.00035385985118947273, "loss": 2.939387321472168, "step": 11630 }, { "epoch": 0.4416371868484478, "grad_norm": 0.26953125, "learning_rate": 0.0003537836722685555, "loss": 2.885766792297363, "step": 11640 }, { "epoch": 0.44201660023921113, "grad_norm": 0.271484375, "learning_rate": 0.0003537074387268433, "loss": 2.923074150085449, "step": 11650 }, { "epoch": 0.4423960136299744, "grad_norm": 0.251953125, "learning_rate": 0.00035363115059141296, "loss": 2.917004203796387, "step": 11660 }, { "epoch": 0.44277542702073763, "grad_norm": 0.283203125, "learning_rate": 0.00035355480788936035, "loss": 2.9097877502441407, "step": 11670 }, { "epoch": 0.44315484041150094, "grad_norm": 0.25390625, "learning_rate": 0.00035347841064780087, "loss": 2.9321573257446287, "step": 11680 }, { "epoch": 0.4435342538022642, "grad_norm": 0.2578125, "learning_rate": 0.0003534019588938692, "loss": 2.9241443634033204, "step": 11690 }, { "epoch": 0.44391366719302744, "grad_norm": 0.255859375, "learning_rate": 0.00035332545265471946, "loss": 2.893677520751953, "step": 11700 }, { "epoch": 0.44429308058379074, "grad_norm": 0.263671875, "learning_rate": 0.0003532488919575251, "loss": 2.942158317565918, "step": 11710 }, { "epoch": 0.444672493974554, "grad_norm": 0.275390625, "learning_rate": 0.00035317227682947884, "loss": 2.9495027542114256, "step": 11720 }, { "epoch": 0.4450519073653173, "grad_norm": 0.26953125, "learning_rate": 0.00035309560729779294, "loss": 2.929840660095215, "step": 11730 }, { "epoch": 0.44543132075608055, "grad_norm": 0.27734375, "learning_rate": 0.0003530188833896986, "loss": 2.925653839111328, "step": 11740 }, { "epoch": 0.4458107341468438, "grad_norm": 0.30078125, "learning_rate": 0.0003529421051324469, "loss": 2.916109085083008, "step": 11750 }, { "epoch": 0.4458107341468438, "eval_loss": 2.9037134647369385, "eval_runtime": 189.8764, "eval_samples_per_second": 20.06, "eval_steps_per_second": 3.344, "step": 11750 }, { "epoch": 0.4461901475376071, "grad_norm": 0.2734375, "learning_rate": 0.00035286527255330756, "loss": 2.953069305419922, "step": 11760 }, { "epoch": 0.44656956092837036, "grad_norm": 0.279296875, "learning_rate": 0.00035278838567957013, "loss": 2.903864097595215, "step": 11770 }, { "epoch": 0.44694897431913366, "grad_norm": 0.26953125, "learning_rate": 0.0003527114445385432, "loss": 2.909090042114258, "step": 11780 }, { "epoch": 0.4473283877098969, "grad_norm": 0.26953125, "learning_rate": 0.0003526344491575547, "loss": 2.9179216384887696, "step": 11790 }, { "epoch": 0.44770780110066016, "grad_norm": 0.26171875, "learning_rate": 0.00035255739956395173, "loss": 2.9061370849609376, "step": 11800 }, { "epoch": 0.44808721449142347, "grad_norm": 0.2578125, "learning_rate": 0.0003524802957851008, "loss": 2.8820926666259767, "step": 11810 }, { "epoch": 0.4484666278821867, "grad_norm": 0.271484375, "learning_rate": 0.00035240313784838754, "loss": 2.9247768402099608, "step": 11820 }, { "epoch": 0.44884604127294997, "grad_norm": 0.267578125, "learning_rate": 0.00035232592578121684, "loss": 2.9411748886108398, "step": 11830 }, { "epoch": 0.4492254546637133, "grad_norm": 0.267578125, "learning_rate": 0.000352248659611013, "loss": 2.9075206756591796, "step": 11840 }, { "epoch": 0.4496048680544765, "grad_norm": 0.28515625, "learning_rate": 0.0003521713393652191, "loss": 2.9161445617675783, "step": 11850 }, { "epoch": 0.44998428144523983, "grad_norm": 0.259765625, "learning_rate": 0.00035209396507129793, "loss": 2.920516777038574, "step": 11860 }, { "epoch": 0.4503636948360031, "grad_norm": 0.26953125, "learning_rate": 0.00035201653675673116, "loss": 2.87536563873291, "step": 11870 }, { "epoch": 0.45074310822676633, "grad_norm": 0.2734375, "learning_rate": 0.00035193905444901973, "loss": 2.866943359375, "step": 11880 }, { "epoch": 0.45112252161752964, "grad_norm": 0.265625, "learning_rate": 0.00035186151817568376, "loss": 2.8700429916381838, "step": 11890 }, { "epoch": 0.4515019350082929, "grad_norm": 0.259765625, "learning_rate": 0.00035178392796426255, "loss": 2.9141921997070312, "step": 11900 }, { "epoch": 0.45188134839905614, "grad_norm": 0.267578125, "learning_rate": 0.00035170628384231453, "loss": 2.916742134094238, "step": 11910 }, { "epoch": 0.45226076178981944, "grad_norm": 0.267578125, "learning_rate": 0.00035162858583741736, "loss": 2.901100921630859, "step": 11920 }, { "epoch": 0.4526401751805827, "grad_norm": 0.263671875, "learning_rate": 0.00035155083397716766, "loss": 2.9013051986694336, "step": 11930 }, { "epoch": 0.453019588571346, "grad_norm": 0.263671875, "learning_rate": 0.00035147302828918137, "loss": 2.910516357421875, "step": 11940 }, { "epoch": 0.45339900196210925, "grad_norm": 0.271484375, "learning_rate": 0.00035139516880109344, "loss": 2.9013132095336913, "step": 11950 }, { "epoch": 0.4537784153528725, "grad_norm": 0.2890625, "learning_rate": 0.000351317255540558, "loss": 2.886016845703125, "step": 11960 }, { "epoch": 0.4541578287436358, "grad_norm": 0.267578125, "learning_rate": 0.00035123928853524815, "loss": 2.919955825805664, "step": 11970 }, { "epoch": 0.45453724213439906, "grad_norm": 0.265625, "learning_rate": 0.0003511612678128562, "loss": 2.911751937866211, "step": 11980 }, { "epoch": 0.45491665552516236, "grad_norm": 0.439453125, "learning_rate": 0.0003510831934010936, "loss": 2.9212507247924804, "step": 11990 }, { "epoch": 0.4552960689159256, "grad_norm": 0.271484375, "learning_rate": 0.0003510050653276906, "loss": 2.901380729675293, "step": 12000 }, { "epoch": 0.4552960689159256, "eval_loss": 2.901052236557007, "eval_runtime": 196.735, "eval_samples_per_second": 19.361, "eval_steps_per_second": 3.228, "step": 12000 }, { "epoch": 0.45567548230668886, "grad_norm": 0.298828125, "learning_rate": 0.00035092688362039684, "loss": 2.8962764739990234, "step": 12010 }, { "epoch": 0.45605489569745217, "grad_norm": 0.263671875, "learning_rate": 0.0003508486483069808, "loss": 2.882288360595703, "step": 12020 }, { "epoch": 0.4564343090882154, "grad_norm": 0.2734375, "learning_rate": 0.00035077035941523003, "loss": 2.908757972717285, "step": 12030 }, { "epoch": 0.45681372247897867, "grad_norm": 0.26171875, "learning_rate": 0.00035069201697295114, "loss": 2.9177759170532225, "step": 12040 }, { "epoch": 0.457193135869742, "grad_norm": 0.259765625, "learning_rate": 0.00035061362100796976, "loss": 2.9017444610595704, "step": 12050 }, { "epoch": 0.4575725492605052, "grad_norm": 0.259765625, "learning_rate": 0.0003505351715481305, "loss": 2.9235309600830077, "step": 12060 }, { "epoch": 0.45795196265126853, "grad_norm": 0.3046875, "learning_rate": 0.000350456668621297, "loss": 2.911620330810547, "step": 12070 }, { "epoch": 0.4583313760420318, "grad_norm": 0.275390625, "learning_rate": 0.00035037811225535197, "loss": 2.9152334213256834, "step": 12080 }, { "epoch": 0.45871078943279503, "grad_norm": 0.25390625, "learning_rate": 0.0003502995024781969, "loss": 2.9247509002685548, "step": 12090 }, { "epoch": 0.45909020282355834, "grad_norm": 0.267578125, "learning_rate": 0.0003502208393177524, "loss": 2.9247882843017576, "step": 12100 }, { "epoch": 0.4594696162143216, "grad_norm": 0.28125, "learning_rate": 0.00035014212280195807, "loss": 2.88739013671875, "step": 12110 }, { "epoch": 0.45984902960508484, "grad_norm": 0.279296875, "learning_rate": 0.0003500633529587723, "loss": 2.8954135894775392, "step": 12120 }, { "epoch": 0.46022844299584814, "grad_norm": 0.28515625, "learning_rate": 0.00034998452981617256, "loss": 2.8863954544067383, "step": 12130 }, { "epoch": 0.4606078563866114, "grad_norm": 0.263671875, "learning_rate": 0.00034990565340215515, "loss": 2.9145456314086915, "step": 12140 }, { "epoch": 0.4609872697773747, "grad_norm": 0.28125, "learning_rate": 0.00034982672374473544, "loss": 2.918132209777832, "step": 12150 }, { "epoch": 0.46136668316813795, "grad_norm": 0.275390625, "learning_rate": 0.00034974774087194764, "loss": 2.9065771102905273, "step": 12160 }, { "epoch": 0.4617460965589012, "grad_norm": 0.25390625, "learning_rate": 0.00034966870481184466, "loss": 2.8814773559570312, "step": 12170 }, { "epoch": 0.4621255099496645, "grad_norm": 0.27734375, "learning_rate": 0.0003495896155924987, "loss": 2.92382869720459, "step": 12180 }, { "epoch": 0.46250492334042775, "grad_norm": 0.287109375, "learning_rate": 0.0003495104732420005, "loss": 2.8972146987915037, "step": 12190 }, { "epoch": 0.46288433673119106, "grad_norm": 0.265625, "learning_rate": 0.00034943127778845986, "loss": 2.90264892578125, "step": 12200 }, { "epoch": 0.4632637501219543, "grad_norm": 0.2578125, "learning_rate": 0.0003493520292600053, "loss": 2.8896596908569334, "step": 12210 }, { "epoch": 0.46364316351271756, "grad_norm": 0.259765625, "learning_rate": 0.0003492727276847843, "loss": 2.8881372451782226, "step": 12220 }, { "epoch": 0.46402257690348087, "grad_norm": 0.26953125, "learning_rate": 0.00034919337309096316, "loss": 2.8884620666503906, "step": 12230 }, { "epoch": 0.4644019902942441, "grad_norm": 0.26953125, "learning_rate": 0.000349113965506727, "loss": 2.922469139099121, "step": 12240 }, { "epoch": 0.46478140368500737, "grad_norm": 0.265625, "learning_rate": 0.0003490345049602797, "loss": 2.906755256652832, "step": 12250 }, { "epoch": 0.46478140368500737, "eval_loss": 2.8956754207611084, "eval_runtime": 189.455, "eval_samples_per_second": 20.105, "eval_steps_per_second": 3.352, "step": 12250 }, { "epoch": 0.46516081707577067, "grad_norm": 0.283203125, "learning_rate": 0.0003489549914798441, "loss": 2.9177387237548826, "step": 12260 }, { "epoch": 0.4655402304665339, "grad_norm": 0.267578125, "learning_rate": 0.00034887542509366167, "loss": 2.9163440704345702, "step": 12270 }, { "epoch": 0.4659196438572972, "grad_norm": 0.26171875, "learning_rate": 0.00034879580582999285, "loss": 2.911612129211426, "step": 12280 }, { "epoch": 0.4662990572480605, "grad_norm": 0.263671875, "learning_rate": 0.00034871613371711664, "loss": 2.8895673751831055, "step": 12290 }, { "epoch": 0.46667847063882373, "grad_norm": 0.302734375, "learning_rate": 0.00034863640878333107, "loss": 2.9211931228637695, "step": 12300 }, { "epoch": 0.46705788402958703, "grad_norm": 0.265625, "learning_rate": 0.00034855663105695265, "loss": 2.936489486694336, "step": 12310 }, { "epoch": 0.4674372974203503, "grad_norm": 0.26171875, "learning_rate": 0.0003484768005663169, "loss": 2.9057601928710937, "step": 12320 }, { "epoch": 0.46781671081111353, "grad_norm": 0.27734375, "learning_rate": 0.00034839691733977793, "loss": 2.8885934829711912, "step": 12330 }, { "epoch": 0.46819612420187684, "grad_norm": 0.28515625, "learning_rate": 0.0003483169814057086, "loss": 2.899825668334961, "step": 12340 }, { "epoch": 0.4685755375926401, "grad_norm": 0.28125, "learning_rate": 0.0003482369927925006, "loss": 2.9121450424194335, "step": 12350 }, { "epoch": 0.4689549509834034, "grad_norm": 0.26171875, "learning_rate": 0.000348156951528564, "loss": 2.879154014587402, "step": 12360 }, { "epoch": 0.46933436437416665, "grad_norm": 0.263671875, "learning_rate": 0.0003480768576423281, "loss": 2.8822216033935546, "step": 12370 }, { "epoch": 0.4697137777649299, "grad_norm": 0.2734375, "learning_rate": 0.0003479967111622404, "loss": 2.897949981689453, "step": 12380 }, { "epoch": 0.4700931911556932, "grad_norm": 0.26953125, "learning_rate": 0.00034791651211676743, "loss": 2.8716941833496095, "step": 12390 }, { "epoch": 0.47047260454645645, "grad_norm": 0.271484375, "learning_rate": 0.0003478362605343941, "loss": 2.8937549591064453, "step": 12400 }, { "epoch": 0.47085201793721976, "grad_norm": 0.26953125, "learning_rate": 0.0003477559564436242, "loss": 2.8700630187988283, "step": 12410 }, { "epoch": 0.471231431327983, "grad_norm": 0.267578125, "learning_rate": 0.0003476755998729801, "loss": 2.898262786865234, "step": 12420 }, { "epoch": 0.47161084471874626, "grad_norm": 0.283203125, "learning_rate": 0.0003475951908510028, "loss": 2.9243221282958984, "step": 12430 }, { "epoch": 0.47199025810950956, "grad_norm": 0.26953125, "learning_rate": 0.0003475147294062518, "loss": 2.8873620986938477, "step": 12440 }, { "epoch": 0.4723696715002728, "grad_norm": 0.267578125, "learning_rate": 0.00034743421556730547, "loss": 2.901606559753418, "step": 12450 }, { "epoch": 0.47274908489103606, "grad_norm": 0.265625, "learning_rate": 0.00034735364936276067, "loss": 2.9185293197631834, "step": 12460 }, { "epoch": 0.47312849828179937, "grad_norm": 0.27734375, "learning_rate": 0.0003472730308212328, "loss": 2.9146888732910154, "step": 12470 }, { "epoch": 0.4735079116725626, "grad_norm": 0.2734375, "learning_rate": 0.000347192359971356, "loss": 2.8683282852172853, "step": 12480 }, { "epoch": 0.4738873250633259, "grad_norm": 0.2578125, "learning_rate": 0.0003471116368417828, "loss": 2.8802421569824217, "step": 12490 }, { "epoch": 0.4742667384540892, "grad_norm": 0.259765625, "learning_rate": 0.00034703086146118447, "loss": 2.8718109130859375, "step": 12500 }, { "epoch": 0.4742667384540892, "eval_loss": 2.8921854496002197, "eval_runtime": 281.8551, "eval_samples_per_second": 13.514, "eval_steps_per_second": 2.253, "step": 12500 }, { "epoch": 0.4746461518448524, "grad_norm": 0.271484375, "learning_rate": 0.00034695003385825073, "loss": 2.901880645751953, "step": 12510 }, { "epoch": 0.47502556523561573, "grad_norm": 0.265625, "learning_rate": 0.0003468691540616898, "loss": 2.921336364746094, "step": 12520 }, { "epoch": 0.475404978626379, "grad_norm": 0.263671875, "learning_rate": 0.00034678822210022867, "loss": 2.9002851486206054, "step": 12530 }, { "epoch": 0.47578439201714223, "grad_norm": 0.265625, "learning_rate": 0.0003467072380026126, "loss": 2.8984004974365236, "step": 12540 }, { "epoch": 0.47616380540790554, "grad_norm": 0.255859375, "learning_rate": 0.0003466262017976055, "loss": 2.9022024154663084, "step": 12550 }, { "epoch": 0.4765432187986688, "grad_norm": 0.263671875, "learning_rate": 0.0003465451135139898, "loss": 2.909968376159668, "step": 12560 }, { "epoch": 0.4769226321894321, "grad_norm": 0.265625, "learning_rate": 0.00034646397318056633, "loss": 2.891847038269043, "step": 12570 }, { "epoch": 0.47730204558019534, "grad_norm": 0.2578125, "learning_rate": 0.0003463827808261545, "loss": 2.887018013000488, "step": 12580 }, { "epoch": 0.4776814589709586, "grad_norm": 0.2578125, "learning_rate": 0.0003463015364795921, "loss": 2.8730905532836912, "step": 12590 }, { "epoch": 0.4780608723617219, "grad_norm": 0.26953125, "learning_rate": 0.00034622024016973556, "loss": 2.8928932189941405, "step": 12600 }, { "epoch": 0.47844028575248515, "grad_norm": 0.26953125, "learning_rate": 0.00034613889192545955, "loss": 2.877326011657715, "step": 12610 }, { "epoch": 0.47881969914324846, "grad_norm": 0.2578125, "learning_rate": 0.00034605749177565745, "loss": 2.9068271636962892, "step": 12620 }, { "epoch": 0.4791991125340117, "grad_norm": 0.265625, "learning_rate": 0.00034597603974924075, "loss": 2.907305145263672, "step": 12630 }, { "epoch": 0.47957852592477496, "grad_norm": 0.29296875, "learning_rate": 0.0003458945358751396, "loss": 2.8910600662231447, "step": 12640 }, { "epoch": 0.47995793931553826, "grad_norm": 0.263671875, "learning_rate": 0.00034581298018230243, "loss": 2.893414115905762, "step": 12650 }, { "epoch": 0.4803373527063015, "grad_norm": 0.275390625, "learning_rate": 0.00034573137269969625, "loss": 2.900663948059082, "step": 12660 }, { "epoch": 0.48071676609706476, "grad_norm": 0.275390625, "learning_rate": 0.0003456497134563063, "loss": 2.86352653503418, "step": 12670 }, { "epoch": 0.48109617948782807, "grad_norm": 0.2734375, "learning_rate": 0.0003455680024811363, "loss": 2.894476890563965, "step": 12680 }, { "epoch": 0.4814755928785913, "grad_norm": 0.28125, "learning_rate": 0.00034548623980320826, "loss": 2.899493408203125, "step": 12690 }, { "epoch": 0.4818550062693546, "grad_norm": 0.26171875, "learning_rate": 0.0003454044254515626, "loss": 2.885264015197754, "step": 12700 }, { "epoch": 0.4822344196601179, "grad_norm": 0.279296875, "learning_rate": 0.0003453225594552581, "loss": 2.9041810989379884, "step": 12710 }, { "epoch": 0.4826138330508811, "grad_norm": 0.271484375, "learning_rate": 0.000345240641843372, "loss": 2.8803989410400392, "step": 12720 }, { "epoch": 0.48299324644164443, "grad_norm": 0.263671875, "learning_rate": 0.0003451586726449995, "loss": 2.8760604858398438, "step": 12730 }, { "epoch": 0.4833726598324077, "grad_norm": 0.2890625, "learning_rate": 0.00034507665188925464, "loss": 2.8881351470947267, "step": 12740 }, { "epoch": 0.48375207322317093, "grad_norm": 0.279296875, "learning_rate": 0.00034499457960526927, "loss": 2.8899887084960936, "step": 12750 }, { "epoch": 0.48375207322317093, "eval_loss": 2.88950252532959, "eval_runtime": 205.1987, "eval_samples_per_second": 18.562, "eval_steps_per_second": 3.095, "step": 12750 }, { "epoch": 0.48413148661393424, "grad_norm": 0.275390625, "learning_rate": 0.00034491245582219396, "loss": 2.888238525390625, "step": 12760 }, { "epoch": 0.4845109000046975, "grad_norm": 0.291015625, "learning_rate": 0.0003448302805691973, "loss": 2.9086492538452147, "step": 12770 }, { "epoch": 0.4848903133954608, "grad_norm": 0.27734375, "learning_rate": 0.00034474805387546624, "loss": 2.898469161987305, "step": 12780 }, { "epoch": 0.48526972678622404, "grad_norm": 0.310546875, "learning_rate": 0.00034466577577020607, "loss": 2.9102657318115233, "step": 12790 }, { "epoch": 0.4856491401769873, "grad_norm": 0.267578125, "learning_rate": 0.0003445834462826402, "loss": 2.9056089401245115, "step": 12800 }, { "epoch": 0.4860285535677506, "grad_norm": 0.26953125, "learning_rate": 0.00034450106544201036, "loss": 2.885097694396973, "step": 12810 }, { "epoch": 0.48640796695851385, "grad_norm": 0.271484375, "learning_rate": 0.0003444186332775766, "loss": 2.8820755004882814, "step": 12820 }, { "epoch": 0.48678738034927715, "grad_norm": 0.265625, "learning_rate": 0.00034433614981861714, "loss": 2.839862823486328, "step": 12830 }, { "epoch": 0.4871667937400404, "grad_norm": 0.259765625, "learning_rate": 0.00034425361509442834, "loss": 2.886923599243164, "step": 12840 }, { "epoch": 0.48754620713080365, "grad_norm": 0.259765625, "learning_rate": 0.0003441710291343249, "loss": 2.8864770889282227, "step": 12850 }, { "epoch": 0.48792562052156696, "grad_norm": 0.259765625, "learning_rate": 0.0003440883919676396, "loss": 2.8808919906616213, "step": 12860 }, { "epoch": 0.4883050339123302, "grad_norm": 0.26953125, "learning_rate": 0.00034400570362372345, "loss": 2.900347900390625, "step": 12870 }, { "epoch": 0.48868444730309346, "grad_norm": 0.283203125, "learning_rate": 0.00034392296413194575, "loss": 2.897147560119629, "step": 12880 }, { "epoch": 0.48906386069385677, "grad_norm": 0.275390625, "learning_rate": 0.0003438401735216937, "loss": 2.9063608169555666, "step": 12890 }, { "epoch": 0.48944327408462, "grad_norm": 0.271484375, "learning_rate": 0.00034375733182237295, "loss": 2.882866859436035, "step": 12900 }, { "epoch": 0.4898226874753833, "grad_norm": 0.26953125, "learning_rate": 0.0003436744390634071, "loss": 2.9291515350341797, "step": 12910 }, { "epoch": 0.4902021008661466, "grad_norm": 0.267578125, "learning_rate": 0.00034359149527423804, "loss": 2.9063337326049803, "step": 12920 }, { "epoch": 0.4905815142569098, "grad_norm": 0.283203125, "learning_rate": 0.00034350850048432555, "loss": 2.906305503845215, "step": 12930 }, { "epoch": 0.49096092764767313, "grad_norm": 0.275390625, "learning_rate": 0.0003434254547231478, "loss": 2.9235183715820314, "step": 12940 }, { "epoch": 0.4913403410384364, "grad_norm": 0.283203125, "learning_rate": 0.00034334235802020095, "loss": 2.8807069778442385, "step": 12950 }, { "epoch": 0.49171975442919963, "grad_norm": 0.267578125, "learning_rate": 0.0003432592104049991, "loss": 2.893384552001953, "step": 12960 }, { "epoch": 0.49209916781996293, "grad_norm": 0.275390625, "learning_rate": 0.0003431760119070747, "loss": 2.9080455780029295, "step": 12970 }, { "epoch": 0.4924785812107262, "grad_norm": 0.271484375, "learning_rate": 0.0003430927625559781, "loss": 2.8736438751220703, "step": 12980 }, { "epoch": 0.4928579946014895, "grad_norm": 0.28125, "learning_rate": 0.00034300946238127766, "loss": 2.897304153442383, "step": 12990 }, { "epoch": 0.49323740799225274, "grad_norm": 0.287109375, "learning_rate": 0.00034292611141256, "loss": 2.882571220397949, "step": 13000 }, { "epoch": 0.49323740799225274, "eval_loss": 2.88731050491333, "eval_runtime": 196.7278, "eval_samples_per_second": 19.362, "eval_steps_per_second": 3.228, "step": 13000 }, { "epoch": 0.493616821383016, "grad_norm": 0.279296875, "learning_rate": 0.00034284270967942975, "loss": 2.8934207916259767, "step": 13010 }, { "epoch": 0.4939962347737793, "grad_norm": 0.2578125, "learning_rate": 0.0003427592572115093, "loss": 2.8786611557006836, "step": 13020 }, { "epoch": 0.49437564816454255, "grad_norm": 0.26171875, "learning_rate": 0.00034267575403843935, "loss": 2.882377052307129, "step": 13030 }, { "epoch": 0.49475506155530585, "grad_norm": 0.263671875, "learning_rate": 0.0003425922001898785, "loss": 2.89440975189209, "step": 13040 }, { "epoch": 0.4951344749460691, "grad_norm": 0.26953125, "learning_rate": 0.00034250859569550334, "loss": 2.906833839416504, "step": 13050 }, { "epoch": 0.49551388833683235, "grad_norm": 0.271484375, "learning_rate": 0.0003424249405850085, "loss": 2.902480125427246, "step": 13060 }, { "epoch": 0.49589330172759566, "grad_norm": 0.265625, "learning_rate": 0.00034234123488810644, "loss": 2.859716796875, "step": 13070 }, { "epoch": 0.4962727151183589, "grad_norm": 0.27734375, "learning_rate": 0.0003422574786345279, "loss": 2.900169563293457, "step": 13080 }, { "epoch": 0.49665212850912216, "grad_norm": 0.31640625, "learning_rate": 0.0003421736718540212, "loss": 2.8748201370239257, "step": 13090 }, { "epoch": 0.49703154189988547, "grad_norm": 0.275390625, "learning_rate": 0.0003420898145763529, "loss": 2.8805509567260743, "step": 13100 }, { "epoch": 0.4974109552906487, "grad_norm": 0.28515625, "learning_rate": 0.0003420059068313073, "loss": 2.9107852935791017, "step": 13110 }, { "epoch": 0.497790368681412, "grad_norm": 0.291015625, "learning_rate": 0.0003419219486486867, "loss": 2.8872940063476564, "step": 13120 }, { "epoch": 0.49816978207217527, "grad_norm": 0.271484375, "learning_rate": 0.00034183794005831136, "loss": 2.876520347595215, "step": 13130 }, { "epoch": 0.4985491954629385, "grad_norm": 0.291015625, "learning_rate": 0.00034175388109001945, "loss": 2.910264778137207, "step": 13140 }, { "epoch": 0.4989286088537018, "grad_norm": 0.26953125, "learning_rate": 0.00034166977177366685, "loss": 2.9189374923706053, "step": 13150 }, { "epoch": 0.4993080222444651, "grad_norm": 0.271484375, "learning_rate": 0.0003415856121391276, "loss": 2.8703514099121095, "step": 13160 }, { "epoch": 0.4996874356352283, "grad_norm": 0.27734375, "learning_rate": 0.0003415014022162934, "loss": 2.890447235107422, "step": 13170 }, { "epoch": 0.5000668490259916, "grad_norm": 0.267578125, "learning_rate": 0.0003414171420350738, "loss": 2.8978090286254883, "step": 13180 }, { "epoch": 0.5004462624167549, "grad_norm": 0.255859375, "learning_rate": 0.00034133283162539644, "loss": 2.9094972610473633, "step": 13190 }, { "epoch": 0.5008256758075181, "grad_norm": 0.296875, "learning_rate": 0.00034124847101720657, "loss": 2.9067922592163087, "step": 13200 }, { "epoch": 0.5012050891982814, "grad_norm": 0.26953125, "learning_rate": 0.0003411640602404673, "loss": 2.8628549575805664, "step": 13210 }, { "epoch": 0.5015845025890447, "grad_norm": 0.265625, "learning_rate": 0.0003410795993251597, "loss": 2.867296028137207, "step": 13220 }, { "epoch": 0.5019639159798079, "grad_norm": 0.275390625, "learning_rate": 0.0003409950883012825, "loss": 2.8983026504516602, "step": 13230 }, { "epoch": 0.5023433293705712, "grad_norm": 0.267578125, "learning_rate": 0.0003409105271988522, "loss": 2.901285743713379, "step": 13240 }, { "epoch": 0.5027227427613346, "grad_norm": 0.27734375, "learning_rate": 0.0003408259160479032, "loss": 2.8835704803466795, "step": 13250 }, { "epoch": 0.5027227427613346, "eval_loss": 2.885258674621582, "eval_runtime": 189.3384, "eval_samples_per_second": 20.117, "eval_steps_per_second": 3.354, "step": 13250 }, { "epoch": 0.5031021561520977, "grad_norm": 0.2578125, "learning_rate": 0.0003407412548784878, "loss": 2.9079912185668944, "step": 13260 }, { "epoch": 0.503481569542861, "grad_norm": 0.263671875, "learning_rate": 0.0003406565437206757, "loss": 2.876902389526367, "step": 13270 }, { "epoch": 0.5038609829336244, "grad_norm": 0.265625, "learning_rate": 0.0003405717826045546, "loss": 2.860718536376953, "step": 13280 }, { "epoch": 0.5042403963243877, "grad_norm": 0.388671875, "learning_rate": 0.0003404869715602299, "loss": 2.8659982681274414, "step": 13290 }, { "epoch": 0.5046198097151509, "grad_norm": 0.28125, "learning_rate": 0.00034040211061782486, "loss": 2.9216646194458007, "step": 13300 }, { "epoch": 0.5049992231059142, "grad_norm": 0.265625, "learning_rate": 0.00034031719980748027, "loss": 2.884727668762207, "step": 13310 }, { "epoch": 0.5053786364966775, "grad_norm": 0.263671875, "learning_rate": 0.0003402322391593546, "loss": 2.8895191192626952, "step": 13320 }, { "epoch": 0.5057580498874407, "grad_norm": 0.302734375, "learning_rate": 0.0003401472287036243, "loss": 2.864694595336914, "step": 13330 }, { "epoch": 0.506137463278204, "grad_norm": 0.265625, "learning_rate": 0.0003400621684704832, "loss": 2.891633415222168, "step": 13340 }, { "epoch": 0.5065168766689673, "grad_norm": 0.267578125, "learning_rate": 0.000339977058490143, "loss": 2.8838836669921877, "step": 13350 }, { "epoch": 0.5068962900597305, "grad_norm": 0.287109375, "learning_rate": 0.00033989189879283295, "loss": 2.8656589508056642, "step": 13360 }, { "epoch": 0.5072757034504938, "grad_norm": 0.283203125, "learning_rate": 0.0003398066894088001, "loss": 2.8747596740722656, "step": 13370 }, { "epoch": 0.5076551168412571, "grad_norm": 0.28125, "learning_rate": 0.00033972143036830905, "loss": 2.866610527038574, "step": 13380 }, { "epoch": 0.5080345302320203, "grad_norm": 0.26953125, "learning_rate": 0.00033963612170164197, "loss": 2.9004764556884766, "step": 13390 }, { "epoch": 0.5084139436227836, "grad_norm": 0.28125, "learning_rate": 0.0003395507634390989, "loss": 2.88348445892334, "step": 13400 }, { "epoch": 0.5087933570135469, "grad_norm": 0.267578125, "learning_rate": 0.00033946535561099725, "loss": 2.88613338470459, "step": 13410 }, { "epoch": 0.5091727704043101, "grad_norm": 0.2734375, "learning_rate": 0.00033937989824767216, "loss": 2.89587516784668, "step": 13420 }, { "epoch": 0.5095521837950734, "grad_norm": 0.287109375, "learning_rate": 0.00033929439137947627, "loss": 2.8566915512084963, "step": 13430 }, { "epoch": 0.5099315971858367, "grad_norm": 0.26953125, "learning_rate": 0.0003392088350367798, "loss": 2.8892871856689455, "step": 13440 }, { "epoch": 0.5103110105766, "grad_norm": 0.26953125, "learning_rate": 0.00033912322924997077, "loss": 2.873052215576172, "step": 13450 }, { "epoch": 0.5106904239673632, "grad_norm": 0.25390625, "learning_rate": 0.0003390375740494545, "loss": 2.9161916732788087, "step": 13460 }, { "epoch": 0.5110698373581265, "grad_norm": 0.2578125, "learning_rate": 0.0003389518694656539, "loss": 2.887952423095703, "step": 13470 }, { "epoch": 0.5114492507488898, "grad_norm": 0.26953125, "learning_rate": 0.00033886611552900965, "loss": 2.8651264190673826, "step": 13480 }, { "epoch": 0.511828664139653, "grad_norm": 0.26953125, "learning_rate": 0.00033878031226997963, "loss": 2.882272148132324, "step": 13490 }, { "epoch": 0.5122080775304163, "grad_norm": 0.26953125, "learning_rate": 0.0003386944597190394, "loss": 2.859777069091797, "step": 13500 }, { "epoch": 0.5122080775304163, "eval_loss": 2.882176160812378, "eval_runtime": 189.4423, "eval_samples_per_second": 20.106, "eval_steps_per_second": 3.352, "step": 13500 }, { "epoch": 0.5125874909211796, "grad_norm": 0.2578125, "learning_rate": 0.00033860855790668207, "loss": 2.891082763671875, "step": 13510 }, { "epoch": 0.5129669043119428, "grad_norm": 0.26953125, "learning_rate": 0.0003385226068634182, "loss": 2.863940620422363, "step": 13520 }, { "epoch": 0.5133463177027061, "grad_norm": 0.275390625, "learning_rate": 0.00033843660661977574, "loss": 2.8671083450317383, "step": 13530 }, { "epoch": 0.5137257310934694, "grad_norm": 0.29296875, "learning_rate": 0.0003383505572063003, "loss": 2.9107154846191405, "step": 13540 }, { "epoch": 0.5141051444842326, "grad_norm": 0.26953125, "learning_rate": 0.0003382644586535549, "loss": 2.8834100723266602, "step": 13550 }, { "epoch": 0.5144845578749959, "grad_norm": 0.283203125, "learning_rate": 0.0003381783109921199, "loss": 2.888962745666504, "step": 13560 }, { "epoch": 0.5148639712657592, "grad_norm": 0.28125, "learning_rate": 0.00033809211425259313, "loss": 2.8695201873779297, "step": 13570 }, { "epoch": 0.5152433846565225, "grad_norm": 0.27734375, "learning_rate": 0.0003380058684655899, "loss": 2.853811264038086, "step": 13580 }, { "epoch": 0.5156227980472857, "grad_norm": 0.27734375, "learning_rate": 0.00033791957366174306, "loss": 2.8526182174682617, "step": 13590 }, { "epoch": 0.516002211438049, "grad_norm": 0.28515625, "learning_rate": 0.0003378332298717027, "loss": 2.862035942077637, "step": 13600 }, { "epoch": 0.5163816248288123, "grad_norm": 0.263671875, "learning_rate": 0.00033774683712613625, "loss": 2.8514022827148438, "step": 13610 }, { "epoch": 0.5167610382195755, "grad_norm": 0.267578125, "learning_rate": 0.0003376603954557288, "loss": 2.8904678344726564, "step": 13620 }, { "epoch": 0.5171404516103388, "grad_norm": 0.2734375, "learning_rate": 0.0003375739048911826, "loss": 2.8794185638427736, "step": 13630 }, { "epoch": 0.5175198650011021, "grad_norm": 0.271484375, "learning_rate": 0.0003374873654632172, "loss": 2.893148994445801, "step": 13640 }, { "epoch": 0.5178992783918653, "grad_norm": 0.2734375, "learning_rate": 0.00033740077720256973, "loss": 2.8848283767700194, "step": 13650 }, { "epoch": 0.5182786917826286, "grad_norm": 0.27734375, "learning_rate": 0.00033731414013999457, "loss": 2.9167131423950194, "step": 13660 }, { "epoch": 0.518658105173392, "grad_norm": 0.267578125, "learning_rate": 0.00033722745430626335, "loss": 2.877553367614746, "step": 13670 }, { "epoch": 0.5190375185641551, "grad_norm": 0.275390625, "learning_rate": 0.0003371407197321652, "loss": 2.8776737213134767, "step": 13680 }, { "epoch": 0.5194169319549184, "grad_norm": 0.275390625, "learning_rate": 0.00033705393644850633, "loss": 2.8811872482299803, "step": 13690 }, { "epoch": 0.5197963453456818, "grad_norm": 0.265625, "learning_rate": 0.00033696710448611044, "loss": 2.8920482635498046, "step": 13700 }, { "epoch": 0.5201757587364451, "grad_norm": 0.26953125, "learning_rate": 0.00033688022387581856, "loss": 2.883253288269043, "step": 13710 }, { "epoch": 0.5205551721272083, "grad_norm": 0.271484375, "learning_rate": 0.00033679329464848866, "loss": 2.8703418731689454, "step": 13720 }, { "epoch": 0.5209345855179716, "grad_norm": 0.26953125, "learning_rate": 0.0003367063168349964, "loss": 2.8462244033813477, "step": 13730 }, { "epoch": 0.5213139989087349, "grad_norm": 0.28125, "learning_rate": 0.0003366192904662344, "loss": 2.9028038024902343, "step": 13740 }, { "epoch": 0.5216934122994981, "grad_norm": 0.29296875, "learning_rate": 0.00033653221557311276, "loss": 2.85524959564209, "step": 13750 }, { "epoch": 0.5216934122994981, "eval_loss": 2.878309965133667, "eval_runtime": 189.4968, "eval_samples_per_second": 20.101, "eval_steps_per_second": 3.351, "step": 13750 }, { "epoch": 0.5220728256902614, "grad_norm": 0.275390625, "learning_rate": 0.00033644509218655857, "loss": 2.8745561599731446, "step": 13760 }, { "epoch": 0.5224522390810247, "grad_norm": 0.28515625, "learning_rate": 0.0003363579203375163, "loss": 2.8722911834716798, "step": 13770 }, { "epoch": 0.5228316524717879, "grad_norm": 0.2734375, "learning_rate": 0.0003362707000569476, "loss": 2.8912513732910154, "step": 13780 }, { "epoch": 0.5232110658625512, "grad_norm": 0.263671875, "learning_rate": 0.00033618343137583126, "loss": 2.863412284851074, "step": 13790 }, { "epoch": 0.5235904792533145, "grad_norm": 0.2734375, "learning_rate": 0.00033609611432516346, "loss": 2.8992868423461915, "step": 13800 }, { "epoch": 0.5239698926440777, "grad_norm": 0.27734375, "learning_rate": 0.00033600874893595736, "loss": 2.870191192626953, "step": 13810 }, { "epoch": 0.524349306034841, "grad_norm": 0.279296875, "learning_rate": 0.00033592133523924325, "loss": 2.8700557708740235, "step": 13820 }, { "epoch": 0.5247287194256043, "grad_norm": 0.28125, "learning_rate": 0.0003358338732660688, "loss": 2.914918899536133, "step": 13830 }, { "epoch": 0.5251081328163675, "grad_norm": 0.267578125, "learning_rate": 0.0003357463630474987, "loss": 2.8640468597412108, "step": 13840 }, { "epoch": 0.5254875462071308, "grad_norm": 0.267578125, "learning_rate": 0.0003356588046146146, "loss": 2.8855056762695312, "step": 13850 }, { "epoch": 0.5258669595978941, "grad_norm": 0.26953125, "learning_rate": 0.0003355711979985157, "loss": 2.8945425033569334, "step": 13860 }, { "epoch": 0.5262463729886574, "grad_norm": 0.265625, "learning_rate": 0.0003354835432303179, "loss": 2.9081464767456056, "step": 13870 }, { "epoch": 0.5266257863794206, "grad_norm": 0.28125, "learning_rate": 0.00033539584034115446, "loss": 2.874899482727051, "step": 13880 }, { "epoch": 0.5270051997701839, "grad_norm": 0.291015625, "learning_rate": 0.0003353080893621756, "loss": 2.8627292633056642, "step": 13890 }, { "epoch": 0.5273846131609472, "grad_norm": 0.267578125, "learning_rate": 0.00033522029032454864, "loss": 2.904581832885742, "step": 13900 }, { "epoch": 0.5277640265517104, "grad_norm": 0.263671875, "learning_rate": 0.000335132443259458, "loss": 2.915811538696289, "step": 13910 }, { "epoch": 0.5281434399424737, "grad_norm": 0.2578125, "learning_rate": 0.00033504454819810524, "loss": 2.8640283584594726, "step": 13920 }, { "epoch": 0.528522853333237, "grad_norm": 0.27734375, "learning_rate": 0.0003349566051717087, "loss": 2.8480043411254883, "step": 13930 }, { "epoch": 0.5289022667240002, "grad_norm": 0.279296875, "learning_rate": 0.0003348686142115041, "loss": 2.887401580810547, "step": 13940 }, { "epoch": 0.5292816801147635, "grad_norm": 0.26171875, "learning_rate": 0.00033478057534874387, "loss": 2.8980428695678713, "step": 13950 }, { "epoch": 0.5296610935055268, "grad_norm": 0.26953125, "learning_rate": 0.0003346924886146978, "loss": 2.8903518676757813, "step": 13960 }, { "epoch": 0.53004050689629, "grad_norm": 0.28125, "learning_rate": 0.00033460435404065233, "loss": 2.897095489501953, "step": 13970 }, { "epoch": 0.5304199202870533, "grad_norm": 0.29296875, "learning_rate": 0.00033451617165791106, "loss": 2.8625276565551756, "step": 13980 }, { "epoch": 0.5307993336778166, "grad_norm": 0.263671875, "learning_rate": 0.0003344279414977946, "loss": 2.894123840332031, "step": 13990 }, { "epoch": 0.5311787470685799, "grad_norm": 0.287109375, "learning_rate": 0.0003343396635916404, "loss": 2.8842851638793947, "step": 14000 }, { "epoch": 0.5311787470685799, "eval_loss": 2.8747148513793945, "eval_runtime": 189.5063, "eval_samples_per_second": 20.1, "eval_steps_per_second": 3.351, "step": 14000 }, { "epoch": 0.5315581604593431, "grad_norm": 0.283203125, "learning_rate": 0.0003342513379708031, "loss": 2.8951347351074217, "step": 14010 }, { "epoch": 0.5319375738501064, "grad_norm": 0.283203125, "learning_rate": 0.0003341629646666541, "loss": 2.897861862182617, "step": 14020 }, { "epoch": 0.5323169872408697, "grad_norm": 0.28515625, "learning_rate": 0.00033407454371058167, "loss": 2.8755237579345705, "step": 14030 }, { "epoch": 0.5326964006316329, "grad_norm": 0.28125, "learning_rate": 0.0003339860751339912, "loss": 2.865498924255371, "step": 14040 }, { "epoch": 0.5330758140223962, "grad_norm": 0.26953125, "learning_rate": 0.0003338975589683049, "loss": 2.8785919189453124, "step": 14050 }, { "epoch": 0.5334552274131595, "grad_norm": 0.30078125, "learning_rate": 0.0003338089952449619, "loss": 2.889093780517578, "step": 14060 }, { "epoch": 0.5338346408039227, "grad_norm": 0.275390625, "learning_rate": 0.00033372038399541817, "loss": 2.8988523483276367, "step": 14070 }, { "epoch": 0.534214054194686, "grad_norm": 0.26171875, "learning_rate": 0.0003336317252511466, "loss": 2.8755828857421877, "step": 14080 }, { "epoch": 0.5345934675854493, "grad_norm": 0.263671875, "learning_rate": 0.00033354301904363706, "loss": 2.879496955871582, "step": 14090 }, { "epoch": 0.5349728809762125, "grad_norm": 0.2734375, "learning_rate": 0.00033345426540439606, "loss": 2.884984016418457, "step": 14100 }, { "epoch": 0.5353522943669758, "grad_norm": 0.267578125, "learning_rate": 0.00033336546436494707, "loss": 2.8811933517456056, "step": 14110 }, { "epoch": 0.5357317077577392, "grad_norm": 0.275390625, "learning_rate": 0.0003332766159568305, "loss": 2.845806884765625, "step": 14120 }, { "epoch": 0.5361111211485025, "grad_norm": 0.279296875, "learning_rate": 0.0003331877202116033, "loss": 2.8757980346679686, "step": 14130 }, { "epoch": 0.5364905345392657, "grad_norm": 0.28125, "learning_rate": 0.0003330987771608395, "loss": 2.889781379699707, "step": 14140 }, { "epoch": 0.536869947930029, "grad_norm": 0.275390625, "learning_rate": 0.0003330097868361299, "loss": 2.8539974212646486, "step": 14150 }, { "epoch": 0.5372493613207923, "grad_norm": 0.26953125, "learning_rate": 0.0003329207492690819, "loss": 2.865687942504883, "step": 14160 }, { "epoch": 0.5376287747115555, "grad_norm": 0.28515625, "learning_rate": 0.00033283166449131994, "loss": 2.8844167709350588, "step": 14170 }, { "epoch": 0.5380081881023188, "grad_norm": 0.279296875, "learning_rate": 0.00033274253253448495, "loss": 2.8674524307250975, "step": 14180 }, { "epoch": 0.5383876014930821, "grad_norm": 0.279296875, "learning_rate": 0.000332653353430235, "loss": 2.864447593688965, "step": 14190 }, { "epoch": 0.5387670148838453, "grad_norm": 0.27734375, "learning_rate": 0.00033256412721024444, "loss": 2.8396934509277343, "step": 14200 }, { "epoch": 0.5391464282746086, "grad_norm": 0.283203125, "learning_rate": 0.0003324748539062047, "loss": 2.856190299987793, "step": 14210 }, { "epoch": 0.5395258416653719, "grad_norm": 0.291015625, "learning_rate": 0.00033238553354982377, "loss": 2.879445266723633, "step": 14220 }, { "epoch": 0.5399052550561351, "grad_norm": 0.271484375, "learning_rate": 0.00033229616617282647, "loss": 2.8780479431152344, "step": 14230 }, { "epoch": 0.5402846684468984, "grad_norm": 0.291015625, "learning_rate": 0.0003322067518069542, "loss": 2.8638593673706056, "step": 14240 }, { "epoch": 0.5406640818376617, "grad_norm": 0.259765625, "learning_rate": 0.0003321172904839651, "loss": 2.850797653198242, "step": 14250 }, { "epoch": 0.5406640818376617, "eval_loss": 2.874237298965454, "eval_runtime": 189.575, "eval_samples_per_second": 20.092, "eval_steps_per_second": 3.35, "step": 14250 }, { "epoch": 0.5410434952284249, "grad_norm": 0.28125, "learning_rate": 0.00033202778223563405, "loss": 2.8592456817626952, "step": 14260 }, { "epoch": 0.5414229086191882, "grad_norm": 0.27734375, "learning_rate": 0.00033193822709375243, "loss": 2.866367149353027, "step": 14270 }, { "epoch": 0.5418023220099515, "grad_norm": 0.265625, "learning_rate": 0.00033184862509012846, "loss": 2.8618499755859377, "step": 14280 }, { "epoch": 0.5421817354007148, "grad_norm": 0.271484375, "learning_rate": 0.00033175897625658694, "loss": 2.870400810241699, "step": 14290 }, { "epoch": 0.542561148791478, "grad_norm": 0.267578125, "learning_rate": 0.0003316692806249692, "loss": 2.900579833984375, "step": 14300 }, { "epoch": 0.5429405621822413, "grad_norm": 0.302734375, "learning_rate": 0.0003315795382271334, "loss": 2.919598388671875, "step": 14310 }, { "epoch": 0.5433199755730046, "grad_norm": 0.27734375, "learning_rate": 0.0003314897490949541, "loss": 2.8571475982666015, "step": 14320 }, { "epoch": 0.5436993889637678, "grad_norm": 0.275390625, "learning_rate": 0.0003313999132603227, "loss": 2.89233283996582, "step": 14330 }, { "epoch": 0.5440788023545311, "grad_norm": 0.27734375, "learning_rate": 0.00033131003075514677, "loss": 2.856412887573242, "step": 14340 }, { "epoch": 0.5444582157452944, "grad_norm": 0.29296875, "learning_rate": 0.00033122010161135097, "loss": 2.8699941635131836, "step": 14350 }, { "epoch": 0.5448376291360576, "grad_norm": 0.2734375, "learning_rate": 0.0003311301258608762, "loss": 2.905893898010254, "step": 14360 }, { "epoch": 0.5452170425268209, "grad_norm": 0.2734375, "learning_rate": 0.00033104010353568, "loss": 2.853104591369629, "step": 14370 }, { "epoch": 0.5455964559175842, "grad_norm": 0.2734375, "learning_rate": 0.00033095003466773634, "loss": 2.915267753601074, "step": 14380 }, { "epoch": 0.5459758693083474, "grad_norm": 0.26953125, "learning_rate": 0.000330859919289036, "loss": 2.8816125869750975, "step": 14390 }, { "epoch": 0.5463552826991107, "grad_norm": 0.259765625, "learning_rate": 0.000330769757431586, "loss": 2.861949157714844, "step": 14400 }, { "epoch": 0.546734696089874, "grad_norm": 0.271484375, "learning_rate": 0.00033067954912741006, "loss": 2.914785385131836, "step": 14410 }, { "epoch": 0.5471141094806373, "grad_norm": 0.267578125, "learning_rate": 0.0003305892944085482, "loss": 2.861470031738281, "step": 14420 }, { "epoch": 0.5474935228714005, "grad_norm": 0.267578125, "learning_rate": 0.00033049899330705714, "loss": 2.8760744094848634, "step": 14430 }, { "epoch": 0.5478729362621638, "grad_norm": 0.27734375, "learning_rate": 0.0003304086458550099, "loss": 2.8782703399658205, "step": 14440 }, { "epoch": 0.5482523496529271, "grad_norm": 0.263671875, "learning_rate": 0.0003303182520844961, "loss": 2.8577857971191407, "step": 14450 }, { "epoch": 0.5486317630436903, "grad_norm": 0.283203125, "learning_rate": 0.0003302278120276217, "loss": 2.88181209564209, "step": 14460 }, { "epoch": 0.5490111764344536, "grad_norm": 0.2734375, "learning_rate": 0.0003301373257165091, "loss": 2.9021276473999023, "step": 14470 }, { "epoch": 0.5493905898252169, "grad_norm": 0.27734375, "learning_rate": 0.00033004679318329727, "loss": 2.866931915283203, "step": 14480 }, { "epoch": 0.5497700032159801, "grad_norm": 0.2734375, "learning_rate": 0.00032995621446014144, "loss": 2.8773681640625, "step": 14490 }, { "epoch": 0.5501494166067434, "grad_norm": 0.265625, "learning_rate": 0.0003298655895792134, "loss": 2.8274993896484375, "step": 14500 }, { "epoch": 0.5501494166067434, "eval_loss": 2.8709304332733154, "eval_runtime": 189.5978, "eval_samples_per_second": 20.09, "eval_steps_per_second": 3.349, "step": 14500 }, { "epoch": 0.5505288299975067, "grad_norm": 0.291015625, "learning_rate": 0.00032977491857270116, "loss": 2.8536571502685546, "step": 14510 }, { "epoch": 0.5509082433882699, "grad_norm": 0.27734375, "learning_rate": 0.00032968420147280915, "loss": 2.8773061752319338, "step": 14520 }, { "epoch": 0.5512876567790332, "grad_norm": 0.27734375, "learning_rate": 0.0003295934383117583, "loss": 2.8733261108398436, "step": 14530 }, { "epoch": 0.5516670701697965, "grad_norm": 0.271484375, "learning_rate": 0.0003295026291217858, "loss": 2.837931442260742, "step": 14540 }, { "epoch": 0.5520464835605597, "grad_norm": 0.267578125, "learning_rate": 0.00032941177393514524, "loss": 2.8510292053222654, "step": 14550 }, { "epoch": 0.552425896951323, "grad_norm": 0.265625, "learning_rate": 0.00032932087278410646, "loss": 2.905380439758301, "step": 14560 }, { "epoch": 0.5528053103420864, "grad_norm": 0.27734375, "learning_rate": 0.00032922992570095567, "loss": 2.8851261138916016, "step": 14570 }, { "epoch": 0.5531847237328497, "grad_norm": 0.283203125, "learning_rate": 0.00032913893271799543, "loss": 2.8903032302856446, "step": 14580 }, { "epoch": 0.5535641371236129, "grad_norm": 0.27734375, "learning_rate": 0.00032904789386754455, "loss": 2.8438880920410154, "step": 14590 }, { "epoch": 0.5539435505143762, "grad_norm": 0.28125, "learning_rate": 0.0003289568091819382, "loss": 2.870359992980957, "step": 14600 }, { "epoch": 0.5543229639051395, "grad_norm": 0.2578125, "learning_rate": 0.00032886567869352766, "loss": 2.8737766265869142, "step": 14610 }, { "epoch": 0.5547023772959027, "grad_norm": 0.259765625, "learning_rate": 0.00032877450243468077, "loss": 2.8555835723876952, "step": 14620 }, { "epoch": 0.555081790686666, "grad_norm": 0.28515625, "learning_rate": 0.00032868328043778137, "loss": 2.8544408798217775, "step": 14630 }, { "epoch": 0.5554612040774293, "grad_norm": 0.267578125, "learning_rate": 0.0003285920127352297, "loss": 2.8652313232421873, "step": 14640 }, { "epoch": 0.5558406174681925, "grad_norm": 0.271484375, "learning_rate": 0.00032850069935944206, "loss": 2.872265625, "step": 14650 }, { "epoch": 0.5562200308589558, "grad_norm": 0.271484375, "learning_rate": 0.0003284093403428512, "loss": 2.8706295013427736, "step": 14660 }, { "epoch": 0.5565994442497191, "grad_norm": 0.27734375, "learning_rate": 0.0003283179357179059, "loss": 2.8901254653930666, "step": 14670 }, { "epoch": 0.5569788576404823, "grad_norm": 0.26953125, "learning_rate": 0.0003282264855170712, "loss": 2.874666213989258, "step": 14680 }, { "epoch": 0.5573582710312456, "grad_norm": 0.275390625, "learning_rate": 0.00032813498977282837, "loss": 2.8554288864135744, "step": 14690 }, { "epoch": 0.5577376844220089, "grad_norm": 0.279296875, "learning_rate": 0.00032804344851767475, "loss": 2.8764556884765624, "step": 14700 }, { "epoch": 0.5581170978127722, "grad_norm": 0.275390625, "learning_rate": 0.0003279518617841239, "loss": 2.901401710510254, "step": 14710 }, { "epoch": 0.5584965112035354, "grad_norm": 0.267578125, "learning_rate": 0.00032786022960470566, "loss": 2.891232490539551, "step": 14720 }, { "epoch": 0.5588759245942987, "grad_norm": 0.275390625, "learning_rate": 0.00032776855201196574, "loss": 2.8985879898071287, "step": 14730 }, { "epoch": 0.559255337985062, "grad_norm": 0.28125, "learning_rate": 0.0003276768290384663, "loss": 2.8581264495849608, "step": 14740 }, { "epoch": 0.5596347513758252, "grad_norm": 0.2734375, "learning_rate": 0.0003275850607167853, "loss": 2.8644105911254885, "step": 14750 }, { "epoch": 0.5596347513758252, "eval_loss": 2.868222713470459, "eval_runtime": 302.831, "eval_samples_per_second": 12.578, "eval_steps_per_second": 2.097, "step": 14750 }, { "epoch": 0.5600141647665885, "grad_norm": 0.259765625, "learning_rate": 0.0003274932470795171, "loss": 2.8466159820556642, "step": 14760 }, { "epoch": 0.5603935781573518, "grad_norm": 0.26953125, "learning_rate": 0.0003274013881592719, "loss": 2.9147262573242188, "step": 14770 }, { "epoch": 0.560772991548115, "grad_norm": 0.283203125, "learning_rate": 0.0003273094839886761, "loss": 2.8623329162597657, "step": 14780 }, { "epoch": 0.5611524049388783, "grad_norm": 0.26953125, "learning_rate": 0.0003272175346003722, "loss": 2.8627138137817383, "step": 14790 }, { "epoch": 0.5615318183296416, "grad_norm": 0.275390625, "learning_rate": 0.00032712554002701876, "loss": 2.8598634719848635, "step": 14800 }, { "epoch": 0.5619112317204048, "grad_norm": 0.287109375, "learning_rate": 0.0003270335003012903, "loss": 2.913101577758789, "step": 14810 }, { "epoch": 0.5622906451111681, "grad_norm": 0.27734375, "learning_rate": 0.0003269414154558775, "loss": 2.8931283950805664, "step": 14820 }, { "epoch": 0.5626700585019314, "grad_norm": 0.283203125, "learning_rate": 0.000326849285523487, "loss": 2.8532665252685545, "step": 14830 }, { "epoch": 0.5630494718926947, "grad_norm": 0.28515625, "learning_rate": 0.0003267571105368414, "loss": 2.8981000900268556, "step": 14840 }, { "epoch": 0.5634288852834579, "grad_norm": 0.26953125, "learning_rate": 0.0003266648905286794, "loss": 2.884189796447754, "step": 14850 }, { "epoch": 0.5638082986742212, "grad_norm": 0.283203125, "learning_rate": 0.0003265726255317556, "loss": 2.8672811508178713, "step": 14860 }, { "epoch": 0.5641877120649845, "grad_norm": 0.74609375, "learning_rate": 0.0003264803155788407, "loss": 2.864910125732422, "step": 14870 }, { "epoch": 0.5645671254557477, "grad_norm": 0.27734375, "learning_rate": 0.0003263879607027212, "loss": 2.8861379623413086, "step": 14880 }, { "epoch": 0.564946538846511, "grad_norm": 0.26953125, "learning_rate": 0.00032629556093619977, "loss": 2.863608169555664, "step": 14890 }, { "epoch": 0.5653259522372743, "grad_norm": 0.275390625, "learning_rate": 0.00032620311631209484, "loss": 2.876792144775391, "step": 14900 }, { "epoch": 0.5657053656280375, "grad_norm": 0.275390625, "learning_rate": 0.0003261106268632408, "loss": 2.894768714904785, "step": 14910 }, { "epoch": 0.5660847790188008, "grad_norm": 0.279296875, "learning_rate": 0.000326018092622488, "loss": 2.8893054962158202, "step": 14920 }, { "epoch": 0.5664641924095641, "grad_norm": 0.28125, "learning_rate": 0.0003259255136227028, "loss": 2.8600311279296875, "step": 14930 }, { "epoch": 0.5668436058003273, "grad_norm": 0.287109375, "learning_rate": 0.0003258328898967673, "loss": 2.862089729309082, "step": 14940 }, { "epoch": 0.5672230191910906, "grad_norm": 0.26171875, "learning_rate": 0.00032574022147757953, "loss": 2.8550548553466797, "step": 14950 }, { "epoch": 0.5676024325818539, "grad_norm": 0.28125, "learning_rate": 0.0003256475083980534, "loss": 2.8683755874633787, "step": 14960 }, { "epoch": 0.5679818459726171, "grad_norm": 0.265625, "learning_rate": 0.0003255547506911186, "loss": 2.913424491882324, "step": 14970 }, { "epoch": 0.5683612593633804, "grad_norm": 0.275390625, "learning_rate": 0.00032546194838972096, "loss": 2.877158546447754, "step": 14980 }, { "epoch": 0.5687406727541438, "grad_norm": 0.2734375, "learning_rate": 0.00032536910152682183, "loss": 2.8812665939331055, "step": 14990 }, { "epoch": 0.5691200861449071, "grad_norm": 0.287109375, "learning_rate": 0.00032527621013539844, "loss": 2.862234115600586, "step": 15000 }, { "epoch": 0.5691200861449071, "eval_loss": 2.865715265274048, "eval_runtime": 188.6118, "eval_samples_per_second": 20.195, "eval_steps_per_second": 3.367, "step": 15000 }, { "epoch": 0.5694994995356703, "grad_norm": 0.29296875, "learning_rate": 0.00032518327424844404, "loss": 2.8443960189819335, "step": 15010 }, { "epoch": 0.5698789129264336, "grad_norm": 0.28515625, "learning_rate": 0.00032509029389896744, "loss": 2.8964349746704103, "step": 15020 }, { "epoch": 0.5702583263171969, "grad_norm": 0.2734375, "learning_rate": 0.0003249972691199934, "loss": 2.8797428131103517, "step": 15030 }, { "epoch": 0.5706377397079601, "grad_norm": 0.271484375, "learning_rate": 0.0003249041999445624, "loss": 2.8718358993530275, "step": 15040 }, { "epoch": 0.5710171530987234, "grad_norm": 0.28125, "learning_rate": 0.0003248110864057307, "loss": 2.885179901123047, "step": 15050 }, { "epoch": 0.5713965664894867, "grad_norm": 0.291015625, "learning_rate": 0.0003247179285365703, "loss": 2.8658910751342774, "step": 15060 }, { "epoch": 0.5717759798802499, "grad_norm": 0.2734375, "learning_rate": 0.00032462472637016896, "loss": 2.871013069152832, "step": 15070 }, { "epoch": 0.5721553932710132, "grad_norm": 0.267578125, "learning_rate": 0.0003245314799396301, "loss": 2.86574764251709, "step": 15080 }, { "epoch": 0.5725348066617765, "grad_norm": 0.271484375, "learning_rate": 0.000324438189278073, "loss": 2.8825565338134767, "step": 15090 }, { "epoch": 0.5729142200525397, "grad_norm": 0.26953125, "learning_rate": 0.00032434485441863264, "loss": 2.8229644775390623, "step": 15100 }, { "epoch": 0.573293633443303, "grad_norm": 0.271484375, "learning_rate": 0.0003242514753944596, "loss": 2.8549177169799806, "step": 15110 }, { "epoch": 0.5736730468340663, "grad_norm": 0.2890625, "learning_rate": 0.0003241580522387201, "loss": 2.8536392211914063, "step": 15120 }, { "epoch": 0.5740524602248296, "grad_norm": 0.283203125, "learning_rate": 0.00032406458498459624, "loss": 2.8830379486083983, "step": 15130 }, { "epoch": 0.5744318736155928, "grad_norm": 0.28125, "learning_rate": 0.00032397107366528556, "loss": 2.878239631652832, "step": 15140 }, { "epoch": 0.5748112870063561, "grad_norm": 0.279296875, "learning_rate": 0.0003238775183140015, "loss": 2.873665428161621, "step": 15150 }, { "epoch": 0.5751907003971194, "grad_norm": 0.28515625, "learning_rate": 0.0003237839189639728, "loss": 2.877823066711426, "step": 15160 }, { "epoch": 0.5755701137878826, "grad_norm": 0.2734375, "learning_rate": 0.0003236902756484442, "loss": 2.866100311279297, "step": 15170 }, { "epoch": 0.5759495271786459, "grad_norm": 0.27734375, "learning_rate": 0.0003235965884006758, "loss": 2.8416263580322267, "step": 15180 }, { "epoch": 0.5763289405694092, "grad_norm": 0.26953125, "learning_rate": 0.00032350285725394343, "loss": 2.8999397277832033, "step": 15190 }, { "epoch": 0.5767083539601724, "grad_norm": 0.275390625, "learning_rate": 0.0003234090822415383, "loss": 2.8650712966918945, "step": 15200 }, { "epoch": 0.5770877673509357, "grad_norm": 0.28125, "learning_rate": 0.00032331526339676763, "loss": 2.8877965927124025, "step": 15210 }, { "epoch": 0.577467180741699, "grad_norm": 0.287109375, "learning_rate": 0.0003232214007529537, "loss": 2.8510343551635744, "step": 15220 }, { "epoch": 0.5778465941324622, "grad_norm": 0.275390625, "learning_rate": 0.0003231274943434347, "loss": 2.8617279052734377, "step": 15230 }, { "epoch": 0.5782260075232255, "grad_norm": 0.279296875, "learning_rate": 0.00032303354420156426, "loss": 2.8749416351318358, "step": 15240 }, { "epoch": 0.5786054209139888, "grad_norm": 0.271484375, "learning_rate": 0.00032293955036071155, "loss": 2.874312973022461, "step": 15250 }, { "epoch": 0.5786054209139888, "eval_loss": 2.864137649536133, "eval_runtime": 188.9781, "eval_samples_per_second": 20.156, "eval_steps_per_second": 3.36, "step": 15250 }, { "epoch": 0.5789848343047521, "grad_norm": 0.27734375, "learning_rate": 0.0003228455128542612, "loss": 2.8795766830444336, "step": 15260 }, { "epoch": 0.5793642476955153, "grad_norm": 0.279296875, "learning_rate": 0.00032275143171561337, "loss": 2.859217071533203, "step": 15270 }, { "epoch": 0.5797436610862786, "grad_norm": 0.27734375, "learning_rate": 0.0003226573069781838, "loss": 2.8758039474487305, "step": 15280 }, { "epoch": 0.5801230744770419, "grad_norm": 0.298828125, "learning_rate": 0.0003225631386754037, "loss": 2.8386903762817384, "step": 15290 }, { "epoch": 0.5805024878678051, "grad_norm": 0.28125, "learning_rate": 0.00032246892684071964, "loss": 2.8756906509399416, "step": 15300 }, { "epoch": 0.5808819012585684, "grad_norm": 0.279296875, "learning_rate": 0.00032237467150759367, "loss": 2.823270797729492, "step": 15310 }, { "epoch": 0.5812613146493317, "grad_norm": 0.275390625, "learning_rate": 0.0003222803727095035, "loss": 2.850119400024414, "step": 15320 }, { "epoch": 0.5816407280400949, "grad_norm": 0.267578125, "learning_rate": 0.000322186030479942, "loss": 2.8578369140625, "step": 15330 }, { "epoch": 0.5820201414308582, "grad_norm": 0.275390625, "learning_rate": 0.00032209164485241766, "loss": 2.849884033203125, "step": 15340 }, { "epoch": 0.5823995548216215, "grad_norm": 0.267578125, "learning_rate": 0.0003219972158604542, "loss": 2.864615249633789, "step": 15350 }, { "epoch": 0.5827789682123847, "grad_norm": 0.28125, "learning_rate": 0.00032190274353759103, "loss": 2.886347198486328, "step": 15360 }, { "epoch": 0.583158381603148, "grad_norm": 0.271484375, "learning_rate": 0.00032180822791738265, "loss": 2.8679729461669923, "step": 15370 }, { "epoch": 0.5835377949939113, "grad_norm": 0.287109375, "learning_rate": 0.00032171366903339905, "loss": 2.8295059204101562, "step": 15380 }, { "epoch": 0.5839172083846745, "grad_norm": 0.287109375, "learning_rate": 0.0003216190669192257, "loss": 2.863005447387695, "step": 15390 }, { "epoch": 0.5842966217754378, "grad_norm": 0.28515625, "learning_rate": 0.0003215244216084632, "loss": 2.8774999618530273, "step": 15400 }, { "epoch": 0.5846760351662011, "grad_norm": 0.283203125, "learning_rate": 0.0003214297331347277, "loss": 2.8905261993408202, "step": 15410 }, { "epoch": 0.5850554485569645, "grad_norm": 0.28515625, "learning_rate": 0.0003213350015316506, "loss": 2.871562957763672, "step": 15420 }, { "epoch": 0.5854348619477276, "grad_norm": 0.2734375, "learning_rate": 0.0003212402268328786, "loss": 2.8770557403564454, "step": 15430 }, { "epoch": 0.585814275338491, "grad_norm": 0.28125, "learning_rate": 0.00032114540907207363, "loss": 2.862154006958008, "step": 15440 }, { "epoch": 0.5861936887292543, "grad_norm": 0.271484375, "learning_rate": 0.0003210505482829132, "loss": 2.857028007507324, "step": 15450 }, { "epoch": 0.5865731021200175, "grad_norm": 0.28125, "learning_rate": 0.00032095564449908967, "loss": 2.8387771606445313, "step": 15460 }, { "epoch": 0.5869525155107808, "grad_norm": 0.2578125, "learning_rate": 0.0003208606977543111, "loss": 2.8571983337402345, "step": 15470 }, { "epoch": 0.5873319289015441, "grad_norm": 0.2734375, "learning_rate": 0.0003207657080823006, "loss": 2.824666404724121, "step": 15480 }, { "epoch": 0.5877113422923073, "grad_norm": 0.27734375, "learning_rate": 0.00032067067551679653, "loss": 2.8616317749023437, "step": 15490 }, { "epoch": 0.5880907556830706, "grad_norm": 0.291015625, "learning_rate": 0.0003205756000915524, "loss": 2.8443761825561524, "step": 15500 }, { "epoch": 0.5880907556830706, "eval_loss": 2.8611600399017334, "eval_runtime": 189.8559, "eval_samples_per_second": 20.063, "eval_steps_per_second": 3.345, "step": 15500 }, { "epoch": 0.5884701690738339, "grad_norm": 0.28125, "learning_rate": 0.0003204804818403371, "loss": 2.8767419815063477, "step": 15510 }, { "epoch": 0.5888495824645971, "grad_norm": 0.2890625, "learning_rate": 0.00032038532079693477, "loss": 2.8587039947509765, "step": 15520 }, { "epoch": 0.5892289958553604, "grad_norm": 0.275390625, "learning_rate": 0.0003202901169951445, "loss": 2.8834741592407225, "step": 15530 }, { "epoch": 0.5896084092461237, "grad_norm": 0.279296875, "learning_rate": 0.0003201948704687809, "loss": 2.881742477416992, "step": 15540 }, { "epoch": 0.589987822636887, "grad_norm": 0.265625, "learning_rate": 0.0003200995812516734, "loss": 2.8662744522094727, "step": 15550 }, { "epoch": 0.5903672360276502, "grad_norm": 0.2734375, "learning_rate": 0.00032000424937766693, "loss": 2.858658790588379, "step": 15560 }, { "epoch": 0.5907466494184135, "grad_norm": 0.287109375, "learning_rate": 0.0003199088748806212, "loss": 2.8815244674682616, "step": 15570 }, { "epoch": 0.5911260628091768, "grad_norm": 0.29296875, "learning_rate": 0.0003198134577944115, "loss": 2.8601444244384764, "step": 15580 }, { "epoch": 0.59150547619994, "grad_norm": 0.294921875, "learning_rate": 0.00031971799815292784, "loss": 2.8463006973266602, "step": 15590 }, { "epoch": 0.5918848895907033, "grad_norm": 0.28515625, "learning_rate": 0.00031962249599007565, "loss": 2.8536121368408205, "step": 15600 }, { "epoch": 0.5922643029814666, "grad_norm": 0.431640625, "learning_rate": 0.0003195269513397752, "loss": 2.8714309692382813, "step": 15610 }, { "epoch": 0.5926437163722298, "grad_norm": 0.267578125, "learning_rate": 0.00031943136423596204, "loss": 2.8685651779174806, "step": 15620 }, { "epoch": 0.5930231297629931, "grad_norm": 0.2734375, "learning_rate": 0.00031933573471258685, "loss": 2.834940719604492, "step": 15630 }, { "epoch": 0.5934025431537564, "grad_norm": 0.33984375, "learning_rate": 0.00031924006280361505, "loss": 2.8724153518676756, "step": 15640 }, { "epoch": 0.5937819565445196, "grad_norm": 0.283203125, "learning_rate": 0.00031914434854302755, "loss": 2.8475362777709963, "step": 15650 }, { "epoch": 0.5941613699352829, "grad_norm": 0.287109375, "learning_rate": 0.00031904859196482, "loss": 2.8415719985961916, "step": 15660 }, { "epoch": 0.5945407833260462, "grad_norm": 0.298828125, "learning_rate": 0.00031895279310300304, "loss": 2.8470132827758787, "step": 15670 }, { "epoch": 0.5949201967168095, "grad_norm": 0.2734375, "learning_rate": 0.00031885695199160264, "loss": 2.875628662109375, "step": 15680 }, { "epoch": 0.5952996101075727, "grad_norm": 0.2890625, "learning_rate": 0.0003187610686646595, "loss": 2.872110366821289, "step": 15690 }, { "epoch": 0.595679023498336, "grad_norm": 0.291015625, "learning_rate": 0.0003186651431562295, "loss": 2.8807865142822267, "step": 15700 }, { "epoch": 0.5960584368890993, "grad_norm": 0.265625, "learning_rate": 0.0003185691755003832, "loss": 2.868916130065918, "step": 15710 }, { "epoch": 0.5964378502798625, "grad_norm": 0.28515625, "learning_rate": 0.00031847316573120653, "loss": 2.851094627380371, "step": 15720 }, { "epoch": 0.5968172636706258, "grad_norm": 0.287109375, "learning_rate": 0.0003183771138828001, "loss": 2.838751029968262, "step": 15730 }, { "epoch": 0.5971966770613891, "grad_norm": 0.287109375, "learning_rate": 0.0003182810199892796, "loss": 2.8507759094238283, "step": 15740 }, { "epoch": 0.5975760904521523, "grad_norm": 0.28515625, "learning_rate": 0.00031818488408477555, "loss": 2.8495500564575194, "step": 15750 }, { "epoch": 0.5975760904521523, "eval_loss": 2.8591833114624023, "eval_runtime": 188.5941, "eval_samples_per_second": 20.197, "eval_steps_per_second": 3.367, "step": 15750 }, { "epoch": 0.5979555038429156, "grad_norm": 0.271484375, "learning_rate": 0.0003180887062034335, "loss": 2.872244453430176, "step": 15760 }, { "epoch": 0.5983349172336789, "grad_norm": 0.2890625, "learning_rate": 0.00031799248637941376, "loss": 2.862942695617676, "step": 15770 }, { "epoch": 0.5987143306244421, "grad_norm": 0.275390625, "learning_rate": 0.0003178962246468917, "loss": 2.8675159454345702, "step": 15780 }, { "epoch": 0.5990937440152054, "grad_norm": 0.28125, "learning_rate": 0.0003177999210400575, "loss": 2.8549287796020506, "step": 15790 }, { "epoch": 0.5994731574059687, "grad_norm": 0.287109375, "learning_rate": 0.0003177035755931162, "loss": 2.8659143447875977, "step": 15800 }, { "epoch": 0.5998525707967319, "grad_norm": 0.302734375, "learning_rate": 0.0003176071883402878, "loss": 2.8736696243286133, "step": 15810 }, { "epoch": 0.6002319841874952, "grad_norm": 0.287109375, "learning_rate": 0.0003175107593158069, "loss": 2.8427661895751952, "step": 15820 }, { "epoch": 0.6006113975782585, "grad_norm": 0.30078125, "learning_rate": 0.0003174142885539233, "loss": 2.8578039169311524, "step": 15830 }, { "epoch": 0.6009908109690218, "grad_norm": 0.302734375, "learning_rate": 0.00031731777608890127, "loss": 2.8604175567626955, "step": 15840 }, { "epoch": 0.601370224359785, "grad_norm": 0.2890625, "learning_rate": 0.00031722122195502014, "loss": 2.8726522445678713, "step": 15850 }, { "epoch": 0.6017496377505483, "grad_norm": 0.271484375, "learning_rate": 0.0003171246261865739, "loss": 2.875540924072266, "step": 15860 }, { "epoch": 0.6021290511413117, "grad_norm": 0.287109375, "learning_rate": 0.00031702798881787144, "loss": 2.863857078552246, "step": 15870 }, { "epoch": 0.6025084645320748, "grad_norm": 0.28515625, "learning_rate": 0.00031693130988323637, "loss": 2.843907356262207, "step": 15880 }, { "epoch": 0.6028878779228382, "grad_norm": 0.28515625, "learning_rate": 0.0003168345894170069, "loss": 2.8708282470703126, "step": 15890 }, { "epoch": 0.6032672913136015, "grad_norm": 0.283203125, "learning_rate": 0.0003167378274535364, "loss": 2.841860771179199, "step": 15900 }, { "epoch": 0.6036467047043647, "grad_norm": 0.29296875, "learning_rate": 0.0003166410240271925, "loss": 2.8552379608154297, "step": 15910 }, { "epoch": 0.604026118095128, "grad_norm": 0.283203125, "learning_rate": 0.00031654417917235787, "loss": 2.848531723022461, "step": 15920 }, { "epoch": 0.6044055314858913, "grad_norm": 0.26953125, "learning_rate": 0.00031644729292342984, "loss": 2.8708791732788086, "step": 15930 }, { "epoch": 0.6047849448766545, "grad_norm": 0.283203125, "learning_rate": 0.00031635036531482046, "loss": 2.853035545349121, "step": 15940 }, { "epoch": 0.6051643582674178, "grad_norm": 0.26953125, "learning_rate": 0.0003162533963809562, "loss": 2.8781625747680666, "step": 15950 }, { "epoch": 0.6055437716581811, "grad_norm": 0.287109375, "learning_rate": 0.0003161563861562787, "loss": 2.862479019165039, "step": 15960 }, { "epoch": 0.6059231850489444, "grad_norm": 0.27734375, "learning_rate": 0.0003160593346752439, "loss": 2.8837371826171876, "step": 15970 }, { "epoch": 0.6063025984397076, "grad_norm": 0.287109375, "learning_rate": 0.0003159622419723223, "loss": 2.8702125549316406, "step": 15980 }, { "epoch": 0.6066820118304709, "grad_norm": 0.26953125, "learning_rate": 0.00031586510808199954, "loss": 2.859238624572754, "step": 15990 }, { "epoch": 0.6070614252212342, "grad_norm": 0.271484375, "learning_rate": 0.00031576793303877537, "loss": 2.864008903503418, "step": 16000 }, { "epoch": 0.6070614252212342, "eval_loss": 2.856475591659546, "eval_runtime": 189.4965, "eval_samples_per_second": 20.101, "eval_steps_per_second": 3.351, "step": 16000 }, { "epoch": 0.6074408386119974, "grad_norm": 0.275390625, "learning_rate": 0.0003156707168771644, "loss": 2.8809368133544924, "step": 16010 }, { "epoch": 0.6078202520027607, "grad_norm": 0.29296875, "learning_rate": 0.00031557345963169586, "loss": 2.8594266891479494, "step": 16020 }, { "epoch": 0.608199665393524, "grad_norm": 0.3046875, "learning_rate": 0.0003154761613369135, "loss": 2.8480293273925783, "step": 16030 }, { "epoch": 0.6085790787842872, "grad_norm": 0.291015625, "learning_rate": 0.00031537882202737564, "loss": 2.8472461700439453, "step": 16040 }, { "epoch": 0.6089584921750505, "grad_norm": 0.287109375, "learning_rate": 0.00031528144173765524, "loss": 2.850267791748047, "step": 16050 }, { "epoch": 0.6093379055658138, "grad_norm": 0.310546875, "learning_rate": 0.00031518402050233977, "loss": 2.8746599197387694, "step": 16060 }, { "epoch": 0.609717318956577, "grad_norm": 0.291015625, "learning_rate": 0.0003150865583560313, "loss": 2.874387741088867, "step": 16070 }, { "epoch": 0.6100967323473403, "grad_norm": 0.2890625, "learning_rate": 0.0003149890553333462, "loss": 2.8706796646118162, "step": 16080 }, { "epoch": 0.6104761457381036, "grad_norm": 0.28125, "learning_rate": 0.00031489151146891565, "loss": 2.8547346115112306, "step": 16090 }, { "epoch": 0.6108555591288669, "grad_norm": 0.298828125, "learning_rate": 0.0003147939267973853, "loss": 2.866359329223633, "step": 16100 }, { "epoch": 0.6112349725196301, "grad_norm": 0.271484375, "learning_rate": 0.00031469630135341515, "loss": 2.8777492523193358, "step": 16110 }, { "epoch": 0.6116143859103934, "grad_norm": 0.26953125, "learning_rate": 0.0003145986351716797, "loss": 2.8464981079101563, "step": 16120 }, { "epoch": 0.6119937993011567, "grad_norm": 0.28125, "learning_rate": 0.00031450092828686795, "loss": 2.8429281234741213, "step": 16130 }, { "epoch": 0.6123732126919199, "grad_norm": 0.2734375, "learning_rate": 0.0003144031807336835, "loss": 2.8568445205688477, "step": 16140 }, { "epoch": 0.6127526260826832, "grad_norm": 0.298828125, "learning_rate": 0.00031430539254684417, "loss": 2.867697334289551, "step": 16150 }, { "epoch": 0.6131320394734465, "grad_norm": 0.271484375, "learning_rate": 0.0003142075637610824, "loss": 2.868274688720703, "step": 16160 }, { "epoch": 0.6135114528642097, "grad_norm": 0.2734375, "learning_rate": 0.0003141096944111449, "loss": 2.8530643463134764, "step": 16170 }, { "epoch": 0.613890866254973, "grad_norm": 0.28125, "learning_rate": 0.0003140117845317928, "loss": 2.850116157531738, "step": 16180 }, { "epoch": 0.6142702796457363, "grad_norm": 0.263671875, "learning_rate": 0.0003139138341578017, "loss": 2.8635419845581054, "step": 16190 }, { "epoch": 0.6146496930364995, "grad_norm": 0.279296875, "learning_rate": 0.0003138158433239616, "loss": 2.845871353149414, "step": 16200 }, { "epoch": 0.6150291064272628, "grad_norm": 0.275390625, "learning_rate": 0.00031371781206507684, "loss": 2.848284912109375, "step": 16210 }, { "epoch": 0.6154085198180261, "grad_norm": 0.283203125, "learning_rate": 0.00031361974041596607, "loss": 2.870209312438965, "step": 16220 }, { "epoch": 0.6157879332087893, "grad_norm": 0.275390625, "learning_rate": 0.0003135216284114623, "loss": 2.8611610412597654, "step": 16230 }, { "epoch": 0.6161673465995526, "grad_norm": 0.291015625, "learning_rate": 0.00031342347608641287, "loss": 2.8667055130004884, "step": 16240 }, { "epoch": 0.6165467599903159, "grad_norm": 0.275390625, "learning_rate": 0.0003133252834756796, "loss": 2.863849639892578, "step": 16250 }, { "epoch": 0.6165467599903159, "eval_loss": 2.8543365001678467, "eval_runtime": 189.35, "eval_samples_per_second": 20.116, "eval_steps_per_second": 3.354, "step": 16250 }, { "epoch": 0.6169261733810792, "grad_norm": 0.287109375, "learning_rate": 0.0003132270506141384, "loss": 2.8525089263916015, "step": 16260 }, { "epoch": 0.6173055867718424, "grad_norm": 0.279296875, "learning_rate": 0.00031312877753667955, "loss": 2.86279296875, "step": 16270 }, { "epoch": 0.6176850001626057, "grad_norm": 0.265625, "learning_rate": 0.00031303046427820757, "loss": 2.9094947814941405, "step": 16280 }, { "epoch": 0.618064413553369, "grad_norm": 0.271484375, "learning_rate": 0.0003129321108736414, "loss": 2.892106628417969, "step": 16290 }, { "epoch": 0.6184438269441322, "grad_norm": 0.287109375, "learning_rate": 0.00031283371735791416, "loss": 2.8745920181274416, "step": 16300 }, { "epoch": 0.6188232403348956, "grad_norm": 0.283203125, "learning_rate": 0.0003127352837659731, "loss": 2.8670230865478517, "step": 16310 }, { "epoch": 0.6192026537256589, "grad_norm": 0.298828125, "learning_rate": 0.00031263681013277985, "loss": 2.8772226333618165, "step": 16320 }, { "epoch": 0.619582067116422, "grad_norm": 0.28125, "learning_rate": 0.00031253829649331027, "loss": 2.8513071060180666, "step": 16330 }, { "epoch": 0.6199614805071854, "grad_norm": 0.271484375, "learning_rate": 0.00031243974288255427, "loss": 2.884710121154785, "step": 16340 }, { "epoch": 0.6203408938979487, "grad_norm": 0.27734375, "learning_rate": 0.00031234114933551617, "loss": 2.901080513000488, "step": 16350 }, { "epoch": 0.6207203072887119, "grad_norm": 0.283203125, "learning_rate": 0.00031224251588721436, "loss": 2.8679794311523437, "step": 16360 }, { "epoch": 0.6210997206794752, "grad_norm": 0.291015625, "learning_rate": 0.00031214384257268135, "loss": 2.8664682388305662, "step": 16370 }, { "epoch": 0.6214791340702385, "grad_norm": 0.28125, "learning_rate": 0.0003120451294269639, "loss": 2.8127660751342773, "step": 16380 }, { "epoch": 0.6218585474610018, "grad_norm": 0.28515625, "learning_rate": 0.0003119463764851228, "loss": 2.882552146911621, "step": 16390 }, { "epoch": 0.622237960851765, "grad_norm": 0.283203125, "learning_rate": 0.00031184758378223325, "loss": 2.8761360168457033, "step": 16400 }, { "epoch": 0.6226173742425283, "grad_norm": 0.283203125, "learning_rate": 0.0003117487513533843, "loss": 2.8749584197998046, "step": 16410 }, { "epoch": 0.6229967876332916, "grad_norm": 0.2734375, "learning_rate": 0.0003116498792336791, "loss": 2.8521270751953125, "step": 16420 }, { "epoch": 0.6233762010240548, "grad_norm": 0.28125, "learning_rate": 0.00031155096745823506, "loss": 2.889888954162598, "step": 16430 }, { "epoch": 0.6237556144148181, "grad_norm": 0.26953125, "learning_rate": 0.0003114520160621837, "loss": 2.8673093795776365, "step": 16440 }, { "epoch": 0.6241350278055814, "grad_norm": 0.283203125, "learning_rate": 0.00031135302508067034, "loss": 2.8684797286987305, "step": 16450 }, { "epoch": 0.6245144411963446, "grad_norm": 0.29296875, "learning_rate": 0.0003112539945488546, "loss": 2.829659271240234, "step": 16460 }, { "epoch": 0.6248938545871079, "grad_norm": 0.28125, "learning_rate": 0.0003111549245019102, "loss": 2.857742691040039, "step": 16470 }, { "epoch": 0.6252732679778712, "grad_norm": 0.2890625, "learning_rate": 0.00031105581497502477, "loss": 2.850091552734375, "step": 16480 }, { "epoch": 0.6256526813686344, "grad_norm": 0.28125, "learning_rate": 0.00031095666600339976, "loss": 2.8780717849731445, "step": 16490 }, { "epoch": 0.6260320947593977, "grad_norm": 0.27734375, "learning_rate": 0.0003108574776222511, "loss": 2.8638471603393554, "step": 16500 }, { "epoch": 0.6260320947593977, "eval_loss": 2.852236270904541, "eval_runtime": 189.5108, "eval_samples_per_second": 20.099, "eval_steps_per_second": 3.351, "step": 16500 }, { "epoch": 0.626411508150161, "grad_norm": 0.26953125, "learning_rate": 0.0003107582498668083, "loss": 2.845009422302246, "step": 16510 }, { "epoch": 0.6267909215409243, "grad_norm": 0.27734375, "learning_rate": 0.00031065898277231503, "loss": 2.859386444091797, "step": 16520 }, { "epoch": 0.6271703349316875, "grad_norm": 0.287109375, "learning_rate": 0.00031055967637402896, "loss": 2.8485748291015627, "step": 16530 }, { "epoch": 0.6275497483224508, "grad_norm": 0.28125, "learning_rate": 0.00031046033070722173, "loss": 2.8729970932006834, "step": 16540 }, { "epoch": 0.6279291617132141, "grad_norm": 0.283203125, "learning_rate": 0.00031036094580717875, "loss": 2.8484254837036134, "step": 16550 }, { "epoch": 0.6283085751039773, "grad_norm": 0.279296875, "learning_rate": 0.0003102615217091996, "loss": 2.84204216003418, "step": 16560 }, { "epoch": 0.6286879884947406, "grad_norm": 0.2890625, "learning_rate": 0.0003101620584485976, "loss": 2.846792221069336, "step": 16570 }, { "epoch": 0.6290674018855039, "grad_norm": 0.27734375, "learning_rate": 0.0003100625560607001, "loss": 2.8719287872314454, "step": 16580 }, { "epoch": 0.6294468152762671, "grad_norm": 0.28125, "learning_rate": 0.0003099630145808483, "loss": 2.8647319793701174, "step": 16590 }, { "epoch": 0.6298262286670304, "grad_norm": 0.294921875, "learning_rate": 0.0003098634340443972, "loss": 2.8751007080078126, "step": 16600 }, { "epoch": 0.6302056420577937, "grad_norm": 0.27734375, "learning_rate": 0.00030976381448671593, "loss": 2.8518255233764647, "step": 16610 }, { "epoch": 0.6305850554485569, "grad_norm": 0.294921875, "learning_rate": 0.0003096641559431871, "loss": 2.850174331665039, "step": 16620 }, { "epoch": 0.6309644688393202, "grad_norm": 0.294921875, "learning_rate": 0.00030956445844920754, "loss": 2.827975845336914, "step": 16630 }, { "epoch": 0.6313438822300835, "grad_norm": 0.283203125, "learning_rate": 0.00030946472204018764, "loss": 2.829604911804199, "step": 16640 }, { "epoch": 0.6317232956208467, "grad_norm": 0.28125, "learning_rate": 0.0003093649467515518, "loss": 2.842541313171387, "step": 16650 }, { "epoch": 0.63210270901161, "grad_norm": 0.27734375, "learning_rate": 0.00030926513261873807, "loss": 2.8068611145019533, "step": 16660 }, { "epoch": 0.6324821224023733, "grad_norm": 0.2890625, "learning_rate": 0.00030916527967719847, "loss": 2.839783477783203, "step": 16670 }, { "epoch": 0.6328615357931366, "grad_norm": 0.26953125, "learning_rate": 0.0003090653879623986, "loss": 2.8520534515380858, "step": 16680 }, { "epoch": 0.6332409491838998, "grad_norm": 0.283203125, "learning_rate": 0.00030896545750981803, "loss": 2.873077392578125, "step": 16690 }, { "epoch": 0.6336203625746631, "grad_norm": 0.2890625, "learning_rate": 0.00030886548835495, "loss": 2.8750072479248048, "step": 16700 }, { "epoch": 0.6339997759654264, "grad_norm": 0.279296875, "learning_rate": 0.0003087654805333013, "loss": 2.8364614486694335, "step": 16710 }, { "epoch": 0.6343791893561896, "grad_norm": 0.3125, "learning_rate": 0.000308665434080393, "loss": 2.8427942276000975, "step": 16720 }, { "epoch": 0.634758602746953, "grad_norm": 0.279296875, "learning_rate": 0.0003085653490317593, "loss": 2.835987854003906, "step": 16730 }, { "epoch": 0.6351380161377163, "grad_norm": 0.28515625, "learning_rate": 0.0003084652254229484, "loss": 2.863676643371582, "step": 16740 }, { "epoch": 0.6355174295284794, "grad_norm": 0.279296875, "learning_rate": 0.000308365063289522, "loss": 2.826494026184082, "step": 16750 }, { "epoch": 0.6355174295284794, "eval_loss": 2.849806785583496, "eval_runtime": 189.5229, "eval_samples_per_second": 20.098, "eval_steps_per_second": 3.351, "step": 16750 }, { "epoch": 0.6358968429192428, "grad_norm": 0.306640625, "learning_rate": 0.000308264862667056, "loss": 2.8596004486083983, "step": 16760 }, { "epoch": 0.6362762563100061, "grad_norm": 0.291015625, "learning_rate": 0.0003081646235911392, "loss": 2.886943817138672, "step": 16770 }, { "epoch": 0.6366556697007693, "grad_norm": 0.296875, "learning_rate": 0.00030806434609737476, "loss": 2.876194953918457, "step": 16780 }, { "epoch": 0.6370350830915326, "grad_norm": 0.28515625, "learning_rate": 0.000307964030221379, "loss": 2.8607683181762695, "step": 16790 }, { "epoch": 0.6374144964822959, "grad_norm": 0.2734375, "learning_rate": 0.0003078636759987821, "loss": 2.8458583831787108, "step": 16800 }, { "epoch": 0.6377939098730592, "grad_norm": 0.279296875, "learning_rate": 0.00030776328346522776, "loss": 2.8839059829711915, "step": 16810 }, { "epoch": 0.6381733232638224, "grad_norm": 0.28125, "learning_rate": 0.00030766285265637345, "loss": 2.853803253173828, "step": 16820 }, { "epoch": 0.6385527366545857, "grad_norm": 0.271484375, "learning_rate": 0.0003075623836078901, "loss": 2.834617805480957, "step": 16830 }, { "epoch": 0.638932150045349, "grad_norm": 0.279296875, "learning_rate": 0.00030746187635546213, "loss": 2.8319101333618164, "step": 16840 }, { "epoch": 0.6393115634361122, "grad_norm": 0.2734375, "learning_rate": 0.00030736133093478785, "loss": 2.8655315399169923, "step": 16850 }, { "epoch": 0.6396909768268755, "grad_norm": 0.283203125, "learning_rate": 0.0003072607473815787, "loss": 2.856521415710449, "step": 16860 }, { "epoch": 0.6400703902176388, "grad_norm": 0.314453125, "learning_rate": 0.0003071601257315601, "loss": 2.8634801864624024, "step": 16870 }, { "epoch": 0.640449803608402, "grad_norm": 0.27734375, "learning_rate": 0.0003070594660204707, "loss": 2.8506980895996095, "step": 16880 }, { "epoch": 0.6408292169991653, "grad_norm": 0.275390625, "learning_rate": 0.0003069587682840627, "loss": 2.8323442459106447, "step": 16890 }, { "epoch": 0.6412086303899286, "grad_norm": 0.275390625, "learning_rate": 0.000306858032558102, "loss": 2.813701629638672, "step": 16900 }, { "epoch": 0.6415880437806918, "grad_norm": 0.279296875, "learning_rate": 0.00030675725887836764, "loss": 2.8478343963623045, "step": 16910 }, { "epoch": 0.6419674571714551, "grad_norm": 0.279296875, "learning_rate": 0.00030665644728065255, "loss": 2.8432132720947267, "step": 16920 }, { "epoch": 0.6423468705622184, "grad_norm": 0.275390625, "learning_rate": 0.00030655559780076285, "loss": 2.857428550720215, "step": 16930 }, { "epoch": 0.6427262839529816, "grad_norm": 0.28515625, "learning_rate": 0.00030645471047451825, "loss": 2.848370361328125, "step": 16940 }, { "epoch": 0.6431056973437449, "grad_norm": 0.296875, "learning_rate": 0.00030635378533775174, "loss": 2.844516372680664, "step": 16950 }, { "epoch": 0.6434851107345082, "grad_norm": 0.27734375, "learning_rate": 0.00030625282242631, "loss": 2.8908987045288086, "step": 16960 }, { "epoch": 0.6438645241252715, "grad_norm": 0.28515625, "learning_rate": 0.00030615182177605286, "loss": 2.8498754501342773, "step": 16970 }, { "epoch": 0.6442439375160347, "grad_norm": 0.279296875, "learning_rate": 0.00030605078342285376, "loss": 2.8651054382324217, "step": 16980 }, { "epoch": 0.644623350906798, "grad_norm": 0.275390625, "learning_rate": 0.00030594970740259946, "loss": 2.886288070678711, "step": 16990 }, { "epoch": 0.6450027642975613, "grad_norm": 0.27734375, "learning_rate": 0.00030584859375118993, "loss": 2.855835723876953, "step": 17000 }, { "epoch": 0.6450027642975613, "eval_loss": 2.8478152751922607, "eval_runtime": 189.3894, "eval_samples_per_second": 20.112, "eval_steps_per_second": 3.353, "step": 17000 }, { "epoch": 0.6453821776883245, "grad_norm": 0.27734375, "learning_rate": 0.0003057474425045388, "loss": 2.87125301361084, "step": 17010 }, { "epoch": 0.6457615910790878, "grad_norm": 0.2734375, "learning_rate": 0.0003056462536985728, "loss": 2.849117469787598, "step": 17020 }, { "epoch": 0.6461410044698511, "grad_norm": 0.29296875, "learning_rate": 0.00030554502736923226, "loss": 2.8516828536987306, "step": 17030 }, { "epoch": 0.6465204178606143, "grad_norm": 0.271484375, "learning_rate": 0.0003054437635524705, "loss": 2.8402036666870116, "step": 17040 }, { "epoch": 0.6468998312513776, "grad_norm": 0.271484375, "learning_rate": 0.00030534246228425456, "loss": 2.844297409057617, "step": 17050 }, { "epoch": 0.6472792446421409, "grad_norm": 0.29296875, "learning_rate": 0.0003052411236005644, "loss": 2.857746696472168, "step": 17060 }, { "epoch": 0.6476586580329041, "grad_norm": 0.28125, "learning_rate": 0.0003051397475373935, "loss": 2.8394149780273437, "step": 17070 }, { "epoch": 0.6480380714236674, "grad_norm": 0.283203125, "learning_rate": 0.00030503833413074845, "loss": 2.8606050491333006, "step": 17080 }, { "epoch": 0.6484174848144307, "grad_norm": 0.28125, "learning_rate": 0.00030493688341664947, "loss": 2.8837812423706053, "step": 17090 }, { "epoch": 0.648796898205194, "grad_norm": 0.28125, "learning_rate": 0.0003048353954311294, "loss": 2.8216346740722655, "step": 17100 }, { "epoch": 0.6491763115959572, "grad_norm": 0.267578125, "learning_rate": 0.00030473387021023494, "loss": 2.8385391235351562, "step": 17110 }, { "epoch": 0.6495557249867205, "grad_norm": 0.279296875, "learning_rate": 0.0003046323077900257, "loss": 2.843626594543457, "step": 17120 }, { "epoch": 0.6499351383774838, "grad_norm": 0.302734375, "learning_rate": 0.00030453070820657456, "loss": 2.834467315673828, "step": 17130 }, { "epoch": 0.650314551768247, "grad_norm": 0.298828125, "learning_rate": 0.00030442907149596756, "loss": 2.825262260437012, "step": 17140 }, { "epoch": 0.6506939651590103, "grad_norm": 0.29296875, "learning_rate": 0.00030432739769430404, "loss": 2.882927131652832, "step": 17150 }, { "epoch": 0.6510733785497737, "grad_norm": 0.287109375, "learning_rate": 0.00030422568683769635, "loss": 2.8531671524047852, "step": 17160 }, { "epoch": 0.6514527919405368, "grad_norm": 0.275390625, "learning_rate": 0.00030412393896227024, "loss": 2.874087905883789, "step": 17170 }, { "epoch": 0.6518322053313002, "grad_norm": 0.275390625, "learning_rate": 0.00030402215410416425, "loss": 2.840306282043457, "step": 17180 }, { "epoch": 0.6522116187220635, "grad_norm": 0.2890625, "learning_rate": 0.0003039203322995304, "loss": 2.8538528442382813, "step": 17190 }, { "epoch": 0.6525910321128267, "grad_norm": 0.283203125, "learning_rate": 0.0003038184735845337, "loss": 2.848603439331055, "step": 17200 }, { "epoch": 0.65297044550359, "grad_norm": 0.287109375, "learning_rate": 0.00030371657799535225, "loss": 2.8631414413452148, "step": 17210 }, { "epoch": 0.6533498588943533, "grad_norm": 0.28125, "learning_rate": 0.00030361464556817727, "loss": 2.873928260803223, "step": 17220 }, { "epoch": 0.6537292722851166, "grad_norm": 0.279296875, "learning_rate": 0.000303512676339213, "loss": 2.8599525451660157, "step": 17230 }, { "epoch": 0.6541086856758798, "grad_norm": 0.291015625, "learning_rate": 0.0003034106703446769, "loss": 2.806654167175293, "step": 17240 }, { "epoch": 0.6544880990666431, "grad_norm": 0.298828125, "learning_rate": 0.00030330862762079937, "loss": 2.8355342864990236, "step": 17250 }, { "epoch": 0.6544880990666431, "eval_loss": 2.8471953868865967, "eval_runtime": 189.3266, "eval_samples_per_second": 20.119, "eval_steps_per_second": 3.354, "step": 17250 }, { "epoch": 0.6548675124574064, "grad_norm": 0.291015625, "learning_rate": 0.0003032065482038239, "loss": 2.8736204147338866, "step": 17260 }, { "epoch": 0.6552469258481696, "grad_norm": 0.291015625, "learning_rate": 0.00030310443213000697, "loss": 2.826099395751953, "step": 17270 }, { "epoch": 0.6556263392389329, "grad_norm": 0.26953125, "learning_rate": 0.00030300227943561806, "loss": 2.844767379760742, "step": 17280 }, { "epoch": 0.6560057526296962, "grad_norm": 0.859375, "learning_rate": 0.0003029000901569398, "loss": 2.859658050537109, "step": 17290 }, { "epoch": 0.6563851660204594, "grad_norm": 0.306640625, "learning_rate": 0.00030279786433026767, "loss": 2.8495527267456056, "step": 17300 }, { "epoch": 0.6567645794112227, "grad_norm": 0.318359375, "learning_rate": 0.0003026956019919101, "loss": 2.8650754928588866, "step": 17310 }, { "epoch": 0.657143992801986, "grad_norm": 0.283203125, "learning_rate": 0.0003025933031781888, "loss": 2.888534355163574, "step": 17320 }, { "epoch": 0.6575234061927492, "grad_norm": 0.296875, "learning_rate": 0.00030249096792543803, "loss": 2.8577701568603517, "step": 17330 }, { "epoch": 0.6579028195835125, "grad_norm": 0.279296875, "learning_rate": 0.0003023885962700051, "loss": 2.8624366760253905, "step": 17340 }, { "epoch": 0.6582822329742758, "grad_norm": 0.27734375, "learning_rate": 0.00030228618824825046, "loss": 2.8659784317016603, "step": 17350 }, { "epoch": 0.658661646365039, "grad_norm": 0.298828125, "learning_rate": 0.00030218374389654726, "loss": 2.841023826599121, "step": 17360 }, { "epoch": 0.6590410597558023, "grad_norm": 0.275390625, "learning_rate": 0.00030208126325128166, "loss": 2.849595069885254, "step": 17370 }, { "epoch": 0.6594204731465656, "grad_norm": 0.271484375, "learning_rate": 0.0003019787463488527, "loss": 2.859141159057617, "step": 17380 }, { "epoch": 0.6597998865373289, "grad_norm": 0.27734375, "learning_rate": 0.00030187619322567214, "loss": 2.8626054763793944, "step": 17390 }, { "epoch": 0.6601792999280921, "grad_norm": 0.291015625, "learning_rate": 0.00030177360391816485, "loss": 2.8370893478393553, "step": 17400 }, { "epoch": 0.6605587133188554, "grad_norm": 0.28125, "learning_rate": 0.0003016709784627684, "loss": 2.8804157257080076, "step": 17410 }, { "epoch": 0.6609381267096187, "grad_norm": 0.283203125, "learning_rate": 0.0003015683168959333, "loss": 2.86740665435791, "step": 17420 }, { "epoch": 0.6613175401003819, "grad_norm": 0.275390625, "learning_rate": 0.00030146561925412286, "loss": 2.856495475769043, "step": 17430 }, { "epoch": 0.6616969534911452, "grad_norm": 0.2734375, "learning_rate": 0.000301362885573813, "loss": 2.8919273376464845, "step": 17440 }, { "epoch": 0.6620763668819085, "grad_norm": 0.28515625, "learning_rate": 0.0003012601158914927, "loss": 2.8809577941894533, "step": 17450 }, { "epoch": 0.6624557802726717, "grad_norm": 0.283203125, "learning_rate": 0.00030115731024366363, "loss": 2.8517337799072267, "step": 17460 }, { "epoch": 0.662835193663435, "grad_norm": 0.275390625, "learning_rate": 0.0003010544686668403, "loss": 2.8490699768066405, "step": 17470 }, { "epoch": 0.6632146070541983, "grad_norm": 0.27734375, "learning_rate": 0.00030095159119754983, "loss": 2.813203048706055, "step": 17480 }, { "epoch": 0.6635940204449615, "grad_norm": 0.28515625, "learning_rate": 0.0003008486778723322, "loss": 2.865022659301758, "step": 17490 }, { "epoch": 0.6639734338357248, "grad_norm": 0.28515625, "learning_rate": 0.00030074572872774015, "loss": 2.825586128234863, "step": 17500 }, { "epoch": 0.6639734338357248, "eval_loss": 2.8451895713806152, "eval_runtime": 188.8701, "eval_samples_per_second": 20.167, "eval_steps_per_second": 3.362, "step": 17500 }, { "epoch": 0.6643528472264881, "grad_norm": 0.287109375, "learning_rate": 0.0003006427438003391, "loss": 2.8478397369384765, "step": 17510 }, { "epoch": 0.6647322606172514, "grad_norm": 0.271484375, "learning_rate": 0.00030053972312670706, "loss": 2.862868309020996, "step": 17520 }, { "epoch": 0.6651116740080146, "grad_norm": 0.271484375, "learning_rate": 0.000300436666743435, "loss": 2.8119140625, "step": 17530 }, { "epoch": 0.6654910873987779, "grad_norm": 0.26953125, "learning_rate": 0.00030033357468712635, "loss": 2.8425262451171873, "step": 17540 }, { "epoch": 0.6658705007895412, "grad_norm": 0.27734375, "learning_rate": 0.00030023044699439735, "loss": 2.84405574798584, "step": 17550 }, { "epoch": 0.6662499141803044, "grad_norm": 0.26953125, "learning_rate": 0.0003001272837018767, "loss": 2.865619087219238, "step": 17560 }, { "epoch": 0.6666293275710677, "grad_norm": 0.29296875, "learning_rate": 0.000300024084846206, "loss": 2.820926856994629, "step": 17570 }, { "epoch": 0.667008740961831, "grad_norm": 0.3046875, "learning_rate": 0.0002999208504640393, "loss": 2.867827796936035, "step": 17580 }, { "epoch": 0.6673881543525942, "grad_norm": 0.291015625, "learning_rate": 0.0002998175805920433, "loss": 2.8387537002563477, "step": 17590 }, { "epoch": 0.6677675677433575, "grad_norm": 0.298828125, "learning_rate": 0.0002997142752668975, "loss": 2.869516944885254, "step": 17600 }, { "epoch": 0.6681469811341209, "grad_norm": 0.29296875, "learning_rate": 0.0002996109345252937, "loss": 2.810620880126953, "step": 17610 }, { "epoch": 0.668526394524884, "grad_norm": 0.294921875, "learning_rate": 0.00029950755840393634, "loss": 2.8605844497680666, "step": 17620 }, { "epoch": 0.6689058079156474, "grad_norm": 0.28125, "learning_rate": 0.00029940414693954253, "loss": 2.8580718994140626, "step": 17630 }, { "epoch": 0.6692852213064107, "grad_norm": 0.287109375, "learning_rate": 0.00029930070016884197, "loss": 2.863025665283203, "step": 17640 }, { "epoch": 0.669664634697174, "grad_norm": 0.2890625, "learning_rate": 0.00029919721812857685, "loss": 2.8394569396972655, "step": 17650 }, { "epoch": 0.6700440480879372, "grad_norm": 0.3046875, "learning_rate": 0.0002990937008555017, "loss": 2.8609491348266602, "step": 17660 }, { "epoch": 0.6704234614787005, "grad_norm": 0.271484375, "learning_rate": 0.0002989901483863838, "loss": 2.8628490447998045, "step": 17670 }, { "epoch": 0.6708028748694638, "grad_norm": 0.275390625, "learning_rate": 0.00029888656075800286, "loss": 2.8258832931518554, "step": 17680 }, { "epoch": 0.671182288260227, "grad_norm": 0.279296875, "learning_rate": 0.00029878293800715116, "loss": 2.833903503417969, "step": 17690 }, { "epoch": 0.6715617016509903, "grad_norm": 0.27734375, "learning_rate": 0.0002986792801706332, "loss": 2.8433979034423826, "step": 17700 }, { "epoch": 0.6719411150417536, "grad_norm": 0.271484375, "learning_rate": 0.0002985755872852663, "loss": 2.862748908996582, "step": 17710 }, { "epoch": 0.6723205284325168, "grad_norm": 0.28515625, "learning_rate": 0.0002984718593878799, "loss": 2.862959289550781, "step": 17720 }, { "epoch": 0.6726999418232801, "grad_norm": 0.279296875, "learning_rate": 0.000298368096515316, "loss": 2.827947807312012, "step": 17730 }, { "epoch": 0.6730793552140434, "grad_norm": 0.29296875, "learning_rate": 0.0002982642987044291, "loss": 2.8332679748535154, "step": 17740 }, { "epoch": 0.6734587686048066, "grad_norm": 0.3125, "learning_rate": 0.0002981604659920861, "loss": 2.811772346496582, "step": 17750 }, { "epoch": 0.6734587686048066, "eval_loss": 2.842341661453247, "eval_runtime": 188.2788, "eval_samples_per_second": 20.231, "eval_steps_per_second": 3.373, "step": 17750 }, { "epoch": 0.6738381819955699, "grad_norm": 0.28125, "learning_rate": 0.0002980565984151661, "loss": 2.8387832641601562, "step": 17760 }, { "epoch": 0.6742175953863332, "grad_norm": 0.279296875, "learning_rate": 0.00029795269601056085, "loss": 2.825750732421875, "step": 17770 }, { "epoch": 0.6745970087770964, "grad_norm": 0.28125, "learning_rate": 0.0002978487588151743, "loss": 2.832627868652344, "step": 17780 }, { "epoch": 0.6749764221678597, "grad_norm": 0.28125, "learning_rate": 0.0002977447868659227, "loss": 2.839649772644043, "step": 17790 }, { "epoch": 0.675355835558623, "grad_norm": 0.291015625, "learning_rate": 0.00029764078019973495, "loss": 2.857250785827637, "step": 17800 }, { "epoch": 0.6757352489493863, "grad_norm": 0.28515625, "learning_rate": 0.00029753673885355195, "loss": 2.819707679748535, "step": 17810 }, { "epoch": 0.6761146623401495, "grad_norm": 0.2734375, "learning_rate": 0.00029743266286432705, "loss": 2.853423500061035, "step": 17820 }, { "epoch": 0.6764940757309128, "grad_norm": 0.287109375, "learning_rate": 0.000297328552269026, "loss": 2.853765106201172, "step": 17830 }, { "epoch": 0.6768734891216761, "grad_norm": 0.28515625, "learning_rate": 0.00029722440710462655, "loss": 2.8453187942504883, "step": 17840 }, { "epoch": 0.6772529025124393, "grad_norm": 0.28125, "learning_rate": 0.0002971202274081191, "loss": 2.8403207778930666, "step": 17850 }, { "epoch": 0.6776323159032026, "grad_norm": 0.275390625, "learning_rate": 0.00029701601321650595, "loss": 2.8519256591796873, "step": 17860 }, { "epoch": 0.6780117292939659, "grad_norm": 0.283203125, "learning_rate": 0.00029691176456680203, "loss": 2.8483776092529296, "step": 17870 }, { "epoch": 0.6783911426847291, "grad_norm": 0.26953125, "learning_rate": 0.0002968074814960342, "loss": 2.840516471862793, "step": 17880 }, { "epoch": 0.6787705560754924, "grad_norm": 0.283203125, "learning_rate": 0.0002967031640412417, "loss": 2.87567024230957, "step": 17890 }, { "epoch": 0.6791499694662557, "grad_norm": 0.279296875, "learning_rate": 0.0002965988122394759, "loss": 2.849530792236328, "step": 17900 }, { "epoch": 0.6795293828570189, "grad_norm": 0.291015625, "learning_rate": 0.0002964944261278005, "loss": 2.8557878494262696, "step": 17910 }, { "epoch": 0.6799087962477822, "grad_norm": 0.275390625, "learning_rate": 0.0002963900057432912, "loss": 2.880356788635254, "step": 17920 }, { "epoch": 0.6802882096385455, "grad_norm": 0.283203125, "learning_rate": 0.00029628555112303596, "loss": 2.8443933486938477, "step": 17930 }, { "epoch": 0.6806676230293088, "grad_norm": 0.279296875, "learning_rate": 0.0002961810623041351, "loss": 2.8750314712524414, "step": 17940 }, { "epoch": 0.681047036420072, "grad_norm": 0.279296875, "learning_rate": 0.00029607653932370065, "loss": 2.8263986587524412, "step": 17950 }, { "epoch": 0.6814264498108353, "grad_norm": 0.30078125, "learning_rate": 0.00029597198221885715, "loss": 2.832116889953613, "step": 17960 }, { "epoch": 0.6818058632015986, "grad_norm": 0.306640625, "learning_rate": 0.0002958673910267411, "loss": 2.826722526550293, "step": 17970 }, { "epoch": 0.6821852765923618, "grad_norm": 0.287109375, "learning_rate": 0.0002957627657845012, "loss": 2.82626953125, "step": 17980 }, { "epoch": 0.6825646899831251, "grad_norm": 0.271484375, "learning_rate": 0.0002956581065292981, "loss": 2.834086608886719, "step": 17990 }, { "epoch": 0.6829441033738884, "grad_norm": 0.287109375, "learning_rate": 0.0002955534132983047, "loss": 2.8805280685424806, "step": 18000 }, { "epoch": 0.6829441033738884, "eval_loss": 2.842597246170044, "eval_runtime": 187.9587, "eval_samples_per_second": 20.265, "eval_steps_per_second": 3.378, "step": 18000 }, { "epoch": 0.6833235167646516, "grad_norm": 0.291015625, "learning_rate": 0.00029544868612870574, "loss": 2.839202117919922, "step": 18010 }, { "epoch": 0.683702930155415, "grad_norm": 0.30859375, "learning_rate": 0.0002953439250576983, "loss": 2.8265419006347656, "step": 18020 }, { "epoch": 0.6840823435461783, "grad_norm": 0.3046875, "learning_rate": 0.0002952391301224913, "loss": 2.847299003601074, "step": 18030 }, { "epoch": 0.6844617569369414, "grad_norm": 0.287109375, "learning_rate": 0.0002951343013603057, "loss": 2.830337333679199, "step": 18040 }, { "epoch": 0.6848411703277048, "grad_norm": 0.2890625, "learning_rate": 0.0002950294388083746, "loss": 2.8686609268188477, "step": 18050 }, { "epoch": 0.6852205837184681, "grad_norm": 0.291015625, "learning_rate": 0.000294924542503943, "loss": 2.8182771682739256, "step": 18060 }, { "epoch": 0.6855999971092314, "grad_norm": 0.28125, "learning_rate": 0.0002948196124842678, "loss": 2.804250717163086, "step": 18070 }, { "epoch": 0.6859794104999946, "grad_norm": 0.28125, "learning_rate": 0.00029471464878661814, "loss": 2.8864715576171873, "step": 18080 }, { "epoch": 0.6863588238907579, "grad_norm": 0.283203125, "learning_rate": 0.0002946096514482749, "loss": 2.8418336868286134, "step": 18090 }, { "epoch": 0.6867382372815212, "grad_norm": 0.275390625, "learning_rate": 0.00029450462050653093, "loss": 2.8468902587890623, "step": 18100 }, { "epoch": 0.6871176506722844, "grad_norm": 0.28515625, "learning_rate": 0.0002943995559986912, "loss": 2.8303152084350587, "step": 18110 }, { "epoch": 0.6874970640630477, "grad_norm": 0.279296875, "learning_rate": 0.00029429445796207225, "loss": 2.8226877212524415, "step": 18120 }, { "epoch": 0.687876477453811, "grad_norm": 0.28515625, "learning_rate": 0.00029418932643400286, "loss": 2.8317996978759767, "step": 18130 }, { "epoch": 0.6882558908445742, "grad_norm": 0.2734375, "learning_rate": 0.0002940841614518236, "loss": 2.858957290649414, "step": 18140 }, { "epoch": 0.6886353042353375, "grad_norm": 0.279296875, "learning_rate": 0.00029397896305288686, "loss": 2.8384653091430665, "step": 18150 }, { "epoch": 0.6890147176261008, "grad_norm": 0.28125, "learning_rate": 0.00029387373127455696, "loss": 2.812555122375488, "step": 18160 }, { "epoch": 0.689394131016864, "grad_norm": 0.28125, "learning_rate": 0.00029376846615421015, "loss": 2.8521038055419923, "step": 18170 }, { "epoch": 0.6897735444076273, "grad_norm": 0.2890625, "learning_rate": 0.00029366316772923425, "loss": 2.841378402709961, "step": 18180 }, { "epoch": 0.6901529577983906, "grad_norm": 0.271484375, "learning_rate": 0.0002935578360370292, "loss": 2.8483720779418946, "step": 18190 }, { "epoch": 0.6905323711891538, "grad_norm": 0.275390625, "learning_rate": 0.0002934524711150067, "loss": 2.8145652770996095, "step": 18200 }, { "epoch": 0.6909117845799171, "grad_norm": 0.291015625, "learning_rate": 0.00029334707300059007, "loss": 2.8503517150878905, "step": 18210 }, { "epoch": 0.6912911979706804, "grad_norm": 0.27734375, "learning_rate": 0.0002932416417312147, "loss": 2.85811767578125, "step": 18220 }, { "epoch": 0.6916706113614437, "grad_norm": 0.291015625, "learning_rate": 0.0002931361773443275, "loss": 2.8697847366333007, "step": 18230 }, { "epoch": 0.6920500247522069, "grad_norm": 0.2734375, "learning_rate": 0.00029303067987738725, "loss": 2.8477672576904296, "step": 18240 }, { "epoch": 0.6924294381429702, "grad_norm": 0.275390625, "learning_rate": 0.00029292514936786457, "loss": 2.8621906280517577, "step": 18250 }, { "epoch": 0.6924294381429702, "eval_loss": 2.83903169631958, "eval_runtime": 191.4787, "eval_samples_per_second": 19.893, "eval_steps_per_second": 3.316, "step": 18250 }, { "epoch": 0.6928088515337335, "grad_norm": 0.291015625, "learning_rate": 0.0002928195858532416, "loss": 2.8217830657958984, "step": 18260 }, { "epoch": 0.6931882649244967, "grad_norm": 0.287109375, "learning_rate": 0.0002927139893710124, "loss": 2.8488996505737303, "step": 18270 }, { "epoch": 0.69356767831526, "grad_norm": 0.283203125, "learning_rate": 0.0002926083599586827, "loss": 2.8436622619628906, "step": 18280 }, { "epoch": 0.6939470917060233, "grad_norm": 0.291015625, "learning_rate": 0.0002925026976537698, "loss": 2.8574934005737305, "step": 18290 }, { "epoch": 0.6943265050967865, "grad_norm": 0.283203125, "learning_rate": 0.00029239700249380287, "loss": 2.8374170303344726, "step": 18300 }, { "epoch": 0.6947059184875498, "grad_norm": 0.294921875, "learning_rate": 0.0002922912745163226, "loss": 2.8409601211547852, "step": 18310 }, { "epoch": 0.6950853318783131, "grad_norm": 0.296875, "learning_rate": 0.0002921855137588814, "loss": 2.8211803436279297, "step": 18320 }, { "epoch": 0.6954647452690763, "grad_norm": 0.2890625, "learning_rate": 0.0002920797202590433, "loss": 2.8444395065307617, "step": 18330 }, { "epoch": 0.6958441586598396, "grad_norm": 0.287109375, "learning_rate": 0.000291973894054384, "loss": 2.8281766891479494, "step": 18340 }, { "epoch": 0.6962235720506029, "grad_norm": 0.283203125, "learning_rate": 0.0002918680351824908, "loss": 2.8393838882446287, "step": 18350 }, { "epoch": 0.6966029854413662, "grad_norm": 0.279296875, "learning_rate": 0.00029176214368096257, "loss": 2.8611684799194337, "step": 18360 }, { "epoch": 0.6969823988321294, "grad_norm": 0.306640625, "learning_rate": 0.0002916562195874097, "loss": 2.856645965576172, "step": 18370 }, { "epoch": 0.6973618122228927, "grad_norm": 0.2890625, "learning_rate": 0.00029155026293945446, "loss": 2.844033432006836, "step": 18380 }, { "epoch": 0.697741225613656, "grad_norm": 0.283203125, "learning_rate": 0.00029144427377473034, "loss": 2.8422388076782226, "step": 18390 }, { "epoch": 0.6981206390044192, "grad_norm": 0.283203125, "learning_rate": 0.0002913382521308825, "loss": 2.8538515090942385, "step": 18400 }, { "epoch": 0.6985000523951825, "grad_norm": 0.28515625, "learning_rate": 0.0002912321980455676, "loss": 2.8075870513916015, "step": 18410 }, { "epoch": 0.6988794657859458, "grad_norm": 0.287109375, "learning_rate": 0.000291126111556454, "loss": 2.831045913696289, "step": 18420 }, { "epoch": 0.699258879176709, "grad_norm": 0.28125, "learning_rate": 0.0002910199927012214, "loss": 2.8712352752685546, "step": 18430 }, { "epoch": 0.6996382925674723, "grad_norm": 0.294921875, "learning_rate": 0.00029091384151756094, "loss": 2.831806755065918, "step": 18440 }, { "epoch": 0.7000177059582356, "grad_norm": 0.29296875, "learning_rate": 0.0002908076580431755, "loss": 2.825814056396484, "step": 18450 }, { "epoch": 0.7003971193489988, "grad_norm": 0.2890625, "learning_rate": 0.00029070144231577904, "loss": 2.8337087631225586, "step": 18460 }, { "epoch": 0.7007765327397621, "grad_norm": 0.294921875, "learning_rate": 0.0002905951943730974, "loss": 2.7858200073242188, "step": 18470 }, { "epoch": 0.7011559461305255, "grad_norm": 0.294921875, "learning_rate": 0.00029048891425286757, "loss": 2.823362350463867, "step": 18480 }, { "epoch": 0.7015353595212888, "grad_norm": 0.2890625, "learning_rate": 0.00029038260199283806, "loss": 2.84307861328125, "step": 18490 }, { "epoch": 0.701914772912052, "grad_norm": 0.283203125, "learning_rate": 0.00029027625763076883, "loss": 2.8268238067626954, "step": 18500 }, { "epoch": 0.701914772912052, "eval_loss": 2.8373358249664307, "eval_runtime": 189.5562, "eval_samples_per_second": 20.094, "eval_steps_per_second": 3.35, "step": 18500 }, { "epoch": 0.7022941863028153, "grad_norm": 0.275390625, "learning_rate": 0.0002901698812044312, "loss": 2.8149555206298826, "step": 18510 }, { "epoch": 0.7026735996935786, "grad_norm": 0.287109375, "learning_rate": 0.0002900634727516079, "loss": 2.838625907897949, "step": 18520 }, { "epoch": 0.7030530130843418, "grad_norm": 0.29296875, "learning_rate": 0.00028995703231009294, "loss": 2.837445831298828, "step": 18530 }, { "epoch": 0.7034324264751051, "grad_norm": 0.291015625, "learning_rate": 0.0002898505599176919, "loss": 2.8390596389770506, "step": 18540 }, { "epoch": 0.7038118398658684, "grad_norm": 0.30078125, "learning_rate": 0.00028974405561222146, "loss": 2.830565643310547, "step": 18550 }, { "epoch": 0.7041912532566316, "grad_norm": 0.28125, "learning_rate": 0.0002896375194315098, "loss": 2.8395450592041014, "step": 18560 }, { "epoch": 0.7045706666473949, "grad_norm": 0.28125, "learning_rate": 0.00028953095141339645, "loss": 2.83322811126709, "step": 18570 }, { "epoch": 0.7049500800381582, "grad_norm": 0.291015625, "learning_rate": 0.0002894243515957321, "loss": 2.8391801834106447, "step": 18580 }, { "epoch": 0.7053294934289214, "grad_norm": 0.287109375, "learning_rate": 0.00028931772001637886, "loss": 2.860108184814453, "step": 18590 }, { "epoch": 0.7057089068196847, "grad_norm": 0.2890625, "learning_rate": 0.00028921105671320997, "loss": 2.839351272583008, "step": 18600 }, { "epoch": 0.706088320210448, "grad_norm": 0.287109375, "learning_rate": 0.0002891043617241102, "loss": 2.8471586227416994, "step": 18610 }, { "epoch": 0.7064677336012112, "grad_norm": 0.28515625, "learning_rate": 0.00028899763508697524, "loss": 2.865244674682617, "step": 18620 }, { "epoch": 0.7068471469919745, "grad_norm": 0.29296875, "learning_rate": 0.00028889087683971236, "loss": 2.8295398712158204, "step": 18630 }, { "epoch": 0.7072265603827378, "grad_norm": 0.294921875, "learning_rate": 0.0002887840870202397, "loss": 2.8376352310180666, "step": 18640 }, { "epoch": 0.7076059737735011, "grad_norm": 0.298828125, "learning_rate": 0.000288677265666487, "loss": 2.8331228256225587, "step": 18650 }, { "epoch": 0.7079853871642643, "grad_norm": 0.3125, "learning_rate": 0.0002885704128163949, "loss": 2.819348907470703, "step": 18660 }, { "epoch": 0.7083648005550276, "grad_norm": 0.28125, "learning_rate": 0.0002884635285079154, "loss": 2.8310998916625976, "step": 18670 }, { "epoch": 0.7087442139457909, "grad_norm": 0.28125, "learning_rate": 0.0002883566127790115, "loss": 2.827471160888672, "step": 18680 }, { "epoch": 0.7091236273365541, "grad_norm": 0.291015625, "learning_rate": 0.0002882496656676576, "loss": 2.839517021179199, "step": 18690 }, { "epoch": 0.7095030407273174, "grad_norm": 0.294921875, "learning_rate": 0.00028814268721183894, "loss": 2.8756292343139647, "step": 18700 }, { "epoch": 0.7098824541180807, "grad_norm": 0.294921875, "learning_rate": 0.00028803567744955226, "loss": 2.8398534774780275, "step": 18710 }, { "epoch": 0.7102618675088439, "grad_norm": 0.291015625, "learning_rate": 0.00028792863641880504, "loss": 2.8589815139770507, "step": 18720 }, { "epoch": 0.7106412808996072, "grad_norm": 0.27734375, "learning_rate": 0.0002878215641576162, "loss": 2.8332136154174803, "step": 18730 }, { "epoch": 0.7110206942903705, "grad_norm": 0.28125, "learning_rate": 0.0002877144607040156, "loss": 2.8517847061157227, "step": 18740 }, { "epoch": 0.7114001076811337, "grad_norm": 0.28515625, "learning_rate": 0.00028760732609604405, "loss": 2.834649085998535, "step": 18750 }, { "epoch": 0.7114001076811337, "eval_loss": 2.835092067718506, "eval_runtime": 195.8031, "eval_samples_per_second": 19.453, "eval_steps_per_second": 3.243, "step": 18750 }, { "epoch": 0.711779521071897, "grad_norm": 0.28515625, "learning_rate": 0.0002875001603717537, "loss": 2.821250343322754, "step": 18760 }, { "epoch": 0.7121589344626603, "grad_norm": 0.26953125, "learning_rate": 0.0002873929635692074, "loss": 2.8351776123046877, "step": 18770 }, { "epoch": 0.7125383478534236, "grad_norm": 0.2890625, "learning_rate": 0.00028728573572647955, "loss": 2.842291069030762, "step": 18780 }, { "epoch": 0.7129177612441868, "grad_norm": 0.275390625, "learning_rate": 0.000287178476881655, "loss": 2.852596664428711, "step": 18790 }, { "epoch": 0.7132971746349501, "grad_norm": 0.27734375, "learning_rate": 0.00028707118707283006, "loss": 2.809381294250488, "step": 18800 }, { "epoch": 0.7136765880257134, "grad_norm": 0.27734375, "learning_rate": 0.00028696386633811176, "loss": 2.8471042633056642, "step": 18810 }, { "epoch": 0.7140560014164766, "grad_norm": 0.28515625, "learning_rate": 0.0002868565147156183, "loss": 2.852587890625, "step": 18820 }, { "epoch": 0.7144354148072399, "grad_norm": 0.2890625, "learning_rate": 0.0002867491322434787, "loss": 2.845592498779297, "step": 18830 }, { "epoch": 0.7148148281980032, "grad_norm": 0.29296875, "learning_rate": 0.00028664171895983304, "loss": 2.844218063354492, "step": 18840 }, { "epoch": 0.7151942415887664, "grad_norm": 0.29296875, "learning_rate": 0.0002865342749028323, "loss": 2.8539018630981445, "step": 18850 }, { "epoch": 0.7155736549795297, "grad_norm": 0.279296875, "learning_rate": 0.0002864268001106384, "loss": 2.8142133712768556, "step": 18860 }, { "epoch": 0.715953068370293, "grad_norm": 0.291015625, "learning_rate": 0.0002863192946214241, "loss": 2.8509246826171877, "step": 18870 }, { "epoch": 0.7163324817610562, "grad_norm": 0.2890625, "learning_rate": 0.0002862117584733732, "loss": 2.8208890914916993, "step": 18880 }, { "epoch": 0.7167118951518195, "grad_norm": 0.29296875, "learning_rate": 0.00028610419170468037, "loss": 2.8371509552001952, "step": 18890 }, { "epoch": 0.7170913085425829, "grad_norm": 0.294921875, "learning_rate": 0.000285996594353551, "loss": 2.8365650177001953, "step": 18900 }, { "epoch": 0.7174707219333462, "grad_norm": 0.279296875, "learning_rate": 0.00028588896645820165, "loss": 2.8556594848632812, "step": 18910 }, { "epoch": 0.7178501353241094, "grad_norm": 0.287109375, "learning_rate": 0.0002857813080568593, "loss": 2.8263576507568358, "step": 18920 }, { "epoch": 0.7182295487148727, "grad_norm": 0.29296875, "learning_rate": 0.00028567361918776217, "loss": 2.855146026611328, "step": 18930 }, { "epoch": 0.718608962105636, "grad_norm": 0.283203125, "learning_rate": 0.000285565899889159, "loss": 2.862105369567871, "step": 18940 }, { "epoch": 0.7189883754963992, "grad_norm": 0.287109375, "learning_rate": 0.0002854581501993096, "loss": 2.835872268676758, "step": 18950 }, { "epoch": 0.7193677888871625, "grad_norm": 0.296875, "learning_rate": 0.0002853503701564843, "loss": 2.826953887939453, "step": 18960 }, { "epoch": 0.7197472022779258, "grad_norm": 0.287109375, "learning_rate": 0.00028524255979896446, "loss": 2.8699466705322267, "step": 18970 }, { "epoch": 0.720126615668689, "grad_norm": 0.2890625, "learning_rate": 0.00028513471916504206, "loss": 2.7957389831542967, "step": 18980 }, { "epoch": 0.7205060290594523, "grad_norm": 0.287109375, "learning_rate": 0.0002850268482930199, "loss": 2.8703914642333985, "step": 18990 }, { "epoch": 0.7208854424502156, "grad_norm": 0.294921875, "learning_rate": 0.00028491894722121146, "loss": 2.85618896484375, "step": 19000 }, { "epoch": 0.7208854424502156, "eval_loss": 2.8346750736236572, "eval_runtime": 189.4562, "eval_samples_per_second": 20.105, "eval_steps_per_second": 3.352, "step": 19000 }, { "epoch": 0.7212648558409788, "grad_norm": 0.29296875, "learning_rate": 0.000284811015987941, "loss": 2.840407371520996, "step": 19010 }, { "epoch": 0.7216442692317421, "grad_norm": 0.294921875, "learning_rate": 0.00028470305463154346, "loss": 2.83505802154541, "step": 19020 }, { "epoch": 0.7220236826225054, "grad_norm": 0.35546875, "learning_rate": 0.00028459506319036446, "loss": 2.8421875, "step": 19030 }, { "epoch": 0.7224030960132686, "grad_norm": 0.28515625, "learning_rate": 0.00028448704170276043, "loss": 2.823431396484375, "step": 19040 }, { "epoch": 0.7227825094040319, "grad_norm": 0.27734375, "learning_rate": 0.0002843789902070983, "loss": 2.8444503784179687, "step": 19050 }, { "epoch": 0.7231619227947952, "grad_norm": 0.29296875, "learning_rate": 0.00028427090874175577, "loss": 2.8459449768066407, "step": 19060 }, { "epoch": 0.7235413361855585, "grad_norm": 0.294921875, "learning_rate": 0.0002841627973451211, "loss": 2.834813690185547, "step": 19070 }, { "epoch": 0.7239207495763217, "grad_norm": 0.296875, "learning_rate": 0.00028405465605559326, "loss": 2.8475931167602537, "step": 19080 }, { "epoch": 0.724300162967085, "grad_norm": 0.287109375, "learning_rate": 0.0002839464849115819, "loss": 2.8481513977050783, "step": 19090 }, { "epoch": 0.7246795763578483, "grad_norm": 0.2890625, "learning_rate": 0.000283838283951507, "loss": 2.815979766845703, "step": 19100 }, { "epoch": 0.7250589897486115, "grad_norm": 0.330078125, "learning_rate": 0.0002837300532137996, "loss": 2.792364311218262, "step": 19110 }, { "epoch": 0.7254384031393748, "grad_norm": 0.283203125, "learning_rate": 0.00028362179273690066, "loss": 2.796684455871582, "step": 19120 }, { "epoch": 0.7258178165301381, "grad_norm": 0.28515625, "learning_rate": 0.0002835135025592623, "loss": 2.8131391525268556, "step": 19130 }, { "epoch": 0.7261972299209013, "grad_norm": 0.28515625, "learning_rate": 0.000283405182719347, "loss": 2.8304702758789064, "step": 19140 }, { "epoch": 0.7265766433116646, "grad_norm": 0.287109375, "learning_rate": 0.00028329683325562763, "loss": 2.8203174591064455, "step": 19150 }, { "epoch": 0.7269560567024279, "grad_norm": 0.2890625, "learning_rate": 0.00028318845420658773, "loss": 2.843124198913574, "step": 19160 }, { "epoch": 0.7273354700931911, "grad_norm": 0.287109375, "learning_rate": 0.00028308004561072126, "loss": 2.825535202026367, "step": 19170 }, { "epoch": 0.7277148834839544, "grad_norm": 0.29296875, "learning_rate": 0.0002829716075065328, "loss": 2.834905815124512, "step": 19180 }, { "epoch": 0.7280942968747177, "grad_norm": 0.291015625, "learning_rate": 0.00028286313993253724, "loss": 2.8406700134277343, "step": 19190 }, { "epoch": 0.728473710265481, "grad_norm": 0.2890625, "learning_rate": 0.0002827546429272602, "loss": 2.8604175567626955, "step": 19200 }, { "epoch": 0.7288531236562442, "grad_norm": 0.287109375, "learning_rate": 0.00028264611652923727, "loss": 2.845260810852051, "step": 19210 }, { "epoch": 0.7292325370470075, "grad_norm": 0.27734375, "learning_rate": 0.0002825375607770151, "loss": 2.838621139526367, "step": 19220 }, { "epoch": 0.7296119504377708, "grad_norm": 0.302734375, "learning_rate": 0.00028242897570915034, "loss": 2.7911308288574217, "step": 19230 }, { "epoch": 0.729991363828534, "grad_norm": 0.29296875, "learning_rate": 0.0002823203613642103, "loss": 2.8166969299316404, "step": 19240 }, { "epoch": 0.7303707772192973, "grad_norm": 0.291015625, "learning_rate": 0.0002822117177807723, "loss": 2.828019714355469, "step": 19250 }, { "epoch": 0.7303707772192973, "eval_loss": 2.8315625190734863, "eval_runtime": 189.4827, "eval_samples_per_second": 20.102, "eval_steps_per_second": 3.351, "step": 19250 }, { "epoch": 0.7307501906100606, "grad_norm": 0.27734375, "learning_rate": 0.0002821030449974244, "loss": 2.860991668701172, "step": 19260 }, { "epoch": 0.7311296040008238, "grad_norm": 0.283203125, "learning_rate": 0.0002819943430527651, "loss": 2.868585968017578, "step": 19270 }, { "epoch": 0.7315090173915871, "grad_norm": 0.27734375, "learning_rate": 0.0002818856119854029, "loss": 2.8236278533935546, "step": 19280 }, { "epoch": 0.7318884307823504, "grad_norm": 0.283203125, "learning_rate": 0.0002817768518339569, "loss": 2.8754623413085936, "step": 19290 }, { "epoch": 0.7322678441731136, "grad_norm": 0.28515625, "learning_rate": 0.00028166806263705646, "loss": 2.8300878524780275, "step": 19300 }, { "epoch": 0.7326472575638769, "grad_norm": 0.28125, "learning_rate": 0.0002815592444333413, "loss": 2.8325267791748048, "step": 19310 }, { "epoch": 0.7330266709546402, "grad_norm": 0.29296875, "learning_rate": 0.0002814503972614614, "loss": 2.8168893814086915, "step": 19320 }, { "epoch": 0.7334060843454034, "grad_norm": 0.291015625, "learning_rate": 0.00028134152116007706, "loss": 2.842405319213867, "step": 19330 }, { "epoch": 0.7337854977361667, "grad_norm": 0.287109375, "learning_rate": 0.0002812326161678587, "loss": 2.8239648818969725, "step": 19340 }, { "epoch": 0.73416491112693, "grad_norm": 0.279296875, "learning_rate": 0.0002811236823234873, "loss": 2.8245887756347656, "step": 19350 }, { "epoch": 0.7345443245176934, "grad_norm": 0.287109375, "learning_rate": 0.0002810147196656538, "loss": 2.841105651855469, "step": 19360 }, { "epoch": 0.7349237379084566, "grad_norm": 0.29296875, "learning_rate": 0.00028090572823305954, "loss": 2.85068302154541, "step": 19370 }, { "epoch": 0.7353031512992199, "grad_norm": 0.310546875, "learning_rate": 0.000280796708064416, "loss": 2.8632137298583986, "step": 19380 }, { "epoch": 0.7356825646899832, "grad_norm": 0.30078125, "learning_rate": 0.0002806876591984449, "loss": 2.840615653991699, "step": 19390 }, { "epoch": 0.7360619780807464, "grad_norm": 0.283203125, "learning_rate": 0.00028057858167387825, "loss": 2.8517189025878906, "step": 19400 }, { "epoch": 0.7364413914715097, "grad_norm": 0.28125, "learning_rate": 0.00028046947552945803, "loss": 2.856273651123047, "step": 19410 }, { "epoch": 0.736820804862273, "grad_norm": 0.296875, "learning_rate": 0.00028036034080393656, "loss": 2.8452922821044924, "step": 19420 }, { "epoch": 0.7372002182530362, "grad_norm": 0.29296875, "learning_rate": 0.0002802511775360762, "loss": 2.854081916809082, "step": 19430 }, { "epoch": 0.7375796316437995, "grad_norm": 0.2734375, "learning_rate": 0.0002801419857646495, "loss": 2.8550865173339846, "step": 19440 }, { "epoch": 0.7379590450345628, "grad_norm": 0.294921875, "learning_rate": 0.00028003276552843924, "loss": 2.825750541687012, "step": 19450 }, { "epoch": 0.738338458425326, "grad_norm": 0.275390625, "learning_rate": 0.000279923516866238, "loss": 2.825227165222168, "step": 19460 }, { "epoch": 0.7387178718160893, "grad_norm": 0.279296875, "learning_rate": 0.0002798142398168488, "loss": 2.814388656616211, "step": 19470 }, { "epoch": 0.7390972852068526, "grad_norm": 0.2890625, "learning_rate": 0.00027970493441908457, "loss": 2.8172780990600588, "step": 19480 }, { "epoch": 0.7394766985976159, "grad_norm": 0.279296875, "learning_rate": 0.00027959560071176836, "loss": 2.842550849914551, "step": 19490 }, { "epoch": 0.7398561119883791, "grad_norm": 0.2890625, "learning_rate": 0.00027948623873373315, "loss": 2.8473268508911134, "step": 19500 }, { "epoch": 0.7398561119883791, "eval_loss": 2.8308820724487305, "eval_runtime": 189.5668, "eval_samples_per_second": 20.093, "eval_steps_per_second": 3.35, "step": 19500 }, { "epoch": 0.7402355253791424, "grad_norm": 0.291015625, "learning_rate": 0.00027937684852382224, "loss": 2.8403640747070313, "step": 19510 }, { "epoch": 0.7406149387699057, "grad_norm": 0.31640625, "learning_rate": 0.00027926743012088854, "loss": 2.815464401245117, "step": 19520 }, { "epoch": 0.7409943521606689, "grad_norm": 0.2890625, "learning_rate": 0.0002791579835637954, "loss": 2.810526466369629, "step": 19530 }, { "epoch": 0.7413737655514322, "grad_norm": 0.291015625, "learning_rate": 0.000279048508891416, "loss": 2.8216562271118164, "step": 19540 }, { "epoch": 0.7417531789421955, "grad_norm": 0.29296875, "learning_rate": 0.0002789390061426334, "loss": 2.8264965057373046, "step": 19550 }, { "epoch": 0.7421325923329587, "grad_norm": 0.298828125, "learning_rate": 0.00027882947535634074, "loss": 2.8317073822021483, "step": 19560 }, { "epoch": 0.742512005723722, "grad_norm": 0.27734375, "learning_rate": 0.00027871991657144106, "loss": 2.8591283798217773, "step": 19570 }, { "epoch": 0.7428914191144853, "grad_norm": 0.314453125, "learning_rate": 0.0002786103298268475, "loss": 2.826180267333984, "step": 19580 }, { "epoch": 0.7432708325052485, "grad_norm": 0.298828125, "learning_rate": 0.0002785007151614829, "loss": 2.8348676681518556, "step": 19590 }, { "epoch": 0.7436502458960118, "grad_norm": 0.283203125, "learning_rate": 0.00027839107261428027, "loss": 2.7994956970214844, "step": 19600 }, { "epoch": 0.7440296592867751, "grad_norm": 0.291015625, "learning_rate": 0.0002782814022241822, "loss": 2.837409782409668, "step": 19610 }, { "epoch": 0.7444090726775384, "grad_norm": 0.275390625, "learning_rate": 0.00027817170403014155, "loss": 2.831551170349121, "step": 19620 }, { "epoch": 0.7447884860683016, "grad_norm": 0.28515625, "learning_rate": 0.00027806197807112064, "loss": 2.819396209716797, "step": 19630 }, { "epoch": 0.7451678994590649, "grad_norm": 0.28515625, "learning_rate": 0.0002779522243860921, "loss": 2.8127923965454102, "step": 19640 }, { "epoch": 0.7455473128498282, "grad_norm": 0.283203125, "learning_rate": 0.000277842443014038, "loss": 2.8613935470581056, "step": 19650 }, { "epoch": 0.7459267262405914, "grad_norm": 0.279296875, "learning_rate": 0.00027773263399395054, "loss": 2.831452751159668, "step": 19660 }, { "epoch": 0.7463061396313547, "grad_norm": 0.291015625, "learning_rate": 0.0002776227973648316, "loss": 2.8509660720825196, "step": 19670 }, { "epoch": 0.746685553022118, "grad_norm": 0.314453125, "learning_rate": 0.0002775129331656928, "loss": 2.840987777709961, "step": 19680 }, { "epoch": 0.7470649664128812, "grad_norm": 0.29296875, "learning_rate": 0.0002774030414355558, "loss": 2.833748435974121, "step": 19690 }, { "epoch": 0.7474443798036445, "grad_norm": 0.2890625, "learning_rate": 0.0002772931222134517, "loss": 2.8443593978881836, "step": 19700 }, { "epoch": 0.7478237931944078, "grad_norm": 0.287109375, "learning_rate": 0.0002771831755384217, "loss": 2.8268220901489256, "step": 19710 }, { "epoch": 0.748203206585171, "grad_norm": 0.28125, "learning_rate": 0.00027707320144951654, "loss": 2.8133056640625, "step": 19720 }, { "epoch": 0.7485826199759343, "grad_norm": 0.296875, "learning_rate": 0.0002769631999857968, "loss": 2.8273521423339845, "step": 19730 }, { "epoch": 0.7489620333666976, "grad_norm": 0.28125, "learning_rate": 0.00027685317118633255, "loss": 2.8408124923706053, "step": 19740 }, { "epoch": 0.7493414467574608, "grad_norm": 0.291015625, "learning_rate": 0.00027674311509020394, "loss": 2.80834903717041, "step": 19750 }, { "epoch": 0.7493414467574608, "eval_loss": 2.8295862674713135, "eval_runtime": 189.5772, "eval_samples_per_second": 20.092, "eval_steps_per_second": 3.35, "step": 19750 }, { "epoch": 0.7497208601482241, "grad_norm": 0.287109375, "learning_rate": 0.00027663303173650067, "loss": 2.831757736206055, "step": 19760 }, { "epoch": 0.7501002735389874, "grad_norm": 0.291015625, "learning_rate": 0.00027652292116432187, "loss": 2.850638008117676, "step": 19770 }, { "epoch": 0.7504796869297508, "grad_norm": 0.283203125, "learning_rate": 0.0002764127834127768, "loss": 2.846861457824707, "step": 19780 }, { "epoch": 0.750859100320514, "grad_norm": 0.28125, "learning_rate": 0.0002763026185209838, "loss": 2.8378061294555663, "step": 19790 }, { "epoch": 0.7512385137112773, "grad_norm": 0.287109375, "learning_rate": 0.0002761924265280715, "loss": 2.8273887634277344, "step": 19800 }, { "epoch": 0.7516179271020406, "grad_norm": 0.283203125, "learning_rate": 0.00027608220747317767, "loss": 2.8930660247802735, "step": 19810 }, { "epoch": 0.7519973404928038, "grad_norm": 0.2890625, "learning_rate": 0.00027597196139544987, "loss": 2.8303878784179686, "step": 19820 }, { "epoch": 0.7523767538835671, "grad_norm": 0.2890625, "learning_rate": 0.0002758616883340452, "loss": 2.8258609771728516, "step": 19830 }, { "epoch": 0.7527561672743304, "grad_norm": 0.3125, "learning_rate": 0.00027575138832813047, "loss": 2.8410013198852537, "step": 19840 }, { "epoch": 0.7531355806650936, "grad_norm": 0.287109375, "learning_rate": 0.00027564106141688186, "loss": 2.81165771484375, "step": 19850 }, { "epoch": 0.7535149940558569, "grad_norm": 0.291015625, "learning_rate": 0.00027553070763948527, "loss": 2.848034477233887, "step": 19860 }, { "epoch": 0.7538944074466202, "grad_norm": 0.294921875, "learning_rate": 0.0002754203270351362, "loss": 2.8248828887939452, "step": 19870 }, { "epoch": 0.7542738208373834, "grad_norm": 0.291015625, "learning_rate": 0.00027530991964303935, "loss": 2.8402008056640624, "step": 19880 }, { "epoch": 0.7546532342281467, "grad_norm": 0.279296875, "learning_rate": 0.00027519948550240933, "loss": 2.820825958251953, "step": 19890 }, { "epoch": 0.75503264761891, "grad_norm": 0.283203125, "learning_rate": 0.00027508902465247, "loss": 2.833135986328125, "step": 19900 }, { "epoch": 0.7554120610096733, "grad_norm": 0.28125, "learning_rate": 0.00027497853713245483, "loss": 2.827299880981445, "step": 19910 }, { "epoch": 0.7557914744004365, "grad_norm": 0.28125, "learning_rate": 0.00027486802298160663, "loss": 2.8122819900512694, "step": 19920 }, { "epoch": 0.7561708877911998, "grad_norm": 0.296875, "learning_rate": 0.00027475748223917783, "loss": 2.8429855346679687, "step": 19930 }, { "epoch": 0.7565503011819631, "grad_norm": 0.28515625, "learning_rate": 0.0002746469149444301, "loss": 2.8249982833862304, "step": 19940 }, { "epoch": 0.7569297145727263, "grad_norm": 0.283203125, "learning_rate": 0.0002745363211366349, "loss": 2.8412599563598633, "step": 19950 }, { "epoch": 0.7573091279634896, "grad_norm": 0.302734375, "learning_rate": 0.00027442570085507256, "loss": 2.8427074432373045, "step": 19960 }, { "epoch": 0.7576885413542529, "grad_norm": 0.298828125, "learning_rate": 0.00027431505413903344, "loss": 2.842500114440918, "step": 19970 }, { "epoch": 0.7580679547450161, "grad_norm": 0.322265625, "learning_rate": 0.00027420438102781683, "loss": 2.833033561706543, "step": 19980 }, { "epoch": 0.7584473681357794, "grad_norm": 0.298828125, "learning_rate": 0.0002740936815607315, "loss": 2.8500452041625977, "step": 19990 }, { "epoch": 0.7588267815265427, "grad_norm": 0.283203125, "learning_rate": 0.0002739829557770957, "loss": 2.842819404602051, "step": 20000 }, { "epoch": 0.7588267815265427, "eval_loss": 2.8291022777557373, "eval_runtime": 189.5778, "eval_samples_per_second": 20.092, "eval_steps_per_second": 3.35, "step": 20000 }, { "epoch": 0.7592061949173059, "grad_norm": 0.28515625, "learning_rate": 0.0002738722037162369, "loss": 2.840434265136719, "step": 20010 }, { "epoch": 0.7595856083080692, "grad_norm": 0.29296875, "learning_rate": 0.000273761425417492, "loss": 2.814021873474121, "step": 20020 }, { "epoch": 0.7599650216988325, "grad_norm": 0.29296875, "learning_rate": 0.00027365062092020713, "loss": 2.8240612030029295, "step": 20030 }, { "epoch": 0.7603444350895958, "grad_norm": 0.291015625, "learning_rate": 0.00027353979026373775, "loss": 2.8289724349975587, "step": 20040 }, { "epoch": 0.760723848480359, "grad_norm": 0.302734375, "learning_rate": 0.0002734289334874487, "loss": 2.8466779708862306, "step": 20050 }, { "epoch": 0.7611032618711223, "grad_norm": 0.28515625, "learning_rate": 0.00027331805063071396, "loss": 2.8211931228637694, "step": 20060 }, { "epoch": 0.7614826752618856, "grad_norm": 0.28125, "learning_rate": 0.0002732071417329169, "loss": 2.8048480987548827, "step": 20070 }, { "epoch": 0.7618620886526488, "grad_norm": 0.27734375, "learning_rate": 0.00027309620683344996, "loss": 2.8195369720458983, "step": 20080 }, { "epoch": 0.7622415020434121, "grad_norm": 0.291015625, "learning_rate": 0.0002729852459717151, "loss": 2.837724494934082, "step": 20090 }, { "epoch": 0.7626209154341754, "grad_norm": 0.27734375, "learning_rate": 0.00027287425918712314, "loss": 2.8414817810058595, "step": 20100 }, { "epoch": 0.7630003288249386, "grad_norm": 0.3203125, "learning_rate": 0.0002727632465190945, "loss": 2.823287582397461, "step": 20110 }, { "epoch": 0.7633797422157019, "grad_norm": 0.2890625, "learning_rate": 0.00027265220800705843, "loss": 2.821257972717285, "step": 20120 }, { "epoch": 0.7637591556064652, "grad_norm": 0.30859375, "learning_rate": 0.0002725411436904536, "loss": 2.828666877746582, "step": 20130 }, { "epoch": 0.7641385689972284, "grad_norm": 0.29296875, "learning_rate": 0.0002724300536087278, "loss": 2.820822334289551, "step": 20140 }, { "epoch": 0.7645179823879917, "grad_norm": 0.287109375, "learning_rate": 0.00027231893780133785, "loss": 2.841429328918457, "step": 20150 }, { "epoch": 0.764897395778755, "grad_norm": 0.294921875, "learning_rate": 0.00027220779630774994, "loss": 2.819386291503906, "step": 20160 }, { "epoch": 0.7652768091695182, "grad_norm": 0.279296875, "learning_rate": 0.000272096629167439, "loss": 2.8012435913085936, "step": 20170 }, { "epoch": 0.7656562225602815, "grad_norm": 0.29296875, "learning_rate": 0.0002719854364198896, "loss": 2.8296445846557616, "step": 20180 }, { "epoch": 0.7660356359510448, "grad_norm": 0.3125, "learning_rate": 0.00027187421810459495, "loss": 2.8163501739501955, "step": 20190 }, { "epoch": 0.7664150493418082, "grad_norm": 0.302734375, "learning_rate": 0.00027176297426105756, "loss": 2.8328012466430663, "step": 20200 }, { "epoch": 0.7667944627325713, "grad_norm": 0.28125, "learning_rate": 0.0002716517049287889, "loss": 2.8075027465820312, "step": 20210 }, { "epoch": 0.7671738761233347, "grad_norm": 0.28515625, "learning_rate": 0.0002715404101473095, "loss": 2.8260358810424804, "step": 20220 }, { "epoch": 0.767553289514098, "grad_norm": 0.296875, "learning_rate": 0.00027142908995614914, "loss": 2.8369245529174805, "step": 20230 }, { "epoch": 0.7679327029048612, "grad_norm": 0.287109375, "learning_rate": 0.00027131774439484636, "loss": 2.8057952880859376, "step": 20240 }, { "epoch": 0.7683121162956245, "grad_norm": 0.294921875, "learning_rate": 0.00027120637350294875, "loss": 2.840958023071289, "step": 20250 }, { "epoch": 0.7683121162956245, "eval_loss": 2.8261659145355225, "eval_runtime": 189.5756, "eval_samples_per_second": 20.092, "eval_steps_per_second": 3.35, "step": 20250 }, { "epoch": 0.7686915296863878, "grad_norm": 0.28515625, "learning_rate": 0.0002710949773200131, "loss": 2.810008239746094, "step": 20260 }, { "epoch": 0.769070943077151, "grad_norm": 0.2890625, "learning_rate": 0.000270983555885605, "loss": 2.8287727355957033, "step": 20270 }, { "epoch": 0.7694503564679143, "grad_norm": 0.294921875, "learning_rate": 0.0002708721092392989, "loss": 2.832901954650879, "step": 20280 }, { "epoch": 0.7698297698586776, "grad_norm": 0.29296875, "learning_rate": 0.00027076063742067857, "loss": 2.7896402359008787, "step": 20290 }, { "epoch": 0.7702091832494408, "grad_norm": 0.291015625, "learning_rate": 0.00027064914046933633, "loss": 2.8057125091552733, "step": 20300 }, { "epoch": 0.7705885966402041, "grad_norm": 0.29296875, "learning_rate": 0.0002705376184248738, "loss": 2.8437219619750977, "step": 20310 }, { "epoch": 0.7709680100309674, "grad_norm": 0.30078125, "learning_rate": 0.0002704260713269011, "loss": 2.8146692276000977, "step": 20320 }, { "epoch": 0.7713474234217307, "grad_norm": 0.29296875, "learning_rate": 0.0002703144992150376, "loss": 2.778013229370117, "step": 20330 }, { "epoch": 0.7717268368124939, "grad_norm": 0.279296875, "learning_rate": 0.0002702029021289113, "loss": 2.812320518493652, "step": 20340 }, { "epoch": 0.7721062502032572, "grad_norm": 0.287109375, "learning_rate": 0.0002700912801081594, "loss": 2.8271387100219725, "step": 20350 }, { "epoch": 0.7724856635940205, "grad_norm": 0.2890625, "learning_rate": 0.00026997963319242747, "loss": 2.8339481353759766, "step": 20360 }, { "epoch": 0.7728650769847837, "grad_norm": 0.28125, "learning_rate": 0.00026986796142137044, "loss": 2.8310049057006834, "step": 20370 }, { "epoch": 0.773244490375547, "grad_norm": 0.294921875, "learning_rate": 0.0002697562648346517, "loss": 2.813351631164551, "step": 20380 }, { "epoch": 0.7736239037663103, "grad_norm": 0.28515625, "learning_rate": 0.00026964454347194354, "loss": 2.842812728881836, "step": 20390 }, { "epoch": 0.7740033171570735, "grad_norm": 0.30078125, "learning_rate": 0.0002695327973729272, "loss": 2.8266927719116213, "step": 20400 }, { "epoch": 0.7743827305478368, "grad_norm": 0.283203125, "learning_rate": 0.00026942102657729244, "loss": 2.848155975341797, "step": 20410 }, { "epoch": 0.7747621439386001, "grad_norm": 0.322265625, "learning_rate": 0.0002693092311247382, "loss": 2.83429012298584, "step": 20420 }, { "epoch": 0.7751415573293633, "grad_norm": 0.29296875, "learning_rate": 0.0002691974110549717, "loss": 2.8356674194335936, "step": 20430 }, { "epoch": 0.7755209707201266, "grad_norm": 0.3046875, "learning_rate": 0.00026908556640770916, "loss": 2.845441436767578, "step": 20440 }, { "epoch": 0.7759003841108899, "grad_norm": 0.2890625, "learning_rate": 0.0002689736972226756, "loss": 2.8432796478271483, "step": 20450 }, { "epoch": 0.7762797975016532, "grad_norm": 0.30078125, "learning_rate": 0.00026886180353960455, "loss": 2.8328813552856444, "step": 20460 }, { "epoch": 0.7766592108924164, "grad_norm": 0.29296875, "learning_rate": 0.0002687498853982383, "loss": 2.8444040298461912, "step": 20470 }, { "epoch": 0.7770386242831797, "grad_norm": 0.28125, "learning_rate": 0.00026863794283832804, "loss": 2.8170892715454103, "step": 20480 }, { "epoch": 0.777418037673943, "grad_norm": 0.291015625, "learning_rate": 0.0002685259758996334, "loss": 2.8331844329833986, "step": 20490 }, { "epoch": 0.7777974510647062, "grad_norm": 0.283203125, "learning_rate": 0.0002684139846219226, "loss": 2.8161619186401365, "step": 20500 }, { "epoch": 0.7777974510647062, "eval_loss": 2.8241872787475586, "eval_runtime": 189.591, "eval_samples_per_second": 20.091, "eval_steps_per_second": 3.349, "step": 20500 }, { "epoch": 0.7781768644554695, "grad_norm": 0.298828125, "learning_rate": 0.0002683019690449728, "loss": 2.7996942520141603, "step": 20510 }, { "epoch": 0.7785562778462328, "grad_norm": 0.29296875, "learning_rate": 0.0002681899292085695, "loss": 2.8082834243774415, "step": 20520 }, { "epoch": 0.778935691236996, "grad_norm": 0.318359375, "learning_rate": 0.0002680778651525071, "loss": 2.8322864532470704, "step": 20530 }, { "epoch": 0.7793151046277593, "grad_norm": 0.3125, "learning_rate": 0.0002679657769165883, "loss": 2.8021427154541017, "step": 20540 }, { "epoch": 0.7796945180185226, "grad_norm": 0.30859375, "learning_rate": 0.0002678536645406246, "loss": 2.823679733276367, "step": 20550 }, { "epoch": 0.7800739314092858, "grad_norm": 0.2890625, "learning_rate": 0.00026774152806443603, "loss": 2.792985534667969, "step": 20560 }, { "epoch": 0.7804533448000491, "grad_norm": 0.291015625, "learning_rate": 0.00026762936752785106, "loss": 2.826543617248535, "step": 20570 }, { "epoch": 0.7808327581908124, "grad_norm": 0.26953125, "learning_rate": 0.00026751718297070694, "loss": 2.8159751892089844, "step": 20580 }, { "epoch": 0.7812121715815756, "grad_norm": 0.28125, "learning_rate": 0.0002674049744328492, "loss": 2.8222383499145507, "step": 20590 }, { "epoch": 0.7815915849723389, "grad_norm": 0.3046875, "learning_rate": 0.0002672927419541321, "loss": 2.806051826477051, "step": 20600 }, { "epoch": 0.7819709983631022, "grad_norm": 0.29296875, "learning_rate": 0.0002671804855744182, "loss": 2.834523582458496, "step": 20610 }, { "epoch": 0.7823504117538655, "grad_norm": 0.2890625, "learning_rate": 0.0002670682053335787, "loss": 2.8298624038696287, "step": 20620 }, { "epoch": 0.7827298251446287, "grad_norm": 0.279296875, "learning_rate": 0.0002669559012714933, "loss": 2.8258914947509766, "step": 20630 }, { "epoch": 0.783109238535392, "grad_norm": 0.30859375, "learning_rate": 0.00026684357342804995, "loss": 2.7977367401123048, "step": 20640 }, { "epoch": 0.7834886519261554, "grad_norm": 0.29296875, "learning_rate": 0.0002667312218431453, "loss": 2.803066444396973, "step": 20650 }, { "epoch": 0.7838680653169185, "grad_norm": 0.29296875, "learning_rate": 0.0002666188465566843, "loss": 2.8096298217773437, "step": 20660 }, { "epoch": 0.7842474787076819, "grad_norm": 0.283203125, "learning_rate": 0.0002665064476085803, "loss": 2.7910829544067384, "step": 20670 }, { "epoch": 0.7846268920984452, "grad_norm": 0.306640625, "learning_rate": 0.0002663940250387552, "loss": 2.836410140991211, "step": 20680 }, { "epoch": 0.7850063054892084, "grad_norm": 0.291015625, "learning_rate": 0.00026628157888713906, "loss": 2.8330690383911135, "step": 20690 }, { "epoch": 0.7853857188799717, "grad_norm": 0.279296875, "learning_rate": 0.00026616910919367046, "loss": 2.848998260498047, "step": 20700 }, { "epoch": 0.785765132270735, "grad_norm": 0.2890625, "learning_rate": 0.00026605661599829635, "loss": 2.7876108169555662, "step": 20710 }, { "epoch": 0.7861445456614982, "grad_norm": 0.27734375, "learning_rate": 0.00026594409934097197, "loss": 2.8279096603393556, "step": 20720 }, { "epoch": 0.7865239590522615, "grad_norm": 0.27734375, "learning_rate": 0.0002658315592616609, "loss": 2.8320310592651365, "step": 20730 }, { "epoch": 0.7869033724430248, "grad_norm": 0.291015625, "learning_rate": 0.00026571899580033517, "loss": 2.785831069946289, "step": 20740 }, { "epoch": 0.7872827858337881, "grad_norm": 0.28515625, "learning_rate": 0.0002656064089969749, "loss": 2.8424457550048827, "step": 20750 }, { "epoch": 0.7872827858337881, "eval_loss": 2.8224668502807617, "eval_runtime": 231.8246, "eval_samples_per_second": 16.431, "eval_steps_per_second": 2.739, "step": 20750 }, { "epoch": 0.7876621992245513, "grad_norm": 0.28125, "learning_rate": 0.0002654937988915687, "loss": 2.799355125427246, "step": 20760 }, { "epoch": 0.7880416126153146, "grad_norm": 0.2890625, "learning_rate": 0.00026538116552411316, "loss": 2.8535146713256836, "step": 20770 }, { "epoch": 0.7884210260060779, "grad_norm": 0.291015625, "learning_rate": 0.0002652685089346136, "loss": 2.8430795669555664, "step": 20780 }, { "epoch": 0.7888004393968411, "grad_norm": 0.30078125, "learning_rate": 0.00026515582916308313, "loss": 2.8293634414672852, "step": 20790 }, { "epoch": 0.7891798527876044, "grad_norm": 0.291015625, "learning_rate": 0.00026504312624954344, "loss": 2.8356050491333007, "step": 20800 }, { "epoch": 0.7895592661783677, "grad_norm": 0.2890625, "learning_rate": 0.00026493040023402406, "loss": 2.831799125671387, "step": 20810 }, { "epoch": 0.7899386795691309, "grad_norm": 0.291015625, "learning_rate": 0.0002648176511565632, "loss": 2.8236278533935546, "step": 20820 }, { "epoch": 0.7903180929598942, "grad_norm": 0.3125, "learning_rate": 0.00026470487905720685, "loss": 2.840847969055176, "step": 20830 }, { "epoch": 0.7906975063506575, "grad_norm": 0.298828125, "learning_rate": 0.0002645920839760094, "loss": 2.8238954544067383, "step": 20840 }, { "epoch": 0.7910769197414207, "grad_norm": 0.279296875, "learning_rate": 0.0002644792659530333, "loss": 2.8423715591430665, "step": 20850 }, { "epoch": 0.791456333132184, "grad_norm": 0.283203125, "learning_rate": 0.0002643664250283492, "loss": 2.818684768676758, "step": 20860 }, { "epoch": 0.7918357465229473, "grad_norm": 0.3046875, "learning_rate": 0.0002642535612420359, "loss": 2.832457160949707, "step": 20870 }, { "epoch": 0.7922151599137106, "grad_norm": 0.296875, "learning_rate": 0.0002641406746341803, "loss": 2.8144353866577148, "step": 20880 }, { "epoch": 0.7925945733044738, "grad_norm": 0.29296875, "learning_rate": 0.0002640277652448774, "loss": 2.812615966796875, "step": 20890 }, { "epoch": 0.7929739866952371, "grad_norm": 0.29296875, "learning_rate": 0.00026391483311423024, "loss": 2.8087089538574217, "step": 20900 }, { "epoch": 0.7933534000860004, "grad_norm": 0.3046875, "learning_rate": 0.00026380187828235004, "loss": 2.7845991134643553, "step": 20910 }, { "epoch": 0.7937328134767636, "grad_norm": 0.28515625, "learning_rate": 0.00026368890078935596, "loss": 2.872048568725586, "step": 20920 }, { "epoch": 0.7941122268675269, "grad_norm": 0.306640625, "learning_rate": 0.0002635759006753754, "loss": 2.845913887023926, "step": 20930 }, { "epoch": 0.7944916402582902, "grad_norm": 0.294921875, "learning_rate": 0.0002634628779805436, "loss": 2.813768196105957, "step": 20940 }, { "epoch": 0.7948710536490534, "grad_norm": 0.28515625, "learning_rate": 0.0002633498327450039, "loss": 2.820099449157715, "step": 20950 }, { "epoch": 0.7952504670398167, "grad_norm": 0.29296875, "learning_rate": 0.00026323676500890754, "loss": 2.8603954315185547, "step": 20960 }, { "epoch": 0.79562988043058, "grad_norm": 0.298828125, "learning_rate": 0.0002631236748124141, "loss": 2.8106000900268553, "step": 20970 }, { "epoch": 0.7960092938213432, "grad_norm": 0.29296875, "learning_rate": 0.0002630105621956906, "loss": 2.8494731903076174, "step": 20980 }, { "epoch": 0.7963887072121065, "grad_norm": 0.302734375, "learning_rate": 0.0002628974271989126, "loss": 2.8505922317504884, "step": 20990 }, { "epoch": 0.7967681206028698, "grad_norm": 0.2890625, "learning_rate": 0.00026278426986226313, "loss": 2.823233985900879, "step": 21000 }, { "epoch": 0.7967681206028698, "eval_loss": 2.8217785358428955, "eval_runtime": 231.4416, "eval_samples_per_second": 16.458, "eval_steps_per_second": 2.744, "step": 21000 }, { "epoch": 0.797147533993633, "grad_norm": 0.283203125, "learning_rate": 0.0002626710902259333, "loss": 2.836376953125, "step": 21010 }, { "epoch": 0.7975269473843963, "grad_norm": 0.298828125, "learning_rate": 0.0002625578883301224, "loss": 2.808627891540527, "step": 21020 }, { "epoch": 0.7979063607751596, "grad_norm": 0.28515625, "learning_rate": 0.0002624446642150372, "loss": 2.8449344635009766, "step": 21030 }, { "epoch": 0.798285774165923, "grad_norm": 0.291015625, "learning_rate": 0.0002623314179208928, "loss": 2.8363460540771483, "step": 21040 }, { "epoch": 0.7986651875566861, "grad_norm": 0.291015625, "learning_rate": 0.00026221814948791173, "loss": 2.815275955200195, "step": 21050 }, { "epoch": 0.7990446009474494, "grad_norm": 0.28125, "learning_rate": 0.0002621048589563247, "loss": 2.823078727722168, "step": 21060 }, { "epoch": 0.7994240143382128, "grad_norm": 0.302734375, "learning_rate": 0.0002619915463663702, "loss": 2.822665214538574, "step": 21070 }, { "epoch": 0.799803427728976, "grad_norm": 0.2890625, "learning_rate": 0.00026187821175829455, "loss": 2.8267452239990236, "step": 21080 }, { "epoch": 0.8001828411197393, "grad_norm": 0.271484375, "learning_rate": 0.00026176485517235177, "loss": 2.8310998916625976, "step": 21090 }, { "epoch": 0.8005622545105026, "grad_norm": 0.279296875, "learning_rate": 0.0002616514766488038, "loss": 2.8480579376220705, "step": 21100 }, { "epoch": 0.8009416679012658, "grad_norm": 0.28515625, "learning_rate": 0.0002615380762279204, "loss": 2.8272424697875977, "step": 21110 }, { "epoch": 0.8013210812920291, "grad_norm": 0.283203125, "learning_rate": 0.00026142465394997914, "loss": 2.822281837463379, "step": 21120 }, { "epoch": 0.8017004946827924, "grad_norm": 0.310546875, "learning_rate": 0.0002613112098552652, "loss": 2.816415214538574, "step": 21130 }, { "epoch": 0.8020799080735556, "grad_norm": 0.287109375, "learning_rate": 0.00026119774398407147, "loss": 2.8340606689453125, "step": 21140 }, { "epoch": 0.8024593214643189, "grad_norm": 0.287109375, "learning_rate": 0.0002610842563766989, "loss": 2.8259275436401365, "step": 21150 }, { "epoch": 0.8028387348550822, "grad_norm": 0.287109375, "learning_rate": 0.00026097074707345585, "loss": 2.804387664794922, "step": 21160 }, { "epoch": 0.8032181482458455, "grad_norm": 0.30078125, "learning_rate": 0.00026085721611465853, "loss": 2.8325439453125, "step": 21170 }, { "epoch": 0.8035975616366087, "grad_norm": 0.28515625, "learning_rate": 0.0002607436635406307, "loss": 2.8575441360473635, "step": 21180 }, { "epoch": 0.803976975027372, "grad_norm": 0.314453125, "learning_rate": 0.00026063008939170403, "loss": 2.8278781890869142, "step": 21190 }, { "epoch": 0.8043563884181353, "grad_norm": 0.287109375, "learning_rate": 0.0002605164937082176, "loss": 2.851171875, "step": 21200 }, { "epoch": 0.8047358018088985, "grad_norm": 0.28125, "learning_rate": 0.0002604028765305183, "loss": 2.830158805847168, "step": 21210 }, { "epoch": 0.8051152151996618, "grad_norm": 0.287109375, "learning_rate": 0.0002602892378989607, "loss": 2.8502878189086913, "step": 21220 }, { "epoch": 0.8054946285904251, "grad_norm": 0.294921875, "learning_rate": 0.0002601755778539068, "loss": 2.8458099365234375, "step": 21230 }, { "epoch": 0.8058740419811883, "grad_norm": 1.1953125, "learning_rate": 0.00026006189643572634, "loss": 2.827349090576172, "step": 21240 }, { "epoch": 0.8062534553719516, "grad_norm": 0.298828125, "learning_rate": 0.0002599481936847966, "loss": 2.809520149230957, "step": 21250 }, { "epoch": 0.8062534553719516, "eval_loss": 2.8284757137298584, "eval_runtime": 231.4418, "eval_samples_per_second": 16.458, "eval_steps_per_second": 2.744, "step": 21250 }, { "epoch": 0.8066328687627149, "grad_norm": 0.3046875, "learning_rate": 0.00025983446964150247, "loss": 2.83636474609375, "step": 21260 }, { "epoch": 0.8070122821534781, "grad_norm": 0.298828125, "learning_rate": 0.00025972072434623643, "loss": 2.7849830627441405, "step": 21270 }, { "epoch": 0.8073916955442414, "grad_norm": 0.28515625, "learning_rate": 0.0002596069578393984, "loss": 2.8209251403808593, "step": 21280 }, { "epoch": 0.8077711089350047, "grad_norm": 0.28125, "learning_rate": 0.0002594931701613959, "loss": 2.8431129455566406, "step": 21290 }, { "epoch": 0.808150522325768, "grad_norm": 0.283203125, "learning_rate": 0.00025937936135264405, "loss": 2.8173896789550783, "step": 21300 }, { "epoch": 0.8085299357165312, "grad_norm": 0.2890625, "learning_rate": 0.00025926553145356535, "loss": 2.829300880432129, "step": 21310 }, { "epoch": 0.8089093491072945, "grad_norm": 0.29296875, "learning_rate": 0.00025915168050458985, "loss": 2.823215866088867, "step": 21320 }, { "epoch": 0.8092887624980578, "grad_norm": 0.267578125, "learning_rate": 0.00025903780854615504, "loss": 2.795969009399414, "step": 21330 }, { "epoch": 0.809668175888821, "grad_norm": 0.27734375, "learning_rate": 0.00025892391561870584, "loss": 2.799988555908203, "step": 21340 }, { "epoch": 0.8100475892795843, "grad_norm": 0.283203125, "learning_rate": 0.0002588100017626948, "loss": 2.8391725540161135, "step": 21350 }, { "epoch": 0.8104270026703476, "grad_norm": 0.287109375, "learning_rate": 0.00025869606701858176, "loss": 2.828306198120117, "step": 21360 }, { "epoch": 0.8108064160611108, "grad_norm": 0.314453125, "learning_rate": 0.00025858211142683384, "loss": 2.8552818298339844, "step": 21370 }, { "epoch": 0.8111858294518741, "grad_norm": 0.302734375, "learning_rate": 0.00025846813502792594, "loss": 2.8500526428222654, "step": 21380 }, { "epoch": 0.8115652428426374, "grad_norm": 0.28125, "learning_rate": 0.00025835413786233995, "loss": 2.834260368347168, "step": 21390 }, { "epoch": 0.8119446562334006, "grad_norm": 0.29296875, "learning_rate": 0.0002582401199705654, "loss": 2.8042266845703123, "step": 21400 }, { "epoch": 0.8123240696241639, "grad_norm": 0.279296875, "learning_rate": 0.00025812608139309904, "loss": 2.8302127838134767, "step": 21410 }, { "epoch": 0.8127034830149272, "grad_norm": 0.287109375, "learning_rate": 0.0002580120221704452, "loss": 2.8529947280883787, "step": 21420 }, { "epoch": 0.8130828964056904, "grad_norm": 0.291015625, "learning_rate": 0.00025789794234311515, "loss": 2.8306928634643556, "step": 21430 }, { "epoch": 0.8134623097964537, "grad_norm": 0.306640625, "learning_rate": 0.0002577838419516278, "loss": 2.804641532897949, "step": 21440 }, { "epoch": 0.813841723187217, "grad_norm": 0.29296875, "learning_rate": 0.0002576697210365093, "loss": 2.8015087127685545, "step": 21450 }, { "epoch": 0.8142211365779803, "grad_norm": 0.296875, "learning_rate": 0.0002575555796382929, "loss": 2.8070398330688477, "step": 21460 }, { "epoch": 0.8146005499687435, "grad_norm": 0.30078125, "learning_rate": 0.0002574414177975194, "loss": 2.820473289489746, "step": 21470 }, { "epoch": 0.8149799633595068, "grad_norm": 0.294921875, "learning_rate": 0.0002573272355547368, "loss": 2.8432037353515627, "step": 21480 }, { "epoch": 0.8153593767502701, "grad_norm": 0.294921875, "learning_rate": 0.0002572130329505001, "loss": 2.835975456237793, "step": 21490 }, { "epoch": 0.8157387901410333, "grad_norm": 0.310546875, "learning_rate": 0.0002570988100253718, "loss": 2.812059783935547, "step": 21500 }, { "epoch": 0.8157387901410333, "eval_loss": 2.820070505142212, "eval_runtime": 200.4146, "eval_samples_per_second": 19.006, "eval_steps_per_second": 3.168, "step": 21500 }, { "epoch": 0.8161182035317966, "grad_norm": 0.28515625, "learning_rate": 0.0002569845668199216, "loss": 2.834417533874512, "step": 21510 }, { "epoch": 0.81649761692256, "grad_norm": 0.294921875, "learning_rate": 0.0002568703033747262, "loss": 2.8178552627563476, "step": 21520 }, { "epoch": 0.8168770303133231, "grad_norm": 0.28515625, "learning_rate": 0.00025675601973036976, "loss": 2.8309131622314454, "step": 21530 }, { "epoch": 0.8172564437040865, "grad_norm": 0.29296875, "learning_rate": 0.0002566417159274434, "loss": 2.81756534576416, "step": 21540 }, { "epoch": 0.8176358570948498, "grad_norm": 0.283203125, "learning_rate": 0.0002565273920065455, "loss": 2.8195348739624024, "step": 21550 }, { "epoch": 0.818015270485613, "grad_norm": 0.294921875, "learning_rate": 0.00025641304800828147, "loss": 2.818246841430664, "step": 21560 }, { "epoch": 0.8183946838763763, "grad_norm": 0.30078125, "learning_rate": 0.0002562986839732641, "loss": 2.8261308670043945, "step": 21570 }, { "epoch": 0.8187740972671396, "grad_norm": 0.29296875, "learning_rate": 0.0002561842999421131, "loss": 2.8103879928588866, "step": 21580 }, { "epoch": 0.8191535106579029, "grad_norm": 0.314453125, "learning_rate": 0.0002560698959554552, "loss": 2.793143463134766, "step": 21590 }, { "epoch": 0.8195329240486661, "grad_norm": 0.28515625, "learning_rate": 0.0002559554720539245, "loss": 2.822785758972168, "step": 21600 }, { "epoch": 0.8199123374394294, "grad_norm": 0.32421875, "learning_rate": 0.00025584102827816196, "loss": 2.839263153076172, "step": 21610 }, { "epoch": 0.8202917508301927, "grad_norm": 0.287109375, "learning_rate": 0.0002557265646688157, "loss": 2.840679168701172, "step": 21620 }, { "epoch": 0.8206711642209559, "grad_norm": 0.29296875, "learning_rate": 0.0002556120812665406, "loss": 2.799322509765625, "step": 21630 }, { "epoch": 0.8210505776117192, "grad_norm": 0.296875, "learning_rate": 0.0002554975781119991, "loss": 2.8505029678344727, "step": 21640 }, { "epoch": 0.8214299910024825, "grad_norm": 0.291015625, "learning_rate": 0.0002553830552458602, "loss": 2.8345819473266602, "step": 21650 }, { "epoch": 0.8218094043932457, "grad_norm": 0.28515625, "learning_rate": 0.0002552685127088002, "loss": 2.8482812881469726, "step": 21660 }, { "epoch": 0.822188817784009, "grad_norm": 0.291015625, "learning_rate": 0.0002551539505415021, "loss": 2.813530921936035, "step": 21670 }, { "epoch": 0.8225682311747723, "grad_norm": 0.287109375, "learning_rate": 0.0002550393687846561, "loss": 2.811182975769043, "step": 21680 }, { "epoch": 0.8229476445655355, "grad_norm": 0.29296875, "learning_rate": 0.00025492476747895925, "loss": 2.8088333129882814, "step": 21690 }, { "epoch": 0.8233270579562988, "grad_norm": 0.291015625, "learning_rate": 0.0002548101466651157, "loss": 2.8332136154174803, "step": 21700 }, { "epoch": 0.8237064713470621, "grad_norm": 0.296875, "learning_rate": 0.0002546955063838362, "loss": 2.829046058654785, "step": 21710 }, { "epoch": 0.8240858847378253, "grad_norm": 0.291015625, "learning_rate": 0.0002545808466758388, "loss": 2.835390090942383, "step": 21720 }, { "epoch": 0.8244652981285886, "grad_norm": 0.2890625, "learning_rate": 0.0002544661675818481, "loss": 2.8195852279663085, "step": 21730 }, { "epoch": 0.8248447115193519, "grad_norm": 0.291015625, "learning_rate": 0.00025435146914259586, "loss": 2.800628662109375, "step": 21740 }, { "epoch": 0.8252241249101152, "grad_norm": 0.29296875, "learning_rate": 0.00025423675139882063, "loss": 2.8258083343505858, "step": 21750 }, { "epoch": 0.8252241249101152, "eval_loss": 2.818159818649292, "eval_runtime": 197.9571, "eval_samples_per_second": 19.242, "eval_steps_per_second": 3.208, "step": 21750 }, { "epoch": 0.8256035383008784, "grad_norm": 0.310546875, "learning_rate": 0.0002541220143912677, "loss": 2.8040924072265625, "step": 21760 }, { "epoch": 0.8259829516916417, "grad_norm": 0.2890625, "learning_rate": 0.0002540072581606894, "loss": 2.8257959365844725, "step": 21770 }, { "epoch": 0.826362365082405, "grad_norm": 0.283203125, "learning_rate": 0.0002538924827478446, "loss": 2.8423734664916993, "step": 21780 }, { "epoch": 0.8267417784731682, "grad_norm": 0.28515625, "learning_rate": 0.0002537776881934994, "loss": 2.8142234802246096, "step": 21790 }, { "epoch": 0.8271211918639315, "grad_norm": 0.287109375, "learning_rate": 0.00025366287453842625, "loss": 2.7942794799804687, "step": 21800 }, { "epoch": 0.8275006052546948, "grad_norm": 0.291015625, "learning_rate": 0.0002535480418234048, "loss": 2.8160116195678713, "step": 21810 }, { "epoch": 0.827880018645458, "grad_norm": 0.29296875, "learning_rate": 0.0002534331900892212, "loss": 2.8200733184814455, "step": 21820 }, { "epoch": 0.8282594320362213, "grad_norm": 0.30859375, "learning_rate": 0.0002533183193766682, "loss": 2.837328338623047, "step": 21830 }, { "epoch": 0.8286388454269846, "grad_norm": 0.294921875, "learning_rate": 0.0002532034297265458, "loss": 2.8256731033325195, "step": 21840 }, { "epoch": 0.8290182588177478, "grad_norm": 0.294921875, "learning_rate": 0.0002530885211796603, "loss": 2.8299646377563477, "step": 21850 }, { "epoch": 0.8293976722085111, "grad_norm": 0.279296875, "learning_rate": 0.00025297359377682494, "loss": 2.821303367614746, "step": 21860 }, { "epoch": 0.8297770855992744, "grad_norm": 0.28125, "learning_rate": 0.00025285864755885946, "loss": 2.815560531616211, "step": 21870 }, { "epoch": 0.8301564989900377, "grad_norm": 0.30859375, "learning_rate": 0.0002527436825665905, "loss": 2.8029767990112306, "step": 21880 }, { "epoch": 0.8305359123808009, "grad_norm": 0.28515625, "learning_rate": 0.00025262869884085115, "loss": 2.8176145553588867, "step": 21890 }, { "epoch": 0.8309153257715642, "grad_norm": 0.279296875, "learning_rate": 0.0002525136964224813, "loss": 2.824557304382324, "step": 21900 }, { "epoch": 0.8312947391623275, "grad_norm": 0.294921875, "learning_rate": 0.0002523986753523275, "loss": 2.8147579193115235, "step": 21910 }, { "epoch": 0.8316741525530907, "grad_norm": 0.30078125, "learning_rate": 0.0002522836356712428, "loss": 2.8162479400634766, "step": 21920 }, { "epoch": 0.832053565943854, "grad_norm": 0.294921875, "learning_rate": 0.00025216857742008694, "loss": 2.851650047302246, "step": 21930 }, { "epoch": 0.8324329793346174, "grad_norm": 0.30859375, "learning_rate": 0.0002520535006397263, "loss": 2.8272247314453125, "step": 21940 }, { "epoch": 0.8328123927253805, "grad_norm": 0.287109375, "learning_rate": 0.0002519384053710336, "loss": 2.8423593521118162, "step": 21950 }, { "epoch": 0.8331918061161439, "grad_norm": 0.294921875, "learning_rate": 0.0002518232916548886, "loss": 2.818336296081543, "step": 21960 }, { "epoch": 0.8335712195069072, "grad_norm": 0.2890625, "learning_rate": 0.0002517081595321771, "loss": 2.818313980102539, "step": 21970 }, { "epoch": 0.8339506328976704, "grad_norm": 0.291015625, "learning_rate": 0.00025159300904379175, "loss": 2.810885429382324, "step": 21980 }, { "epoch": 0.8343300462884337, "grad_norm": 0.3125, "learning_rate": 0.00025147784023063155, "loss": 2.7800928115844727, "step": 21990 }, { "epoch": 0.834709459679197, "grad_norm": 0.298828125, "learning_rate": 0.00025136265313360225, "loss": 2.7969608306884766, "step": 22000 }, { "epoch": 0.834709459679197, "eval_loss": 2.81705904006958, "eval_runtime": 189.6667, "eval_samples_per_second": 20.083, "eval_steps_per_second": 3.348, "step": 22000 }, { "epoch": 0.8350888730699603, "grad_norm": 0.302734375, "learning_rate": 0.00025124744779361573, "loss": 2.7963851928710937, "step": 22010 }, { "epoch": 0.8354682864607235, "grad_norm": 0.3046875, "learning_rate": 0.00025113222425159076, "loss": 2.8157909393310545, "step": 22020 }, { "epoch": 0.8358476998514868, "grad_norm": 0.296875, "learning_rate": 0.0002510169825484522, "loss": 2.827780914306641, "step": 22030 }, { "epoch": 0.8362271132422501, "grad_norm": 0.287109375, "learning_rate": 0.0002509017227251317, "loss": 2.839272308349609, "step": 22040 }, { "epoch": 0.8366065266330133, "grad_norm": 0.298828125, "learning_rate": 0.000250786444822567, "loss": 2.815377616882324, "step": 22050 }, { "epoch": 0.8369859400237766, "grad_norm": 0.310546875, "learning_rate": 0.00025067114888170254, "loss": 2.8152305603027346, "step": 22060 }, { "epoch": 0.8373653534145399, "grad_norm": 0.296875, "learning_rate": 0.000250555834943489, "loss": 2.841930389404297, "step": 22070 }, { "epoch": 0.8377447668053031, "grad_norm": 0.298828125, "learning_rate": 0.00025044050304888363, "loss": 2.849019432067871, "step": 22080 }, { "epoch": 0.8381241801960664, "grad_norm": 0.294921875, "learning_rate": 0.0002503251532388498, "loss": 2.8148317337036133, "step": 22090 }, { "epoch": 0.8385035935868297, "grad_norm": 0.298828125, "learning_rate": 0.0002502097855543575, "loss": 2.832468032836914, "step": 22100 }, { "epoch": 0.8388830069775929, "grad_norm": 0.3046875, "learning_rate": 0.00025009440003638295, "loss": 2.8145072937011717, "step": 22110 }, { "epoch": 0.8392624203683562, "grad_norm": 0.28515625, "learning_rate": 0.00024997899672590866, "loss": 2.8000743865966795, "step": 22120 }, { "epoch": 0.8396418337591195, "grad_norm": 0.30078125, "learning_rate": 0.00024986357566392356, "loss": 2.8451200485229493, "step": 22130 }, { "epoch": 0.8400212471498827, "grad_norm": 0.28515625, "learning_rate": 0.0002497481368914228, "loss": 2.7841936111450196, "step": 22140 }, { "epoch": 0.840400660540646, "grad_norm": 0.287109375, "learning_rate": 0.0002496326804494078, "loss": 2.8165719985961912, "step": 22150 }, { "epoch": 0.8407800739314093, "grad_norm": 0.29296875, "learning_rate": 0.0002495172063788864, "loss": 2.837203598022461, "step": 22160 }, { "epoch": 0.8411594873221726, "grad_norm": 0.3046875, "learning_rate": 0.00024940171472087266, "loss": 2.817221832275391, "step": 22170 }, { "epoch": 0.8415389007129358, "grad_norm": 0.29296875, "learning_rate": 0.00024928620551638666, "loss": 2.8315170288085936, "step": 22180 }, { "epoch": 0.8419183141036991, "grad_norm": 0.302734375, "learning_rate": 0.00024917067880645506, "loss": 2.836107063293457, "step": 22190 }, { "epoch": 0.8422977274944624, "grad_norm": 0.298828125, "learning_rate": 0.00024905513463211045, "loss": 2.7984359741210936, "step": 22200 }, { "epoch": 0.8426771408852256, "grad_norm": 0.283203125, "learning_rate": 0.0002489395730343919, "loss": 2.8163103103637694, "step": 22210 }, { "epoch": 0.8430565542759889, "grad_norm": 0.294921875, "learning_rate": 0.0002488239940543444, "loss": 2.8372028350830076, "step": 22220 }, { "epoch": 0.8434359676667522, "grad_norm": 0.283203125, "learning_rate": 0.00024870839773301915, "loss": 2.8186737060546876, "step": 22230 }, { "epoch": 0.8438153810575154, "grad_norm": 0.296875, "learning_rate": 0.00024859278411147373, "loss": 2.839053153991699, "step": 22240 }, { "epoch": 0.8441947944482787, "grad_norm": 0.28515625, "learning_rate": 0.00024847715323077164, "loss": 2.8131637573242188, "step": 22250 }, { "epoch": 0.8441947944482787, "eval_loss": 2.815413475036621, "eval_runtime": 189.4058, "eval_samples_per_second": 20.11, "eval_steps_per_second": 3.353, "step": 22250 }, { "epoch": 0.844574207839042, "grad_norm": 0.283203125, "learning_rate": 0.00024836150513198263, "loss": 2.8224035263061524, "step": 22260 }, { "epoch": 0.8449536212298052, "grad_norm": 0.2890625, "learning_rate": 0.00024824583985618245, "loss": 2.803734016418457, "step": 22270 }, { "epoch": 0.8453330346205685, "grad_norm": 0.296875, "learning_rate": 0.00024813015744445314, "loss": 2.8305343627929687, "step": 22280 }, { "epoch": 0.8457124480113318, "grad_norm": 0.302734375, "learning_rate": 0.00024801445793788256, "loss": 2.819507598876953, "step": 22290 }, { "epoch": 0.8460918614020951, "grad_norm": 0.294921875, "learning_rate": 0.00024789874137756495, "loss": 2.821614456176758, "step": 22300 }, { "epoch": 0.8464712747928583, "grad_norm": 0.2890625, "learning_rate": 0.00024778300780460035, "loss": 2.827231216430664, "step": 22310 }, { "epoch": 0.8468506881836216, "grad_norm": 0.29296875, "learning_rate": 0.000247667257260095, "loss": 2.8115280151367186, "step": 22320 }, { "epoch": 0.8472301015743849, "grad_norm": 0.302734375, "learning_rate": 0.00024755148978516104, "loss": 2.8185523986816405, "step": 22330 }, { "epoch": 0.8476095149651481, "grad_norm": 0.302734375, "learning_rate": 0.0002474357054209168, "loss": 2.798639106750488, "step": 22340 }, { "epoch": 0.8479889283559114, "grad_norm": 0.291015625, "learning_rate": 0.0002473199042084864, "loss": 2.8006723403930662, "step": 22350 }, { "epoch": 0.8483683417466747, "grad_norm": 0.2890625, "learning_rate": 0.00024720408618900006, "loss": 2.8153396606445313, "step": 22360 }, { "epoch": 0.8487477551374379, "grad_norm": 0.28515625, "learning_rate": 0.0002470882514035941, "loss": 2.825265312194824, "step": 22370 }, { "epoch": 0.8491271685282012, "grad_norm": 0.294921875, "learning_rate": 0.00024697239989341036, "loss": 2.8119373321533203, "step": 22380 }, { "epoch": 0.8495065819189646, "grad_norm": 0.2890625, "learning_rate": 0.0002468565316995972, "loss": 2.826456069946289, "step": 22390 }, { "epoch": 0.8498859953097277, "grad_norm": 0.296875, "learning_rate": 0.0002467406468633085, "loss": 2.815421295166016, "step": 22400 }, { "epoch": 0.850265408700491, "grad_norm": 0.291015625, "learning_rate": 0.00024662474542570416, "loss": 2.7837976455688476, "step": 22410 }, { "epoch": 0.8506448220912544, "grad_norm": 0.29296875, "learning_rate": 0.00024650882742794993, "loss": 2.826875686645508, "step": 22420 }, { "epoch": 0.8510242354820177, "grad_norm": 0.29296875, "learning_rate": 0.0002463928929112177, "loss": 2.8170242309570312, "step": 22430 }, { "epoch": 0.8514036488727809, "grad_norm": 0.287109375, "learning_rate": 0.00024627694191668476, "loss": 2.806075096130371, "step": 22440 }, { "epoch": 0.8517830622635442, "grad_norm": 0.279296875, "learning_rate": 0.0002461609744855347, "loss": 2.8101213455200194, "step": 22450 }, { "epoch": 0.8521624756543075, "grad_norm": 0.30078125, "learning_rate": 0.0002460449906589568, "loss": 2.823282814025879, "step": 22460 }, { "epoch": 0.8525418890450707, "grad_norm": 0.296875, "learning_rate": 0.00024592899047814587, "loss": 2.8106924057006837, "step": 22470 }, { "epoch": 0.852921302435834, "grad_norm": 0.2890625, "learning_rate": 0.0002458129739843031, "loss": 2.8409502029418947, "step": 22480 }, { "epoch": 0.8533007158265973, "grad_norm": 0.27734375, "learning_rate": 0.0002456969412186349, "loss": 2.8344343185424803, "step": 22490 }, { "epoch": 0.8536801292173605, "grad_norm": 0.294921875, "learning_rate": 0.00024558089222235385, "loss": 2.814138412475586, "step": 22500 }, { "epoch": 0.8536801292173605, "eval_loss": 2.8137834072113037, "eval_runtime": 189.42, "eval_samples_per_second": 20.109, "eval_steps_per_second": 3.352, "step": 22500 }, { "epoch": 0.8540595426081238, "grad_norm": 0.3046875, "learning_rate": 0.00024546482703667816, "loss": 2.796217918395996, "step": 22510 }, { "epoch": 0.8544389559988871, "grad_norm": 0.302734375, "learning_rate": 0.0002453487457028318, "loss": 2.796310615539551, "step": 22520 }, { "epoch": 0.8548183693896503, "grad_norm": 0.294921875, "learning_rate": 0.0002452326482620444, "loss": 2.8015207290649413, "step": 22530 }, { "epoch": 0.8551977827804136, "grad_norm": 0.306640625, "learning_rate": 0.00024511653475555156, "loss": 2.8194787979125975, "step": 22540 }, { "epoch": 0.8555771961711769, "grad_norm": 0.28515625, "learning_rate": 0.00024500040522459416, "loss": 2.832789421081543, "step": 22550 }, { "epoch": 0.8559566095619401, "grad_norm": 0.283203125, "learning_rate": 0.0002448842597104192, "loss": 2.8189889907836916, "step": 22560 }, { "epoch": 0.8563360229527034, "grad_norm": 0.29296875, "learning_rate": 0.0002447680982542791, "loss": 2.855090522766113, "step": 22570 }, { "epoch": 0.8567154363434667, "grad_norm": 0.294921875, "learning_rate": 0.00024465192089743207, "loss": 2.8199586868286133, "step": 22580 }, { "epoch": 0.85709484973423, "grad_norm": 0.29296875, "learning_rate": 0.0002445357276811418, "loss": 2.8478654861450194, "step": 22590 }, { "epoch": 0.8574742631249932, "grad_norm": 0.291015625, "learning_rate": 0.00024441951864667786, "loss": 2.8049013137817385, "step": 22600 }, { "epoch": 0.8578536765157565, "grad_norm": 0.302734375, "learning_rate": 0.0002443032938353152, "loss": 2.811000633239746, "step": 22610 }, { "epoch": 0.8582330899065198, "grad_norm": 0.294921875, "learning_rate": 0.0002441870532883346, "loss": 2.8057634353637697, "step": 22620 }, { "epoch": 0.858612503297283, "grad_norm": 0.314453125, "learning_rate": 0.00024407079704702228, "loss": 2.821101951599121, "step": 22630 }, { "epoch": 0.8589919166880463, "grad_norm": 0.302734375, "learning_rate": 0.00024395452515267, "loss": 2.8625539779663085, "step": 22640 }, { "epoch": 0.8593713300788096, "grad_norm": 0.294921875, "learning_rate": 0.0002438382376465751, "loss": 2.7928112030029295, "step": 22650 }, { "epoch": 0.8597507434695728, "grad_norm": 0.298828125, "learning_rate": 0.00024372193457004065, "loss": 2.823271369934082, "step": 22660 }, { "epoch": 0.8601301568603361, "grad_norm": 0.291015625, "learning_rate": 0.000243605615964375, "loss": 2.808486557006836, "step": 22670 }, { "epoch": 0.8605095702510994, "grad_norm": 0.298828125, "learning_rate": 0.0002434892818708922, "loss": 2.8147842407226564, "step": 22680 }, { "epoch": 0.8608889836418626, "grad_norm": 0.28125, "learning_rate": 0.0002433729323309117, "loss": 2.8234777450561523, "step": 22690 }, { "epoch": 0.8612683970326259, "grad_norm": 0.294921875, "learning_rate": 0.00024325656738575843, "loss": 2.8336986541748046, "step": 22700 }, { "epoch": 0.8616478104233892, "grad_norm": 0.294921875, "learning_rate": 0.00024314018707676286, "loss": 2.817000961303711, "step": 22710 }, { "epoch": 0.8620272238141525, "grad_norm": 0.314453125, "learning_rate": 0.00024302379144526084, "loss": 2.8184627532958983, "step": 22720 }, { "epoch": 0.8624066372049157, "grad_norm": 0.29296875, "learning_rate": 0.00024290738053259374, "loss": 2.8195777893066407, "step": 22730 }, { "epoch": 0.862786050595679, "grad_norm": 0.283203125, "learning_rate": 0.0002427909543801083, "loss": 2.814330291748047, "step": 22740 }, { "epoch": 0.8631654639864423, "grad_norm": 0.29296875, "learning_rate": 0.00024267451302915674, "loss": 2.8511964797973635, "step": 22750 }, { "epoch": 0.8631654639864423, "eval_loss": 2.813929796218872, "eval_runtime": 189.4904, "eval_samples_per_second": 20.101, "eval_steps_per_second": 3.351, "step": 22750 }, { "epoch": 0.8635448773772055, "grad_norm": 0.326171875, "learning_rate": 0.00024255805652109654, "loss": 2.8116783142089843, "step": 22760 }, { "epoch": 0.8639242907679688, "grad_norm": 0.28125, "learning_rate": 0.00024244158489729074, "loss": 2.848722457885742, "step": 22770 }, { "epoch": 0.8643037041587321, "grad_norm": 0.3046875, "learning_rate": 0.00024232509819910756, "loss": 2.8176342010498048, "step": 22780 }, { "epoch": 0.8646831175494953, "grad_norm": 0.294921875, "learning_rate": 0.0002422085964679208, "loss": 2.8007719039916994, "step": 22790 }, { "epoch": 0.8650625309402586, "grad_norm": 0.296875, "learning_rate": 0.00024209207974510943, "loss": 2.8298677444458007, "step": 22800 }, { "epoch": 0.865441944331022, "grad_norm": 0.30078125, "learning_rate": 0.0002419755480720578, "loss": 2.818860626220703, "step": 22810 }, { "epoch": 0.8658213577217851, "grad_norm": 0.298828125, "learning_rate": 0.00024185900149015547, "loss": 2.8016468048095704, "step": 22820 }, { "epoch": 0.8662007711125485, "grad_norm": 0.29296875, "learning_rate": 0.00024174244004079753, "loss": 2.8250349044799803, "step": 22830 }, { "epoch": 0.8665801845033118, "grad_norm": 0.294921875, "learning_rate": 0.00024162586376538415, "loss": 2.7820178985595705, "step": 22840 }, { "epoch": 0.8669595978940751, "grad_norm": 0.28515625, "learning_rate": 0.00024150927270532075, "loss": 2.7790334701538084, "step": 22850 }, { "epoch": 0.8673390112848383, "grad_norm": 0.291015625, "learning_rate": 0.0002413926669020182, "loss": 2.839372253417969, "step": 22860 }, { "epoch": 0.8677184246756016, "grad_norm": 0.2890625, "learning_rate": 0.0002412760463968924, "loss": 2.805887222290039, "step": 22870 }, { "epoch": 0.8680978380663649, "grad_norm": 0.29296875, "learning_rate": 0.00024115941123136457, "loss": 2.8391019821166994, "step": 22880 }, { "epoch": 0.8684772514571281, "grad_norm": 0.302734375, "learning_rate": 0.00024104276144686106, "loss": 2.8016197204589846, "step": 22890 }, { "epoch": 0.8688566648478914, "grad_norm": 0.298828125, "learning_rate": 0.00024092609708481363, "loss": 2.829619216918945, "step": 22900 }, { "epoch": 0.8692360782386547, "grad_norm": 0.3046875, "learning_rate": 0.00024080941818665886, "loss": 2.804649543762207, "step": 22910 }, { "epoch": 0.8696154916294179, "grad_norm": 0.296875, "learning_rate": 0.0002406927247938389, "loss": 2.809928321838379, "step": 22920 }, { "epoch": 0.8699949050201812, "grad_norm": 0.30078125, "learning_rate": 0.0002405760169478007, "loss": 2.804800796508789, "step": 22930 }, { "epoch": 0.8703743184109445, "grad_norm": 0.302734375, "learning_rate": 0.00024045929468999652, "loss": 2.819566535949707, "step": 22940 }, { "epoch": 0.8707537318017077, "grad_norm": 0.29296875, "learning_rate": 0.0002403425580618837, "loss": 2.8380720138549806, "step": 22950 }, { "epoch": 0.871133145192471, "grad_norm": 0.294921875, "learning_rate": 0.0002402258071049247, "loss": 2.7759305953979494, "step": 22960 }, { "epoch": 0.8715125585832343, "grad_norm": 0.294921875, "learning_rate": 0.00024010904186058706, "loss": 2.8126598358154298, "step": 22970 }, { "epoch": 0.8718919719739975, "grad_norm": 0.302734375, "learning_rate": 0.00023999226237034335, "loss": 2.831007194519043, "step": 22980 }, { "epoch": 0.8722713853647608, "grad_norm": 0.29296875, "learning_rate": 0.00023987546867567131, "loss": 2.801127815246582, "step": 22990 }, { "epoch": 0.8726507987555241, "grad_norm": 0.296875, "learning_rate": 0.0002397586608180536, "loss": 2.823965072631836, "step": 23000 }, { "epoch": 0.8726507987555241, "eval_loss": 2.811549186706543, "eval_runtime": 189.4571, "eval_samples_per_second": 20.105, "eval_steps_per_second": 3.352, "step": 23000 }, { "epoch": 0.8730302121462874, "grad_norm": 0.296875, "learning_rate": 0.00023964183883897798, "loss": 2.7849313735961916, "step": 23010 }, { "epoch": 0.8734096255370506, "grad_norm": 0.298828125, "learning_rate": 0.00023952500277993718, "loss": 2.823659896850586, "step": 23020 }, { "epoch": 0.8737890389278139, "grad_norm": 0.296875, "learning_rate": 0.00023940815268242905, "loss": 2.8423078536987303, "step": 23030 }, { "epoch": 0.8741684523185772, "grad_norm": 0.322265625, "learning_rate": 0.0002392912885879563, "loss": 2.8434043884277345, "step": 23040 }, { "epoch": 0.8745478657093404, "grad_norm": 0.296875, "learning_rate": 0.0002391744105380266, "loss": 2.828703689575195, "step": 23050 }, { "epoch": 0.8749272791001037, "grad_norm": 0.30078125, "learning_rate": 0.00023905751857415268, "loss": 2.856952095031738, "step": 23060 }, { "epoch": 0.875306692490867, "grad_norm": 0.287109375, "learning_rate": 0.00023894061273785214, "loss": 2.8098730087280273, "step": 23070 }, { "epoch": 0.8756861058816302, "grad_norm": 0.306640625, "learning_rate": 0.00023882369307064757, "loss": 2.816156005859375, "step": 23080 }, { "epoch": 0.8760655192723935, "grad_norm": 0.318359375, "learning_rate": 0.00023870675961406632, "loss": 2.782734680175781, "step": 23090 }, { "epoch": 0.8764449326631568, "grad_norm": 0.296875, "learning_rate": 0.00023858981240964088, "loss": 2.8108755111694337, "step": 23100 }, { "epoch": 0.87682434605392, "grad_norm": 0.287109375, "learning_rate": 0.0002384728514989084, "loss": 2.812100410461426, "step": 23110 }, { "epoch": 0.8772037594446833, "grad_norm": 0.291015625, "learning_rate": 0.00023835587692341105, "loss": 2.8306970596313477, "step": 23120 }, { "epoch": 0.8775831728354466, "grad_norm": 0.298828125, "learning_rate": 0.00023823888872469572, "loss": 2.852703666687012, "step": 23130 }, { "epoch": 0.8779625862262099, "grad_norm": 0.294921875, "learning_rate": 0.0002381218869443143, "loss": 2.806318664550781, "step": 23140 }, { "epoch": 0.8783419996169731, "grad_norm": 0.3046875, "learning_rate": 0.00023800487162382327, "loss": 2.803704833984375, "step": 23150 }, { "epoch": 0.8787214130077364, "grad_norm": 0.302734375, "learning_rate": 0.00023788784280478424, "loss": 2.836885452270508, "step": 23160 }, { "epoch": 0.8791008263984997, "grad_norm": 0.298828125, "learning_rate": 0.00023777080052876328, "loss": 2.8114973068237306, "step": 23170 }, { "epoch": 0.8794802397892629, "grad_norm": 0.28515625, "learning_rate": 0.00023765374483733154, "loss": 2.819234275817871, "step": 23180 }, { "epoch": 0.8798596531800262, "grad_norm": 0.287109375, "learning_rate": 0.0002375366757720647, "loss": 2.832800102233887, "step": 23190 }, { "epoch": 0.8802390665707895, "grad_norm": 0.291015625, "learning_rate": 0.00023741959337454324, "loss": 2.8151235580444336, "step": 23200 }, { "epoch": 0.8806184799615527, "grad_norm": 0.2890625, "learning_rate": 0.00023730249768635258, "loss": 2.818841552734375, "step": 23210 }, { "epoch": 0.880997893352316, "grad_norm": 0.310546875, "learning_rate": 0.00023718538874908257, "loss": 2.8148921966552733, "step": 23220 }, { "epoch": 0.8813773067430793, "grad_norm": 0.3046875, "learning_rate": 0.00023706826660432792, "loss": 2.7946456909179687, "step": 23230 }, { "epoch": 0.8817567201338425, "grad_norm": 0.298828125, "learning_rate": 0.00023695113129368797, "loss": 2.811196136474609, "step": 23240 }, { "epoch": 0.8821361335246058, "grad_norm": 0.28515625, "learning_rate": 0.00023683398285876685, "loss": 2.8191627502441405, "step": 23250 }, { "epoch": 0.8821361335246058, "eval_loss": 2.810842275619507, "eval_runtime": 189.542, "eval_samples_per_second": 20.096, "eval_steps_per_second": 3.35, "step": 23250 }, { "epoch": 0.8825155469153692, "grad_norm": 0.2890625, "learning_rate": 0.00023671682134117328, "loss": 2.82583065032959, "step": 23260 }, { "epoch": 0.8828949603061325, "grad_norm": 0.298828125, "learning_rate": 0.0002365996467825205, "loss": 2.8098333358764647, "step": 23270 }, { "epoch": 0.8832743736968957, "grad_norm": 0.287109375, "learning_rate": 0.00023648245922442657, "loss": 2.8001239776611326, "step": 23280 }, { "epoch": 0.883653787087659, "grad_norm": 0.29296875, "learning_rate": 0.0002363652587085141, "loss": 2.825997734069824, "step": 23290 }, { "epoch": 0.8840332004784223, "grad_norm": 0.28515625, "learning_rate": 0.00023624804527641033, "loss": 2.8214107513427735, "step": 23300 }, { "epoch": 0.8844126138691855, "grad_norm": 0.30078125, "learning_rate": 0.00023613081896974706, "loss": 2.825693893432617, "step": 23310 }, { "epoch": 0.8847920272599488, "grad_norm": 0.291015625, "learning_rate": 0.0002360135798301606, "loss": 2.8077987670898437, "step": 23320 }, { "epoch": 0.8851714406507121, "grad_norm": 0.318359375, "learning_rate": 0.00023589632789929187, "loss": 2.7901525497436523, "step": 23330 }, { "epoch": 0.8855508540414753, "grad_norm": 0.296875, "learning_rate": 0.00023577906321878644, "loss": 2.8577621459960936, "step": 23340 }, { "epoch": 0.8859302674322386, "grad_norm": 0.298828125, "learning_rate": 0.00023566178583029425, "loss": 2.828934097290039, "step": 23350 }, { "epoch": 0.8863096808230019, "grad_norm": 0.30078125, "learning_rate": 0.00023554449577546977, "loss": 2.8062675476074217, "step": 23360 }, { "epoch": 0.8866890942137651, "grad_norm": 0.3125, "learning_rate": 0.00023542719309597207, "loss": 2.834072494506836, "step": 23370 }, { "epoch": 0.8870685076045284, "grad_norm": 0.298828125, "learning_rate": 0.00023530987783346467, "loss": 2.8184696197509767, "step": 23380 }, { "epoch": 0.8874479209952917, "grad_norm": 0.28515625, "learning_rate": 0.00023519255002961548, "loss": 2.8377567291259767, "step": 23390 }, { "epoch": 0.8878273343860549, "grad_norm": 0.291015625, "learning_rate": 0.00023507520972609696, "loss": 2.811074447631836, "step": 23400 }, { "epoch": 0.8882067477768182, "grad_norm": 0.291015625, "learning_rate": 0.00023495785696458596, "loss": 2.797775077819824, "step": 23410 }, { "epoch": 0.8885861611675815, "grad_norm": 0.31640625, "learning_rate": 0.0002348404917867637, "loss": 2.8000532150268556, "step": 23420 }, { "epoch": 0.8889655745583448, "grad_norm": 0.28125, "learning_rate": 0.00023472311423431604, "loss": 2.8025671005249024, "step": 23430 }, { "epoch": 0.889344987949108, "grad_norm": 0.29296875, "learning_rate": 0.0002346057243489329, "loss": 2.8018653869628904, "step": 23440 }, { "epoch": 0.8897244013398713, "grad_norm": 0.29296875, "learning_rate": 0.00023448832217230883, "loss": 2.8318992614746095, "step": 23450 }, { "epoch": 0.8901038147306346, "grad_norm": 0.29296875, "learning_rate": 0.00023437090774614268, "loss": 2.818352699279785, "step": 23460 }, { "epoch": 0.8904832281213978, "grad_norm": 0.29296875, "learning_rate": 0.00023425348111213756, "loss": 2.8379709243774416, "step": 23470 }, { "epoch": 0.8908626415121611, "grad_norm": 0.302734375, "learning_rate": 0.00023413604231200107, "loss": 2.8054901123046876, "step": 23480 }, { "epoch": 0.8912420549029244, "grad_norm": 0.294921875, "learning_rate": 0.000234018591387445, "loss": 2.8163021087646483, "step": 23490 }, { "epoch": 0.8916214682936876, "grad_norm": 0.298828125, "learning_rate": 0.00023390112838018556, "loss": 2.8037750244140627, "step": 23500 }, { "epoch": 0.8916214682936876, "eval_loss": 2.8092968463897705, "eval_runtime": 189.5822, "eval_samples_per_second": 20.092, "eval_steps_per_second": 3.349, "step": 23500 }, { "epoch": 0.8920008816844509, "grad_norm": 0.3046875, "learning_rate": 0.00023378365333194317, "loss": 2.801365280151367, "step": 23510 }, { "epoch": 0.8923802950752142, "grad_norm": 0.318359375, "learning_rate": 0.0002336661662844425, "loss": 2.8116554260253905, "step": 23520 }, { "epoch": 0.8927597084659774, "grad_norm": 0.287109375, "learning_rate": 0.0002335486672794126, "loss": 2.800572395324707, "step": 23530 }, { "epoch": 0.8931391218567407, "grad_norm": 0.294921875, "learning_rate": 0.0002334311563585867, "loss": 2.8355554580688476, "step": 23540 }, { "epoch": 0.893518535247504, "grad_norm": 0.314453125, "learning_rate": 0.0002333136335637022, "loss": 2.8275257110595704, "step": 23550 }, { "epoch": 0.8938979486382673, "grad_norm": 0.30859375, "learning_rate": 0.00023319609893650078, "loss": 2.838644599914551, "step": 23560 }, { "epoch": 0.8942773620290305, "grad_norm": 0.28515625, "learning_rate": 0.00023307855251872837, "loss": 2.8086238861083985, "step": 23570 }, { "epoch": 0.8946567754197938, "grad_norm": 0.30078125, "learning_rate": 0.00023296099435213502, "loss": 2.811561393737793, "step": 23580 }, { "epoch": 0.8950361888105571, "grad_norm": 0.298828125, "learning_rate": 0.00023284342447847496, "loss": 2.7996482849121094, "step": 23590 }, { "epoch": 0.8954156022013203, "grad_norm": 0.3046875, "learning_rate": 0.00023272584293950659, "loss": 2.8210895538330076, "step": 23600 }, { "epoch": 0.8957950155920836, "grad_norm": 0.287109375, "learning_rate": 0.00023260824977699244, "loss": 2.8487382888793946, "step": 23610 }, { "epoch": 0.8961744289828469, "grad_norm": 0.294921875, "learning_rate": 0.00023249064503269923, "loss": 2.835400390625, "step": 23620 }, { "epoch": 0.8965538423736101, "grad_norm": 0.302734375, "learning_rate": 0.0002323730287483977, "loss": 2.7942537307739257, "step": 23630 }, { "epoch": 0.8969332557643734, "grad_norm": 0.296875, "learning_rate": 0.0002322554009658627, "loss": 2.788120079040527, "step": 23640 }, { "epoch": 0.8973126691551367, "grad_norm": 0.294921875, "learning_rate": 0.00023213776172687335, "loss": 2.799386215209961, "step": 23650 }, { "epoch": 0.8976920825458999, "grad_norm": 0.287109375, "learning_rate": 0.00023202011107321252, "loss": 2.781049346923828, "step": 23660 }, { "epoch": 0.8980714959366632, "grad_norm": 0.291015625, "learning_rate": 0.00023190244904666734, "loss": 2.8106687545776365, "step": 23670 }, { "epoch": 0.8984509093274266, "grad_norm": 0.296875, "learning_rate": 0.00023178477568902898, "loss": 2.804153823852539, "step": 23680 }, { "epoch": 0.8988303227181899, "grad_norm": 0.29296875, "learning_rate": 0.00023166709104209256, "loss": 2.815953254699707, "step": 23690 }, { "epoch": 0.899209736108953, "grad_norm": 0.294921875, "learning_rate": 0.00023154939514765727, "loss": 2.8042551040649415, "step": 23700 }, { "epoch": 0.8995891494997164, "grad_norm": 0.302734375, "learning_rate": 0.00023143168804752625, "loss": 2.849088668823242, "step": 23710 }, { "epoch": 0.8999685628904797, "grad_norm": 0.283203125, "learning_rate": 0.00023131396978350665, "loss": 2.8071596145629885, "step": 23720 }, { "epoch": 0.9003479762812429, "grad_norm": 0.30078125, "learning_rate": 0.00023119624039740955, "loss": 2.8057788848876952, "step": 23730 }, { "epoch": 0.9007273896720062, "grad_norm": 0.298828125, "learning_rate": 0.00023107849993105002, "loss": 2.789341163635254, "step": 23740 }, { "epoch": 0.9011068030627695, "grad_norm": 0.291015625, "learning_rate": 0.00023096074842624708, "loss": 2.813013458251953, "step": 23750 }, { "epoch": 0.9011068030627695, "eval_loss": 2.807892084121704, "eval_runtime": 240.706, "eval_samples_per_second": 15.824, "eval_steps_per_second": 2.638, "step": 23750 }, { "epoch": 0.9014862164535327, "grad_norm": 0.2890625, "learning_rate": 0.00023084298592482362, "loss": 2.8018653869628904, "step": 23760 }, { "epoch": 0.901865629844296, "grad_norm": 0.296875, "learning_rate": 0.00023072521246860639, "loss": 2.8199172973632813, "step": 23770 }, { "epoch": 0.9022450432350593, "grad_norm": 0.294921875, "learning_rate": 0.00023060742809942624, "loss": 2.825362968444824, "step": 23780 }, { "epoch": 0.9026244566258225, "grad_norm": 0.298828125, "learning_rate": 0.0002304896328591176, "loss": 2.796553039550781, "step": 23790 }, { "epoch": 0.9030038700165858, "grad_norm": 0.294921875, "learning_rate": 0.000230371826789519, "loss": 2.8205041885375977, "step": 23800 }, { "epoch": 0.9033832834073491, "grad_norm": 0.302734375, "learning_rate": 0.00023025400993247278, "loss": 2.8137434005737303, "step": 23810 }, { "epoch": 0.9037626967981123, "grad_norm": 0.302734375, "learning_rate": 0.0002301361823298249, "loss": 2.782440757751465, "step": 23820 }, { "epoch": 0.9041421101888756, "grad_norm": 0.28125, "learning_rate": 0.00023001834402342552, "loss": 2.811214256286621, "step": 23830 }, { "epoch": 0.9045215235796389, "grad_norm": 0.287109375, "learning_rate": 0.00022990049505512813, "loss": 2.8088314056396486, "step": 23840 }, { "epoch": 0.9049009369704022, "grad_norm": 0.296875, "learning_rate": 0.00022978263546679045, "loss": 2.83048038482666, "step": 23850 }, { "epoch": 0.9052803503611654, "grad_norm": 0.2890625, "learning_rate": 0.00022966476530027373, "loss": 2.8388458251953126, "step": 23860 }, { "epoch": 0.9056597637519287, "grad_norm": 0.318359375, "learning_rate": 0.00022954688459744303, "loss": 2.8092269897460938, "step": 23870 }, { "epoch": 0.906039177142692, "grad_norm": 0.302734375, "learning_rate": 0.0002294289934001671, "loss": 2.8241737365722654, "step": 23880 }, { "epoch": 0.9064185905334552, "grad_norm": 0.287109375, "learning_rate": 0.0002293110917503185, "loss": 2.7793020248413085, "step": 23890 }, { "epoch": 0.9067980039242185, "grad_norm": 0.298828125, "learning_rate": 0.00022919317968977346, "loss": 2.802753448486328, "step": 23900 }, { "epoch": 0.9071774173149818, "grad_norm": 0.294921875, "learning_rate": 0.00022907525726041192, "loss": 2.82403564453125, "step": 23910 }, { "epoch": 0.907556830705745, "grad_norm": 0.291015625, "learning_rate": 0.00022895732450411754, "loss": 2.826273727416992, "step": 23920 }, { "epoch": 0.9079362440965083, "grad_norm": 0.306640625, "learning_rate": 0.0002288393814627775, "loss": 2.815776062011719, "step": 23930 }, { "epoch": 0.9083156574872716, "grad_norm": 0.291015625, "learning_rate": 0.00022872142817828293, "loss": 2.7796201705932617, "step": 23940 }, { "epoch": 0.9086950708780348, "grad_norm": 0.302734375, "learning_rate": 0.00022860346469252818, "loss": 2.820523262023926, "step": 23950 }, { "epoch": 0.9090744842687981, "grad_norm": 0.318359375, "learning_rate": 0.00022848549104741164, "loss": 2.8037309646606445, "step": 23960 }, { "epoch": 0.9094538976595614, "grad_norm": 0.302734375, "learning_rate": 0.00022836750728483503, "loss": 2.782582092285156, "step": 23970 }, { "epoch": 0.9098333110503247, "grad_norm": 0.296875, "learning_rate": 0.00022824951344670372, "loss": 2.798220634460449, "step": 23980 }, { "epoch": 0.9102127244410879, "grad_norm": 0.29296875, "learning_rate": 0.00022813150957492678, "loss": 2.8098608016967774, "step": 23990 }, { "epoch": 0.9105921378318512, "grad_norm": 0.29296875, "learning_rate": 0.0002280134957114167, "loss": 2.8130760192871094, "step": 24000 }, { "epoch": 0.9105921378318512, "eval_loss": 2.806929588317871, "eval_runtime": 188.1752, "eval_samples_per_second": 20.242, "eval_steps_per_second": 3.375, "step": 24000 }, { "epoch": 0.9109715512226145, "grad_norm": 0.30078125, "learning_rate": 0.0002278954718980896, "loss": 2.834333610534668, "step": 24010 }, { "epoch": 0.9113509646133777, "grad_norm": 0.298828125, "learning_rate": 0.00022777743817686504, "loss": 2.8338529586791994, "step": 24020 }, { "epoch": 0.911730378004141, "grad_norm": 0.310546875, "learning_rate": 0.00022765939458966634, "loss": 2.8201663970947264, "step": 24030 }, { "epoch": 0.9121097913949043, "grad_norm": 0.287109375, "learning_rate": 0.00022754134117841993, "loss": 2.819229507446289, "step": 24040 }, { "epoch": 0.9124892047856675, "grad_norm": 0.287109375, "learning_rate": 0.00022742327798505612, "loss": 2.812160301208496, "step": 24050 }, { "epoch": 0.9128686181764308, "grad_norm": 0.291015625, "learning_rate": 0.00022730520505150845, "loss": 2.8293107986450194, "step": 24060 }, { "epoch": 0.9132480315671941, "grad_norm": 0.29296875, "learning_rate": 0.00022718712241971405, "loss": 2.8363576889038087, "step": 24070 }, { "epoch": 0.9136274449579573, "grad_norm": 0.314453125, "learning_rate": 0.0002270690301316134, "loss": 2.8148921966552733, "step": 24080 }, { "epoch": 0.9140068583487206, "grad_norm": 0.314453125, "learning_rate": 0.00022695092822915038, "loss": 2.80274658203125, "step": 24090 }, { "epoch": 0.914386271739484, "grad_norm": 0.30078125, "learning_rate": 0.0002268328167542725, "loss": 2.846920394897461, "step": 24100 }, { "epoch": 0.9147656851302471, "grad_norm": 0.30078125, "learning_rate": 0.00022671469574893043, "loss": 2.809992218017578, "step": 24110 }, { "epoch": 0.9151450985210104, "grad_norm": 0.287109375, "learning_rate": 0.00022659656525507834, "loss": 2.8345056533813477, "step": 24120 }, { "epoch": 0.9155245119117738, "grad_norm": 0.2890625, "learning_rate": 0.0002264784253146737, "loss": 2.823945999145508, "step": 24130 }, { "epoch": 0.9159039253025371, "grad_norm": 0.294921875, "learning_rate": 0.00022636027596967755, "loss": 2.8021417617797852, "step": 24140 }, { "epoch": 0.9162833386933003, "grad_norm": 0.29296875, "learning_rate": 0.00022624211726205386, "loss": 2.792042541503906, "step": 24150 }, { "epoch": 0.9166627520840636, "grad_norm": 0.287109375, "learning_rate": 0.00022612394923377042, "loss": 2.818480682373047, "step": 24160 }, { "epoch": 0.9170421654748269, "grad_norm": 0.29296875, "learning_rate": 0.00022600577192679792, "loss": 2.821940231323242, "step": 24170 }, { "epoch": 0.9174215788655901, "grad_norm": 0.294921875, "learning_rate": 0.00022588758538311053, "loss": 2.8114500045776367, "step": 24180 }, { "epoch": 0.9178009922563534, "grad_norm": 0.291015625, "learning_rate": 0.00022576938964468575, "loss": 2.826885795593262, "step": 24190 }, { "epoch": 0.9181804056471167, "grad_norm": 0.291015625, "learning_rate": 0.00022565118475350416, "loss": 2.829610252380371, "step": 24200 }, { "epoch": 0.9185598190378799, "grad_norm": 0.291015625, "learning_rate": 0.0002255329707515498, "loss": 2.7876413345336912, "step": 24210 }, { "epoch": 0.9189392324286432, "grad_norm": 0.29296875, "learning_rate": 0.00022541474768080977, "loss": 2.8143085479736327, "step": 24220 }, { "epoch": 0.9193186458194065, "grad_norm": 0.2890625, "learning_rate": 0.0002252965155832746, "loss": 2.800698661804199, "step": 24230 }, { "epoch": 0.9196980592101697, "grad_norm": 0.2890625, "learning_rate": 0.00022517827450093775, "loss": 2.8335933685302734, "step": 24240 }, { "epoch": 0.920077472600933, "grad_norm": 0.306640625, "learning_rate": 0.0002250600244757961, "loss": 2.7869577407836914, "step": 24250 }, { "epoch": 0.920077472600933, "eval_loss": 2.8055012226104736, "eval_runtime": 228.1026, "eval_samples_per_second": 16.699, "eval_steps_per_second": 2.784, "step": 24250 }, { "epoch": 0.9204568859916963, "grad_norm": 0.302734375, "learning_rate": 0.00022494176554984957, "loss": 2.818916130065918, "step": 24260 }, { "epoch": 0.9208362993824596, "grad_norm": 0.3125, "learning_rate": 0.0002248234977651014, "loss": 2.820851135253906, "step": 24270 }, { "epoch": 0.9212157127732228, "grad_norm": 0.29296875, "learning_rate": 0.0002247052211635578, "loss": 2.8262798309326174, "step": 24280 }, { "epoch": 0.9215951261639861, "grad_norm": 0.298828125, "learning_rate": 0.00022458693578722815, "loss": 2.786672592163086, "step": 24290 }, { "epoch": 0.9219745395547494, "grad_norm": 0.294921875, "learning_rate": 0.00022446864167812507, "loss": 2.8349817276000975, "step": 24300 }, { "epoch": 0.9223539529455126, "grad_norm": 0.2890625, "learning_rate": 0.00022435033887826405, "loss": 2.7938411712646483, "step": 24310 }, { "epoch": 0.9227333663362759, "grad_norm": 0.296875, "learning_rate": 0.000224232027429664, "loss": 2.8359397888183593, "step": 24320 }, { "epoch": 0.9231127797270392, "grad_norm": 0.29296875, "learning_rate": 0.00022411370737434653, "loss": 2.7884441375732423, "step": 24330 }, { "epoch": 0.9234921931178024, "grad_norm": 0.3046875, "learning_rate": 0.00022399537875433665, "loss": 2.7730785369873048, "step": 24340 }, { "epoch": 0.9238716065085657, "grad_norm": 0.3046875, "learning_rate": 0.00022387704161166215, "loss": 2.793855667114258, "step": 24350 }, { "epoch": 0.924251019899329, "grad_norm": 0.31640625, "learning_rate": 0.00022375869598835398, "loss": 2.7928672790527345, "step": 24360 }, { "epoch": 0.9246304332900922, "grad_norm": 0.3125, "learning_rate": 0.00022364034192644603, "loss": 2.7675395965576173, "step": 24370 }, { "epoch": 0.9250098466808555, "grad_norm": 0.294921875, "learning_rate": 0.00022352197946797522, "loss": 2.8061586380004884, "step": 24380 }, { "epoch": 0.9253892600716188, "grad_norm": 0.27734375, "learning_rate": 0.00022340360865498156, "loss": 2.836260223388672, "step": 24390 }, { "epoch": 0.9257686734623821, "grad_norm": 0.298828125, "learning_rate": 0.0002232852295295078, "loss": 2.8436647415161134, "step": 24400 }, { "epoch": 0.9261480868531453, "grad_norm": 0.302734375, "learning_rate": 0.0002231668421335999, "loss": 2.805807113647461, "step": 24410 }, { "epoch": 0.9265275002439086, "grad_norm": 0.29296875, "learning_rate": 0.00022304844650930652, "loss": 2.8382390975952148, "step": 24420 }, { "epoch": 0.9269069136346719, "grad_norm": 0.31640625, "learning_rate": 0.0002229300426986794, "loss": 2.8315176010131835, "step": 24430 }, { "epoch": 0.9272863270254351, "grad_norm": 0.29296875, "learning_rate": 0.00022281163074377308, "loss": 2.8369979858398438, "step": 24440 }, { "epoch": 0.9276657404161984, "grad_norm": 0.291015625, "learning_rate": 0.00022269321068664515, "loss": 2.8068370819091797, "step": 24450 }, { "epoch": 0.9280451538069617, "grad_norm": 0.29296875, "learning_rate": 0.00022257478256935582, "loss": 2.8379735946655273, "step": 24460 }, { "epoch": 0.9284245671977249, "grad_norm": 0.298828125, "learning_rate": 0.00022245634643396853, "loss": 2.8192529678344727, "step": 24470 }, { "epoch": 0.9288039805884882, "grad_norm": 0.310546875, "learning_rate": 0.0002223379023225492, "loss": 2.8332033157348633, "step": 24480 }, { "epoch": 0.9291833939792515, "grad_norm": 0.3046875, "learning_rate": 0.00022221945027716674, "loss": 2.809197425842285, "step": 24490 }, { "epoch": 0.9295628073700147, "grad_norm": 0.326171875, "learning_rate": 0.00022210099033989297, "loss": 2.8035215377807616, "step": 24500 }, { "epoch": 0.9295628073700147, "eval_loss": 2.8044991493225098, "eval_runtime": 190.3289, "eval_samples_per_second": 20.013, "eval_steps_per_second": 3.336, "step": 24500 }, { "epoch": 0.929942220760778, "grad_norm": 0.291015625, "learning_rate": 0.00022198252255280232, "loss": 2.8216976165771483, "step": 24510 }, { "epoch": 0.9303216341515413, "grad_norm": 0.291015625, "learning_rate": 0.00022186404695797226, "loss": 2.8334983825683593, "step": 24520 }, { "epoch": 0.9307010475423045, "grad_norm": 0.296875, "learning_rate": 0.00022174556359748267, "loss": 2.838593864440918, "step": 24530 }, { "epoch": 0.9310804609330678, "grad_norm": 0.294921875, "learning_rate": 0.0002216270725134166, "loss": 2.8050020217895506, "step": 24540 }, { "epoch": 0.9314598743238311, "grad_norm": 0.28515625, "learning_rate": 0.0002215085737478595, "loss": 2.8331624984741213, "step": 24550 }, { "epoch": 0.9318392877145945, "grad_norm": 0.30078125, "learning_rate": 0.00022139006734289978, "loss": 2.7845001220703125, "step": 24560 }, { "epoch": 0.9322187011053576, "grad_norm": 0.28515625, "learning_rate": 0.00022127155334062838, "loss": 2.8292022705078126, "step": 24570 }, { "epoch": 0.932598114496121, "grad_norm": 0.2890625, "learning_rate": 0.00022115303178313916, "loss": 2.8090740203857423, "step": 24580 }, { "epoch": 0.9329775278868843, "grad_norm": 0.322265625, "learning_rate": 0.00022103450271252856, "loss": 2.791348457336426, "step": 24590 }, { "epoch": 0.9333569412776475, "grad_norm": 0.298828125, "learning_rate": 0.00022091596617089545, "loss": 2.77685489654541, "step": 24600 }, { "epoch": 0.9337363546684108, "grad_norm": 0.302734375, "learning_rate": 0.00022079742220034175, "loss": 2.796299171447754, "step": 24610 }, { "epoch": 0.9341157680591741, "grad_norm": 0.287109375, "learning_rate": 0.00022067887084297182, "loss": 2.7972578048706054, "step": 24620 }, { "epoch": 0.9344951814499373, "grad_norm": 0.279296875, "learning_rate": 0.0002205603121408926, "loss": 2.8038463592529297, "step": 24630 }, { "epoch": 0.9348745948407006, "grad_norm": 0.2890625, "learning_rate": 0.00022044174613621373, "loss": 2.796761894226074, "step": 24640 }, { "epoch": 0.9352540082314639, "grad_norm": 0.283203125, "learning_rate": 0.00022032317287104744, "loss": 2.7755054473876952, "step": 24650 }, { "epoch": 0.9356334216222271, "grad_norm": 0.28515625, "learning_rate": 0.00022020459238750845, "loss": 2.8176809310913087, "step": 24660 }, { "epoch": 0.9360128350129904, "grad_norm": 0.2890625, "learning_rate": 0.00022008600472771418, "loss": 2.8378005981445313, "step": 24670 }, { "epoch": 0.9363922484037537, "grad_norm": 0.30078125, "learning_rate": 0.0002199674099337845, "loss": 2.7815645217895506, "step": 24680 }, { "epoch": 0.936771661794517, "grad_norm": 0.298828125, "learning_rate": 0.00021984880804784185, "loss": 2.8148691177368166, "step": 24690 }, { "epoch": 0.9371510751852802, "grad_norm": 0.294921875, "learning_rate": 0.00021973019911201112, "loss": 2.827803039550781, "step": 24700 }, { "epoch": 0.9375304885760435, "grad_norm": 0.29296875, "learning_rate": 0.00021961158316841979, "loss": 2.790256118774414, "step": 24710 }, { "epoch": 0.9379099019668068, "grad_norm": 0.294921875, "learning_rate": 0.00021949296025919787, "loss": 2.800267219543457, "step": 24720 }, { "epoch": 0.93828931535757, "grad_norm": 0.32421875, "learning_rate": 0.0002193743304264777, "loss": 2.778347206115723, "step": 24730 }, { "epoch": 0.9386687287483333, "grad_norm": 0.3125, "learning_rate": 0.00021925569371239418, "loss": 2.840124320983887, "step": 24740 }, { "epoch": 0.9390481421390966, "grad_norm": 0.30859375, "learning_rate": 0.0002191370501590846, "loss": 2.7858560562133787, "step": 24750 }, { "epoch": 0.9390481421390966, "eval_loss": 2.8035686016082764, "eval_runtime": 189.2981, "eval_samples_per_second": 20.122, "eval_steps_per_second": 3.354, "step": 24750 }, { "epoch": 0.9394275555298598, "grad_norm": 0.283203125, "learning_rate": 0.00021901839980868876, "loss": 2.804863929748535, "step": 24760 }, { "epoch": 0.9398069689206231, "grad_norm": 0.29296875, "learning_rate": 0.00021889974270334874, "loss": 2.7924448013305665, "step": 24770 }, { "epoch": 0.9401863823113864, "grad_norm": 0.30078125, "learning_rate": 0.00021878107888520924, "loss": 2.796219253540039, "step": 24780 }, { "epoch": 0.9405657957021496, "grad_norm": 0.302734375, "learning_rate": 0.0002186624083964171, "loss": 2.8135498046875, "step": 24790 }, { "epoch": 0.9409452090929129, "grad_norm": 0.30078125, "learning_rate": 0.00021854373127912165, "loss": 2.8110305786132814, "step": 24800 }, { "epoch": 0.9413246224836762, "grad_norm": 0.30078125, "learning_rate": 0.00021842504757547455, "loss": 2.828930473327637, "step": 24810 }, { "epoch": 0.9417040358744395, "grad_norm": 0.298828125, "learning_rate": 0.00021830635732762979, "loss": 2.7909011840820312, "step": 24820 }, { "epoch": 0.9420834492652027, "grad_norm": 0.314453125, "learning_rate": 0.0002181876605777438, "loss": 2.81960506439209, "step": 24830 }, { "epoch": 0.942462862655966, "grad_norm": 0.29296875, "learning_rate": 0.0002180689573679751, "loss": 2.805502510070801, "step": 24840 }, { "epoch": 0.9428422760467293, "grad_norm": 0.302734375, "learning_rate": 0.00021795024774048477, "loss": 2.8319076538085937, "step": 24850 }, { "epoch": 0.9432216894374925, "grad_norm": 0.302734375, "learning_rate": 0.00021783153173743586, "loss": 2.792451286315918, "step": 24860 }, { "epoch": 0.9436011028282558, "grad_norm": 0.291015625, "learning_rate": 0.00021771280940099396, "loss": 2.800520896911621, "step": 24870 }, { "epoch": 0.9439805162190191, "grad_norm": 0.330078125, "learning_rate": 0.00021759408077332677, "loss": 2.829667663574219, "step": 24880 }, { "epoch": 0.9443599296097823, "grad_norm": 0.3203125, "learning_rate": 0.00021747534589660426, "loss": 2.819064140319824, "step": 24890 }, { "epoch": 0.9447393430005456, "grad_norm": 0.2890625, "learning_rate": 0.00021735660481299863, "loss": 2.815561294555664, "step": 24900 }, { "epoch": 0.9451187563913089, "grad_norm": 0.287109375, "learning_rate": 0.0002172378575646842, "loss": 2.811493682861328, "step": 24910 }, { "epoch": 0.9454981697820721, "grad_norm": 0.298828125, "learning_rate": 0.0002171191041938377, "loss": 2.799086570739746, "step": 24920 }, { "epoch": 0.9458775831728354, "grad_norm": 0.30078125, "learning_rate": 0.00021700034474263764, "loss": 2.8108877182006835, "step": 24930 }, { "epoch": 0.9462569965635987, "grad_norm": 0.291015625, "learning_rate": 0.00021688157925326516, "loss": 2.800168037414551, "step": 24940 }, { "epoch": 0.9466364099543619, "grad_norm": 0.298828125, "learning_rate": 0.0002167628077679032, "loss": 2.818270683288574, "step": 24950 }, { "epoch": 0.9470158233451252, "grad_norm": 0.298828125, "learning_rate": 0.00021664403032873697, "loss": 2.800080108642578, "step": 24960 }, { "epoch": 0.9473952367358885, "grad_norm": 0.294921875, "learning_rate": 0.0002165252469779538, "loss": 2.823975372314453, "step": 24970 }, { "epoch": 0.9477746501266519, "grad_norm": 0.31640625, "learning_rate": 0.00021640645775774308, "loss": 2.7912038803100585, "step": 24980 }, { "epoch": 0.948154063517415, "grad_norm": 0.2890625, "learning_rate": 0.0002162876627102963, "loss": 2.8072330474853517, "step": 24990 }, { "epoch": 0.9485334769081784, "grad_norm": 0.306640625, "learning_rate": 0.00021616886187780692, "loss": 2.8282211303710936, "step": 25000 }, { "epoch": 0.9485334769081784, "eval_loss": 2.8023314476013184, "eval_runtime": 189.2607, "eval_samples_per_second": 20.126, "eval_steps_per_second": 3.355, "step": 25000 }, { "epoch": 0.9489128902989417, "grad_norm": 0.29296875, "learning_rate": 0.00021605005530247069, "loss": 2.78975830078125, "step": 25010 }, { "epoch": 0.9492923036897049, "grad_norm": 0.306640625, "learning_rate": 0.00021593124302648518, "loss": 2.770734977722168, "step": 25020 }, { "epoch": 0.9496717170804682, "grad_norm": 0.30078125, "learning_rate": 0.00021581242509205014, "loss": 2.7679609298706054, "step": 25030 }, { "epoch": 0.9500511304712315, "grad_norm": 0.294921875, "learning_rate": 0.00021569360154136713, "loss": 2.785012626647949, "step": 25040 }, { "epoch": 0.9504305438619947, "grad_norm": 0.28515625, "learning_rate": 0.00021557477241663996, "loss": 2.8065185546875, "step": 25050 }, { "epoch": 0.950809957252758, "grad_norm": 0.28125, "learning_rate": 0.00021545593776007419, "loss": 2.816660499572754, "step": 25060 }, { "epoch": 0.9511893706435213, "grad_norm": 0.294921875, "learning_rate": 0.0002153370976138775, "loss": 2.8009050369262694, "step": 25070 }, { "epoch": 0.9515687840342845, "grad_norm": 0.302734375, "learning_rate": 0.00021521825202025938, "loss": 2.8189300537109374, "step": 25080 }, { "epoch": 0.9519481974250478, "grad_norm": 0.294921875, "learning_rate": 0.0002150994010214315, "loss": 2.838400650024414, "step": 25090 }, { "epoch": 0.9523276108158111, "grad_norm": 0.2890625, "learning_rate": 0.00021498054465960716, "loss": 2.8135093688964843, "step": 25100 }, { "epoch": 0.9527070242065744, "grad_norm": 0.296875, "learning_rate": 0.0002148616829770017, "loss": 2.826569366455078, "step": 25110 }, { "epoch": 0.9530864375973376, "grad_norm": 0.29296875, "learning_rate": 0.0002147428160158324, "loss": 2.7776342391967774, "step": 25120 }, { "epoch": 0.9534658509881009, "grad_norm": 0.294921875, "learning_rate": 0.0002146239438183183, "loss": 2.8110767364501954, "step": 25130 }, { "epoch": 0.9538452643788642, "grad_norm": 0.30078125, "learning_rate": 0.00021450506642668043, "loss": 2.8269105911254884, "step": 25140 }, { "epoch": 0.9542246777696274, "grad_norm": 0.298828125, "learning_rate": 0.0002143861838831415, "loss": 2.7642024993896483, "step": 25150 }, { "epoch": 0.9546040911603907, "grad_norm": 0.294921875, "learning_rate": 0.00021426729622992622, "loss": 2.804811477661133, "step": 25160 }, { "epoch": 0.954983504551154, "grad_norm": 0.291015625, "learning_rate": 0.00021414840350926094, "loss": 2.819792366027832, "step": 25170 }, { "epoch": 0.9553629179419172, "grad_norm": 0.28125, "learning_rate": 0.00021402950576337407, "loss": 2.8017240524291993, "step": 25180 }, { "epoch": 0.9557423313326805, "grad_norm": 0.296875, "learning_rate": 0.0002139106030344955, "loss": 2.8056591033935545, "step": 25190 }, { "epoch": 0.9561217447234438, "grad_norm": 0.296875, "learning_rate": 0.00021379169536485707, "loss": 2.774022102355957, "step": 25200 }, { "epoch": 0.956501158114207, "grad_norm": 0.291015625, "learning_rate": 0.00021367278279669236, "loss": 2.8005224227905274, "step": 25210 }, { "epoch": 0.9568805715049703, "grad_norm": 0.3203125, "learning_rate": 0.0002135538653722366, "loss": 2.813671684265137, "step": 25220 }, { "epoch": 0.9572599848957336, "grad_norm": 0.318359375, "learning_rate": 0.00021343494313372694, "loss": 2.7977697372436525, "step": 25230 }, { "epoch": 0.9576393982864969, "grad_norm": 0.29296875, "learning_rate": 0.00021331601612340197, "loss": 2.8195451736450194, "step": 25240 }, { "epoch": 0.9580188116772601, "grad_norm": 0.294921875, "learning_rate": 0.00021319708438350218, "loss": 2.801590347290039, "step": 25250 }, { "epoch": 0.9580188116772601, "eval_loss": 2.8032116889953613, "eval_runtime": 190.12, "eval_samples_per_second": 20.035, "eval_steps_per_second": 3.34, "step": 25250 }, { "epoch": 0.9583982250680234, "grad_norm": 0.30078125, "learning_rate": 0.0002130781479562697, "loss": 2.7867650985717773, "step": 25260 }, { "epoch": 0.9587776384587867, "grad_norm": 0.302734375, "learning_rate": 0.0002129592068839483, "loss": 2.806438446044922, "step": 25270 }, { "epoch": 0.9591570518495499, "grad_norm": 0.291015625, "learning_rate": 0.00021284026120878328, "loss": 2.800473213195801, "step": 25280 }, { "epoch": 0.9595364652403132, "grad_norm": 0.306640625, "learning_rate": 0.0002127213109730219, "loss": 2.8157548904418945, "step": 25290 }, { "epoch": 0.9599158786310765, "grad_norm": 0.2890625, "learning_rate": 0.00021260235621891268, "loss": 2.7999258041381836, "step": 25300 }, { "epoch": 0.9602952920218397, "grad_norm": 0.291015625, "learning_rate": 0.00021248339698870595, "loss": 2.781154251098633, "step": 25310 }, { "epoch": 0.960674705412603, "grad_norm": 0.29296875, "learning_rate": 0.0002123644333246536, "loss": 2.803557014465332, "step": 25320 }, { "epoch": 0.9610541188033663, "grad_norm": 0.306640625, "learning_rate": 0.000212245465269009, "loss": 2.771169662475586, "step": 25330 }, { "epoch": 0.9614335321941295, "grad_norm": 0.298828125, "learning_rate": 0.00021212649286402724, "loss": 2.803247833251953, "step": 25340 }, { "epoch": 0.9618129455848928, "grad_norm": 0.30078125, "learning_rate": 0.0002120075161519648, "loss": 2.8189264297485352, "step": 25350 }, { "epoch": 0.9621923589756561, "grad_norm": 0.30078125, "learning_rate": 0.00021188853517507982, "loss": 2.825995445251465, "step": 25360 }, { "epoch": 0.9625717723664193, "grad_norm": 0.2890625, "learning_rate": 0.00021176954997563182, "loss": 2.7851356506347655, "step": 25370 }, { "epoch": 0.9629511857571826, "grad_norm": 0.287109375, "learning_rate": 0.00021165056059588202, "loss": 2.7694272994995117, "step": 25380 }, { "epoch": 0.9633305991479459, "grad_norm": 0.3203125, "learning_rate": 0.00021153156707809286, "loss": 2.811687469482422, "step": 25390 }, { "epoch": 0.9637100125387092, "grad_norm": 0.279296875, "learning_rate": 0.00021141256946452844, "loss": 2.8107797622680666, "step": 25400 }, { "epoch": 0.9640894259294724, "grad_norm": 0.296875, "learning_rate": 0.00021129356779745436, "loss": 2.7885562896728517, "step": 25410 }, { "epoch": 0.9644688393202357, "grad_norm": 0.3046875, "learning_rate": 0.00021117456211913735, "loss": 2.790610122680664, "step": 25420 }, { "epoch": 0.964848252710999, "grad_norm": 0.3203125, "learning_rate": 0.000211055552471846, "loss": 2.7978002548217775, "step": 25430 }, { "epoch": 0.9652276661017622, "grad_norm": 0.2890625, "learning_rate": 0.00021093653889784993, "loss": 2.7816003799438476, "step": 25440 }, { "epoch": 0.9656070794925256, "grad_norm": 0.29296875, "learning_rate": 0.0002108175214394204, "loss": 2.802622604370117, "step": 25450 }, { "epoch": 0.9659864928832889, "grad_norm": 0.296875, "learning_rate": 0.00021069850013882987, "loss": 2.77524471282959, "step": 25460 }, { "epoch": 0.9663659062740521, "grad_norm": 0.298828125, "learning_rate": 0.00021057947503835237, "loss": 2.8099132537841798, "step": 25470 }, { "epoch": 0.9667453196648154, "grad_norm": 0.310546875, "learning_rate": 0.00021046044618026305, "loss": 2.8033498764038085, "step": 25480 }, { "epoch": 0.9671247330555787, "grad_norm": 0.291015625, "learning_rate": 0.00021034141360683868, "loss": 2.7808515548706056, "step": 25490 }, { "epoch": 0.9675041464463419, "grad_norm": 0.291015625, "learning_rate": 0.00021022237736035703, "loss": 2.813277816772461, "step": 25500 }, { "epoch": 0.9675041464463419, "eval_loss": 2.800165891647339, "eval_runtime": 189.8544, "eval_samples_per_second": 20.063, "eval_steps_per_second": 3.345, "step": 25500 }, { "epoch": 0.9678835598371052, "grad_norm": 0.29296875, "learning_rate": 0.00021010333748309733, "loss": 2.8114023208618164, "step": 25510 }, { "epoch": 0.9682629732278685, "grad_norm": 0.306640625, "learning_rate": 0.00020998429401734012, "loss": 2.7968488693237306, "step": 25520 }, { "epoch": 0.9686423866186318, "grad_norm": 0.302734375, "learning_rate": 0.0002098652470053672, "loss": 2.805044746398926, "step": 25530 }, { "epoch": 0.969021800009395, "grad_norm": 0.29296875, "learning_rate": 0.00020974619648946163, "loss": 2.7653804779052735, "step": 25540 }, { "epoch": 0.9694012134001583, "grad_norm": 0.298828125, "learning_rate": 0.0002096271425119077, "loss": 2.79105224609375, "step": 25550 }, { "epoch": 0.9697806267909216, "grad_norm": 0.310546875, "learning_rate": 0.00020950808511499092, "loss": 2.806291198730469, "step": 25560 }, { "epoch": 0.9701600401816848, "grad_norm": 0.291015625, "learning_rate": 0.00020938902434099803, "loss": 2.7906667709350588, "step": 25570 }, { "epoch": 0.9705394535724481, "grad_norm": 0.294921875, "learning_rate": 0.00020926996023221692, "loss": 2.8224308013916017, "step": 25580 }, { "epoch": 0.9709188669632114, "grad_norm": 0.291015625, "learning_rate": 0.00020915089283093683, "loss": 2.7958993911743164, "step": 25590 }, { "epoch": 0.9712982803539746, "grad_norm": 0.287109375, "learning_rate": 0.00020903182217944788, "loss": 2.821567153930664, "step": 25600 }, { "epoch": 0.9716776937447379, "grad_norm": 0.298828125, "learning_rate": 0.0002089127483200417, "loss": 2.787550163269043, "step": 25610 }, { "epoch": 0.9720571071355012, "grad_norm": 0.298828125, "learning_rate": 0.0002087936712950107, "loss": 2.778925323486328, "step": 25620 }, { "epoch": 0.9724365205262644, "grad_norm": 0.298828125, "learning_rate": 0.00020867459114664878, "loss": 2.7892807006835936, "step": 25630 }, { "epoch": 0.9728159339170277, "grad_norm": 0.302734375, "learning_rate": 0.00020855550791725055, "loss": 2.7843488693237304, "step": 25640 }, { "epoch": 0.973195347307791, "grad_norm": 0.314453125, "learning_rate": 0.00020843642164911205, "loss": 2.814738082885742, "step": 25650 }, { "epoch": 0.9735747606985543, "grad_norm": 0.28515625, "learning_rate": 0.00020831733238453026, "loss": 2.782101058959961, "step": 25660 }, { "epoch": 0.9739541740893175, "grad_norm": 0.302734375, "learning_rate": 0.00020819824016580322, "loss": 2.775551605224609, "step": 25670 }, { "epoch": 0.9743335874800808, "grad_norm": 0.287109375, "learning_rate": 0.00020807914503522996, "loss": 2.8026058197021486, "step": 25680 }, { "epoch": 0.9747130008708441, "grad_norm": 0.29296875, "learning_rate": 0.00020796004703511076, "loss": 2.818112373352051, "step": 25690 }, { "epoch": 0.9750924142616073, "grad_norm": 0.29296875, "learning_rate": 0.00020784094620774667, "loss": 2.8032838821411135, "step": 25700 }, { "epoch": 0.9754718276523706, "grad_norm": 0.298828125, "learning_rate": 0.0002077218425954399, "loss": 2.829441452026367, "step": 25710 }, { "epoch": 0.9758512410431339, "grad_norm": 0.296875, "learning_rate": 0.00020760273624049366, "loss": 2.824639892578125, "step": 25720 }, { "epoch": 0.9762306544338971, "grad_norm": 0.294921875, "learning_rate": 0.0002074836271852119, "loss": 2.8013381958007812, "step": 25730 }, { "epoch": 0.9766100678246604, "grad_norm": 0.30078125, "learning_rate": 0.0002073645154718999, "loss": 2.8154729843139648, "step": 25740 }, { "epoch": 0.9769894812154237, "grad_norm": 0.29296875, "learning_rate": 0.00020724540114286355, "loss": 2.8008852005004883, "step": 25750 }, { "epoch": 0.9769894812154237, "eval_loss": 2.7993133068084717, "eval_runtime": 189.744, "eval_samples_per_second": 20.074, "eval_steps_per_second": 3.347, "step": 25750 }, { "epoch": 0.9773688946061869, "grad_norm": 0.29296875, "learning_rate": 0.0002071262842404099, "loss": 2.7981916427612306, "step": 25760 }, { "epoch": 0.9777483079969502, "grad_norm": 0.3125, "learning_rate": 0.0002070071648068467, "loss": 2.792654037475586, "step": 25770 }, { "epoch": 0.9781277213877135, "grad_norm": 0.30078125, "learning_rate": 0.00020688804288448287, "loss": 2.8171180725097655, "step": 25780 }, { "epoch": 0.9785071347784767, "grad_norm": 0.30859375, "learning_rate": 0.00020676891851562795, "loss": 2.7876731872558596, "step": 25790 }, { "epoch": 0.97888654816924, "grad_norm": 0.287109375, "learning_rate": 0.00020664979174259257, "loss": 2.7866477966308594, "step": 25800 }, { "epoch": 0.9792659615600033, "grad_norm": 0.29296875, "learning_rate": 0.00020653066260768797, "loss": 2.794148063659668, "step": 25810 }, { "epoch": 0.9796453749507666, "grad_norm": 0.291015625, "learning_rate": 0.0002064115311532265, "loss": 2.781221389770508, "step": 25820 }, { "epoch": 0.9800247883415298, "grad_norm": 0.29296875, "learning_rate": 0.00020629239742152113, "loss": 2.788425064086914, "step": 25830 }, { "epoch": 0.9804042017322931, "grad_norm": 0.30078125, "learning_rate": 0.0002061732614548857, "loss": 2.8279541015625, "step": 25840 }, { "epoch": 0.9807836151230565, "grad_norm": 0.326171875, "learning_rate": 0.0002060541232956349, "loss": 2.7840692520141603, "step": 25850 }, { "epoch": 0.9811630285138196, "grad_norm": 0.291015625, "learning_rate": 0.00020593498298608412, "loss": 2.8079132080078124, "step": 25860 }, { "epoch": 0.981542441904583, "grad_norm": 0.28515625, "learning_rate": 0.00020581584056854956, "loss": 2.8112836837768556, "step": 25870 }, { "epoch": 0.9819218552953463, "grad_norm": 0.30078125, "learning_rate": 0.00020569669608534808, "loss": 2.815271759033203, "step": 25880 }, { "epoch": 0.9823012686861095, "grad_norm": 0.3046875, "learning_rate": 0.00020557754957879754, "loss": 2.8022842407226562, "step": 25890 }, { "epoch": 0.9826806820768728, "grad_norm": 0.30078125, "learning_rate": 0.0002054584010912161, "loss": 2.794334602355957, "step": 25900 }, { "epoch": 0.9830600954676361, "grad_norm": 0.291015625, "learning_rate": 0.00020533925066492298, "loss": 2.7857635498046873, "step": 25910 }, { "epoch": 0.9834395088583993, "grad_norm": 0.3203125, "learning_rate": 0.00020522009834223796, "loss": 2.8154842376708986, "step": 25920 }, { "epoch": 0.9838189222491626, "grad_norm": 0.2890625, "learning_rate": 0.00020510094416548142, "loss": 2.781041717529297, "step": 25930 }, { "epoch": 0.9841983356399259, "grad_norm": 0.306640625, "learning_rate": 0.00020498178817697458, "loss": 2.788304328918457, "step": 25940 }, { "epoch": 0.9845777490306892, "grad_norm": 0.306640625, "learning_rate": 0.00020486263041903904, "loss": 2.820603370666504, "step": 25950 }, { "epoch": 0.9849571624214524, "grad_norm": 0.294921875, "learning_rate": 0.00020474347093399735, "loss": 2.7859111785888673, "step": 25960 }, { "epoch": 0.9853365758122157, "grad_norm": 0.306640625, "learning_rate": 0.00020462430976417239, "loss": 2.7769342422485352, "step": 25970 }, { "epoch": 0.985715989202979, "grad_norm": 0.287109375, "learning_rate": 0.00020450514695188776, "loss": 2.7742378234863283, "step": 25980 }, { "epoch": 0.9860954025937422, "grad_norm": 0.294921875, "learning_rate": 0.0002043859825394677, "loss": 2.7790973663330076, "step": 25990 }, { "epoch": 0.9864748159845055, "grad_norm": 0.333984375, "learning_rate": 0.0002042668165692369, "loss": 2.772689437866211, "step": 26000 }, { "epoch": 0.9864748159845055, "eval_loss": 2.7985999584198, "eval_runtime": 189.7505, "eval_samples_per_second": 20.074, "eval_steps_per_second": 3.346, "step": 26000 }, { "epoch": 0.9868542293752688, "grad_norm": 0.29296875, "learning_rate": 0.00020414764908352075, "loss": 2.807981491088867, "step": 26010 }, { "epoch": 0.987233642766032, "grad_norm": 0.30859375, "learning_rate": 0.000204028480124645, "loss": 2.769340705871582, "step": 26020 }, { "epoch": 0.9876130561567953, "grad_norm": 0.291015625, "learning_rate": 0.000203909309734936, "loss": 2.781769371032715, "step": 26030 }, { "epoch": 0.9879924695475586, "grad_norm": 0.306640625, "learning_rate": 0.00020379013795672067, "loss": 2.768941879272461, "step": 26040 }, { "epoch": 0.9883718829383218, "grad_norm": 0.302734375, "learning_rate": 0.00020367096483232637, "loss": 2.802210807800293, "step": 26050 }, { "epoch": 0.9887512963290851, "grad_norm": 0.296875, "learning_rate": 0.0002035517904040809, "loss": 2.8223005294799806, "step": 26060 }, { "epoch": 0.9891307097198484, "grad_norm": 0.3125, "learning_rate": 0.00020343261471431263, "loss": 2.7818571090698243, "step": 26070 }, { "epoch": 0.9895101231106117, "grad_norm": 0.28515625, "learning_rate": 0.00020331343780535018, "loss": 2.8370433807373048, "step": 26080 }, { "epoch": 0.9898895365013749, "grad_norm": 0.294921875, "learning_rate": 0.00020319425971952293, "loss": 2.837141227722168, "step": 26090 }, { "epoch": 0.9902689498921382, "grad_norm": 0.291015625, "learning_rate": 0.00020307508049916036, "loss": 2.8407773971557617, "step": 26100 }, { "epoch": 0.9906483632829015, "grad_norm": 0.296875, "learning_rate": 0.00020295590018659247, "loss": 2.818437194824219, "step": 26110 }, { "epoch": 0.9910277766736647, "grad_norm": 0.294921875, "learning_rate": 0.0002028367188241497, "loss": 2.8091814041137697, "step": 26120 }, { "epoch": 0.991407190064428, "grad_norm": 0.2890625, "learning_rate": 0.0002027175364541628, "loss": 2.806157875061035, "step": 26130 }, { "epoch": 0.9917866034551913, "grad_norm": 0.294921875, "learning_rate": 0.00020259835311896285, "loss": 2.8080286026000976, "step": 26140 }, { "epoch": 0.9921660168459545, "grad_norm": 0.294921875, "learning_rate": 0.0002024791688608814, "loss": 2.808907890319824, "step": 26150 }, { "epoch": 0.9925454302367178, "grad_norm": 0.298828125, "learning_rate": 0.00020235998372225016, "loss": 2.803487777709961, "step": 26160 }, { "epoch": 0.9929248436274811, "grad_norm": 0.291015625, "learning_rate": 0.00020224079774540133, "loss": 2.831182670593262, "step": 26170 }, { "epoch": 0.9933042570182443, "grad_norm": 0.291015625, "learning_rate": 0.00020212161097266722, "loss": 2.79931755065918, "step": 26180 }, { "epoch": 0.9936836704090076, "grad_norm": 0.294921875, "learning_rate": 0.00020200242344638064, "loss": 2.819341468811035, "step": 26190 }, { "epoch": 0.9940630837997709, "grad_norm": 0.302734375, "learning_rate": 0.00020188323520887438, "loss": 2.802592658996582, "step": 26200 }, { "epoch": 0.9944424971905341, "grad_norm": 0.29296875, "learning_rate": 0.00020176404630248182, "loss": 2.824872589111328, "step": 26210 }, { "epoch": 0.9948219105812974, "grad_norm": 0.294921875, "learning_rate": 0.0002016448567695363, "loss": 2.790685272216797, "step": 26220 }, { "epoch": 0.9952013239720607, "grad_norm": 0.3046875, "learning_rate": 0.0002015256666523715, "loss": 2.8162240982055664, "step": 26230 }, { "epoch": 0.995580737362824, "grad_norm": 0.310546875, "learning_rate": 0.00020140647599332129, "loss": 2.7746536254882814, "step": 26240 }, { "epoch": 0.9959601507535872, "grad_norm": 0.2890625, "learning_rate": 0.0002012872848347198, "loss": 2.805500793457031, "step": 26250 }, { "epoch": 0.9959601507535872, "eval_loss": 2.797369956970215, "eval_runtime": 188.5707, "eval_samples_per_second": 20.199, "eval_steps_per_second": 3.367, "step": 26250 }, { "epoch": 0.9963395641443505, "grad_norm": 0.337890625, "learning_rate": 0.0002011680932189012, "loss": 2.8162769317626952, "step": 26260 }, { "epoch": 0.9967189775351138, "grad_norm": 0.3125, "learning_rate": 0.00020104890118819997, "loss": 2.7984901428222657, "step": 26270 }, { "epoch": 0.997098390925877, "grad_norm": 0.3046875, "learning_rate": 0.00020092970878495057, "loss": 2.7996074676513674, "step": 26280 }, { "epoch": 0.9974778043166403, "grad_norm": 0.294921875, "learning_rate": 0.0002008105160514878, "loss": 2.8066356658935545, "step": 26290 }, { "epoch": 0.9978572177074037, "grad_norm": 0.291015625, "learning_rate": 0.00020069132303014638, "loss": 2.79492073059082, "step": 26300 }, { "epoch": 0.9982366310981668, "grad_norm": 0.29296875, "learning_rate": 0.00020057212976326132, "loss": 2.7722360610961916, "step": 26310 }, { "epoch": 0.9986160444889302, "grad_norm": 0.294921875, "learning_rate": 0.00020045293629316748, "loss": 2.764409828186035, "step": 26320 }, { "epoch": 0.9989954578796935, "grad_norm": 0.294921875, "learning_rate": 0.00020033374266220002, "loss": 2.8312709808349608, "step": 26330 }, { "epoch": 0.9993748712704567, "grad_norm": 0.28515625, "learning_rate": 0.00020021454891269404, "loss": 2.7892553329467775, "step": 26340 }, { "epoch": 0.99975428466122, "grad_norm": 0.298828125, "learning_rate": 0.00020009535508698468, "loss": 2.786690902709961, "step": 26350 }, { "epoch": 1.000113824017229, "grad_norm": 0.298828125, "learning_rate": 0.00019997616122740717, "loss": 2.790797233581543, "step": 26360 }, { "epoch": 1.0004932374079922, "grad_norm": 0.296875, "learning_rate": 0.00019985696737629665, "loss": 2.7873693466186524, "step": 26370 }, { "epoch": 1.0008726507987555, "grad_norm": 0.310546875, "learning_rate": 0.00019973777357598838, "loss": 2.7789758682250976, "step": 26380 }, { "epoch": 1.0012520641895188, "grad_norm": 0.296875, "learning_rate": 0.00019961857986881766, "loss": 2.7762781143188477, "step": 26390 }, { "epoch": 1.0016314775802821, "grad_norm": 0.302734375, "learning_rate": 0.0001994993862971194, "loss": 2.7864112854003906, "step": 26400 }, { "epoch": 1.0020108909710452, "grad_norm": 0.283203125, "learning_rate": 0.00019938019290322885, "loss": 2.7720930099487306, "step": 26410 }, { "epoch": 1.0023903043618085, "grad_norm": 0.298828125, "learning_rate": 0.0001992609997294811, "loss": 2.7796754837036133, "step": 26420 }, { "epoch": 1.0027697177525718, "grad_norm": 0.30078125, "learning_rate": 0.00019914180681821094, "loss": 2.7950613021850588, "step": 26430 }, { "epoch": 1.0031491311433351, "grad_norm": 0.33203125, "learning_rate": 0.00019902261421175332, "loss": 2.7959789276123046, "step": 26440 }, { "epoch": 1.0035285445340985, "grad_norm": 0.302734375, "learning_rate": 0.00019890342195244304, "loss": 2.7795486450195312, "step": 26450 }, { "epoch": 1.0039079579248618, "grad_norm": 0.31640625, "learning_rate": 0.00019878423008261477, "loss": 2.7750032424926756, "step": 26460 }, { "epoch": 1.004287371315625, "grad_norm": 0.29296875, "learning_rate": 0.0001986650386446029, "loss": 2.7827592849731446, "step": 26470 }, { "epoch": 1.0046667847063881, "grad_norm": 0.30078125, "learning_rate": 0.0001985458476807418, "loss": 2.79776554107666, "step": 26480 }, { "epoch": 1.0050461980971515, "grad_norm": 0.306640625, "learning_rate": 0.00019842665723336563, "loss": 2.7568889617919923, "step": 26490 }, { "epoch": 1.0054256114879148, "grad_norm": 0.294921875, "learning_rate": 0.00019830746734480857, "loss": 2.7671274185180663, "step": 26500 }, { "epoch": 1.0054256114879148, "eval_loss": 2.7966036796569824, "eval_runtime": 189.1427, "eval_samples_per_second": 20.138, "eval_steps_per_second": 3.357, "step": 26500 }, { "epoch": 1.005805024878678, "grad_norm": 0.28515625, "learning_rate": 0.00019818827805740416, "loss": 2.7736795425415037, "step": 26510 }, { "epoch": 1.0061844382694414, "grad_norm": 0.294921875, "learning_rate": 0.0001980690894134861, "loss": 2.815703773498535, "step": 26520 }, { "epoch": 1.0065638516602047, "grad_norm": 0.29296875, "learning_rate": 0.00019794990145538782, "loss": 2.7874298095703125, "step": 26530 }, { "epoch": 1.0069432650509678, "grad_norm": 0.3046875, "learning_rate": 0.00019783071422544224, "loss": 2.7837993621826174, "step": 26540 }, { "epoch": 1.007322678441731, "grad_norm": 0.294921875, "learning_rate": 0.00019771152776598233, "loss": 2.7406620025634765, "step": 26550 }, { "epoch": 1.0077020918324944, "grad_norm": 0.2890625, "learning_rate": 0.00019759234211934063, "loss": 2.7918296813964845, "step": 26560 }, { "epoch": 1.0080815052232577, "grad_norm": 0.318359375, "learning_rate": 0.00019747315732784954, "loss": 2.789214515686035, "step": 26570 }, { "epoch": 1.008460918614021, "grad_norm": 0.306640625, "learning_rate": 0.00019735397343384088, "loss": 2.7878097534179687, "step": 26580 }, { "epoch": 1.0088403320047843, "grad_norm": 0.296875, "learning_rate": 0.00019723479047964638, "loss": 2.8145353317260744, "step": 26590 }, { "epoch": 1.0092197453955476, "grad_norm": 0.302734375, "learning_rate": 0.00019711560850759738, "loss": 2.8036483764648437, "step": 26600 }, { "epoch": 1.0095991587863107, "grad_norm": 0.306640625, "learning_rate": 0.00019699642756002496, "loss": 2.795553207397461, "step": 26610 }, { "epoch": 1.009978572177074, "grad_norm": 0.294921875, "learning_rate": 0.00019687724767925954, "loss": 2.7830251693725585, "step": 26620 }, { "epoch": 1.0103579855678373, "grad_norm": 0.310546875, "learning_rate": 0.00019675806890763147, "loss": 2.785091781616211, "step": 26630 }, { "epoch": 1.0107373989586006, "grad_norm": 0.3125, "learning_rate": 0.0001966388912874707, "loss": 2.7897857666015624, "step": 26640 }, { "epoch": 1.011116812349364, "grad_norm": 0.29296875, "learning_rate": 0.00019651971486110646, "loss": 2.799842643737793, "step": 26650 }, { "epoch": 1.0114962257401272, "grad_norm": 0.302734375, "learning_rate": 0.00019640053967086784, "loss": 2.773910140991211, "step": 26660 }, { "epoch": 1.0118756391308903, "grad_norm": 0.30859375, "learning_rate": 0.00019628136575908345, "loss": 2.7757116317749024, "step": 26670 }, { "epoch": 1.0122550525216536, "grad_norm": 0.29296875, "learning_rate": 0.0001961621931680815, "loss": 2.8191410064697267, "step": 26680 }, { "epoch": 1.012634465912417, "grad_norm": 0.296875, "learning_rate": 0.00019604302194018943, "loss": 2.7949411392211916, "step": 26690 }, { "epoch": 1.0130138793031802, "grad_norm": 0.298828125, "learning_rate": 0.0001959238521177345, "loss": 2.7698183059692383, "step": 26700 }, { "epoch": 1.0133932926939435, "grad_norm": 0.29296875, "learning_rate": 0.00019580468374304338, "loss": 2.8054794311523437, "step": 26710 }, { "epoch": 1.0137727060847068, "grad_norm": 0.296875, "learning_rate": 0.00019568551685844232, "loss": 2.7987112045288085, "step": 26720 }, { "epoch": 1.0141521194754701, "grad_norm": 0.306640625, "learning_rate": 0.00019556635150625687, "loss": 2.766096305847168, "step": 26730 }, { "epoch": 1.0145315328662332, "grad_norm": 0.294921875, "learning_rate": 0.000195447187728812, "loss": 2.770389175415039, "step": 26740 }, { "epoch": 1.0149109462569965, "grad_norm": 0.291015625, "learning_rate": 0.00019532802556843242, "loss": 2.803853988647461, "step": 26750 }, { "epoch": 1.0149109462569965, "eval_loss": 2.7955868244171143, "eval_runtime": 188.8546, "eval_samples_per_second": 20.169, "eval_steps_per_second": 3.362, "step": 26750 }, { "epoch": 1.0152903596477598, "grad_norm": 0.296875, "learning_rate": 0.00019520886506744193, "loss": 2.7892723083496094, "step": 26760 }, { "epoch": 1.0156697730385231, "grad_norm": 0.3046875, "learning_rate": 0.00019508970626816398, "loss": 2.8105138778686523, "step": 26770 }, { "epoch": 1.0160491864292864, "grad_norm": 0.306640625, "learning_rate": 0.00019497054921292125, "loss": 2.8360006332397463, "step": 26780 }, { "epoch": 1.0164285998200497, "grad_norm": 0.298828125, "learning_rate": 0.00019485139394403596, "loss": 2.796798515319824, "step": 26790 }, { "epoch": 1.0168080132108128, "grad_norm": 0.302734375, "learning_rate": 0.00019473224050382953, "loss": 2.7814876556396486, "step": 26800 }, { "epoch": 1.0171874266015761, "grad_norm": 0.298828125, "learning_rate": 0.00019461308893462285, "loss": 2.7797262191772463, "step": 26810 }, { "epoch": 1.0175668399923394, "grad_norm": 0.30859375, "learning_rate": 0.00019449393927873604, "loss": 2.794768714904785, "step": 26820 }, { "epoch": 1.0179462533831027, "grad_norm": 0.29296875, "learning_rate": 0.00019437479157848874, "loss": 2.805461883544922, "step": 26830 }, { "epoch": 1.018325666773866, "grad_norm": 0.298828125, "learning_rate": 0.0001942556458761996, "loss": 2.7992889404296877, "step": 26840 }, { "epoch": 1.0187050801646294, "grad_norm": 0.291015625, "learning_rate": 0.00019413650221418688, "loss": 2.7891908645629884, "step": 26850 }, { "epoch": 1.0190844935553927, "grad_norm": 0.318359375, "learning_rate": 0.00019401736063476784, "loss": 2.772376823425293, "step": 26860 }, { "epoch": 1.0194639069461557, "grad_norm": 0.298828125, "learning_rate": 0.00019389822118025913, "loss": 2.785756301879883, "step": 26870 }, { "epoch": 1.019843320336919, "grad_norm": 0.28515625, "learning_rate": 0.00019377908389297665, "loss": 2.783197021484375, "step": 26880 }, { "epoch": 1.0202227337276824, "grad_norm": 0.298828125, "learning_rate": 0.00019365994881523552, "loss": 2.789397048950195, "step": 26890 }, { "epoch": 1.0206021471184457, "grad_norm": 0.294921875, "learning_rate": 0.00019354081598935014, "loss": 2.7750423431396483, "step": 26900 }, { "epoch": 1.020981560509209, "grad_norm": 0.296875, "learning_rate": 0.0001934216854576339, "loss": 2.788864517211914, "step": 26910 }, { "epoch": 1.0213609738999723, "grad_norm": 0.33203125, "learning_rate": 0.00019330255726239954, "loss": 2.7891208648681642, "step": 26920 }, { "epoch": 1.0217403872907354, "grad_norm": 0.29296875, "learning_rate": 0.000193183431445959, "loss": 2.790058898925781, "step": 26930 }, { "epoch": 1.0221198006814987, "grad_norm": 0.310546875, "learning_rate": 0.00019306430805062337, "loss": 2.786491584777832, "step": 26940 }, { "epoch": 1.022499214072262, "grad_norm": 0.3046875, "learning_rate": 0.00019294518711870264, "loss": 2.7597915649414064, "step": 26950 }, { "epoch": 1.0228786274630253, "grad_norm": 0.291015625, "learning_rate": 0.00019282606869250625, "loss": 2.8056135177612305, "step": 26960 }, { "epoch": 1.0232580408537886, "grad_norm": 0.28515625, "learning_rate": 0.00019270695281434268, "loss": 2.7845911026000976, "step": 26970 }, { "epoch": 1.0236374542445519, "grad_norm": 0.337890625, "learning_rate": 0.00019258783952651927, "loss": 2.7698858261108397, "step": 26980 }, { "epoch": 1.0240168676353152, "grad_norm": 0.306640625, "learning_rate": 0.00019246872887134265, "loss": 2.7862312316894533, "step": 26990 }, { "epoch": 1.0243962810260783, "grad_norm": 0.3046875, "learning_rate": 0.00019234962089111852, "loss": 2.7366365432739257, "step": 27000 }, { "epoch": 1.0243962810260783, "eval_loss": 2.795020818710327, "eval_runtime": 206.8379, "eval_samples_per_second": 18.415, "eval_steps_per_second": 3.07, "step": 27000 }, { "epoch": 1.0247756944168416, "grad_norm": 0.302734375, "learning_rate": 0.00019223051562815166, "loss": 2.7520429611206056, "step": 27010 }, { "epoch": 1.0251551078076049, "grad_norm": 0.30078125, "learning_rate": 0.00019211141312474564, "loss": 2.823873519897461, "step": 27020 }, { "epoch": 1.0255345211983682, "grad_norm": 0.302734375, "learning_rate": 0.00019199231342320329, "loss": 2.786263084411621, "step": 27030 }, { "epoch": 1.0259139345891315, "grad_norm": 0.302734375, "learning_rate": 0.00019187321656582645, "loss": 2.783249855041504, "step": 27040 }, { "epoch": 1.0262933479798948, "grad_norm": 0.3046875, "learning_rate": 0.00019175412259491573, "loss": 2.786861038208008, "step": 27050 }, { "epoch": 1.0266727613706579, "grad_norm": 0.30078125, "learning_rate": 0.0001916350315527709, "loss": 2.808721923828125, "step": 27060 }, { "epoch": 1.0270521747614212, "grad_norm": 0.30078125, "learning_rate": 0.00019151594348169068, "loss": 2.792275047302246, "step": 27070 }, { "epoch": 1.0274315881521845, "grad_norm": 0.30078125, "learning_rate": 0.00019139685842397278, "loss": 2.8024396896362305, "step": 27080 }, { "epoch": 1.0278110015429478, "grad_norm": 0.298828125, "learning_rate": 0.0001912777764219136, "loss": 2.7871400833129885, "step": 27090 }, { "epoch": 1.028190414933711, "grad_norm": 0.302734375, "learning_rate": 0.0001911586975178087, "loss": 2.7587167739868166, "step": 27100 }, { "epoch": 1.0285698283244744, "grad_norm": 0.3046875, "learning_rate": 0.00019103962175395244, "loss": 2.7602840423583985, "step": 27110 }, { "epoch": 1.0289492417152375, "grad_norm": 0.2890625, "learning_rate": 0.00019092054917263819, "loss": 2.797502136230469, "step": 27120 }, { "epoch": 1.0293286551060008, "grad_norm": 0.291015625, "learning_rate": 0.00019080147981615785, "loss": 2.7574010848999024, "step": 27130 }, { "epoch": 1.029708068496764, "grad_norm": 0.298828125, "learning_rate": 0.00019068241372680256, "loss": 2.789565086364746, "step": 27140 }, { "epoch": 1.0300874818875274, "grad_norm": 0.296875, "learning_rate": 0.00019056335094686219, "loss": 2.7917268753051756, "step": 27150 }, { "epoch": 1.0304668952782907, "grad_norm": 0.314453125, "learning_rate": 0.00019044429151862522, "loss": 2.8230751037597654, "step": 27160 }, { "epoch": 1.030846308669054, "grad_norm": 0.30859375, "learning_rate": 0.0001903252354843792, "loss": 2.801799011230469, "step": 27170 }, { "epoch": 1.0312257220598173, "grad_norm": 0.296875, "learning_rate": 0.00019020618288641042, "loss": 2.7761640548706055, "step": 27180 }, { "epoch": 1.0316051354505804, "grad_norm": 0.298828125, "learning_rate": 0.00019008713376700393, "loss": 2.7797014236450197, "step": 27190 }, { "epoch": 1.0319845488413437, "grad_norm": 0.296875, "learning_rate": 0.0001899680881684434, "loss": 2.788605308532715, "step": 27200 }, { "epoch": 1.032363962232107, "grad_norm": 0.29296875, "learning_rate": 0.00018984904613301144, "loss": 2.7788196563720704, "step": 27210 }, { "epoch": 1.0327433756228703, "grad_norm": 0.298828125, "learning_rate": 0.00018973000770298933, "loss": 2.8102643966674803, "step": 27220 }, { "epoch": 1.0331227890136336, "grad_norm": 0.294921875, "learning_rate": 0.0001896109729206572, "loss": 2.763945198059082, "step": 27230 }, { "epoch": 1.033502202404397, "grad_norm": 0.302734375, "learning_rate": 0.00018949194182829352, "loss": 2.7868488311767576, "step": 27240 }, { "epoch": 1.03388161579516, "grad_norm": 0.296875, "learning_rate": 0.00018937291446817581, "loss": 2.8058820724487306, "step": 27250 }, { "epoch": 1.03388161579516, "eval_loss": 2.7944321632385254, "eval_runtime": 203.0447, "eval_samples_per_second": 18.759, "eval_steps_per_second": 3.127, "step": 27250 }, { "epoch": 1.0342610291859233, "grad_norm": 0.279296875, "learning_rate": 0.00018925389088258023, "loss": 2.805589485168457, "step": 27260 }, { "epoch": 1.0346404425766866, "grad_norm": 0.29296875, "learning_rate": 0.0001891348711137813, "loss": 2.764800453186035, "step": 27270 }, { "epoch": 1.03501985596745, "grad_norm": 0.3046875, "learning_rate": 0.0001890158552040525, "loss": 2.791369819641113, "step": 27280 }, { "epoch": 1.0353992693582132, "grad_norm": 0.310546875, "learning_rate": 0.0001888968431956658, "loss": 2.782131004333496, "step": 27290 }, { "epoch": 1.0357786827489766, "grad_norm": 0.333984375, "learning_rate": 0.0001887778351308919, "loss": 2.8198217391967773, "step": 27300 }, { "epoch": 1.0361580961397399, "grad_norm": 0.296875, "learning_rate": 0.00018865883105199997, "loss": 2.8000478744506836, "step": 27310 }, { "epoch": 1.036537509530503, "grad_norm": 0.29296875, "learning_rate": 0.00018853983100125776, "loss": 2.782887649536133, "step": 27320 }, { "epoch": 1.0369169229212662, "grad_norm": 0.298828125, "learning_rate": 0.00018842083502093167, "loss": 2.8103784561157226, "step": 27330 }, { "epoch": 1.0372963363120296, "grad_norm": 0.3046875, "learning_rate": 0.00018830184315328665, "loss": 2.762551689147949, "step": 27340 }, { "epoch": 1.0376757497027929, "grad_norm": 0.306640625, "learning_rate": 0.00018818285544058616, "loss": 2.780237007141113, "step": 27350 }, { "epoch": 1.0380551630935562, "grad_norm": 0.291015625, "learning_rate": 0.0001880638719250921, "loss": 2.792671775817871, "step": 27360 }, { "epoch": 1.0384345764843195, "grad_norm": 0.30078125, "learning_rate": 0.00018794489264906513, "loss": 2.793067741394043, "step": 27370 }, { "epoch": 1.0388139898750826, "grad_norm": 0.306640625, "learning_rate": 0.00018782591765476405, "loss": 2.7990488052368163, "step": 27380 }, { "epoch": 1.0391934032658459, "grad_norm": 0.3125, "learning_rate": 0.00018770694698444647, "loss": 2.7786928176879884, "step": 27390 }, { "epoch": 1.0395728166566092, "grad_norm": 0.28515625, "learning_rate": 0.00018758798068036823, "loss": 2.7664922714233398, "step": 27400 }, { "epoch": 1.0399522300473725, "grad_norm": 0.287109375, "learning_rate": 0.0001874690187847838, "loss": 2.8086526870727537, "step": 27410 }, { "epoch": 1.0403316434381358, "grad_norm": 0.30078125, "learning_rate": 0.00018735006133994593, "loss": 2.8185882568359375, "step": 27420 }, { "epoch": 1.040711056828899, "grad_norm": 0.298828125, "learning_rate": 0.0001872311083881059, "loss": 2.789616012573242, "step": 27430 }, { "epoch": 1.0410904702196624, "grad_norm": 0.29296875, "learning_rate": 0.00018711215997151327, "loss": 2.7750288009643556, "step": 27440 }, { "epoch": 1.0414698836104255, "grad_norm": 0.306640625, "learning_rate": 0.00018699321613241618, "loss": 2.7872861862182616, "step": 27450 }, { "epoch": 1.0418492970011888, "grad_norm": 0.298828125, "learning_rate": 0.00018687427691306096, "loss": 2.790182113647461, "step": 27460 }, { "epoch": 1.042228710391952, "grad_norm": 0.31640625, "learning_rate": 0.00018675534235569236, "loss": 2.763536071777344, "step": 27470 }, { "epoch": 1.0426081237827154, "grad_norm": 0.291015625, "learning_rate": 0.00018663641250255363, "loss": 2.758711051940918, "step": 27480 }, { "epoch": 1.0429875371734787, "grad_norm": 0.291015625, "learning_rate": 0.00018651748739588602, "loss": 2.785404396057129, "step": 27490 }, { "epoch": 1.043366950564242, "grad_norm": 0.314453125, "learning_rate": 0.00018639856707792932, "loss": 2.787277030944824, "step": 27500 }, { "epoch": 1.043366950564242, "eval_loss": 2.793674945831299, "eval_runtime": 191.9327, "eval_samples_per_second": 19.846, "eval_steps_per_second": 3.308, "step": 27500 }, { "epoch": 1.043746363955005, "grad_norm": 0.296875, "learning_rate": 0.00018627965159092162, "loss": 2.7965402603149414, "step": 27510 }, { "epoch": 1.0441257773457684, "grad_norm": 0.294921875, "learning_rate": 0.00018616074097709933, "loss": 2.7740978240966796, "step": 27520 }, { "epoch": 1.0445051907365317, "grad_norm": 0.30078125, "learning_rate": 0.00018604183527869692, "loss": 2.754684066772461, "step": 27530 }, { "epoch": 1.044884604127295, "grad_norm": 0.310546875, "learning_rate": 0.00018592293453794726, "loss": 2.8150354385375977, "step": 27540 }, { "epoch": 1.0452640175180583, "grad_norm": 0.287109375, "learning_rate": 0.00018580403879708157, "loss": 2.7908191680908203, "step": 27550 }, { "epoch": 1.0456434309088216, "grad_norm": 0.306640625, "learning_rate": 0.000185685148098329, "loss": 2.7611183166503905, "step": 27560 }, { "epoch": 1.046022844299585, "grad_norm": 0.318359375, "learning_rate": 0.00018556626248391716, "loss": 2.7989458084106444, "step": 27570 }, { "epoch": 1.046402257690348, "grad_norm": 0.30859375, "learning_rate": 0.00018544738199607178, "loss": 2.8006824493408202, "step": 27580 }, { "epoch": 1.0467816710811113, "grad_norm": 0.302734375, "learning_rate": 0.00018532850667701684, "loss": 2.806045722961426, "step": 27590 }, { "epoch": 1.0471610844718746, "grad_norm": 0.29296875, "learning_rate": 0.00018520963656897424, "loss": 2.7865825653076173, "step": 27600 }, { "epoch": 1.047540497862638, "grad_norm": 0.30078125, "learning_rate": 0.00018509077171416426, "loss": 2.789866638183594, "step": 27610 }, { "epoch": 1.0479199112534012, "grad_norm": 0.29296875, "learning_rate": 0.00018497191215480528, "loss": 2.8305301666259766, "step": 27620 }, { "epoch": 1.0482993246441645, "grad_norm": 0.29296875, "learning_rate": 0.00018485305793311383, "loss": 2.8010631561279298, "step": 27630 }, { "epoch": 1.0486787380349276, "grad_norm": 0.302734375, "learning_rate": 0.0001847342090913043, "loss": 2.784342384338379, "step": 27640 }, { "epoch": 1.049058151425691, "grad_norm": 0.29296875, "learning_rate": 0.0001846153656715895, "loss": 2.765933609008789, "step": 27650 }, { "epoch": 1.0494375648164542, "grad_norm": 0.298828125, "learning_rate": 0.0001844965277161802, "loss": 2.812592315673828, "step": 27660 }, { "epoch": 1.0498169782072175, "grad_norm": 0.298828125, "learning_rate": 0.00018437769526728504, "loss": 2.7577585220336913, "step": 27670 }, { "epoch": 1.0501963915979808, "grad_norm": 0.298828125, "learning_rate": 0.00018425886836711093, "loss": 2.762236976623535, "step": 27680 }, { "epoch": 1.0505758049887441, "grad_norm": 0.29296875, "learning_rate": 0.00018414004705786277, "loss": 2.7728593826293944, "step": 27690 }, { "epoch": 1.0509552183795074, "grad_norm": 0.296875, "learning_rate": 0.0001840212313817435, "loss": 2.7809051513671874, "step": 27700 }, { "epoch": 1.0513346317702705, "grad_norm": 0.328125, "learning_rate": 0.00018390242138095384, "loss": 2.7812191009521485, "step": 27710 }, { "epoch": 1.0517140451610338, "grad_norm": 0.3046875, "learning_rate": 0.00018378361709769271, "loss": 2.7673789978027346, "step": 27720 }, { "epoch": 1.0520934585517971, "grad_norm": 0.29296875, "learning_rate": 0.00018366481857415702, "loss": 2.808651351928711, "step": 27730 }, { "epoch": 1.0524728719425605, "grad_norm": 0.30859375, "learning_rate": 0.0001835460258525416, "loss": 2.7631832122802735, "step": 27740 }, { "epoch": 1.0528522853333238, "grad_norm": 0.298828125, "learning_rate": 0.000183427238975039, "loss": 2.7844612121582033, "step": 27750 }, { "epoch": 1.0528522853333238, "eval_loss": 2.7926461696624756, "eval_runtime": 190.2338, "eval_samples_per_second": 20.023, "eval_steps_per_second": 3.338, "step": 27750 }, { "epoch": 1.053231698724087, "grad_norm": 0.30078125, "learning_rate": 0.00018330845798383998, "loss": 2.8178684234619142, "step": 27760 }, { "epoch": 1.0536111121148501, "grad_norm": 0.30078125, "learning_rate": 0.0001831896829211332, "loss": 2.7774377822875977, "step": 27770 }, { "epoch": 1.0539905255056135, "grad_norm": 0.330078125, "learning_rate": 0.00018307091382910488, "loss": 2.759101676940918, "step": 27780 }, { "epoch": 1.0543699388963768, "grad_norm": 0.294921875, "learning_rate": 0.00018295215074993953, "loss": 2.771928596496582, "step": 27790 }, { "epoch": 1.05474935228714, "grad_norm": 0.298828125, "learning_rate": 0.00018283339372581928, "loss": 2.78176326751709, "step": 27800 }, { "epoch": 1.0551287656779034, "grad_norm": 0.296875, "learning_rate": 0.00018271464279892433, "loss": 2.773866653442383, "step": 27810 }, { "epoch": 1.0555081790686667, "grad_norm": 0.29296875, "learning_rate": 0.00018259589801143233, "loss": 2.783024215698242, "step": 27820 }, { "epoch": 1.0558875924594298, "grad_norm": 0.287109375, "learning_rate": 0.0001824771594055191, "loss": 2.806703567504883, "step": 27830 }, { "epoch": 1.056267005850193, "grad_norm": 0.30078125, "learning_rate": 0.00018235842702335817, "loss": 2.753369140625, "step": 27840 }, { "epoch": 1.0566464192409564, "grad_norm": 0.30078125, "learning_rate": 0.0001822397009071209, "loss": 2.808294677734375, "step": 27850 }, { "epoch": 1.0570258326317197, "grad_norm": 0.29296875, "learning_rate": 0.00018212098109897617, "loss": 2.809322738647461, "step": 27860 }, { "epoch": 1.057405246022483, "grad_norm": 0.296875, "learning_rate": 0.00018200226764109094, "loss": 2.777980613708496, "step": 27870 }, { "epoch": 1.0577846594132463, "grad_norm": 0.330078125, "learning_rate": 0.0001818835605756298, "loss": 2.796761894226074, "step": 27880 }, { "epoch": 1.0581640728040096, "grad_norm": 0.326171875, "learning_rate": 0.00018176485994475503, "loss": 2.7710702896118162, "step": 27890 }, { "epoch": 1.0585434861947727, "grad_norm": 0.328125, "learning_rate": 0.0001816461657906266, "loss": 2.786495018005371, "step": 27900 }, { "epoch": 1.058922899585536, "grad_norm": 0.39453125, "learning_rate": 0.00018152747815540227, "loss": 2.773598289489746, "step": 27910 }, { "epoch": 1.0593023129762993, "grad_norm": 0.29296875, "learning_rate": 0.0001814087970812375, "loss": 2.776407241821289, "step": 27920 }, { "epoch": 1.0596817263670626, "grad_norm": 0.294921875, "learning_rate": 0.0001812901226102853, "loss": 2.774435806274414, "step": 27930 }, { "epoch": 1.060061139757826, "grad_norm": 0.29296875, "learning_rate": 0.00018117145478469638, "loss": 2.7467792510986326, "step": 27940 }, { "epoch": 1.0604405531485892, "grad_norm": 0.30078125, "learning_rate": 0.00018105279364661917, "loss": 2.7863759994506836, "step": 27950 }, { "epoch": 1.0608199665393523, "grad_norm": 0.28515625, "learning_rate": 0.00018093413923819964, "loss": 2.7889711380004885, "step": 27960 }, { "epoch": 1.0611993799301156, "grad_norm": 0.298828125, "learning_rate": 0.0001808154916015814, "loss": 2.7891555786132813, "step": 27970 }, { "epoch": 1.061578793320879, "grad_norm": 0.302734375, "learning_rate": 0.0001806968507789056, "loss": 2.7642147064208986, "step": 27980 }, { "epoch": 1.0619582067116422, "grad_norm": 0.333984375, "learning_rate": 0.00018057821681231108, "loss": 2.789502716064453, "step": 27990 }, { "epoch": 1.0623376201024055, "grad_norm": 0.318359375, "learning_rate": 0.00018045958974393414, "loss": 2.79998779296875, "step": 28000 }, { "epoch": 1.0623376201024055, "eval_loss": 2.791923999786377, "eval_runtime": 190.1808, "eval_samples_per_second": 20.028, "eval_steps_per_second": 3.339, "step": 28000 }, { "epoch": 1.0627170334931688, "grad_norm": 0.31640625, "learning_rate": 0.00018034096961590872, "loss": 2.779283142089844, "step": 28010 }, { "epoch": 1.0630964468839321, "grad_norm": 0.3046875, "learning_rate": 0.00018022235647036615, "loss": 2.803208923339844, "step": 28020 }, { "epoch": 1.0634758602746952, "grad_norm": 0.306640625, "learning_rate": 0.00018010375034943553, "loss": 2.8058504104614257, "step": 28030 }, { "epoch": 1.0638552736654585, "grad_norm": 0.291015625, "learning_rate": 0.00017998515129524317, "loss": 2.8120962142944337, "step": 28040 }, { "epoch": 1.0642346870562218, "grad_norm": 0.296875, "learning_rate": 0.00017986655934991308, "loss": 2.807118225097656, "step": 28050 }, { "epoch": 1.0646141004469851, "grad_norm": 0.314453125, "learning_rate": 0.00017974797455556677, "loss": 2.78634033203125, "step": 28060 }, { "epoch": 1.0649935138377484, "grad_norm": 0.294921875, "learning_rate": 0.0001796293969543229, "loss": 2.812597465515137, "step": 28070 }, { "epoch": 1.0653729272285117, "grad_norm": 0.2890625, "learning_rate": 0.0001795108265882979, "loss": 2.74217529296875, "step": 28080 }, { "epoch": 1.0657523406192748, "grad_norm": 0.298828125, "learning_rate": 0.00017939226349960555, "loss": 2.757254791259766, "step": 28090 }, { "epoch": 1.0661317540100381, "grad_norm": 0.306640625, "learning_rate": 0.00017927370773035712, "loss": 2.7794496536254885, "step": 28100 }, { "epoch": 1.0665111674008014, "grad_norm": 0.3125, "learning_rate": 0.00017915515932266096, "loss": 2.7863113403320314, "step": 28110 }, { "epoch": 1.0668905807915647, "grad_norm": 0.306640625, "learning_rate": 0.0001790366183186231, "loss": 2.7720129013061525, "step": 28120 }, { "epoch": 1.067269994182328, "grad_norm": 0.328125, "learning_rate": 0.0001789180847603469, "loss": 2.807218551635742, "step": 28130 }, { "epoch": 1.0676494075730913, "grad_norm": 0.296875, "learning_rate": 0.00017879955868993315, "loss": 2.782308578491211, "step": 28140 }, { "epoch": 1.0680288209638547, "grad_norm": 0.294921875, "learning_rate": 0.0001786810401494796, "loss": 2.7744329452514647, "step": 28150 }, { "epoch": 1.0684082343546177, "grad_norm": 0.29296875, "learning_rate": 0.0001785625291810818, "loss": 2.7726091384887694, "step": 28160 }, { "epoch": 1.068787647745381, "grad_norm": 0.3046875, "learning_rate": 0.00017844402582683241, "loss": 2.7687265396118166, "step": 28170 }, { "epoch": 1.0691670611361443, "grad_norm": 0.298828125, "learning_rate": 0.00017832553012882128, "loss": 2.794355583190918, "step": 28180 }, { "epoch": 1.0695464745269077, "grad_norm": 0.3046875, "learning_rate": 0.00017820704212913563, "loss": 2.8507577896118166, "step": 28190 }, { "epoch": 1.069925887917671, "grad_norm": 0.2890625, "learning_rate": 0.00017808856186986004, "loss": 2.7628957748413088, "step": 28200 }, { "epoch": 1.0703053013084343, "grad_norm": 0.287109375, "learning_rate": 0.00017797008939307633, "loss": 2.7634273529052735, "step": 28210 }, { "epoch": 1.0706847146991973, "grad_norm": 0.291015625, "learning_rate": 0.0001778516247408633, "loss": 2.790788269042969, "step": 28220 }, { "epoch": 1.0710641280899607, "grad_norm": 0.29296875, "learning_rate": 0.00017773316795529725, "loss": 2.758692741394043, "step": 28230 }, { "epoch": 1.071443541480724, "grad_norm": 0.294921875, "learning_rate": 0.00017761471907845158, "loss": 2.7947994232177735, "step": 28240 }, { "epoch": 1.0718229548714873, "grad_norm": 0.287109375, "learning_rate": 0.000177496278152397, "loss": 2.7694318771362303, "step": 28250 }, { "epoch": 1.0718229548714873, "eval_loss": 2.7911360263824463, "eval_runtime": 190.4638, "eval_samples_per_second": 19.999, "eval_steps_per_second": 3.334, "step": 28250 }, { "epoch": 1.0722023682622506, "grad_norm": 0.296875, "learning_rate": 0.0001773778452192011, "loss": 2.804987144470215, "step": 28260 }, { "epoch": 1.0725817816530139, "grad_norm": 0.30078125, "learning_rate": 0.0001772594203209289, "loss": 2.7704351425170897, "step": 28270 }, { "epoch": 1.072961195043777, "grad_norm": 0.298828125, "learning_rate": 0.00017714100349964257, "loss": 2.786202621459961, "step": 28280 }, { "epoch": 1.0733406084345403, "grad_norm": 0.302734375, "learning_rate": 0.00017702259479740118, "loss": 2.7684293746948243, "step": 28290 }, { "epoch": 1.0737200218253036, "grad_norm": 0.30859375, "learning_rate": 0.00017690419425626114, "loss": 2.7582157135009764, "step": 28300 }, { "epoch": 1.0740994352160669, "grad_norm": 0.298828125, "learning_rate": 0.00017678580191827587, "loss": 2.7917430877685545, "step": 28310 }, { "epoch": 1.0744788486068302, "grad_norm": 0.298828125, "learning_rate": 0.000176667417825496, "loss": 2.7860788345336913, "step": 28320 }, { "epoch": 1.0748582619975935, "grad_norm": 0.291015625, "learning_rate": 0.00017654904201996896, "loss": 2.7869583129882813, "step": 28330 }, { "epoch": 1.0752376753883568, "grad_norm": 0.302734375, "learning_rate": 0.00017643067454373943, "loss": 2.779834747314453, "step": 28340 }, { "epoch": 1.0756170887791199, "grad_norm": 0.294921875, "learning_rate": 0.00017631231543884917, "loss": 2.771871566772461, "step": 28350 }, { "epoch": 1.0759965021698832, "grad_norm": 0.30078125, "learning_rate": 0.00017619396474733692, "loss": 2.796026039123535, "step": 28360 }, { "epoch": 1.0763759155606465, "grad_norm": 0.29296875, "learning_rate": 0.0001760756225112383, "loss": 2.7796875, "step": 28370 }, { "epoch": 1.0767553289514098, "grad_norm": 0.296875, "learning_rate": 0.00017595728877258603, "loss": 2.8010723114013674, "step": 28380 }, { "epoch": 1.077134742342173, "grad_norm": 0.294921875, "learning_rate": 0.00017583896357341002, "loss": 2.788990592956543, "step": 28390 }, { "epoch": 1.0775141557329364, "grad_norm": 0.2890625, "learning_rate": 0.00017572064695573668, "loss": 2.7884668350219726, "step": 28400 }, { "epoch": 1.0778935691236997, "grad_norm": 0.29296875, "learning_rate": 0.0001756023389615898, "loss": 2.8070772171020506, "step": 28410 }, { "epoch": 1.0782729825144628, "grad_norm": 0.306640625, "learning_rate": 0.00017548403963298987, "loss": 2.7836828231811523, "step": 28420 }, { "epoch": 1.078652395905226, "grad_norm": 0.302734375, "learning_rate": 0.0001753657490119545, "loss": 2.7997331619262695, "step": 28430 }, { "epoch": 1.0790318092959894, "grad_norm": 0.2890625, "learning_rate": 0.00017524746714049793, "loss": 2.782060432434082, "step": 28440 }, { "epoch": 1.0794112226867527, "grad_norm": 0.302734375, "learning_rate": 0.00017512919406063152, "loss": 2.7851690292358398, "step": 28450 }, { "epoch": 1.079790636077516, "grad_norm": 0.294921875, "learning_rate": 0.00017501092981436338, "loss": 2.7920713424682617, "step": 28460 }, { "epoch": 1.0801700494682793, "grad_norm": 0.31640625, "learning_rate": 0.00017489267444369868, "loss": 2.7931827545166015, "step": 28470 }, { "epoch": 1.0805494628590424, "grad_norm": 0.294921875, "learning_rate": 0.00017477442799063912, "loss": 2.7538848876953126, "step": 28480 }, { "epoch": 1.0809288762498057, "grad_norm": 0.3046875, "learning_rate": 0.00017465619049718346, "loss": 2.763479232788086, "step": 28490 }, { "epoch": 1.081308289640569, "grad_norm": 0.298828125, "learning_rate": 0.0001745379620053273, "loss": 2.7742265701293944, "step": 28500 }, { "epoch": 1.081308289640569, "eval_loss": 2.7903730869293213, "eval_runtime": 189.7697, "eval_samples_per_second": 20.072, "eval_steps_per_second": 3.346, "step": 28500 }, { "epoch": 1.0816877030313323, "grad_norm": 0.306640625, "learning_rate": 0.00017441974255706291, "loss": 2.780679130554199, "step": 28510 }, { "epoch": 1.0820671164220956, "grad_norm": 0.3046875, "learning_rate": 0.00017430153219437936, "loss": 2.785821723937988, "step": 28520 }, { "epoch": 1.082446529812859, "grad_norm": 0.3046875, "learning_rate": 0.00017418333095926258, "loss": 2.7931150436401366, "step": 28530 }, { "epoch": 1.082825943203622, "grad_norm": 0.296875, "learning_rate": 0.00017406513889369526, "loss": 2.798258399963379, "step": 28540 }, { "epoch": 1.0832053565943853, "grad_norm": 0.2890625, "learning_rate": 0.00017394695603965677, "loss": 2.789473533630371, "step": 28550 }, { "epoch": 1.0835847699851486, "grad_norm": 0.30078125, "learning_rate": 0.0001738287824391231, "loss": 2.805739974975586, "step": 28560 }, { "epoch": 1.083964183375912, "grad_norm": 0.296875, "learning_rate": 0.00017371061813406725, "loss": 2.7964056015014647, "step": 28570 }, { "epoch": 1.0843435967666752, "grad_norm": 0.298828125, "learning_rate": 0.00017359246316645866, "loss": 2.7997331619262695, "step": 28580 }, { "epoch": 1.0847230101574385, "grad_norm": 0.30859375, "learning_rate": 0.0001734743175782636, "loss": 2.792850685119629, "step": 28590 }, { "epoch": 1.0851024235482019, "grad_norm": 0.31640625, "learning_rate": 0.00017335618141144487, "loss": 2.760815238952637, "step": 28600 }, { "epoch": 1.085481836938965, "grad_norm": 0.30859375, "learning_rate": 0.0001732380547079621, "loss": 2.7820867538452148, "step": 28610 }, { "epoch": 1.0858612503297282, "grad_norm": 0.302734375, "learning_rate": 0.00017311993750977135, "loss": 2.811574172973633, "step": 28620 }, { "epoch": 1.0862406637204916, "grad_norm": 0.31640625, "learning_rate": 0.0001730018298588255, "loss": 2.782013702392578, "step": 28630 }, { "epoch": 1.0866200771112549, "grad_norm": 0.296875, "learning_rate": 0.00017288373179707398, "loss": 2.788167190551758, "step": 28640 }, { "epoch": 1.0869994905020182, "grad_norm": 0.296875, "learning_rate": 0.00017276564336646274, "loss": 2.778248405456543, "step": 28650 }, { "epoch": 1.0873789038927815, "grad_norm": 0.30859375, "learning_rate": 0.00017264756460893432, "loss": 2.767258071899414, "step": 28660 }, { "epoch": 1.0877583172835448, "grad_norm": 0.306640625, "learning_rate": 0.00017252949556642791, "loss": 2.787116050720215, "step": 28670 }, { "epoch": 1.0881377306743079, "grad_norm": 0.30078125, "learning_rate": 0.0001724114362808793, "loss": 2.761539840698242, "step": 28680 }, { "epoch": 1.0885171440650712, "grad_norm": 0.296875, "learning_rate": 0.00017229338679422055, "loss": 2.7836633682250977, "step": 28690 }, { "epoch": 1.0888965574558345, "grad_norm": 0.30078125, "learning_rate": 0.00017217534714838046, "loss": 2.7731863021850587, "step": 28700 }, { "epoch": 1.0892759708465978, "grad_norm": 0.302734375, "learning_rate": 0.0001720573173852843, "loss": 2.769741439819336, "step": 28710 }, { "epoch": 1.089655384237361, "grad_norm": 0.310546875, "learning_rate": 0.0001719392975468539, "loss": 2.7744155883789063, "step": 28720 }, { "epoch": 1.0900347976281244, "grad_norm": 0.30078125, "learning_rate": 0.00017182128767500733, "loss": 2.7889915466308595, "step": 28730 }, { "epoch": 1.0904142110188875, "grad_norm": 0.294921875, "learning_rate": 0.00017170328781165932, "loss": 2.815344047546387, "step": 28740 }, { "epoch": 1.0907936244096508, "grad_norm": 0.30078125, "learning_rate": 0.00017158529799872097, "loss": 2.770720863342285, "step": 28750 }, { "epoch": 1.0907936244096508, "eval_loss": 2.789764165878296, "eval_runtime": 189.9259, "eval_samples_per_second": 20.055, "eval_steps_per_second": 3.343, "step": 28750 }, { "epoch": 1.091173037800414, "grad_norm": 0.306640625, "learning_rate": 0.00017146731827810002, "loss": 2.78243350982666, "step": 28760 }, { "epoch": 1.0915524511911774, "grad_norm": 0.296875, "learning_rate": 0.00017134934869170014, "loss": 2.8160200119018555, "step": 28770 }, { "epoch": 1.0919318645819407, "grad_norm": 0.30859375, "learning_rate": 0.00017123138928142186, "loss": 2.7999719619750976, "step": 28780 }, { "epoch": 1.092311277972704, "grad_norm": 0.3125, "learning_rate": 0.000171113440089162, "loss": 2.7860111236572265, "step": 28790 }, { "epoch": 1.092690691363467, "grad_norm": 0.30859375, "learning_rate": 0.00017099550115681352, "loss": 2.7981550216674806, "step": 28800 }, { "epoch": 1.0930701047542304, "grad_norm": 0.30078125, "learning_rate": 0.00017087757252626595, "loss": 2.7788421630859377, "step": 28810 }, { "epoch": 1.0934495181449937, "grad_norm": 0.296875, "learning_rate": 0.00017075965423940517, "loss": 2.78209228515625, "step": 28820 }, { "epoch": 1.093828931535757, "grad_norm": 0.314453125, "learning_rate": 0.0001706417463381134, "loss": 2.780582809448242, "step": 28830 }, { "epoch": 1.0942083449265203, "grad_norm": 0.29296875, "learning_rate": 0.00017052384886426892, "loss": 2.7766395568847657, "step": 28840 }, { "epoch": 1.0945877583172836, "grad_norm": 0.291015625, "learning_rate": 0.00017040596185974652, "loss": 2.773018646240234, "step": 28850 }, { "epoch": 1.094967171708047, "grad_norm": 0.296875, "learning_rate": 0.00017028808536641735, "loss": 2.764349365234375, "step": 28860 }, { "epoch": 1.09534658509881, "grad_norm": 0.298828125, "learning_rate": 0.0001701702194261487, "loss": 2.796035385131836, "step": 28870 }, { "epoch": 1.0957259984895733, "grad_norm": 0.30078125, "learning_rate": 0.000170052364080804, "loss": 2.7673349380493164, "step": 28880 }, { "epoch": 1.0961054118803366, "grad_norm": 0.3046875, "learning_rate": 0.0001699345193722431, "loss": 2.7946893692016603, "step": 28890 }, { "epoch": 1.0964848252711, "grad_norm": 0.3125, "learning_rate": 0.00016981668534232213, "loss": 2.7871126174926757, "step": 28900 }, { "epoch": 1.0968642386618632, "grad_norm": 0.29296875, "learning_rate": 0.00016969886203289315, "loss": 2.829139518737793, "step": 28910 }, { "epoch": 1.0972436520526265, "grad_norm": 0.296875, "learning_rate": 0.0001695810494858046, "loss": 2.7891250610351563, "step": 28920 }, { "epoch": 1.0976230654433896, "grad_norm": 0.298828125, "learning_rate": 0.00016946324774290113, "loss": 2.817679786682129, "step": 28930 }, { "epoch": 1.098002478834153, "grad_norm": 0.294921875, "learning_rate": 0.00016934545684602355, "loss": 2.788760185241699, "step": 28940 }, { "epoch": 1.0983818922249162, "grad_norm": 0.30078125, "learning_rate": 0.0001692276768370086, "loss": 2.7721471786499023, "step": 28950 }, { "epoch": 1.0987613056156795, "grad_norm": 0.3125, "learning_rate": 0.00016910990775768943, "loss": 2.7893402099609377, "step": 28960 }, { "epoch": 1.0991407190064428, "grad_norm": 0.294921875, "learning_rate": 0.00016899214964989515, "loss": 2.771762657165527, "step": 28970 }, { "epoch": 1.0995201323972061, "grad_norm": 0.294921875, "learning_rate": 0.0001688744025554511, "loss": 2.8199508666992186, "step": 28980 }, { "epoch": 1.0998995457879694, "grad_norm": 0.30078125, "learning_rate": 0.00016875666651617848, "loss": 2.7733743667602537, "step": 28990 }, { "epoch": 1.1002789591787325, "grad_norm": 0.296875, "learning_rate": 0.00016863894157389476, "loss": 2.767172431945801, "step": 29000 }, { "epoch": 1.1002789591787325, "eval_loss": 2.7889413833618164, "eval_runtime": 189.7304, "eval_samples_per_second": 20.076, "eval_steps_per_second": 3.347, "step": 29000 }, { "epoch": 1.1006583725694958, "grad_norm": 0.294921875, "learning_rate": 0.00016852122777041351, "loss": 2.7551666259765626, "step": 29010 }, { "epoch": 1.1010377859602591, "grad_norm": 0.30078125, "learning_rate": 0.00016840352514754415, "loss": 2.7403453826904296, "step": 29020 }, { "epoch": 1.1014171993510224, "grad_norm": 0.30078125, "learning_rate": 0.00016828583374709223, "loss": 2.784125328063965, "step": 29030 }, { "epoch": 1.1017966127417858, "grad_norm": 0.306640625, "learning_rate": 0.00016816815361085934, "loss": 2.7885162353515627, "step": 29040 }, { "epoch": 1.102176026132549, "grad_norm": 0.30859375, "learning_rate": 0.0001680504847806431, "loss": 2.7691627502441407, "step": 29050 }, { "epoch": 1.1025554395233121, "grad_norm": 0.298828125, "learning_rate": 0.00016793282729823697, "loss": 2.772763252258301, "step": 29060 }, { "epoch": 1.1029348529140754, "grad_norm": 0.302734375, "learning_rate": 0.00016781518120543044, "loss": 2.79200439453125, "step": 29070 }, { "epoch": 1.1033142663048388, "grad_norm": 0.322265625, "learning_rate": 0.00016769754654400904, "loss": 2.779529571533203, "step": 29080 }, { "epoch": 1.103693679695602, "grad_norm": 0.296875, "learning_rate": 0.00016757992335575427, "loss": 2.76479549407959, "step": 29090 }, { "epoch": 1.1040730930863654, "grad_norm": 0.3046875, "learning_rate": 0.00016746231168244329, "loss": 2.7866819381713865, "step": 29100 }, { "epoch": 1.1044525064771287, "grad_norm": 0.296875, "learning_rate": 0.00016734471156584942, "loss": 2.7974843978881836, "step": 29110 }, { "epoch": 1.104831919867892, "grad_norm": 0.3046875, "learning_rate": 0.00016722712304774185, "loss": 2.786004066467285, "step": 29120 }, { "epoch": 1.105211333258655, "grad_norm": 0.291015625, "learning_rate": 0.00016710954616988555, "loss": 2.7902122497558595, "step": 29130 }, { "epoch": 1.1055907466494184, "grad_norm": 0.2890625, "learning_rate": 0.00016699198097404135, "loss": 2.759429931640625, "step": 29140 }, { "epoch": 1.1059701600401817, "grad_norm": 0.291015625, "learning_rate": 0.00016687442750196608, "loss": 2.792978858947754, "step": 29150 }, { "epoch": 1.106349573430945, "grad_norm": 0.298828125, "learning_rate": 0.00016675688579541234, "loss": 2.771105194091797, "step": 29160 }, { "epoch": 1.1067289868217083, "grad_norm": 0.30078125, "learning_rate": 0.00016663935589612843, "loss": 2.788691520690918, "step": 29170 }, { "epoch": 1.1071084002124716, "grad_norm": 0.29296875, "learning_rate": 0.00016652183784585857, "loss": 2.7625707626342773, "step": 29180 }, { "epoch": 1.1074878136032347, "grad_norm": 0.302734375, "learning_rate": 0.00016640433168634282, "loss": 2.7684179306030274, "step": 29190 }, { "epoch": 1.107867226993998, "grad_norm": 0.302734375, "learning_rate": 0.00016628683745931685, "loss": 2.778386116027832, "step": 29200 }, { "epoch": 1.1082466403847613, "grad_norm": 0.306640625, "learning_rate": 0.00016616935520651224, "loss": 2.7620277404785156, "step": 29210 }, { "epoch": 1.1086260537755246, "grad_norm": 0.302734375, "learning_rate": 0.0001660518849696563, "loss": 2.791958045959473, "step": 29220 }, { "epoch": 1.109005467166288, "grad_norm": 0.3046875, "learning_rate": 0.000165934426790472, "loss": 2.7753017425537108, "step": 29230 }, { "epoch": 1.1093848805570512, "grad_norm": 0.326171875, "learning_rate": 0.00016581698071067806, "loss": 2.7742597579956056, "step": 29240 }, { "epoch": 1.1097642939478143, "grad_norm": 0.302734375, "learning_rate": 0.00016569954677198893, "loss": 2.768603706359863, "step": 29250 }, { "epoch": 1.1097642939478143, "eval_loss": 2.791550397872925, "eval_runtime": 189.7157, "eval_samples_per_second": 20.077, "eval_steps_per_second": 3.347, "step": 29250 }, { "epoch": 1.1101437073385776, "grad_norm": 0.3125, "learning_rate": 0.00016558212501611476, "loss": 2.7843252182006837, "step": 29260 }, { "epoch": 1.110523120729341, "grad_norm": 0.294921875, "learning_rate": 0.00016546471548476132, "loss": 2.7901559829711915, "step": 29270 }, { "epoch": 1.1109025341201042, "grad_norm": 0.294921875, "learning_rate": 0.00016534731821963, "loss": 2.7804866790771485, "step": 29280 }, { "epoch": 1.1112819475108675, "grad_norm": 0.294921875, "learning_rate": 0.00016522993326241792, "loss": 2.7736316680908204, "step": 29290 }, { "epoch": 1.1116613609016308, "grad_norm": 0.306640625, "learning_rate": 0.00016511256065481787, "loss": 2.8028955459594727, "step": 29300 }, { "epoch": 1.1120407742923941, "grad_norm": 0.30078125, "learning_rate": 0.0001649952004385181, "loss": 2.773177909851074, "step": 29310 }, { "epoch": 1.1124201876831572, "grad_norm": 0.3125, "learning_rate": 0.00016487785265520253, "loss": 2.803989219665527, "step": 29320 }, { "epoch": 1.1127996010739205, "grad_norm": 0.296875, "learning_rate": 0.00016476051734655076, "loss": 2.7698843002319338, "step": 29330 }, { "epoch": 1.1131790144646838, "grad_norm": 0.3046875, "learning_rate": 0.00016464319455423792, "loss": 2.792783737182617, "step": 29340 }, { "epoch": 1.1135584278554471, "grad_norm": 0.294921875, "learning_rate": 0.00016452588431993445, "loss": 2.7519039154052733, "step": 29350 }, { "epoch": 1.1139378412462104, "grad_norm": 0.298828125, "learning_rate": 0.00016440858668530667, "loss": 2.7763578414916994, "step": 29360 }, { "epoch": 1.1143172546369737, "grad_norm": 0.291015625, "learning_rate": 0.0001642913016920163, "loss": 2.769949722290039, "step": 29370 }, { "epoch": 1.114696668027737, "grad_norm": 0.296875, "learning_rate": 0.00016417402938172053, "loss": 2.778183364868164, "step": 29380 }, { "epoch": 1.1150760814185001, "grad_norm": 0.29296875, "learning_rate": 0.000164056769796072, "loss": 2.7839927673339844, "step": 29390 }, { "epoch": 1.1154554948092634, "grad_norm": 0.328125, "learning_rate": 0.00016393952297671897, "loss": 2.8126983642578125, "step": 29400 }, { "epoch": 1.1158349082000267, "grad_norm": 0.302734375, "learning_rate": 0.00016382228896530518, "loss": 2.7815315246582033, "step": 29410 }, { "epoch": 1.11621432159079, "grad_norm": 0.306640625, "learning_rate": 0.00016370506780346954, "loss": 2.7813796997070312, "step": 29420 }, { "epoch": 1.1165937349815533, "grad_norm": 0.298828125, "learning_rate": 0.00016358785953284673, "loss": 2.7985830307006836, "step": 29430 }, { "epoch": 1.1169731483723166, "grad_norm": 0.28515625, "learning_rate": 0.00016347066419506668, "loss": 2.785284423828125, "step": 29440 }, { "epoch": 1.1173525617630797, "grad_norm": 0.29296875, "learning_rate": 0.00016335348183175484, "loss": 2.7636661529541016, "step": 29450 }, { "epoch": 1.117731975153843, "grad_norm": 0.3046875, "learning_rate": 0.0001632363124845318, "loss": 2.7807945251464843, "step": 29460 }, { "epoch": 1.1181113885446063, "grad_norm": 0.29296875, "learning_rate": 0.0001631191561950138, "loss": 2.7913835525512694, "step": 29470 }, { "epoch": 1.1184908019353696, "grad_norm": 0.294921875, "learning_rate": 0.00016300201300481235, "loss": 2.777690124511719, "step": 29480 }, { "epoch": 1.118870215326133, "grad_norm": 0.3046875, "learning_rate": 0.00016288488295553436, "loss": 2.77139949798584, "step": 29490 }, { "epoch": 1.1192496287168963, "grad_norm": 0.314453125, "learning_rate": 0.00016276776608878187, "loss": 2.788038635253906, "step": 29500 }, { "epoch": 1.1192496287168963, "eval_loss": 2.7879996299743652, "eval_runtime": 189.6969, "eval_samples_per_second": 20.079, "eval_steps_per_second": 3.347, "step": 29500 }, { "epoch": 1.1196290421076593, "grad_norm": 0.287109375, "learning_rate": 0.00016265066244615248, "loss": 2.7513957977294923, "step": 29510 }, { "epoch": 1.1200084554984227, "grad_norm": 0.296875, "learning_rate": 0.00016253357206923904, "loss": 2.7855958938598633, "step": 29520 }, { "epoch": 1.120387868889186, "grad_norm": 0.302734375, "learning_rate": 0.00016241649499962945, "loss": 2.818332481384277, "step": 29530 }, { "epoch": 1.1207672822799493, "grad_norm": 0.3046875, "learning_rate": 0.00016229943127890723, "loss": 2.816667175292969, "step": 29540 }, { "epoch": 1.1211466956707126, "grad_norm": 0.29296875, "learning_rate": 0.00016218238094865093, "loss": 2.78240966796875, "step": 29550 }, { "epoch": 1.1215261090614759, "grad_norm": 0.306640625, "learning_rate": 0.00016206534405043457, "loss": 2.765675354003906, "step": 29560 }, { "epoch": 1.1219055224522392, "grad_norm": 0.3125, "learning_rate": 0.00016194832062582708, "loss": 2.7861862182617188, "step": 29570 }, { "epoch": 1.1222849358430023, "grad_norm": 0.291015625, "learning_rate": 0.00016183131071639278, "loss": 2.759438896179199, "step": 29580 }, { "epoch": 1.1226643492337656, "grad_norm": 0.30078125, "learning_rate": 0.00016171431436369122, "loss": 2.78537654876709, "step": 29590 }, { "epoch": 1.1230437626245289, "grad_norm": 0.30078125, "learning_rate": 0.0001615973316092772, "loss": 2.7682445526123045, "step": 29600 }, { "epoch": 1.1234231760152922, "grad_norm": 0.3046875, "learning_rate": 0.00016148036249470041, "loss": 2.752110481262207, "step": 29610 }, { "epoch": 1.1238025894060555, "grad_norm": 0.30859375, "learning_rate": 0.00016136340706150596, "loss": 2.8039758682250975, "step": 29620 }, { "epoch": 1.1241820027968188, "grad_norm": 0.30859375, "learning_rate": 0.00016124646535123404, "loss": 2.803653526306152, "step": 29630 }, { "epoch": 1.124561416187582, "grad_norm": 0.29296875, "learning_rate": 0.00016112953740541984, "loss": 2.7557321548461915, "step": 29640 }, { "epoch": 1.1249408295783452, "grad_norm": 0.302734375, "learning_rate": 0.00016101262326559382, "loss": 2.7823776245117187, "step": 29650 }, { "epoch": 1.1253202429691085, "grad_norm": 0.294921875, "learning_rate": 0.0001608957229732815, "loss": 2.773545265197754, "step": 29660 }, { "epoch": 1.1256996563598718, "grad_norm": 0.3359375, "learning_rate": 0.0001607788365700035, "loss": 2.7435897827148437, "step": 29670 }, { "epoch": 1.126079069750635, "grad_norm": 0.306640625, "learning_rate": 0.0001606619640972753, "loss": 2.8148513793945313, "step": 29680 }, { "epoch": 1.1264584831413984, "grad_norm": 0.310546875, "learning_rate": 0.0001605451055966077, "loss": 2.770188903808594, "step": 29690 }, { "epoch": 1.1268378965321615, "grad_norm": 0.302734375, "learning_rate": 0.00016042826110950647, "loss": 2.7557559967041017, "step": 29700 }, { "epoch": 1.1272173099229248, "grad_norm": 0.298828125, "learning_rate": 0.00016031143067747237, "loss": 2.755065155029297, "step": 29710 }, { "epoch": 1.127596723313688, "grad_norm": 0.30859375, "learning_rate": 0.00016019461434200102, "loss": 2.7652984619140626, "step": 29720 }, { "epoch": 1.1279761367044514, "grad_norm": 0.306640625, "learning_rate": 0.0001600778121445833, "loss": 2.7827526092529298, "step": 29730 }, { "epoch": 1.1283555500952147, "grad_norm": 0.294921875, "learning_rate": 0.00015996102412670497, "loss": 2.812664604187012, "step": 29740 }, { "epoch": 1.128734963485978, "grad_norm": 0.294921875, "learning_rate": 0.00015984425032984666, "loss": 2.798584747314453, "step": 29750 }, { "epoch": 1.128734963485978, "eval_loss": 2.7876031398773193, "eval_runtime": 189.7365, "eval_samples_per_second": 20.075, "eval_steps_per_second": 3.347, "step": 29750 }, { "epoch": 1.1291143768767413, "grad_norm": 0.30078125, "learning_rate": 0.000159727490795484, "loss": 2.7903308868408203, "step": 29760 }, { "epoch": 1.1294937902675044, "grad_norm": 0.298828125, "learning_rate": 0.00015961074556508764, "loss": 2.7981582641601563, "step": 29770 }, { "epoch": 1.1298732036582677, "grad_norm": 0.3203125, "learning_rate": 0.0001594940146801231, "loss": 2.7476934432983398, "step": 29780 }, { "epoch": 1.130252617049031, "grad_norm": 0.298828125, "learning_rate": 0.00015937729818205068, "loss": 2.7978631973266603, "step": 29790 }, { "epoch": 1.1306320304397943, "grad_norm": 0.31640625, "learning_rate": 0.0001592605961123258, "loss": 2.7848638534545898, "step": 29800 }, { "epoch": 1.1310114438305576, "grad_norm": 0.302734375, "learning_rate": 0.00015914390851239858, "loss": 2.776778984069824, "step": 29810 }, { "epoch": 1.131390857221321, "grad_norm": 0.306640625, "learning_rate": 0.00015902723542371398, "loss": 2.7786970138549805, "step": 29820 }, { "epoch": 1.1317702706120842, "grad_norm": 0.302734375, "learning_rate": 0.00015891057688771198, "loss": 2.7815832138061523, "step": 29830 }, { "epoch": 1.1321496840028473, "grad_norm": 0.30078125, "learning_rate": 0.00015879393294582726, "loss": 2.7850978851318358, "step": 29840 }, { "epoch": 1.1325290973936106, "grad_norm": 0.30078125, "learning_rate": 0.00015867730363948942, "loss": 2.770014190673828, "step": 29850 }, { "epoch": 1.132908510784374, "grad_norm": 0.291015625, "learning_rate": 0.0001585606890101226, "loss": 2.780798149108887, "step": 29860 }, { "epoch": 1.1332879241751372, "grad_norm": 0.322265625, "learning_rate": 0.00015844408909914607, "loss": 2.775553512573242, "step": 29870 }, { "epoch": 1.1336673375659005, "grad_norm": 0.314453125, "learning_rate": 0.0001583275039479737, "loss": 2.7713949203491213, "step": 29880 }, { "epoch": 1.1340467509566639, "grad_norm": 0.291015625, "learning_rate": 0.00015821093359801413, "loss": 2.7889101028442385, "step": 29890 }, { "epoch": 1.1344261643474272, "grad_norm": 0.29296875, "learning_rate": 0.0001580943780906707, "loss": 2.783517837524414, "step": 29900 }, { "epoch": 1.1348055777381902, "grad_norm": 0.302734375, "learning_rate": 0.00015797783746734155, "loss": 2.7745214462280274, "step": 29910 }, { "epoch": 1.1351849911289535, "grad_norm": 0.302734375, "learning_rate": 0.0001578613117694196, "loss": 2.7680170059204103, "step": 29920 }, { "epoch": 1.1355644045197169, "grad_norm": 0.29296875, "learning_rate": 0.0001577448010382922, "loss": 2.776707077026367, "step": 29930 }, { "epoch": 1.1359438179104802, "grad_norm": 0.302734375, "learning_rate": 0.00015762830531534168, "loss": 2.795788383483887, "step": 29940 }, { "epoch": 1.1363232313012435, "grad_norm": 0.291015625, "learning_rate": 0.00015751182464194492, "loss": 2.740387535095215, "step": 29950 }, { "epoch": 1.1367026446920065, "grad_norm": 0.302734375, "learning_rate": 0.00015739535905947354, "loss": 2.7558320999145507, "step": 29960 }, { "epoch": 1.1370820580827699, "grad_norm": 0.296875, "learning_rate": 0.0001572789086092935, "loss": 2.7955881118774415, "step": 29970 }, { "epoch": 1.1374614714735332, "grad_norm": 0.306640625, "learning_rate": 0.00015716247333276575, "loss": 2.7744237899780275, "step": 29980 }, { "epoch": 1.1378408848642965, "grad_norm": 0.310546875, "learning_rate": 0.00015704605327124566, "loss": 2.7746454238891602, "step": 29990 }, { "epoch": 1.1382202982550598, "grad_norm": 0.298828125, "learning_rate": 0.00015692964846608336, "loss": 2.789381408691406, "step": 30000 }, { "epoch": 1.1382202982550598, "eval_loss": 2.786647319793701, "eval_runtime": 189.767, "eval_samples_per_second": 20.072, "eval_steps_per_second": 3.346, "step": 30000 }, { "epoch": 1.138599711645823, "grad_norm": 0.314453125, "learning_rate": 0.00015681325895862324, "loss": 2.766264533996582, "step": 30010 }, { "epoch": 1.1389791250365864, "grad_norm": 0.296875, "learning_rate": 0.00015669688479020457, "loss": 2.7526357650756834, "step": 30020 }, { "epoch": 1.1393585384273495, "grad_norm": 0.294921875, "learning_rate": 0.00015658052600216112, "loss": 2.7906240463256835, "step": 30030 }, { "epoch": 1.1397379518181128, "grad_norm": 0.291015625, "learning_rate": 0.00015646418263582097, "loss": 2.776152420043945, "step": 30040 }, { "epoch": 1.140117365208876, "grad_norm": 0.298828125, "learning_rate": 0.00015634785473250698, "loss": 2.770591926574707, "step": 30050 }, { "epoch": 1.1404967785996394, "grad_norm": 0.3046875, "learning_rate": 0.00015623154233353643, "loss": 2.772708702087402, "step": 30060 }, { "epoch": 1.1408761919904027, "grad_norm": 0.298828125, "learning_rate": 0.00015611524548022116, "loss": 2.792009735107422, "step": 30070 }, { "epoch": 1.141255605381166, "grad_norm": 0.30078125, "learning_rate": 0.00015599896421386727, "loss": 2.81104736328125, "step": 30080 }, { "epoch": 1.1416350187719293, "grad_norm": 0.296875, "learning_rate": 0.00015588269857577555, "loss": 2.7798933029174804, "step": 30090 }, { "epoch": 1.1420144321626924, "grad_norm": 0.302734375, "learning_rate": 0.00015576644860724117, "loss": 2.785451126098633, "step": 30100 }, { "epoch": 1.1423938455534557, "grad_norm": 0.287109375, "learning_rate": 0.00015565021434955377, "loss": 2.787906837463379, "step": 30110 }, { "epoch": 1.142773258944219, "grad_norm": 0.31640625, "learning_rate": 0.00015553399584399723, "loss": 2.7504335403442384, "step": 30120 }, { "epoch": 1.1431526723349823, "grad_norm": 0.29296875, "learning_rate": 0.00015541779313185008, "loss": 2.788090133666992, "step": 30130 }, { "epoch": 1.1435320857257456, "grad_norm": 0.298828125, "learning_rate": 0.00015530160625438515, "loss": 2.748646354675293, "step": 30140 }, { "epoch": 1.143911499116509, "grad_norm": 0.298828125, "learning_rate": 0.0001551854352528695, "loss": 2.7837736129760744, "step": 30150 }, { "epoch": 1.144290912507272, "grad_norm": 0.306640625, "learning_rate": 0.00015506928016856476, "loss": 2.8000246047973634, "step": 30160 }, { "epoch": 1.1446703258980353, "grad_norm": 0.3046875, "learning_rate": 0.00015495314104272683, "loss": 2.770271873474121, "step": 30170 }, { "epoch": 1.1450497392887986, "grad_norm": 0.294921875, "learning_rate": 0.00015483701791660603, "loss": 2.7747236251831056, "step": 30180 }, { "epoch": 1.145429152679562, "grad_norm": 0.302734375, "learning_rate": 0.00015472091083144668, "loss": 2.7664737701416016, "step": 30190 }, { "epoch": 1.1458085660703252, "grad_norm": 0.314453125, "learning_rate": 0.00015460481982848778, "loss": 2.7726133346557615, "step": 30200 }, { "epoch": 1.1461879794610885, "grad_norm": 0.29296875, "learning_rate": 0.00015448874494896245, "loss": 2.781939888000488, "step": 30210 }, { "epoch": 1.1465673928518516, "grad_norm": 0.296875, "learning_rate": 0.00015437268623409812, "loss": 2.7441003799438475, "step": 30220 }, { "epoch": 1.146946806242615, "grad_norm": 0.30859375, "learning_rate": 0.00015425664372511637, "loss": 2.7906940460205076, "step": 30230 }, { "epoch": 1.1473262196333782, "grad_norm": 0.302734375, "learning_rate": 0.00015414061746323312, "loss": 2.7784749984741213, "step": 30240 }, { "epoch": 1.1477056330241415, "grad_norm": 0.296875, "learning_rate": 0.00015402460748965865, "loss": 2.813048553466797, "step": 30250 }, { "epoch": 1.1477056330241415, "eval_loss": 2.787014961242676, "eval_runtime": 189.7647, "eval_samples_per_second": 20.072, "eval_steps_per_second": 3.346, "step": 30250 }, { "epoch": 1.1480850464149048, "grad_norm": 0.3046875, "learning_rate": 0.00015390861384559717, "loss": 2.7542816162109376, "step": 30260 }, { "epoch": 1.1484644598056681, "grad_norm": 0.30078125, "learning_rate": 0.00015379263657224723, "loss": 2.7809288024902346, "step": 30270 }, { "epoch": 1.1488438731964314, "grad_norm": 0.29296875, "learning_rate": 0.00015367667571080165, "loss": 2.7746137619018554, "step": 30280 }, { "epoch": 1.1492232865871945, "grad_norm": 0.296875, "learning_rate": 0.00015356073130244738, "loss": 2.7704517364501955, "step": 30290 }, { "epoch": 1.1496026999779578, "grad_norm": 0.291015625, "learning_rate": 0.0001534448033883653, "loss": 2.7817966461181642, "step": 30300 }, { "epoch": 1.1499821133687211, "grad_norm": 0.302734375, "learning_rate": 0.00015332889200973078, "loss": 2.761709976196289, "step": 30310 }, { "epoch": 1.1503615267594844, "grad_norm": 0.29296875, "learning_rate": 0.00015321299720771314, "loss": 2.7558801651000975, "step": 30320 }, { "epoch": 1.1507409401502477, "grad_norm": 0.298828125, "learning_rate": 0.0001530971190234758, "loss": 2.7730291366577147, "step": 30330 }, { "epoch": 1.151120353541011, "grad_norm": 0.3203125, "learning_rate": 0.00015298125749817624, "loss": 2.803190803527832, "step": 30340 }, { "epoch": 1.1514997669317744, "grad_norm": 0.298828125, "learning_rate": 0.0001528654126729662, "loss": 2.779417610168457, "step": 30350 }, { "epoch": 1.1518791803225374, "grad_norm": 0.298828125, "learning_rate": 0.00015274958458899135, "loss": 2.747758483886719, "step": 30360 }, { "epoch": 1.1522585937133007, "grad_norm": 0.29296875, "learning_rate": 0.00015263377328739137, "loss": 2.7647146224975585, "step": 30370 }, { "epoch": 1.152638007104064, "grad_norm": 0.2890625, "learning_rate": 0.00015251797880930015, "loss": 2.7770204544067383, "step": 30380 }, { "epoch": 1.1530174204948274, "grad_norm": 0.298828125, "learning_rate": 0.0001524022011958454, "loss": 2.8046106338500976, "step": 30390 }, { "epoch": 1.1533968338855907, "grad_norm": 0.31640625, "learning_rate": 0.00015228644048814906, "loss": 2.786570358276367, "step": 30400 }, { "epoch": 1.1537762472763538, "grad_norm": 0.306640625, "learning_rate": 0.00015217069672732688, "loss": 2.7472164154052736, "step": 30410 }, { "epoch": 1.154155660667117, "grad_norm": 0.30859375, "learning_rate": 0.00015205496995448868, "loss": 2.796977996826172, "step": 30420 }, { "epoch": 1.1545350740578804, "grad_norm": 0.306640625, "learning_rate": 0.0001519392602107382, "loss": 2.7743114471435546, "step": 30430 }, { "epoch": 1.1549144874486437, "grad_norm": 0.30078125, "learning_rate": 0.0001518235675371732, "loss": 2.7892177581787108, "step": 30440 }, { "epoch": 1.155293900839407, "grad_norm": 0.291015625, "learning_rate": 0.00015170789197488529, "loss": 2.7681161880493166, "step": 30450 }, { "epoch": 1.1556733142301703, "grad_norm": 0.3046875, "learning_rate": 0.0001515922335649601, "loss": 2.805204391479492, "step": 30460 }, { "epoch": 1.1560527276209336, "grad_norm": 0.302734375, "learning_rate": 0.0001514765923484771, "loss": 2.802520751953125, "step": 30470 }, { "epoch": 1.1564321410116967, "grad_norm": 0.298828125, "learning_rate": 0.00015136096836650963, "loss": 2.7859737396240236, "step": 30480 }, { "epoch": 1.15681155440246, "grad_norm": 0.302734375, "learning_rate": 0.000151245361660125, "loss": 2.7620113372802733, "step": 30490 }, { "epoch": 1.1571909677932233, "grad_norm": 0.3046875, "learning_rate": 0.00015112977227038432, "loss": 2.7748268127441404, "step": 30500 }, { "epoch": 1.1571909677932233, "eval_loss": 2.7857508659362793, "eval_runtime": 189.8391, "eval_samples_per_second": 20.064, "eval_steps_per_second": 3.345, "step": 30500 }, { "epoch": 1.1575703811839866, "grad_norm": 0.294921875, "learning_rate": 0.00015101420023834255, "loss": 2.7684406280517577, "step": 30510 }, { "epoch": 1.15794979457475, "grad_norm": 0.310546875, "learning_rate": 0.00015089864560504848, "loss": 2.785778045654297, "step": 30520 }, { "epoch": 1.1583292079655132, "grad_norm": 0.29296875, "learning_rate": 0.00015078310841154475, "loss": 2.7682409286499023, "step": 30530 }, { "epoch": 1.1587086213562765, "grad_norm": 0.294921875, "learning_rate": 0.0001506675886988679, "loss": 2.754154396057129, "step": 30540 }, { "epoch": 1.1590880347470396, "grad_norm": 0.302734375, "learning_rate": 0.00015055208650804796, "loss": 2.768457794189453, "step": 30550 }, { "epoch": 1.159467448137803, "grad_norm": 0.28515625, "learning_rate": 0.000150436601880109, "loss": 2.7523273468017577, "step": 30560 }, { "epoch": 1.1598468615285662, "grad_norm": 0.30078125, "learning_rate": 0.00015032113485606886, "loss": 2.7683454513549806, "step": 30570 }, { "epoch": 1.1602262749193295, "grad_norm": 0.302734375, "learning_rate": 0.00015020568547693906, "loss": 2.760143852233887, "step": 30580 }, { "epoch": 1.1606056883100928, "grad_norm": 0.3046875, "learning_rate": 0.0001500902537837247, "loss": 2.80804500579834, "step": 30590 }, { "epoch": 1.1609851017008561, "grad_norm": 0.3125, "learning_rate": 0.0001499748398174248, "loss": 2.777779960632324, "step": 30600 }, { "epoch": 1.1613645150916194, "grad_norm": 0.3046875, "learning_rate": 0.00014985944361903207, "loss": 2.771732139587402, "step": 30610 }, { "epoch": 1.1617439284823825, "grad_norm": 0.3203125, "learning_rate": 0.0001497440652295329, "loss": 2.7830026626586912, "step": 30620 }, { "epoch": 1.1621233418731458, "grad_norm": 0.294921875, "learning_rate": 0.00014962870468990721, "loss": 2.786970329284668, "step": 30630 }, { "epoch": 1.1625027552639091, "grad_norm": 0.298828125, "learning_rate": 0.00014951336204112873, "loss": 2.758565902709961, "step": 30640 }, { "epoch": 1.1628821686546724, "grad_norm": 0.294921875, "learning_rate": 0.00014939803732416489, "loss": 2.7615299224853516, "step": 30650 }, { "epoch": 1.1632615820454357, "grad_norm": 0.29296875, "learning_rate": 0.0001492827305799765, "loss": 2.8085205078125, "step": 30660 }, { "epoch": 1.1636409954361988, "grad_norm": 0.314453125, "learning_rate": 0.0001491674418495182, "loss": 2.7439044952392577, "step": 30670 }, { "epoch": 1.1640204088269621, "grad_norm": 0.306640625, "learning_rate": 0.00014905217117373816, "loss": 2.7748249053955076, "step": 30680 }, { "epoch": 1.1643998222177254, "grad_norm": 0.310546875, "learning_rate": 0.00014893691859357828, "loss": 2.796868324279785, "step": 30690 }, { "epoch": 1.1647792356084887, "grad_norm": 0.302734375, "learning_rate": 0.0001488216841499737, "loss": 2.747126579284668, "step": 30700 }, { "epoch": 1.165158648999252, "grad_norm": 0.296875, "learning_rate": 0.00014870646788385346, "loss": 2.8088346481323243, "step": 30710 }, { "epoch": 1.1655380623900153, "grad_norm": 0.30078125, "learning_rate": 0.00014859126983613992, "loss": 2.797402763366699, "step": 30720 }, { "epoch": 1.1659174757807786, "grad_norm": 0.30078125, "learning_rate": 0.00014847609004774924, "loss": 2.7700521469116213, "step": 30730 }, { "epoch": 1.1662968891715417, "grad_norm": 0.302734375, "learning_rate": 0.0001483609285595907, "loss": 2.764089584350586, "step": 30740 }, { "epoch": 1.166676302562305, "grad_norm": 0.298828125, "learning_rate": 0.00014824578541256731, "loss": 2.79050350189209, "step": 30750 }, { "epoch": 1.166676302562305, "eval_loss": 2.784939765930176, "eval_runtime": 190.7484, "eval_samples_per_second": 19.969, "eval_steps_per_second": 3.329, "step": 30750 }, { "epoch": 1.1670557159530683, "grad_norm": 0.30859375, "learning_rate": 0.00014813066064757571, "loss": 2.802741050720215, "step": 30760 }, { "epoch": 1.1674351293438316, "grad_norm": 0.294921875, "learning_rate": 0.00014801555430550567, "loss": 2.770320701599121, "step": 30770 }, { "epoch": 1.167814542734595, "grad_norm": 0.30859375, "learning_rate": 0.00014790046642724065, "loss": 2.7888444900512694, "step": 30780 }, { "epoch": 1.1681939561253583, "grad_norm": 0.296875, "learning_rate": 0.00014778539705365747, "loss": 2.757907485961914, "step": 30790 }, { "epoch": 1.1685733695161216, "grad_norm": 0.306640625, "learning_rate": 0.0001476703462256266, "loss": 2.7560298919677733, "step": 30800 }, { "epoch": 1.1689527829068846, "grad_norm": 0.30859375, "learning_rate": 0.0001475553139840114, "loss": 2.7728364944458006, "step": 30810 }, { "epoch": 1.169332196297648, "grad_norm": 0.306640625, "learning_rate": 0.00014744030036966917, "loss": 2.785826873779297, "step": 30820 }, { "epoch": 1.1697116096884113, "grad_norm": 0.296875, "learning_rate": 0.00014732530542345038, "loss": 2.811594581604004, "step": 30830 }, { "epoch": 1.1700910230791746, "grad_norm": 0.31640625, "learning_rate": 0.00014721032918619878, "loss": 2.751936912536621, "step": 30840 }, { "epoch": 1.1704704364699379, "grad_norm": 0.294921875, "learning_rate": 0.00014709537169875153, "loss": 2.7689655303955076, "step": 30850 }, { "epoch": 1.1708498498607012, "grad_norm": 0.294921875, "learning_rate": 0.00014698043300193927, "loss": 2.795510673522949, "step": 30860 }, { "epoch": 1.1712292632514643, "grad_norm": 0.298828125, "learning_rate": 0.0001468655131365859, "loss": 2.779537582397461, "step": 30870 }, { "epoch": 1.1716086766422276, "grad_norm": 0.302734375, "learning_rate": 0.00014675061214350843, "loss": 2.7711172103881836, "step": 30880 }, { "epoch": 1.1719880900329909, "grad_norm": 0.298828125, "learning_rate": 0.00014663573006351742, "loss": 2.7799554824829102, "step": 30890 }, { "epoch": 1.1723675034237542, "grad_norm": 0.30078125, "learning_rate": 0.00014652086693741655, "loss": 2.7882125854492186, "step": 30900 }, { "epoch": 1.1727469168145175, "grad_norm": 0.298828125, "learning_rate": 0.00014640602280600304, "loss": 2.7954288482666017, "step": 30910 }, { "epoch": 1.1731263302052808, "grad_norm": 0.294921875, "learning_rate": 0.00014629119771006693, "loss": 2.761754035949707, "step": 30920 }, { "epoch": 1.1735057435960439, "grad_norm": 0.2890625, "learning_rate": 0.0001461763916903918, "loss": 2.813496208190918, "step": 30930 }, { "epoch": 1.1738851569868072, "grad_norm": 0.306640625, "learning_rate": 0.00014606160478775445, "loss": 2.784537506103516, "step": 30940 }, { "epoch": 1.1742645703775705, "grad_norm": 0.310546875, "learning_rate": 0.0001459468370429247, "loss": 2.7938121795654296, "step": 30950 }, { "epoch": 1.1746439837683338, "grad_norm": 0.298828125, "learning_rate": 0.00014583208849666587, "loss": 2.7926023483276365, "step": 30960 }, { "epoch": 1.175023397159097, "grad_norm": 0.30078125, "learning_rate": 0.0001457173591897341, "loss": 2.8199155807495115, "step": 30970 }, { "epoch": 1.1754028105498604, "grad_norm": 0.302734375, "learning_rate": 0.00014560264916287902, "loss": 2.793244743347168, "step": 30980 }, { "epoch": 1.1757822239406237, "grad_norm": 0.30078125, "learning_rate": 0.00014548795845684316, "loss": 2.7894113540649412, "step": 30990 }, { "epoch": 1.1761616373313868, "grad_norm": 0.298828125, "learning_rate": 0.0001453732871123624, "loss": 2.792725372314453, "step": 31000 }, { "epoch": 1.1761616373313868, "eval_loss": 2.7843639850616455, "eval_runtime": 189.9337, "eval_samples_per_second": 20.054, "eval_steps_per_second": 3.343, "step": 31000 }, { "epoch": 1.17654105072215, "grad_norm": 0.30078125, "learning_rate": 0.0001452586351701655, "loss": 2.773011016845703, "step": 31010 }, { "epoch": 1.1769204641129134, "grad_norm": 0.296875, "learning_rate": 0.00014514400267097463, "loss": 2.7602230072021485, "step": 31020 }, { "epoch": 1.1772998775036767, "grad_norm": 0.2890625, "learning_rate": 0.0001450293896555048, "loss": 2.7494041442871096, "step": 31030 }, { "epoch": 1.17767929089444, "grad_norm": 0.28515625, "learning_rate": 0.00014491479616446425, "loss": 2.7926143646240233, "step": 31040 }, { "epoch": 1.1780587042852033, "grad_norm": 0.30078125, "learning_rate": 0.0001448002222385542, "loss": 2.797669792175293, "step": 31050 }, { "epoch": 1.1784381176759666, "grad_norm": 0.298828125, "learning_rate": 0.00014468566791846885, "loss": 2.777814483642578, "step": 31060 }, { "epoch": 1.1788175310667297, "grad_norm": 0.298828125, "learning_rate": 0.00014457113324489566, "loss": 2.7649248123168944, "step": 31070 }, { "epoch": 1.179196944457493, "grad_norm": 0.298828125, "learning_rate": 0.00014445661825851502, "loss": 2.7883142471313476, "step": 31080 }, { "epoch": 1.1795763578482563, "grad_norm": 0.302734375, "learning_rate": 0.0001443421230000002, "loss": 2.781380844116211, "step": 31090 }, { "epoch": 1.1799557712390196, "grad_norm": 0.306640625, "learning_rate": 0.00014422764751001758, "loss": 2.778353691101074, "step": 31100 }, { "epoch": 1.180335184629783, "grad_norm": 0.302734375, "learning_rate": 0.00014411319182922652, "loss": 2.791376495361328, "step": 31110 }, { "epoch": 1.1807145980205462, "grad_norm": 0.287109375, "learning_rate": 0.00014399875599827933, "loss": 2.805023193359375, "step": 31120 }, { "epoch": 1.1810940114113093, "grad_norm": 0.291015625, "learning_rate": 0.00014388434005782128, "loss": 2.791310691833496, "step": 31130 }, { "epoch": 1.1814734248020726, "grad_norm": 0.29296875, "learning_rate": 0.00014376994404849045, "loss": 2.799587059020996, "step": 31140 }, { "epoch": 1.181852838192836, "grad_norm": 0.29296875, "learning_rate": 0.00014365556801091802, "loss": 2.787159538269043, "step": 31150 }, { "epoch": 1.1822322515835992, "grad_norm": 0.30078125, "learning_rate": 0.00014354121198572808, "loss": 2.780709075927734, "step": 31160 }, { "epoch": 1.1826116649743625, "grad_norm": 0.296875, "learning_rate": 0.0001434268760135374, "loss": 2.7834543228149413, "step": 31170 }, { "epoch": 1.1829910783651258, "grad_norm": 0.291015625, "learning_rate": 0.00014331256013495582, "loss": 2.8156526565551756, "step": 31180 }, { "epoch": 1.183370491755889, "grad_norm": 0.30078125, "learning_rate": 0.00014319826439058596, "loss": 2.771438789367676, "step": 31190 }, { "epoch": 1.1837499051466522, "grad_norm": 0.3046875, "learning_rate": 0.00014308398882102345, "loss": 2.7724966049194335, "step": 31200 }, { "epoch": 1.1841293185374155, "grad_norm": 0.3046875, "learning_rate": 0.00014296973346685646, "loss": 2.7965869903564453, "step": 31210 }, { "epoch": 1.1845087319281788, "grad_norm": 0.296875, "learning_rate": 0.00014285549836866616, "loss": 2.769389343261719, "step": 31220 }, { "epoch": 1.1848881453189422, "grad_norm": 0.28515625, "learning_rate": 0.00014274128356702654, "loss": 2.7658939361572266, "step": 31230 }, { "epoch": 1.1852675587097055, "grad_norm": 0.294921875, "learning_rate": 0.00014262708910250447, "loss": 2.8163681030273438, "step": 31240 }, { "epoch": 1.1856469721004688, "grad_norm": 0.296875, "learning_rate": 0.0001425129150156592, "loss": 2.7668058395385744, "step": 31250 }, { "epoch": 1.1856469721004688, "eval_loss": 2.783811330795288, "eval_runtime": 235.8736, "eval_samples_per_second": 16.148, "eval_steps_per_second": 2.692, "step": 31250 }, { "epoch": 1.1860263854912318, "grad_norm": 0.296875, "learning_rate": 0.00014239876134704323, "loss": 2.7539859771728517, "step": 31260 }, { "epoch": 1.1864057988819952, "grad_norm": 0.294921875, "learning_rate": 0.0001422846281372016, "loss": 2.7802976608276366, "step": 31270 }, { "epoch": 1.1867852122727585, "grad_norm": 0.287109375, "learning_rate": 0.0001421705154266719, "loss": 2.775971221923828, "step": 31280 }, { "epoch": 1.1871646256635218, "grad_norm": 0.2890625, "learning_rate": 0.00014205642325598475, "loss": 2.7837154388427736, "step": 31290 }, { "epoch": 1.187544039054285, "grad_norm": 0.291015625, "learning_rate": 0.00014194235166566332, "loss": 2.745716667175293, "step": 31300 }, { "epoch": 1.1879234524450484, "grad_norm": 0.294921875, "learning_rate": 0.00014182830069622355, "loss": 2.7658546447753904, "step": 31310 }, { "epoch": 1.1883028658358117, "grad_norm": 0.298828125, "learning_rate": 0.00014171427038817385, "loss": 2.764560890197754, "step": 31320 }, { "epoch": 1.1886822792265748, "grad_norm": 0.302734375, "learning_rate": 0.0001416002607820155, "loss": 2.8104047775268555, "step": 31330 }, { "epoch": 1.189061692617338, "grad_norm": 0.3046875, "learning_rate": 0.00014148627191824236, "loss": 2.798623466491699, "step": 31340 }, { "epoch": 1.1894411060081014, "grad_norm": 0.310546875, "learning_rate": 0.0001413723038373411, "loss": 2.770613670349121, "step": 31350 }, { "epoch": 1.1898205193988647, "grad_norm": 0.298828125, "learning_rate": 0.00014125835657979055, "loss": 2.783980941772461, "step": 31360 }, { "epoch": 1.190199932789628, "grad_norm": 0.291015625, "learning_rate": 0.00014114443018606264, "loss": 2.7792074203491213, "step": 31370 }, { "epoch": 1.190579346180391, "grad_norm": 0.296875, "learning_rate": 0.00014103052469662162, "loss": 2.8035831451416016, "step": 31380 }, { "epoch": 1.1909587595711544, "grad_norm": 0.318359375, "learning_rate": 0.00014091664015192433, "loss": 2.7771537780761717, "step": 31390 }, { "epoch": 1.1913381729619177, "grad_norm": 0.30078125, "learning_rate": 0.0001408027765924202, "loss": 2.773911476135254, "step": 31400 }, { "epoch": 1.191717586352681, "grad_norm": 0.3046875, "learning_rate": 0.00014068893405855132, "loss": 2.785068702697754, "step": 31410 }, { "epoch": 1.1920969997434443, "grad_norm": 0.2890625, "learning_rate": 0.00014057511259075222, "loss": 2.7896522521972655, "step": 31420 }, { "epoch": 1.1924764131342076, "grad_norm": 0.29296875, "learning_rate": 0.0001404613122294498, "loss": 2.7688392639160155, "step": 31430 }, { "epoch": 1.192855826524971, "grad_norm": 0.294921875, "learning_rate": 0.0001403475330150637, "loss": 2.7656686782836912, "step": 31440 }, { "epoch": 1.193235239915734, "grad_norm": 0.302734375, "learning_rate": 0.00014023377498800606, "loss": 2.7552637100219726, "step": 31450 }, { "epoch": 1.1936146533064973, "grad_norm": 0.298828125, "learning_rate": 0.00014012003818868117, "loss": 2.793931007385254, "step": 31460 }, { "epoch": 1.1939940666972606, "grad_norm": 0.306640625, "learning_rate": 0.00014000632265748613, "loss": 2.7896183013916014, "step": 31470 }, { "epoch": 1.194373480088024, "grad_norm": 0.2890625, "learning_rate": 0.00013989262843481032, "loss": 2.7542179107666014, "step": 31480 }, { "epoch": 1.1947528934787872, "grad_norm": 0.3046875, "learning_rate": 0.0001397789555610357, "loss": 2.7619279861450194, "step": 31490 }, { "epoch": 1.1951323068695505, "grad_norm": 0.30859375, "learning_rate": 0.00013966530407653633, "loss": 2.778411865234375, "step": 31500 }, { "epoch": 1.1951323068695505, "eval_loss": 2.7833809852600098, "eval_runtime": 245.1368, "eval_samples_per_second": 15.538, "eval_steps_per_second": 2.59, "step": 31500 }, { "epoch": 1.1955117202603138, "grad_norm": 0.294921875, "learning_rate": 0.00013955167402167898, "loss": 2.7885210037231447, "step": 31510 }, { "epoch": 1.195891133651077, "grad_norm": 0.3046875, "learning_rate": 0.0001394380654368227, "loss": 2.7770565032958983, "step": 31520 }, { "epoch": 1.1962705470418402, "grad_norm": 0.294921875, "learning_rate": 0.00013932447836231907, "loss": 2.7848228454589843, "step": 31530 }, { "epoch": 1.1966499604326035, "grad_norm": 0.298828125, "learning_rate": 0.00013921091283851164, "loss": 2.8019027709960938, "step": 31540 }, { "epoch": 1.1970293738233668, "grad_norm": 0.298828125, "learning_rate": 0.00013909736890573663, "loss": 2.7973222732543945, "step": 31550 }, { "epoch": 1.1974087872141301, "grad_norm": 0.302734375, "learning_rate": 0.0001389838466043226, "loss": 2.7821826934814453, "step": 31560 }, { "epoch": 1.1977882006048934, "grad_norm": 0.302734375, "learning_rate": 0.00013887034597459018, "loss": 2.8053855895996094, "step": 31570 }, { "epoch": 1.1981676139956567, "grad_norm": 0.302734375, "learning_rate": 0.0001387568670568526, "loss": 2.7719890594482424, "step": 31580 }, { "epoch": 1.1985470273864198, "grad_norm": 0.294921875, "learning_rate": 0.00013864340989141514, "loss": 2.7452741622924806, "step": 31590 }, { "epoch": 1.1989264407771831, "grad_norm": 0.30859375, "learning_rate": 0.0001385299745185755, "loss": 2.774085807800293, "step": 31600 }, { "epoch": 1.1993058541679464, "grad_norm": 0.30078125, "learning_rate": 0.00013841656097862356, "loss": 2.7830198287963865, "step": 31610 }, { "epoch": 1.1996852675587097, "grad_norm": 0.310546875, "learning_rate": 0.00013830316931184156, "loss": 2.78675537109375, "step": 31620 }, { "epoch": 1.200064680949473, "grad_norm": 0.302734375, "learning_rate": 0.0001381897995585037, "loss": 2.74835319519043, "step": 31630 }, { "epoch": 1.2004440943402361, "grad_norm": 0.294921875, "learning_rate": 0.0001380764517588768, "loss": 2.779549789428711, "step": 31640 }, { "epoch": 1.2008235077309994, "grad_norm": 0.29296875, "learning_rate": 0.0001379631259532195, "loss": 2.7615472793579103, "step": 31650 }, { "epoch": 1.2012029211217627, "grad_norm": 0.306640625, "learning_rate": 0.0001378498221817829, "loss": 2.758219528198242, "step": 31660 }, { "epoch": 1.201582334512526, "grad_norm": 0.298828125, "learning_rate": 0.0001377365404848101, "loss": 2.760417366027832, "step": 31670 }, { "epoch": 1.2019617479032894, "grad_norm": 0.302734375, "learning_rate": 0.00013762328090253641, "loss": 2.7839548110961916, "step": 31680 }, { "epoch": 1.2023411612940527, "grad_norm": 0.30078125, "learning_rate": 0.00013751004347518935, "loss": 2.806779670715332, "step": 31690 }, { "epoch": 1.202720574684816, "grad_norm": 0.3125, "learning_rate": 0.0001373968282429886, "loss": 2.78314208984375, "step": 31700 }, { "epoch": 1.203099988075579, "grad_norm": 0.302734375, "learning_rate": 0.00013728363524614573, "loss": 2.75712890625, "step": 31710 }, { "epoch": 1.2034794014663424, "grad_norm": 0.31640625, "learning_rate": 0.00013717046452486464, "loss": 2.760175323486328, "step": 31720 }, { "epoch": 1.2038588148571057, "grad_norm": 0.302734375, "learning_rate": 0.00013705731611934118, "loss": 2.79012393951416, "step": 31730 }, { "epoch": 1.204238228247869, "grad_norm": 0.302734375, "learning_rate": 0.00013694419006976343, "loss": 2.7997241973876954, "step": 31740 }, { "epoch": 1.2046176416386323, "grad_norm": 0.287109375, "learning_rate": 0.00013683108641631144, "loss": 2.758224678039551, "step": 31750 }, { "epoch": 1.2046176416386323, "eval_loss": 2.7825045585632324, "eval_runtime": 240.6707, "eval_samples_per_second": 15.827, "eval_steps_per_second": 2.638, "step": 31750 }, { "epoch": 1.2049970550293956, "grad_norm": 0.3046875, "learning_rate": 0.0001367180051991572, "loss": 2.7597095489501955, "step": 31760 }, { "epoch": 1.2053764684201589, "grad_norm": 0.302734375, "learning_rate": 0.0001366049464584648, "loss": 2.7726818084716798, "step": 31770 }, { "epoch": 1.205755881810922, "grad_norm": 0.29296875, "learning_rate": 0.0001364919102343906, "loss": 2.7639301300048826, "step": 31780 }, { "epoch": 1.2061352952016853, "grad_norm": 0.30078125, "learning_rate": 0.00013637889656708253, "loss": 2.7309404373168946, "step": 31790 }, { "epoch": 1.2065147085924486, "grad_norm": 0.30078125, "learning_rate": 0.00013626590549668073, "loss": 2.762765312194824, "step": 31800 }, { "epoch": 1.2068941219832119, "grad_norm": 0.296875, "learning_rate": 0.00013615293706331736, "loss": 2.775743293762207, "step": 31810 }, { "epoch": 1.2072735353739752, "grad_norm": 0.298828125, "learning_rate": 0.0001360399913071166, "loss": 2.7938499450683594, "step": 31820 }, { "epoch": 1.2076529487647385, "grad_norm": 0.302734375, "learning_rate": 0.0001359270682681942, "loss": 2.7638126373291017, "step": 31830 }, { "epoch": 1.2080323621555016, "grad_norm": 0.298828125, "learning_rate": 0.0001358141679866583, "loss": 2.7746877670288086, "step": 31840 }, { "epoch": 1.2084117755462649, "grad_norm": 0.29296875, "learning_rate": 0.00013570129050260863, "loss": 2.787561225891113, "step": 31850 }, { "epoch": 1.2087911889370282, "grad_norm": 0.3046875, "learning_rate": 0.00013558843585613711, "loss": 2.799609565734863, "step": 31860 }, { "epoch": 1.2091706023277915, "grad_norm": 0.30078125, "learning_rate": 0.0001354756040873272, "loss": 2.7950508117675783, "step": 31870 }, { "epoch": 1.2095500157185548, "grad_norm": 0.30078125, "learning_rate": 0.00013536279523625452, "loss": 2.7592098236083986, "step": 31880 }, { "epoch": 1.209929429109318, "grad_norm": 0.30078125, "learning_rate": 0.00013525000934298652, "loss": 2.7881425857543944, "step": 31890 }, { "epoch": 1.2103088425000812, "grad_norm": 0.294921875, "learning_rate": 0.00013513724644758228, "loss": 2.786981201171875, "step": 31900 }, { "epoch": 1.2106882558908445, "grad_norm": 0.30078125, "learning_rate": 0.0001350245065900929, "loss": 2.758708381652832, "step": 31910 }, { "epoch": 1.2110676692816078, "grad_norm": 0.294921875, "learning_rate": 0.00013491178981056128, "loss": 2.778666687011719, "step": 31920 }, { "epoch": 1.2114470826723711, "grad_norm": 0.314453125, "learning_rate": 0.0001347990961490222, "loss": 2.7488231658935547, "step": 31930 }, { "epoch": 1.2118264960631344, "grad_norm": 0.298828125, "learning_rate": 0.00013468642564550196, "loss": 2.7883411407470704, "step": 31940 }, { "epoch": 1.2122059094538977, "grad_norm": 0.294921875, "learning_rate": 0.00013457377834001885, "loss": 2.765018272399902, "step": 31950 }, { "epoch": 1.212585322844661, "grad_norm": 0.302734375, "learning_rate": 0.00013446115427258302, "loss": 2.791919708251953, "step": 31960 }, { "epoch": 1.2129647362354241, "grad_norm": 0.30859375, "learning_rate": 0.000134348553483196, "loss": 2.791251373291016, "step": 31970 }, { "epoch": 1.2133441496261874, "grad_norm": 0.3046875, "learning_rate": 0.00013423597601185135, "loss": 2.772542953491211, "step": 31980 }, { "epoch": 1.2137235630169507, "grad_norm": 0.302734375, "learning_rate": 0.0001341234218985343, "loss": 2.7805059432983397, "step": 31990 }, { "epoch": 1.214102976407714, "grad_norm": 0.298828125, "learning_rate": 0.00013401089118322186, "loss": 2.7751031875610352, "step": 32000 }, { "epoch": 1.214102976407714, "eval_loss": 2.782331943511963, "eval_runtime": 190.4768, "eval_samples_per_second": 19.997, "eval_steps_per_second": 3.334, "step": 32000 }, { "epoch": 1.2144823897984773, "grad_norm": 0.294921875, "learning_rate": 0.0001338983839058824, "loss": 2.74182243347168, "step": 32010 }, { "epoch": 1.2148618031892406, "grad_norm": 0.322265625, "learning_rate": 0.00013378590010647629, "loss": 2.7412900924682617, "step": 32020 }, { "epoch": 1.215241216580004, "grad_norm": 0.302734375, "learning_rate": 0.00013367343982495551, "loss": 2.780571174621582, "step": 32030 }, { "epoch": 1.215620629970767, "grad_norm": 0.29296875, "learning_rate": 0.00013356100310126366, "loss": 2.7738128662109376, "step": 32040 }, { "epoch": 1.2160000433615303, "grad_norm": 0.296875, "learning_rate": 0.0001334485899753358, "loss": 2.754019927978516, "step": 32050 }, { "epoch": 1.2163794567522936, "grad_norm": 0.298828125, "learning_rate": 0.00013333620048709883, "loss": 2.7683753967285156, "step": 32060 }, { "epoch": 1.216758870143057, "grad_norm": 0.296875, "learning_rate": 0.0001332238346764713, "loss": 2.7941097259521483, "step": 32070 }, { "epoch": 1.2171382835338203, "grad_norm": 0.291015625, "learning_rate": 0.00013311149258336304, "loss": 2.806388092041016, "step": 32080 }, { "epoch": 1.2175176969245833, "grad_norm": 0.298828125, "learning_rate": 0.00013299917424767577, "loss": 2.7735832214355467, "step": 32090 }, { "epoch": 1.2178971103153466, "grad_norm": 0.30078125, "learning_rate": 0.00013288687970930258, "loss": 2.7806476593017577, "step": 32100 }, { "epoch": 1.21827652370611, "grad_norm": 0.294921875, "learning_rate": 0.00013277460900812832, "loss": 2.789962387084961, "step": 32110 }, { "epoch": 1.2186559370968733, "grad_norm": 0.30078125, "learning_rate": 0.00013266236218402902, "loss": 2.8050642013549805, "step": 32120 }, { "epoch": 1.2190353504876366, "grad_norm": 0.306640625, "learning_rate": 0.00013255013927687258, "loss": 2.7720947265625, "step": 32130 }, { "epoch": 1.2194147638783999, "grad_norm": 0.30078125, "learning_rate": 0.00013243794032651826, "loss": 2.7912405014038084, "step": 32140 }, { "epoch": 1.2197941772691632, "grad_norm": 0.291015625, "learning_rate": 0.0001323257653728168, "loss": 2.7826210021972657, "step": 32150 }, { "epoch": 1.2201735906599263, "grad_norm": 0.298828125, "learning_rate": 0.00013221361445561045, "loss": 2.816517448425293, "step": 32160 }, { "epoch": 1.2205530040506896, "grad_norm": 0.29296875, "learning_rate": 0.00013210148761473285, "loss": 2.718488311767578, "step": 32170 }, { "epoch": 1.2209324174414529, "grad_norm": 0.2890625, "learning_rate": 0.00013198938489000921, "loss": 2.7532020568847657, "step": 32180 }, { "epoch": 1.2213118308322162, "grad_norm": 0.291015625, "learning_rate": 0.00013187730632125604, "loss": 2.78692512512207, "step": 32190 }, { "epoch": 1.2216912442229795, "grad_norm": 0.298828125, "learning_rate": 0.00013176525194828143, "loss": 2.7969644546508787, "step": 32200 }, { "epoch": 1.2220706576137428, "grad_norm": 0.30078125, "learning_rate": 0.0001316532218108847, "loss": 2.789232063293457, "step": 32210 }, { "epoch": 1.222450071004506, "grad_norm": 0.291015625, "learning_rate": 0.00013154121594885674, "loss": 2.7683006286621095, "step": 32220 }, { "epoch": 1.2228294843952692, "grad_norm": 0.298828125, "learning_rate": 0.00013142923440197964, "loss": 2.786480712890625, "step": 32230 }, { "epoch": 1.2232088977860325, "grad_norm": 0.310546875, "learning_rate": 0.00013131727721002703, "loss": 2.7976966857910157, "step": 32240 }, { "epoch": 1.2235883111767958, "grad_norm": 0.296875, "learning_rate": 0.00013120534441276368, "loss": 2.806254005432129, "step": 32250 }, { "epoch": 1.2235883111767958, "eval_loss": 2.781614065170288, "eval_runtime": 190.3078, "eval_samples_per_second": 20.015, "eval_steps_per_second": 3.337, "step": 32250 }, { "epoch": 1.223967724567559, "grad_norm": 0.3125, "learning_rate": 0.00013109343604994592, "loss": 2.776316261291504, "step": 32260 }, { "epoch": 1.2243471379583224, "grad_norm": 0.298828125, "learning_rate": 0.00013098155216132126, "loss": 2.7456024169921873, "step": 32270 }, { "epoch": 1.2247265513490857, "grad_norm": 0.29296875, "learning_rate": 0.00013086969278662856, "loss": 2.7734373092651365, "step": 32280 }, { "epoch": 1.225105964739849, "grad_norm": 0.328125, "learning_rate": 0.00013075785796559796, "loss": 2.7489046096801757, "step": 32290 }, { "epoch": 1.225485378130612, "grad_norm": 0.302734375, "learning_rate": 0.00013064604773795085, "loss": 2.77445011138916, "step": 32300 }, { "epoch": 1.2258647915213754, "grad_norm": 0.30078125, "learning_rate": 0.00013053426214339997, "loss": 2.790690231323242, "step": 32310 }, { "epoch": 1.2262442049121387, "grad_norm": 0.306640625, "learning_rate": 0.00013042250122164925, "loss": 2.8134511947631835, "step": 32320 }, { "epoch": 1.226623618302902, "grad_norm": 0.306640625, "learning_rate": 0.00013031076501239395, "loss": 2.7878353118896486, "step": 32330 }, { "epoch": 1.2270030316936653, "grad_norm": 0.294921875, "learning_rate": 0.00013019905355532026, "loss": 2.7947463989257812, "step": 32340 }, { "epoch": 1.2273824450844284, "grad_norm": 0.291015625, "learning_rate": 0.000130087366890106, "loss": 2.787133979797363, "step": 32350 }, { "epoch": 1.2277618584751917, "grad_norm": 0.29296875, "learning_rate": 0.00012997570505641982, "loss": 2.7748701095581056, "step": 32360 }, { "epoch": 1.228141271865955, "grad_norm": 0.296875, "learning_rate": 0.00012986406809392193, "loss": 2.7587152481079102, "step": 32370 }, { "epoch": 1.2285206852567183, "grad_norm": 0.30078125, "learning_rate": 0.0001297524560422632, "loss": 2.8152847290039062, "step": 32380 }, { "epoch": 1.2289000986474816, "grad_norm": 0.296875, "learning_rate": 0.0001296408689410861, "loss": 2.7633293151855467, "step": 32390 }, { "epoch": 1.229279512038245, "grad_norm": 0.3203125, "learning_rate": 0.00012952930683002412, "loss": 2.8032049179077148, "step": 32400 }, { "epoch": 1.2296589254290082, "grad_norm": 0.302734375, "learning_rate": 0.00012941776974870163, "loss": 2.7601457595825196, "step": 32410 }, { "epoch": 1.2300383388197713, "grad_norm": 0.296875, "learning_rate": 0.00012930625773673442, "loss": 2.7729286193847655, "step": 32420 }, { "epoch": 1.2304177522105346, "grad_norm": 0.294921875, "learning_rate": 0.0001291947708337293, "loss": 2.7780410766601564, "step": 32430 }, { "epoch": 1.230797165601298, "grad_norm": 0.314453125, "learning_rate": 0.0001290833090792841, "loss": 2.7619483947753904, "step": 32440 }, { "epoch": 1.2311765789920612, "grad_norm": 0.30859375, "learning_rate": 0.0001289718725129877, "loss": 2.778491401672363, "step": 32450 }, { "epoch": 1.2315559923828245, "grad_norm": 0.314453125, "learning_rate": 0.00012886046117442006, "loss": 2.8131231307983398, "step": 32460 }, { "epoch": 1.2319354057735878, "grad_norm": 0.298828125, "learning_rate": 0.00012874907510315232, "loss": 2.8091617584228517, "step": 32470 }, { "epoch": 1.2323148191643511, "grad_norm": 0.298828125, "learning_rate": 0.00012863771433874633, "loss": 2.7972713470458985, "step": 32480 }, { "epoch": 1.2326942325551142, "grad_norm": 0.296875, "learning_rate": 0.00012852637892075527, "loss": 2.784572410583496, "step": 32490 }, { "epoch": 1.2330736459458775, "grad_norm": 0.310546875, "learning_rate": 0.00012841506888872312, "loss": 2.7592700958251952, "step": 32500 }, { "epoch": 1.2330736459458775, "eval_loss": 2.781217098236084, "eval_runtime": 190.2325, "eval_samples_per_second": 20.023, "eval_steps_per_second": 3.338, "step": 32500 }, { "epoch": 1.2334530593366408, "grad_norm": 0.294921875, "learning_rate": 0.0001283037842821851, "loss": 2.7771881103515623, "step": 32510 }, { "epoch": 1.2338324727274042, "grad_norm": 0.298828125, "learning_rate": 0.0001281925251406669, "loss": 2.7716047286987306, "step": 32520 }, { "epoch": 1.2342118861181675, "grad_norm": 0.298828125, "learning_rate": 0.00012808129150368571, "loss": 2.7819019317626954, "step": 32530 }, { "epoch": 1.2345912995089308, "grad_norm": 0.291015625, "learning_rate": 0.00012797008341074935, "loss": 2.7561668395996093, "step": 32540 }, { "epoch": 1.2349707128996938, "grad_norm": 0.29296875, "learning_rate": 0.0001278589009013567, "loss": 2.797807312011719, "step": 32550 }, { "epoch": 1.2353501262904572, "grad_norm": 0.30078125, "learning_rate": 0.0001277477440149974, "loss": 2.7721351623535155, "step": 32560 }, { "epoch": 1.2357295396812205, "grad_norm": 0.291015625, "learning_rate": 0.00012763661279115214, "loss": 2.784488487243652, "step": 32570 }, { "epoch": 1.2361089530719838, "grad_norm": 0.291015625, "learning_rate": 0.0001275255072692925, "loss": 2.7955245971679688, "step": 32580 }, { "epoch": 1.236488366462747, "grad_norm": 0.2890625, "learning_rate": 0.00012741442748888076, "loss": 2.7507707595825197, "step": 32590 }, { "epoch": 1.2368677798535104, "grad_norm": 0.345703125, "learning_rate": 0.0001273033734893702, "loss": 2.7677885055541993, "step": 32600 }, { "epoch": 1.2372471932442735, "grad_norm": 0.31640625, "learning_rate": 0.00012719234531020497, "loss": 2.7655614852905273, "step": 32610 }, { "epoch": 1.2376266066350368, "grad_norm": 0.30078125, "learning_rate": 0.00012708134299082004, "loss": 2.8106544494628904, "step": 32620 }, { "epoch": 1.2380060200258, "grad_norm": 0.30078125, "learning_rate": 0.00012697036657064101, "loss": 2.7705398559570313, "step": 32630 }, { "epoch": 1.2383854334165634, "grad_norm": 0.31640625, "learning_rate": 0.00012685941608908447, "loss": 2.7647226333618162, "step": 32640 }, { "epoch": 1.2387648468073267, "grad_norm": 0.294921875, "learning_rate": 0.00012674849158555778, "loss": 2.7697189331054686, "step": 32650 }, { "epoch": 1.23914426019809, "grad_norm": 0.30078125, "learning_rate": 0.00012663759309945909, "loss": 2.8110271453857423, "step": 32660 }, { "epoch": 1.2395236735888533, "grad_norm": 0.296875, "learning_rate": 0.00012652672067017711, "loss": 2.7456893920898438, "step": 32670 }, { "epoch": 1.2399030869796164, "grad_norm": 0.3046875, "learning_rate": 0.00012641587433709155, "loss": 2.7999797821044923, "step": 32680 }, { "epoch": 1.2402825003703797, "grad_norm": 0.302734375, "learning_rate": 0.0001263050541395728, "loss": 2.8051252365112305, "step": 32690 }, { "epoch": 1.240661913761143, "grad_norm": 0.296875, "learning_rate": 0.00012619426011698174, "loss": 2.7851545333862306, "step": 32700 }, { "epoch": 1.2410413271519063, "grad_norm": 0.287109375, "learning_rate": 0.00012608349230867024, "loss": 2.768702507019043, "step": 32710 }, { "epoch": 1.2414207405426696, "grad_norm": 0.298828125, "learning_rate": 0.0001259727507539807, "loss": 2.7655025482177735, "step": 32720 }, { "epoch": 1.241800153933433, "grad_norm": 0.291015625, "learning_rate": 0.00012586203549224634, "loss": 2.780513954162598, "step": 32730 }, { "epoch": 1.2421795673241962, "grad_norm": 0.31640625, "learning_rate": 0.00012575134656279087, "loss": 2.7481100082397463, "step": 32740 }, { "epoch": 1.2425589807149593, "grad_norm": 0.314453125, "learning_rate": 0.00012564068400492862, "loss": 2.7776418685913087, "step": 32750 }, { "epoch": 1.2425589807149593, "eval_loss": 2.7808778285980225, "eval_runtime": 190.2397, "eval_samples_per_second": 20.022, "eval_steps_per_second": 3.338, "step": 32750 }, { "epoch": 1.2429383941057226, "grad_norm": 0.30078125, "learning_rate": 0.00012553004785796482, "loss": 2.745952606201172, "step": 32760 }, { "epoch": 1.243317807496486, "grad_norm": 0.298828125, "learning_rate": 0.0001254194381611951, "loss": 2.7969465255737305, "step": 32770 }, { "epoch": 1.2436972208872492, "grad_norm": 0.302734375, "learning_rate": 0.00012530885495390577, "loss": 2.7797943115234376, "step": 32780 }, { "epoch": 1.2440766342780125, "grad_norm": 0.298828125, "learning_rate": 0.0001251982982753736, "loss": 2.7770160675048827, "step": 32790 }, { "epoch": 1.2444560476687756, "grad_norm": 0.30859375, "learning_rate": 0.00012508776816486615, "loss": 2.7587959289550783, "step": 32800 }, { "epoch": 1.244835461059539, "grad_norm": 0.3046875, "learning_rate": 0.0001249772646616414, "loss": 2.7893548965454102, "step": 32810 }, { "epoch": 1.2452148744503022, "grad_norm": 0.291015625, "learning_rate": 0.0001248667878049479, "loss": 2.7745525360107424, "step": 32820 }, { "epoch": 1.2455942878410655, "grad_norm": 0.298828125, "learning_rate": 0.0001247563376340248, "loss": 2.7591268539428713, "step": 32830 }, { "epoch": 1.2459737012318288, "grad_norm": 0.298828125, "learning_rate": 0.00012464591418810174, "loss": 2.7495222091674805, "step": 32840 }, { "epoch": 1.2463531146225921, "grad_norm": 0.298828125, "learning_rate": 0.0001245355175063988, "loss": 2.8157159805297853, "step": 32850 }, { "epoch": 1.2467325280133554, "grad_norm": 0.294921875, "learning_rate": 0.00012442514762812666, "loss": 2.7341346740722656, "step": 32860 }, { "epoch": 1.2471119414041185, "grad_norm": 0.296875, "learning_rate": 0.00012431480459248638, "loss": 2.7473775863647463, "step": 32870 }, { "epoch": 1.2474913547948818, "grad_norm": 0.296875, "learning_rate": 0.00012420448843866955, "loss": 2.7663475036621095, "step": 32880 }, { "epoch": 1.2478707681856451, "grad_norm": 0.30078125, "learning_rate": 0.00012409419920585818, "loss": 2.7679691314697266, "step": 32890 }, { "epoch": 1.2482501815764084, "grad_norm": 0.296875, "learning_rate": 0.00012398393693322478, "loss": 2.801413726806641, "step": 32900 }, { "epoch": 1.2486295949671717, "grad_norm": 0.30078125, "learning_rate": 0.00012387370165993232, "loss": 2.794879341125488, "step": 32910 }, { "epoch": 1.249009008357935, "grad_norm": 0.294921875, "learning_rate": 0.00012376349342513388, "loss": 2.7522727966308596, "step": 32920 }, { "epoch": 1.2493884217486984, "grad_norm": 0.30859375, "learning_rate": 0.00012365331226797327, "loss": 2.801259231567383, "step": 32930 }, { "epoch": 1.2497678351394614, "grad_norm": 0.30859375, "learning_rate": 0.0001235431582275846, "loss": 2.7557106018066406, "step": 32940 }, { "epoch": 1.2501472485302247, "grad_norm": 0.296875, "learning_rate": 0.00012343303134309237, "loss": 2.7420465469360353, "step": 32950 }, { "epoch": 1.250526661920988, "grad_norm": 0.2890625, "learning_rate": 0.00012332293165361123, "loss": 2.7768239974975586, "step": 32960 }, { "epoch": 1.2509060753117514, "grad_norm": 0.298828125, "learning_rate": 0.0001232128591982464, "loss": 2.7838342666625975, "step": 32970 }, { "epoch": 1.2512854887025147, "grad_norm": 0.29296875, "learning_rate": 0.0001231028140160933, "loss": 2.8000213623046877, "step": 32980 }, { "epoch": 1.2516649020932777, "grad_norm": 0.30078125, "learning_rate": 0.00012299279614623788, "loss": 2.783274459838867, "step": 32990 }, { "epoch": 1.2520443154840413, "grad_norm": 0.302734375, "learning_rate": 0.00012288280562775601, "loss": 2.766650390625, "step": 33000 }, { "epoch": 1.2520443154840413, "eval_loss": 2.780487537384033, "eval_runtime": 190.2764, "eval_samples_per_second": 20.018, "eval_steps_per_second": 3.337, "step": 33000 }, { "epoch": 1.2524237288748044, "grad_norm": 0.30078125, "learning_rate": 0.0001227728424997142, "loss": 2.7825571060180665, "step": 33010 }, { "epoch": 1.2528031422655677, "grad_norm": 0.302734375, "learning_rate": 0.00012266290680116906, "loss": 2.7833423614501953, "step": 33020 }, { "epoch": 1.253182555656331, "grad_norm": 0.291015625, "learning_rate": 0.00012255299857116743, "loss": 2.7693519592285156, "step": 33030 }, { "epoch": 1.2535619690470943, "grad_norm": 0.298828125, "learning_rate": 0.0001224431178487465, "loss": 2.7522443771362304, "step": 33040 }, { "epoch": 1.2539413824378576, "grad_norm": 0.294921875, "learning_rate": 0.0001223332646729336, "loss": 2.7590347290039063, "step": 33050 }, { "epoch": 1.2543207958286207, "grad_norm": 0.30859375, "learning_rate": 0.00012222343908274642, "loss": 2.7410507202148438, "step": 33060 }, { "epoch": 1.2547002092193842, "grad_norm": 0.302734375, "learning_rate": 0.00012211364111719262, "loss": 2.7698455810546876, "step": 33070 }, { "epoch": 1.2550796226101473, "grad_norm": 0.31640625, "learning_rate": 0.00012200387081527023, "loss": 2.777422332763672, "step": 33080 }, { "epoch": 1.2554590360009106, "grad_norm": 0.3125, "learning_rate": 0.00012189412821596749, "loss": 2.777694511413574, "step": 33090 }, { "epoch": 1.2558384493916739, "grad_norm": 0.30078125, "learning_rate": 0.00012178441335826253, "loss": 2.7904253005981445, "step": 33100 }, { "epoch": 1.2562178627824372, "grad_norm": 0.3203125, "learning_rate": 0.0001216747262811239, "loss": 2.760318374633789, "step": 33110 }, { "epoch": 1.2565972761732005, "grad_norm": 0.306640625, "learning_rate": 0.00012156506702351023, "loss": 2.7844913482666014, "step": 33120 }, { "epoch": 1.2569766895639636, "grad_norm": 0.3125, "learning_rate": 0.00012145543562437027, "loss": 2.788127326965332, "step": 33130 }, { "epoch": 1.2573561029547269, "grad_norm": 0.306640625, "learning_rate": 0.00012134583212264268, "loss": 2.7857177734375, "step": 33140 }, { "epoch": 1.2577355163454902, "grad_norm": 0.296875, "learning_rate": 0.00012123625655725645, "loss": 2.7666826248168945, "step": 33150 }, { "epoch": 1.2581149297362535, "grad_norm": 0.3046875, "learning_rate": 0.00012112670896713059, "loss": 2.759499359130859, "step": 33160 }, { "epoch": 1.2584943431270168, "grad_norm": 0.298828125, "learning_rate": 0.0001210171893911742, "loss": 2.767199897766113, "step": 33170 }, { "epoch": 1.25887375651778, "grad_norm": 0.30078125, "learning_rate": 0.00012090769786828619, "loss": 2.7522964477539062, "step": 33180 }, { "epoch": 1.2592531699085434, "grad_norm": 0.296875, "learning_rate": 0.00012079823443735584, "loss": 2.7668718338012694, "step": 33190 }, { "epoch": 1.2596325832993065, "grad_norm": 0.3046875, "learning_rate": 0.00012068879913726237, "loss": 2.8232885360717774, "step": 33200 }, { "epoch": 1.2600119966900698, "grad_norm": 0.30078125, "learning_rate": 0.00012057939200687478, "loss": 2.7671533584594727, "step": 33210 }, { "epoch": 1.260391410080833, "grad_norm": 0.302734375, "learning_rate": 0.00012047001308505228, "loss": 2.761138916015625, "step": 33220 }, { "epoch": 1.2607708234715964, "grad_norm": 0.296875, "learning_rate": 0.00012036066241064406, "loss": 2.772122383117676, "step": 33230 }, { "epoch": 1.2611502368623597, "grad_norm": 0.29296875, "learning_rate": 0.00012025134002248926, "loss": 2.757292938232422, "step": 33240 }, { "epoch": 1.2615296502531228, "grad_norm": 0.298828125, "learning_rate": 0.00012014204595941683, "loss": 2.761079025268555, "step": 33250 }, { "epoch": 1.2615296502531228, "eval_loss": 2.7800815105438232, "eval_runtime": 190.341, "eval_samples_per_second": 20.011, "eval_steps_per_second": 3.336, "step": 33250 }, { "epoch": 1.2619090636438863, "grad_norm": 0.30078125, "learning_rate": 0.00012003278026024582, "loss": 2.783158874511719, "step": 33260 }, { "epoch": 1.2622884770346494, "grad_norm": 0.298828125, "learning_rate": 0.00011992354296378519, "loss": 2.7897424697875977, "step": 33270 }, { "epoch": 1.2626678904254127, "grad_norm": 0.3125, "learning_rate": 0.00011981433410883381, "loss": 2.7706342697143556, "step": 33280 }, { "epoch": 1.263047303816176, "grad_norm": 0.306640625, "learning_rate": 0.00011970515373418027, "loss": 2.7592288970947267, "step": 33290 }, { "epoch": 1.2634267172069393, "grad_norm": 0.298828125, "learning_rate": 0.00011959600187860331, "loss": 2.75936336517334, "step": 33300 }, { "epoch": 1.2638061305977026, "grad_norm": 0.294921875, "learning_rate": 0.00011948687858087141, "loss": 2.7991029739379885, "step": 33310 }, { "epoch": 1.2641855439884657, "grad_norm": 0.296875, "learning_rate": 0.00011937778387974297, "loss": 2.7797962188720704, "step": 33320 }, { "epoch": 1.264564957379229, "grad_norm": 0.306640625, "learning_rate": 0.00011926871781396602, "loss": 2.762119674682617, "step": 33330 }, { "epoch": 1.2649443707699923, "grad_norm": 0.3046875, "learning_rate": 0.00011915968042227868, "loss": 2.7714298248291014, "step": 33340 }, { "epoch": 1.2653237841607556, "grad_norm": 0.30078125, "learning_rate": 0.00011905067174340884, "loss": 2.774629020690918, "step": 33350 }, { "epoch": 1.265703197551519, "grad_norm": 0.30078125, "learning_rate": 0.00011894169181607408, "loss": 2.794961166381836, "step": 33360 }, { "epoch": 1.2660826109422822, "grad_norm": 0.302734375, "learning_rate": 0.0001188327406789818, "loss": 2.769544219970703, "step": 33370 }, { "epoch": 1.2664620243330456, "grad_norm": 0.291015625, "learning_rate": 0.00011872381837082924, "loss": 2.7557844161987304, "step": 33380 }, { "epoch": 1.2668414377238086, "grad_norm": 0.296875, "learning_rate": 0.0001186149249303034, "loss": 2.7845062255859374, "step": 33390 }, { "epoch": 1.267220851114572, "grad_norm": 0.298828125, "learning_rate": 0.00011850606039608092, "loss": 2.7415597915649412, "step": 33400 }, { "epoch": 1.2676002645053353, "grad_norm": 0.306640625, "learning_rate": 0.00011839722480682825, "loss": 2.7795694351196287, "step": 33410 }, { "epoch": 1.2679796778960986, "grad_norm": 0.298828125, "learning_rate": 0.00011828841820120158, "loss": 2.7733701705932616, "step": 33420 }, { "epoch": 1.2683590912868619, "grad_norm": 0.306640625, "learning_rate": 0.00011817964061784674, "loss": 2.786083984375, "step": 33430 }, { "epoch": 1.2687385046776252, "grad_norm": 0.296875, "learning_rate": 0.00011807089209539934, "loss": 2.8059598922729494, "step": 33440 }, { "epoch": 1.2691179180683885, "grad_norm": 0.296875, "learning_rate": 0.00011796217267248457, "loss": 2.76937255859375, "step": 33450 }, { "epoch": 1.2694973314591516, "grad_norm": 0.296875, "learning_rate": 0.00011785348238771736, "loss": 2.803596305847168, "step": 33460 }, { "epoch": 1.2698767448499149, "grad_norm": 0.296875, "learning_rate": 0.00011774482127970223, "loss": 2.7765880584716798, "step": 33470 }, { "epoch": 1.2702561582406782, "grad_norm": 0.298828125, "learning_rate": 0.00011763618938703338, "loss": 2.782704162597656, "step": 33480 }, { "epoch": 1.2706355716314415, "grad_norm": 0.29296875, "learning_rate": 0.00011752758674829469, "loss": 2.789693260192871, "step": 33490 }, { "epoch": 1.2710149850222048, "grad_norm": 0.30859375, "learning_rate": 0.0001174190134020595, "loss": 2.780994415283203, "step": 33500 }, { "epoch": 1.2710149850222048, "eval_loss": 2.779590606689453, "eval_runtime": 197.4245, "eval_samples_per_second": 19.293, "eval_steps_per_second": 3.216, "step": 33500 }, { "epoch": 1.2713943984129679, "grad_norm": 0.29296875, "learning_rate": 0.0001173104693868908, "loss": 2.7572874069213866, "step": 33510 }, { "epoch": 1.2717738118037314, "grad_norm": 0.296875, "learning_rate": 0.00011720195474134128, "loss": 2.792861557006836, "step": 33520 }, { "epoch": 1.2721532251944945, "grad_norm": 0.30078125, "learning_rate": 0.00011709346950395313, "loss": 2.7917335510253904, "step": 33530 }, { "epoch": 1.2725326385852578, "grad_norm": 0.291015625, "learning_rate": 0.00011698501371325794, "loss": 2.7656330108642577, "step": 33540 }, { "epoch": 1.272912051976021, "grad_norm": 0.302734375, "learning_rate": 0.00011687658740777703, "loss": 2.762348937988281, "step": 33550 }, { "epoch": 1.2732914653667844, "grad_norm": 0.314453125, "learning_rate": 0.00011676819062602122, "loss": 2.7706966400146484, "step": 33560 }, { "epoch": 1.2736708787575477, "grad_norm": 0.30859375, "learning_rate": 0.00011665982340649094, "loss": 2.791518974304199, "step": 33570 }, { "epoch": 1.2740502921483108, "grad_norm": 0.298828125, "learning_rate": 0.00011655148578767567, "loss": 2.7907581329345703, "step": 33580 }, { "epoch": 1.274429705539074, "grad_norm": 0.296875, "learning_rate": 0.00011644317780805498, "loss": 2.7728647232055663, "step": 33590 }, { "epoch": 1.2748091189298374, "grad_norm": 0.291015625, "learning_rate": 0.00011633489950609758, "loss": 2.7739593505859377, "step": 33600 }, { "epoch": 1.2751885323206007, "grad_norm": 0.30078125, "learning_rate": 0.00011622665092026164, "loss": 2.8097599029541014, "step": 33610 }, { "epoch": 1.275567945711364, "grad_norm": 0.298828125, "learning_rate": 0.00011611843208899475, "loss": 2.7608694076538085, "step": 33620 }, { "epoch": 1.2759473591021273, "grad_norm": 0.294921875, "learning_rate": 0.0001160102430507342, "loss": 2.760478973388672, "step": 33630 }, { "epoch": 1.2763267724928906, "grad_norm": 0.29296875, "learning_rate": 0.00011590208384390644, "loss": 2.7534374237060546, "step": 33640 }, { "epoch": 1.2767061858836537, "grad_norm": 0.298828125, "learning_rate": 0.00011579395450692736, "loss": 2.7697317123413088, "step": 33650 }, { "epoch": 1.277085599274417, "grad_norm": 0.30078125, "learning_rate": 0.0001156858550782022, "loss": 2.7537958145141603, "step": 33660 }, { "epoch": 1.2774650126651803, "grad_norm": 0.30078125, "learning_rate": 0.00011557778559612584, "loss": 2.7303422927856444, "step": 33670 }, { "epoch": 1.2778444260559436, "grad_norm": 0.291015625, "learning_rate": 0.00011546974609908223, "loss": 2.789947509765625, "step": 33680 }, { "epoch": 1.278223839446707, "grad_norm": 0.296875, "learning_rate": 0.00011536173662544479, "loss": 2.7673372268676757, "step": 33690 }, { "epoch": 1.27860325283747, "grad_norm": 0.294921875, "learning_rate": 0.00011525375721357616, "loss": 2.78076286315918, "step": 33700 }, { "epoch": 1.2789826662282335, "grad_norm": 0.291015625, "learning_rate": 0.00011514580790182859, "loss": 2.7686412811279295, "step": 33710 }, { "epoch": 1.2793620796189966, "grad_norm": 0.30078125, "learning_rate": 0.00011503788872854336, "loss": 2.781269073486328, "step": 33720 }, { "epoch": 1.27974149300976, "grad_norm": 0.30078125, "learning_rate": 0.00011492999973205111, "loss": 2.7821802139282226, "step": 33730 }, { "epoch": 1.2801209064005232, "grad_norm": 0.2890625, "learning_rate": 0.00011482214095067178, "loss": 2.731707954406738, "step": 33740 }, { "epoch": 1.2805003197912865, "grad_norm": 0.3046875, "learning_rate": 0.0001147143124227147, "loss": 2.7622407913208007, "step": 33750 }, { "epoch": 1.2805003197912865, "eval_loss": 2.7793657779693604, "eval_runtime": 197.403, "eval_samples_per_second": 19.296, "eval_steps_per_second": 3.217, "step": 33750 }, { "epoch": 1.2808797331820498, "grad_norm": 0.30078125, "learning_rate": 0.00011460651418647827, "loss": 2.765658378601074, "step": 33760 }, { "epoch": 1.281259146572813, "grad_norm": 0.30078125, "learning_rate": 0.00011449874628025015, "loss": 2.7683923721313475, "step": 33770 }, { "epoch": 1.2816385599635765, "grad_norm": 0.3046875, "learning_rate": 0.00011439100874230741, "loss": 2.798405075073242, "step": 33780 }, { "epoch": 1.2820179733543395, "grad_norm": 0.306640625, "learning_rate": 0.00011428330161091611, "loss": 2.7962112426757812, "step": 33790 }, { "epoch": 1.2823973867451028, "grad_norm": 0.3046875, "learning_rate": 0.00011417562492433167, "loss": 2.7905603408813477, "step": 33800 }, { "epoch": 1.2827768001358661, "grad_norm": 0.296875, "learning_rate": 0.00011406797872079849, "loss": 2.7646188735961914, "step": 33810 }, { "epoch": 1.2831562135266295, "grad_norm": 0.30078125, "learning_rate": 0.00011396036303855053, "loss": 2.7628385543823244, "step": 33820 }, { "epoch": 1.2835356269173928, "grad_norm": 0.32421875, "learning_rate": 0.00011385277791581042, "loss": 2.776811218261719, "step": 33830 }, { "epoch": 1.2839150403081558, "grad_norm": 0.291015625, "learning_rate": 0.00011374522339079033, "loss": 2.7845245361328126, "step": 33840 }, { "epoch": 1.2842944536989191, "grad_norm": 0.298828125, "learning_rate": 0.00011363769950169128, "loss": 2.7581653594970703, "step": 33850 }, { "epoch": 1.2846738670896825, "grad_norm": 0.291015625, "learning_rate": 0.00011353020628670382, "loss": 2.7461042404174805, "step": 33860 }, { "epoch": 1.2850532804804458, "grad_norm": 0.3125, "learning_rate": 0.00011342274378400697, "loss": 2.7716253280639647, "step": 33870 }, { "epoch": 1.285432693871209, "grad_norm": 0.306640625, "learning_rate": 0.00011331531203176944, "loss": 2.786782646179199, "step": 33880 }, { "epoch": 1.2858121072619724, "grad_norm": 0.298828125, "learning_rate": 0.00011320791106814863, "loss": 2.7670610427856444, "step": 33890 }, { "epoch": 1.2861915206527357, "grad_norm": 0.291015625, "learning_rate": 0.00011310054093129139, "loss": 2.793046760559082, "step": 33900 }, { "epoch": 1.2865709340434988, "grad_norm": 0.298828125, "learning_rate": 0.00011299320165933305, "loss": 2.788967323303223, "step": 33910 }, { "epoch": 1.286950347434262, "grad_norm": 0.298828125, "learning_rate": 0.00011288589329039856, "loss": 2.798797607421875, "step": 33920 }, { "epoch": 1.2873297608250254, "grad_norm": 0.3046875, "learning_rate": 0.00011277861586260157, "loss": 2.771337699890137, "step": 33930 }, { "epoch": 1.2877091742157887, "grad_norm": 0.30859375, "learning_rate": 0.0001126713694140448, "loss": 2.793935012817383, "step": 33940 }, { "epoch": 1.288088587606552, "grad_norm": 0.30078125, "learning_rate": 0.00011256415398281989, "loss": 2.77231502532959, "step": 33950 }, { "epoch": 1.288468000997315, "grad_norm": 0.314453125, "learning_rate": 0.00011245696960700778, "loss": 2.784613609313965, "step": 33960 }, { "epoch": 1.2888474143880786, "grad_norm": 0.306640625, "learning_rate": 0.00011234981632467798, "loss": 2.7741989135742187, "step": 33970 }, { "epoch": 1.2892268277788417, "grad_norm": 0.30078125, "learning_rate": 0.0001122426941738892, "loss": 2.771740531921387, "step": 33980 }, { "epoch": 1.289606241169605, "grad_norm": 0.29296875, "learning_rate": 0.00011213560319268892, "loss": 2.800338935852051, "step": 33990 }, { "epoch": 1.2899856545603683, "grad_norm": 0.296875, "learning_rate": 0.00011202854341911378, "loss": 2.783330535888672, "step": 34000 }, { "epoch": 1.2899856545603683, "eval_loss": 2.7788021564483643, "eval_runtime": 198.5325, "eval_samples_per_second": 19.186, "eval_steps_per_second": 3.198, "step": 34000 }, { "epoch": 1.2903650679511316, "grad_norm": 0.296875, "learning_rate": 0.00011192151489118919, "loss": 2.7633642196655273, "step": 34010 }, { "epoch": 1.290744481341895, "grad_norm": 0.314453125, "learning_rate": 0.00011181451764692943, "loss": 2.7653278350830077, "step": 34020 }, { "epoch": 1.291123894732658, "grad_norm": 0.30859375, "learning_rate": 0.00011170755172433766, "loss": 2.740658760070801, "step": 34030 }, { "epoch": 1.2915033081234213, "grad_norm": 0.29296875, "learning_rate": 0.00011160061716140622, "loss": 2.7837114334106445, "step": 34040 }, { "epoch": 1.2918827215141846, "grad_norm": 0.294921875, "learning_rate": 0.00011149371399611573, "loss": 2.7838422775268556, "step": 34050 }, { "epoch": 1.292262134904948, "grad_norm": 0.314453125, "learning_rate": 0.00011138684226643624, "loss": 2.8122732162475588, "step": 34060 }, { "epoch": 1.2926415482957112, "grad_norm": 0.296875, "learning_rate": 0.00011128000201032622, "loss": 2.775711250305176, "step": 34070 }, { "epoch": 1.2930209616864745, "grad_norm": 0.296875, "learning_rate": 0.00011117319326573341, "loss": 2.7805658340454102, "step": 34080 }, { "epoch": 1.2934003750772378, "grad_norm": 0.296875, "learning_rate": 0.00011106641607059369, "loss": 2.7611513137817383, "step": 34090 }, { "epoch": 1.293779788468001, "grad_norm": 0.306640625, "learning_rate": 0.00011095967046283242, "loss": 2.7799476623535155, "step": 34100 }, { "epoch": 1.2941592018587642, "grad_norm": 0.298828125, "learning_rate": 0.00011085295648036334, "loss": 2.7925670623779295, "step": 34110 }, { "epoch": 1.2945386152495275, "grad_norm": 0.298828125, "learning_rate": 0.00011074627416108905, "loss": 2.8068300247192384, "step": 34120 }, { "epoch": 1.2949180286402908, "grad_norm": 0.298828125, "learning_rate": 0.00011063962354290085, "loss": 2.7807952880859377, "step": 34130 }, { "epoch": 1.2952974420310541, "grad_norm": 0.298828125, "learning_rate": 0.000110533004663679, "loss": 2.778495025634766, "step": 34140 }, { "epoch": 1.2956768554218174, "grad_norm": 0.3203125, "learning_rate": 0.00011042641756129221, "loss": 2.7653680801391602, "step": 34150 }, { "epoch": 1.2960562688125807, "grad_norm": 0.298828125, "learning_rate": 0.00011031986227359812, "loss": 2.7558094024658204, "step": 34160 }, { "epoch": 1.2964356822033438, "grad_norm": 0.302734375, "learning_rate": 0.0001102133388384428, "loss": 2.774689483642578, "step": 34170 }, { "epoch": 1.2968150955941071, "grad_norm": 0.310546875, "learning_rate": 0.00011010684729366141, "loss": 2.7863292694091797, "step": 34180 }, { "epoch": 1.2971945089848704, "grad_norm": 0.30078125, "learning_rate": 0.00011000038767707743, "loss": 2.7735788345336916, "step": 34190 }, { "epoch": 1.2975739223756337, "grad_norm": 0.296875, "learning_rate": 0.00010989396002650315, "loss": 2.7460000991821287, "step": 34200 }, { "epoch": 1.297953335766397, "grad_norm": 0.30078125, "learning_rate": 0.00010978756437973938, "loss": 2.7795949935913087, "step": 34210 }, { "epoch": 1.2983327491571601, "grad_norm": 0.294921875, "learning_rate": 0.00010968120077457582, "loss": 2.7695075988769533, "step": 34220 }, { "epoch": 1.2987121625479237, "grad_norm": 0.298828125, "learning_rate": 0.00010957486924879059, "loss": 2.7750617980957033, "step": 34230 }, { "epoch": 1.2990915759386867, "grad_norm": 0.291015625, "learning_rate": 0.00010946856984015044, "loss": 2.763701629638672, "step": 34240 }, { "epoch": 1.29947098932945, "grad_norm": 0.298828125, "learning_rate": 0.0001093623025864106, "loss": 2.7443784713745116, "step": 34250 }, { "epoch": 1.29947098932945, "eval_loss": 2.7785723209381104, "eval_runtime": 238.9485, "eval_samples_per_second": 15.941, "eval_steps_per_second": 2.657, "step": 34250 }, { "epoch": 1.2998504027202133, "grad_norm": 0.296875, "learning_rate": 0.00010925606752531528, "loss": 2.792083740234375, "step": 34260 }, { "epoch": 1.3002298161109767, "grad_norm": 0.296875, "learning_rate": 0.0001091498646945968, "loss": 2.7427431106567384, "step": 34270 }, { "epoch": 1.30060922950174, "grad_norm": 0.318359375, "learning_rate": 0.00010904369413197628, "loss": 2.7977365493774413, "step": 34280 }, { "epoch": 1.300988642892503, "grad_norm": 0.296875, "learning_rate": 0.00010893755587516323, "loss": 2.7925338745117188, "step": 34290 }, { "epoch": 1.3013680562832664, "grad_norm": 0.291015625, "learning_rate": 0.00010883144996185593, "loss": 2.754009819030762, "step": 34300 }, { "epoch": 1.3017474696740297, "grad_norm": 0.294921875, "learning_rate": 0.00010872537642974092, "loss": 2.7550333023071287, "step": 34310 }, { "epoch": 1.302126883064793, "grad_norm": 0.30859375, "learning_rate": 0.00010861933531649335, "loss": 2.7757757186889647, "step": 34320 }, { "epoch": 1.3025062964555563, "grad_norm": 0.3125, "learning_rate": 0.00010851332665977685, "loss": 2.734751892089844, "step": 34330 }, { "epoch": 1.3028857098463196, "grad_norm": 0.302734375, "learning_rate": 0.00010840735049724342, "loss": 2.781819152832031, "step": 34340 }, { "epoch": 1.3032651232370829, "grad_norm": 0.298828125, "learning_rate": 0.00010830140686653376, "loss": 2.7879188537597654, "step": 34350 }, { "epoch": 1.303644536627846, "grad_norm": 0.294921875, "learning_rate": 0.00010819549580527682, "loss": 2.7699121475219726, "step": 34360 }, { "epoch": 1.3040239500186093, "grad_norm": 0.294921875, "learning_rate": 0.00010808961735109, "loss": 2.7371002197265626, "step": 34370 }, { "epoch": 1.3044033634093726, "grad_norm": 0.30078125, "learning_rate": 0.00010798377154157907, "loss": 2.7976442337036134, "step": 34380 }, { "epoch": 1.3047827768001359, "grad_norm": 0.306640625, "learning_rate": 0.00010787795841433845, "loss": 2.7682931900024412, "step": 34390 }, { "epoch": 1.3051621901908992, "grad_norm": 0.291015625, "learning_rate": 0.00010777217800695072, "loss": 2.7827295303344726, "step": 34400 }, { "epoch": 1.3055416035816625, "grad_norm": 0.298828125, "learning_rate": 0.00010766643035698685, "loss": 2.7657197952270507, "step": 34410 }, { "epoch": 1.3059210169724258, "grad_norm": 0.296875, "learning_rate": 0.00010756071550200624, "loss": 2.762852096557617, "step": 34420 }, { "epoch": 1.3063004303631889, "grad_norm": 0.298828125, "learning_rate": 0.00010745503347955667, "loss": 2.7748672485351564, "step": 34430 }, { "epoch": 1.3066798437539522, "grad_norm": 0.291015625, "learning_rate": 0.00010734938432717427, "loss": 2.774866485595703, "step": 34440 }, { "epoch": 1.3070592571447155, "grad_norm": 0.306640625, "learning_rate": 0.00010724376808238332, "loss": 2.7405136108398436, "step": 34450 }, { "epoch": 1.3074386705354788, "grad_norm": 0.294921875, "learning_rate": 0.00010713818478269657, "loss": 2.7854679107666014, "step": 34460 }, { "epoch": 1.307818083926242, "grad_norm": 0.298828125, "learning_rate": 0.00010703263446561511, "loss": 2.7679651260375975, "step": 34470 }, { "epoch": 1.3081974973170052, "grad_norm": 0.30078125, "learning_rate": 0.0001069271171686282, "loss": 2.7588031768798826, "step": 34480 }, { "epoch": 1.3085769107077687, "grad_norm": 0.2890625, "learning_rate": 0.00010682163292921339, "loss": 2.766629219055176, "step": 34490 }, { "epoch": 1.3089563240985318, "grad_norm": 0.30859375, "learning_rate": 0.00010671618178483646, "loss": 2.7901681900024413, "step": 34500 }, { "epoch": 1.3089563240985318, "eval_loss": 2.778262138366699, "eval_runtime": 238.7545, "eval_samples_per_second": 15.954, "eval_steps_per_second": 2.66, "step": 34500 }, { "epoch": 1.309335737489295, "grad_norm": 0.294921875, "learning_rate": 0.0001066107637729516, "loss": 2.7767948150634765, "step": 34510 }, { "epoch": 1.3097151508800584, "grad_norm": 0.30078125, "learning_rate": 0.0001065053789310011, "loss": 2.7888465881347657, "step": 34520 }, { "epoch": 1.3100945642708217, "grad_norm": 0.2890625, "learning_rate": 0.00010640002729641542, "loss": 2.7391164779663084, "step": 34530 }, { "epoch": 1.310473977661585, "grad_norm": 0.3046875, "learning_rate": 0.00010629470890661326, "loss": 2.7888038635253904, "step": 34540 }, { "epoch": 1.310853391052348, "grad_norm": 0.291015625, "learning_rate": 0.00010618942379900176, "loss": 2.778070831298828, "step": 34550 }, { "epoch": 1.3112328044431114, "grad_norm": 0.306640625, "learning_rate": 0.00010608417201097572, "loss": 2.7660036087036133, "step": 34560 }, { "epoch": 1.3116122178338747, "grad_norm": 0.298828125, "learning_rate": 0.00010597895357991865, "loss": 2.7961105346679687, "step": 34570 }, { "epoch": 1.311991631224638, "grad_norm": 0.30859375, "learning_rate": 0.00010587376854320178, "loss": 2.7670385360717775, "step": 34580 }, { "epoch": 1.3123710446154013, "grad_norm": 0.294921875, "learning_rate": 0.00010576861693818494, "loss": 2.763417625427246, "step": 34590 }, { "epoch": 1.3127504580061646, "grad_norm": 0.296875, "learning_rate": 0.00010566349880221548, "loss": 2.801090049743652, "step": 34600 }, { "epoch": 1.313129871396928, "grad_norm": 0.30078125, "learning_rate": 0.00010555841417262946, "loss": 2.774119567871094, "step": 34610 }, { "epoch": 1.313509284787691, "grad_norm": 0.294921875, "learning_rate": 0.0001054533630867506, "loss": 2.7587631225585936, "step": 34620 }, { "epoch": 1.3138886981784543, "grad_norm": 0.29296875, "learning_rate": 0.0001053483455818911, "loss": 2.7458959579467774, "step": 34630 }, { "epoch": 1.3142681115692176, "grad_norm": 0.302734375, "learning_rate": 0.00010524336169535076, "loss": 2.7486103057861326, "step": 34640 }, { "epoch": 1.314647524959981, "grad_norm": 0.30078125, "learning_rate": 0.00010513841146441787, "loss": 2.7570568084716798, "step": 34650 }, { "epoch": 1.3150269383507442, "grad_norm": 0.30078125, "learning_rate": 0.00010503349492636853, "loss": 2.7530284881591798, "step": 34660 }, { "epoch": 1.3154063517415073, "grad_norm": 0.298828125, "learning_rate": 0.00010492861211846697, "loss": 2.785405731201172, "step": 34670 }, { "epoch": 1.3157857651322709, "grad_norm": 0.310546875, "learning_rate": 0.00010482376307796529, "loss": 2.774991035461426, "step": 34680 }, { "epoch": 1.316165178523034, "grad_norm": 0.298828125, "learning_rate": 0.00010471894784210386, "loss": 2.7676395416259765, "step": 34690 }, { "epoch": 1.3165445919137972, "grad_norm": 0.298828125, "learning_rate": 0.00010461416644811084, "loss": 2.7750728607177733, "step": 34700 }, { "epoch": 1.3169240053045606, "grad_norm": 0.30859375, "learning_rate": 0.0001045094189332024, "loss": 2.7922744750976562, "step": 34710 }, { "epoch": 1.3173034186953239, "grad_norm": 0.302734375, "learning_rate": 0.00010440470533458266, "loss": 2.7546600341796874, "step": 34720 }, { "epoch": 1.3176828320860872, "grad_norm": 0.302734375, "learning_rate": 0.00010430002568944395, "loss": 2.7463201522827148, "step": 34730 }, { "epoch": 1.3180622454768502, "grad_norm": 0.302734375, "learning_rate": 0.00010419538003496601, "loss": 2.7548770904541016, "step": 34740 }, { "epoch": 1.3184416588676136, "grad_norm": 0.294921875, "learning_rate": 0.00010409076840831704, "loss": 2.7711191177368164, "step": 34750 }, { "epoch": 1.3184416588676136, "eval_loss": 2.777784824371338, "eval_runtime": 239.0476, "eval_samples_per_second": 15.934, "eval_steps_per_second": 2.656, "step": 34750 }, { "epoch": 1.3188210722583769, "grad_norm": 0.302734375, "learning_rate": 0.00010398619084665282, "loss": 2.8024654388427734, "step": 34760 }, { "epoch": 1.3192004856491402, "grad_norm": 0.3125, "learning_rate": 0.00010388164738711736, "loss": 2.7465465545654295, "step": 34770 }, { "epoch": 1.3195798990399035, "grad_norm": 0.294921875, "learning_rate": 0.00010377713806684202, "loss": 2.782278060913086, "step": 34780 }, { "epoch": 1.3199593124306668, "grad_norm": 0.30078125, "learning_rate": 0.00010367266292294662, "loss": 2.7681581497192385, "step": 34790 }, { "epoch": 1.32033872582143, "grad_norm": 0.3046875, "learning_rate": 0.00010356822199253837, "loss": 2.781281280517578, "step": 34800 }, { "epoch": 1.3207181392121932, "grad_norm": 0.3046875, "learning_rate": 0.00010346381531271286, "loss": 2.7454206466674806, "step": 34810 }, { "epoch": 1.3210975526029565, "grad_norm": 0.314453125, "learning_rate": 0.00010335944292055281, "loss": 2.762623405456543, "step": 34820 }, { "epoch": 1.3214769659937198, "grad_norm": 0.302734375, "learning_rate": 0.00010325510485312942, "loss": 2.7845970153808595, "step": 34830 }, { "epoch": 1.321856379384483, "grad_norm": 0.296875, "learning_rate": 0.00010315080114750131, "loss": 2.7701690673828123, "step": 34840 }, { "epoch": 1.3222357927752464, "grad_norm": 0.294921875, "learning_rate": 0.00010304653184071504, "loss": 2.795565605163574, "step": 34850 }, { "epoch": 1.3226152061660097, "grad_norm": 0.2890625, "learning_rate": 0.00010294229696980484, "loss": 2.7978899002075197, "step": 34860 }, { "epoch": 1.322994619556773, "grad_norm": 0.296875, "learning_rate": 0.00010283809657179291, "loss": 2.7724071502685548, "step": 34870 }, { "epoch": 1.323374032947536, "grad_norm": 0.296875, "learning_rate": 0.00010273393068368906, "loss": 2.7441791534423827, "step": 34880 }, { "epoch": 1.3237534463382994, "grad_norm": 0.30078125, "learning_rate": 0.00010262979934249084, "loss": 2.792716217041016, "step": 34890 }, { "epoch": 1.3241328597290627, "grad_norm": 0.298828125, "learning_rate": 0.00010252570258518348, "loss": 2.780387115478516, "step": 34900 }, { "epoch": 1.324512273119826, "grad_norm": 0.287109375, "learning_rate": 0.00010242164044874018, "loss": 2.767241096496582, "step": 34910 }, { "epoch": 1.3248916865105893, "grad_norm": 0.455078125, "learning_rate": 0.00010231761297012156, "loss": 2.7867053985595702, "step": 34920 }, { "epoch": 1.3252710999013524, "grad_norm": 0.302734375, "learning_rate": 0.00010221362018627601, "loss": 2.7598480224609374, "step": 34930 }, { "epoch": 1.325650513292116, "grad_norm": 0.2890625, "learning_rate": 0.00010210966213413972, "loss": 2.7762826919555663, "step": 34940 }, { "epoch": 1.326029926682879, "grad_norm": 0.294921875, "learning_rate": 0.00010200573885063642, "loss": 2.757663345336914, "step": 34950 }, { "epoch": 1.3264093400736423, "grad_norm": 0.296875, "learning_rate": 0.00010190185037267749, "loss": 2.7922170639038084, "step": 34960 }, { "epoch": 1.3267887534644056, "grad_norm": 0.302734375, "learning_rate": 0.00010179799673716195, "loss": 2.788397216796875, "step": 34970 }, { "epoch": 1.327168166855169, "grad_norm": 0.29296875, "learning_rate": 0.00010169417798097657, "loss": 2.7610313415527346, "step": 34980 }, { "epoch": 1.3275475802459322, "grad_norm": 0.29296875, "learning_rate": 0.0001015903941409956, "loss": 2.7711814880371093, "step": 34990 }, { "epoch": 1.3279269936366953, "grad_norm": 0.29296875, "learning_rate": 0.00010148664525408089, "loss": 2.7562623977661134, "step": 35000 }, { "epoch": 1.3279269936366953, "eval_loss": 2.777465343475342, "eval_runtime": 239.0089, "eval_samples_per_second": 15.937, "eval_steps_per_second": 2.657, "step": 35000 }, { "epoch": 1.3283064070274586, "grad_norm": 0.294921875, "learning_rate": 0.00010138293135708186, "loss": 2.757858657836914, "step": 35010 }, { "epoch": 1.328685820418222, "grad_norm": 0.3046875, "learning_rate": 0.0001012792524868357, "loss": 2.762888717651367, "step": 35020 }, { "epoch": 1.3290652338089852, "grad_norm": 0.30078125, "learning_rate": 0.00010117560868016692, "loss": 2.748147201538086, "step": 35030 }, { "epoch": 1.3294446471997485, "grad_norm": 0.30078125, "learning_rate": 0.00010107199997388765, "loss": 2.7800674438476562, "step": 35040 }, { "epoch": 1.3298240605905118, "grad_norm": 0.302734375, "learning_rate": 0.00010096842640479755, "loss": 2.7784889221191404, "step": 35050 }, { "epoch": 1.3302034739812751, "grad_norm": 0.302734375, "learning_rate": 0.00010086488800968399, "loss": 2.7559396743774416, "step": 35060 }, { "epoch": 1.3305828873720382, "grad_norm": 0.291015625, "learning_rate": 0.00010076138482532142, "loss": 2.7655403137207033, "step": 35070 }, { "epoch": 1.3309623007628015, "grad_norm": 0.296875, "learning_rate": 0.00010065791688847223, "loss": 2.7580310821533205, "step": 35080 }, { "epoch": 1.3313417141535648, "grad_norm": 0.29296875, "learning_rate": 0.00010055448423588598, "loss": 2.7623186111450195, "step": 35090 }, { "epoch": 1.3317211275443281, "grad_norm": 0.296875, "learning_rate": 0.00010045108690430004, "loss": 2.7950412750244142, "step": 35100 }, { "epoch": 1.3321005409350914, "grad_norm": 0.298828125, "learning_rate": 0.00010034772493043867, "loss": 2.7801862716674806, "step": 35110 }, { "epoch": 1.3324799543258548, "grad_norm": 0.302734375, "learning_rate": 0.00010024439835101423, "loss": 2.781943511962891, "step": 35120 }, { "epoch": 1.332859367716618, "grad_norm": 0.29296875, "learning_rate": 0.00010014110720272593, "loss": 2.777988815307617, "step": 35130 }, { "epoch": 1.3332387811073811, "grad_norm": 0.294921875, "learning_rate": 0.00010003785152226099, "loss": 2.7596792221069335, "step": 35140 }, { "epoch": 1.3336181944981444, "grad_norm": 0.30078125, "learning_rate": 9.993463134629333e-05, "loss": 2.792178726196289, "step": 35150 }, { "epoch": 1.3339976078889078, "grad_norm": 0.294921875, "learning_rate": 9.983144671148487e-05, "loss": 2.767119789123535, "step": 35160 }, { "epoch": 1.334377021279671, "grad_norm": 0.302734375, "learning_rate": 9.972829765448464e-05, "loss": 2.775430679321289, "step": 35170 }, { "epoch": 1.3347564346704344, "grad_norm": 0.296875, "learning_rate": 9.962518421192902e-05, "loss": 2.767574691772461, "step": 35180 }, { "epoch": 1.3351358480611974, "grad_norm": 0.296875, "learning_rate": 9.952210642044171e-05, "loss": 2.758332061767578, "step": 35190 }, { "epoch": 1.335515261451961, "grad_norm": 0.298828125, "learning_rate": 9.941906431663397e-05, "loss": 2.7663671493530275, "step": 35200 }, { "epoch": 1.335894674842724, "grad_norm": 0.296875, "learning_rate": 9.931605793710417e-05, "loss": 2.756064796447754, "step": 35210 }, { "epoch": 1.3362740882334874, "grad_norm": 0.3046875, "learning_rate": 9.921308731843803e-05, "loss": 2.751726531982422, "step": 35220 }, { "epoch": 1.3366535016242507, "grad_norm": 0.302734375, "learning_rate": 9.911015249720856e-05, "loss": 2.7624998092651367, "step": 35230 }, { "epoch": 1.337032915015014, "grad_norm": 0.291015625, "learning_rate": 9.90072535099763e-05, "loss": 2.768180274963379, "step": 35240 }, { "epoch": 1.3374123284057773, "grad_norm": 0.310546875, "learning_rate": 9.890439039328854e-05, "loss": 2.739475059509277, "step": 35250 }, { "epoch": 1.3374123284057773, "eval_loss": 2.7771129608154297, "eval_runtime": 190.1972, "eval_samples_per_second": 20.027, "eval_steps_per_second": 3.339, "step": 35250 }, { "epoch": 1.3377917417965404, "grad_norm": 0.296875, "learning_rate": 9.880156318368037e-05, "loss": 2.7787734985351564, "step": 35260 }, { "epoch": 1.3381711551873037, "grad_norm": 0.3046875, "learning_rate": 9.869877191767377e-05, "loss": 2.7565235137939452, "step": 35270 }, { "epoch": 1.338550568578067, "grad_norm": 0.29296875, "learning_rate": 9.859601663177825e-05, "loss": 2.7874698638916016, "step": 35280 }, { "epoch": 1.3389299819688303, "grad_norm": 0.3046875, "learning_rate": 9.849329736249013e-05, "loss": 2.7844417572021483, "step": 35290 }, { "epoch": 1.3393093953595936, "grad_norm": 0.296875, "learning_rate": 9.839061414629338e-05, "loss": 2.756439781188965, "step": 35300 }, { "epoch": 1.339688808750357, "grad_norm": 0.2890625, "learning_rate": 9.828796701965881e-05, "loss": 2.758011054992676, "step": 35310 }, { "epoch": 1.3400682221411202, "grad_norm": 0.298828125, "learning_rate": 9.818535601904482e-05, "loss": 2.7790714263916017, "step": 35320 }, { "epoch": 1.3404476355318833, "grad_norm": 0.294921875, "learning_rate": 9.80827811808964e-05, "loss": 2.7486616134643556, "step": 35330 }, { "epoch": 1.3408270489226466, "grad_norm": 0.294921875, "learning_rate": 9.798024254164624e-05, "loss": 2.7645074844360353, "step": 35340 }, { "epoch": 1.34120646231341, "grad_norm": 0.294921875, "learning_rate": 9.787774013771392e-05, "loss": 2.769732856750488, "step": 35350 }, { "epoch": 1.3415858757041732, "grad_norm": 0.296875, "learning_rate": 9.777527400550613e-05, "loss": 2.759073829650879, "step": 35360 }, { "epoch": 1.3419652890949365, "grad_norm": 0.302734375, "learning_rate": 9.767284418141669e-05, "loss": 2.743440055847168, "step": 35370 }, { "epoch": 1.3423447024856996, "grad_norm": 0.296875, "learning_rate": 9.757045070182669e-05, "loss": 2.8006237030029295, "step": 35380 }, { "epoch": 1.3427241158764631, "grad_norm": 0.29296875, "learning_rate": 9.746809360310416e-05, "loss": 2.7505281448364256, "step": 35390 }, { "epoch": 1.3431035292672262, "grad_norm": 0.298828125, "learning_rate": 9.736577292160417e-05, "loss": 2.7769500732421877, "step": 35400 }, { "epoch": 1.3434829426579895, "grad_norm": 0.298828125, "learning_rate": 9.726348869366889e-05, "loss": 2.7662590026855467, "step": 35410 }, { "epoch": 1.3438623560487528, "grad_norm": 0.2890625, "learning_rate": 9.716124095562771e-05, "loss": 2.7367374420166017, "step": 35420 }, { "epoch": 1.3442417694395161, "grad_norm": 0.294921875, "learning_rate": 9.705902974379684e-05, "loss": 2.7806066513061523, "step": 35430 }, { "epoch": 1.3446211828302794, "grad_norm": 0.29296875, "learning_rate": 9.695685509447961e-05, "loss": 2.77655143737793, "step": 35440 }, { "epoch": 1.3450005962210425, "grad_norm": 0.302734375, "learning_rate": 9.685471704396627e-05, "loss": 2.784286689758301, "step": 35450 }, { "epoch": 1.345380009611806, "grad_norm": 0.30078125, "learning_rate": 9.67526156285343e-05, "loss": 2.7627546310424806, "step": 35460 }, { "epoch": 1.3457594230025691, "grad_norm": 0.296875, "learning_rate": 9.665055088444797e-05, "loss": 2.7663673400878905, "step": 35470 }, { "epoch": 1.3461388363933324, "grad_norm": 0.32421875, "learning_rate": 9.654852284795852e-05, "loss": 2.7726552963256834, "step": 35480 }, { "epoch": 1.3465182497840957, "grad_norm": 0.298828125, "learning_rate": 9.644653155530419e-05, "loss": 2.75240421295166, "step": 35490 }, { "epoch": 1.346897663174859, "grad_norm": 0.294921875, "learning_rate": 9.634457704271031e-05, "loss": 2.758844757080078, "step": 35500 }, { "epoch": 1.346897663174859, "eval_loss": 2.777334451675415, "eval_runtime": 190.0507, "eval_samples_per_second": 20.042, "eval_steps_per_second": 3.341, "step": 35500 }, { "epoch": 1.3472770765656223, "grad_norm": 0.30078125, "learning_rate": 9.624265934638899e-05, "loss": 2.77371768951416, "step": 35510 }, { "epoch": 1.3476564899563854, "grad_norm": 0.298828125, "learning_rate": 9.614077850253922e-05, "loss": 2.7727659225463865, "step": 35520 }, { "epoch": 1.3480359033471487, "grad_norm": 0.302734375, "learning_rate": 9.603893454734698e-05, "loss": 2.783967399597168, "step": 35530 }, { "epoch": 1.348415316737912, "grad_norm": 0.29296875, "learning_rate": 9.593712751698525e-05, "loss": 2.782316780090332, "step": 35540 }, { "epoch": 1.3487947301286753, "grad_norm": 0.30078125, "learning_rate": 9.583535744761371e-05, "loss": 2.7872827529907225, "step": 35550 }, { "epoch": 1.3491741435194387, "grad_norm": 0.310546875, "learning_rate": 9.573362437537905e-05, "loss": 2.778621864318848, "step": 35560 }, { "epoch": 1.349553556910202, "grad_norm": 0.298828125, "learning_rate": 9.563192833641472e-05, "loss": 2.757267951965332, "step": 35570 }, { "epoch": 1.3499329703009653, "grad_norm": 0.30078125, "learning_rate": 9.5530269366841e-05, "loss": 2.757533645629883, "step": 35580 }, { "epoch": 1.3503123836917283, "grad_norm": 0.302734375, "learning_rate": 9.542864750276519e-05, "loss": 2.753961753845215, "step": 35590 }, { "epoch": 1.3506917970824917, "grad_norm": 0.298828125, "learning_rate": 9.532706278028124e-05, "loss": 2.77132568359375, "step": 35600 }, { "epoch": 1.351071210473255, "grad_norm": 0.306640625, "learning_rate": 9.52255152354699e-05, "loss": 2.7579395294189455, "step": 35610 }, { "epoch": 1.3514506238640183, "grad_norm": 0.302734375, "learning_rate": 9.512400490439877e-05, "loss": 2.785846710205078, "step": 35620 }, { "epoch": 1.3518300372547816, "grad_norm": 0.298828125, "learning_rate": 9.502253182312232e-05, "loss": 2.7498249053955077, "step": 35630 }, { "epoch": 1.3522094506455447, "grad_norm": 0.298828125, "learning_rate": 9.492109602768165e-05, "loss": 2.7427444458007812, "step": 35640 }, { "epoch": 1.3525888640363082, "grad_norm": 0.298828125, "learning_rate": 9.481969755410468e-05, "loss": 2.746337127685547, "step": 35650 }, { "epoch": 1.3529682774270713, "grad_norm": 0.296875, "learning_rate": 9.471833643840593e-05, "loss": 2.7842433929443358, "step": 35660 }, { "epoch": 1.3533476908178346, "grad_norm": 0.291015625, "learning_rate": 9.461701271658701e-05, "loss": 2.7451948165893554, "step": 35670 }, { "epoch": 1.3537271042085979, "grad_norm": 0.291015625, "learning_rate": 9.451572642463587e-05, "loss": 2.7855724334716796, "step": 35680 }, { "epoch": 1.3541065175993612, "grad_norm": 0.296875, "learning_rate": 9.441447759852737e-05, "loss": 2.754529571533203, "step": 35690 }, { "epoch": 1.3544859309901245, "grad_norm": 0.3046875, "learning_rate": 9.43132662742229e-05, "loss": 2.7653823852539063, "step": 35700 }, { "epoch": 1.3548653443808876, "grad_norm": 0.29296875, "learning_rate": 9.421209248767083e-05, "loss": 2.7926118850708006, "step": 35710 }, { "epoch": 1.3552447577716509, "grad_norm": 0.296875, "learning_rate": 9.411095627480586e-05, "loss": 2.75591983795166, "step": 35720 }, { "epoch": 1.3556241711624142, "grad_norm": 0.298828125, "learning_rate": 9.400985767154957e-05, "loss": 2.776497650146484, "step": 35730 }, { "epoch": 1.3560035845531775, "grad_norm": 0.29296875, "learning_rate": 9.390879671380999e-05, "loss": 2.7617395401000975, "step": 35740 }, { "epoch": 1.3563829979439408, "grad_norm": 0.291015625, "learning_rate": 9.38077734374821e-05, "loss": 2.779347229003906, "step": 35750 }, { "epoch": 1.3563829979439408, "eval_loss": 2.776610851287842, "eval_runtime": 197.4688, "eval_samples_per_second": 19.289, "eval_steps_per_second": 3.216, "step": 35750 }, { "epoch": 1.356762411334704, "grad_norm": 0.302734375, "learning_rate": 9.370678787844703e-05, "loss": 2.7436113357543945, "step": 35760 }, { "epoch": 1.3571418247254674, "grad_norm": 0.294921875, "learning_rate": 9.360584007257296e-05, "loss": 2.79432373046875, "step": 35770 }, { "epoch": 1.3575212381162305, "grad_norm": 0.302734375, "learning_rate": 9.350493005571435e-05, "loss": 2.791601371765137, "step": 35780 }, { "epoch": 1.3579006515069938, "grad_norm": 0.291015625, "learning_rate": 9.340405786371258e-05, "loss": 2.7955806732177733, "step": 35790 }, { "epoch": 1.358280064897757, "grad_norm": 0.294921875, "learning_rate": 9.330322353239505e-05, "loss": 2.7913970947265625, "step": 35800 }, { "epoch": 1.3586594782885204, "grad_norm": 0.296875, "learning_rate": 9.320242709757629e-05, "loss": 2.799349784851074, "step": 35810 }, { "epoch": 1.3590388916792837, "grad_norm": 0.291015625, "learning_rate": 9.310166859505695e-05, "loss": 2.775654602050781, "step": 35820 }, { "epoch": 1.359418305070047, "grad_norm": 0.302734375, "learning_rate": 9.300094806062465e-05, "loss": 2.743771553039551, "step": 35830 }, { "epoch": 1.3597977184608103, "grad_norm": 0.291015625, "learning_rate": 9.290026553005288e-05, "loss": 2.764354705810547, "step": 35840 }, { "epoch": 1.3601771318515734, "grad_norm": 0.302734375, "learning_rate": 9.279962103910227e-05, "loss": 2.772987937927246, "step": 35850 }, { "epoch": 1.3605565452423367, "grad_norm": 0.29296875, "learning_rate": 9.26990146235196e-05, "loss": 2.7696590423583984, "step": 35860 }, { "epoch": 1.3609359586331, "grad_norm": 0.30078125, "learning_rate": 9.259844631903818e-05, "loss": 2.778325080871582, "step": 35870 }, { "epoch": 1.3613153720238633, "grad_norm": 0.294921875, "learning_rate": 9.249791616137771e-05, "loss": 2.8242877960205077, "step": 35880 }, { "epoch": 1.3616947854146266, "grad_norm": 0.294921875, "learning_rate": 9.239742418624462e-05, "loss": 2.7647476196289062, "step": 35890 }, { "epoch": 1.3620741988053897, "grad_norm": 0.28515625, "learning_rate": 9.229697042933152e-05, "loss": 2.7643291473388674, "step": 35900 }, { "epoch": 1.3624536121961532, "grad_norm": 0.29296875, "learning_rate": 9.219655492631747e-05, "loss": 2.76641960144043, "step": 35910 }, { "epoch": 1.3628330255869163, "grad_norm": 0.2890625, "learning_rate": 9.209617771286796e-05, "loss": 2.78900260925293, "step": 35920 }, { "epoch": 1.3632124389776796, "grad_norm": 0.306640625, "learning_rate": 9.199583882463503e-05, "loss": 2.7456632614135743, "step": 35930 }, { "epoch": 1.363591852368443, "grad_norm": 0.29296875, "learning_rate": 9.189553829725692e-05, "loss": 2.7792230606079102, "step": 35940 }, { "epoch": 1.3639712657592062, "grad_norm": 0.294921875, "learning_rate": 9.179527616635832e-05, "loss": 2.788455772399902, "step": 35950 }, { "epoch": 1.3643506791499695, "grad_norm": 0.291015625, "learning_rate": 9.169505246755021e-05, "loss": 2.7611717224121093, "step": 35960 }, { "epoch": 1.3647300925407326, "grad_norm": 0.291015625, "learning_rate": 9.15948672364302e-05, "loss": 2.775840950012207, "step": 35970 }, { "epoch": 1.365109505931496, "grad_norm": 0.298828125, "learning_rate": 9.14947205085817e-05, "loss": 2.7424638748168944, "step": 35980 }, { "epoch": 1.3654889193222592, "grad_norm": 0.2890625, "learning_rate": 9.139461231957502e-05, "loss": 2.773838996887207, "step": 35990 }, { "epoch": 1.3658683327130225, "grad_norm": 0.294921875, "learning_rate": 9.129454270496639e-05, "loss": 2.782877731323242, "step": 36000 }, { "epoch": 1.3658683327130225, "eval_loss": 2.776322841644287, "eval_runtime": 190.6, "eval_samples_per_second": 19.984, "eval_steps_per_second": 3.332, "step": 36000 }, { "epoch": 1.3662477461037859, "grad_norm": 0.302734375, "learning_rate": 9.119451170029865e-05, "loss": 2.816996383666992, "step": 36010 }, { "epoch": 1.3666271594945492, "grad_norm": 0.29296875, "learning_rate": 9.10945193411005e-05, "loss": 2.7784223556518555, "step": 36020 }, { "epoch": 1.3670065728853125, "grad_norm": 0.30078125, "learning_rate": 9.099456566288736e-05, "loss": 2.7633268356323244, "step": 36030 }, { "epoch": 1.3673859862760755, "grad_norm": 0.296875, "learning_rate": 9.089465070116058e-05, "loss": 2.780078887939453, "step": 36040 }, { "epoch": 1.3677653996668389, "grad_norm": 0.30078125, "learning_rate": 9.079477449140817e-05, "loss": 2.745391082763672, "step": 36050 }, { "epoch": 1.3681448130576022, "grad_norm": 0.291015625, "learning_rate": 9.069493706910372e-05, "loss": 2.7804088592529297, "step": 36060 }, { "epoch": 1.3685242264483655, "grad_norm": 0.296875, "learning_rate": 9.05951384697077e-05, "loss": 2.766518402099609, "step": 36070 }, { "epoch": 1.3689036398391288, "grad_norm": 0.32421875, "learning_rate": 9.049537872866646e-05, "loss": 2.741163635253906, "step": 36080 }, { "epoch": 1.3692830532298919, "grad_norm": 0.30078125, "learning_rate": 9.03956578814125e-05, "loss": 2.7730825424194334, "step": 36090 }, { "epoch": 1.3696624666206554, "grad_norm": 0.291015625, "learning_rate": 9.029597596336479e-05, "loss": 2.7365594863891602, "step": 36100 }, { "epoch": 1.3700418800114185, "grad_norm": 0.296875, "learning_rate": 9.019633300992816e-05, "loss": 2.7447656631469726, "step": 36110 }, { "epoch": 1.3704212934021818, "grad_norm": 0.29296875, "learning_rate": 9.00967290564938e-05, "loss": 2.7633312225341795, "step": 36120 }, { "epoch": 1.370800706792945, "grad_norm": 0.29296875, "learning_rate": 8.999716413843891e-05, "loss": 2.755478858947754, "step": 36130 }, { "epoch": 1.3711801201837084, "grad_norm": 0.294921875, "learning_rate": 8.989763829112698e-05, "loss": 2.771398735046387, "step": 36140 }, { "epoch": 1.3715595335744717, "grad_norm": 0.296875, "learning_rate": 8.979815154990753e-05, "loss": 2.7757076263427733, "step": 36150 }, { "epoch": 1.3719389469652348, "grad_norm": 0.30078125, "learning_rate": 8.969870395011621e-05, "loss": 2.7868398666381835, "step": 36160 }, { "epoch": 1.3723183603559983, "grad_norm": 0.298828125, "learning_rate": 8.959929552707465e-05, "loss": 2.8166168212890623, "step": 36170 }, { "epoch": 1.3726977737467614, "grad_norm": 0.298828125, "learning_rate": 8.949992631609083e-05, "loss": 2.7569637298583984, "step": 36180 }, { "epoch": 1.3730771871375247, "grad_norm": 0.287109375, "learning_rate": 8.940059635245859e-05, "loss": 2.7877159118652344, "step": 36190 }, { "epoch": 1.373456600528288, "grad_norm": 0.296875, "learning_rate": 8.930130567145786e-05, "loss": 2.761771011352539, "step": 36200 }, { "epoch": 1.3738360139190513, "grad_norm": 0.28515625, "learning_rate": 8.92020543083546e-05, "loss": 2.779192352294922, "step": 36210 }, { "epoch": 1.3742154273098146, "grad_norm": 0.294921875, "learning_rate": 8.9102842298401e-05, "loss": 2.7806232452392576, "step": 36220 }, { "epoch": 1.3745948407005777, "grad_norm": 0.296875, "learning_rate": 8.900366967683502e-05, "loss": 2.7343490600585936, "step": 36230 }, { "epoch": 1.374974254091341, "grad_norm": 0.291015625, "learning_rate": 8.890453647888077e-05, "loss": 2.7685203552246094, "step": 36240 }, { "epoch": 1.3753536674821043, "grad_norm": 0.306640625, "learning_rate": 8.880544273974823e-05, "loss": 2.7639678955078124, "step": 36250 }, { "epoch": 1.3753536674821043, "eval_loss": 2.7760376930236816, "eval_runtime": 191.0593, "eval_samples_per_second": 19.936, "eval_steps_per_second": 3.324, "step": 36250 }, { "epoch": 1.3757330808728676, "grad_norm": 0.3046875, "learning_rate": 8.87063884946336e-05, "loss": 2.7565919876098635, "step": 36260 }, { "epoch": 1.376112494263631, "grad_norm": 0.298828125, "learning_rate": 8.860737377871883e-05, "loss": 2.752772331237793, "step": 36270 }, { "epoch": 1.3764919076543942, "grad_norm": 0.314453125, "learning_rate": 8.850839862717193e-05, "loss": 2.7927040100097655, "step": 36280 }, { "epoch": 1.3768713210451575, "grad_norm": 0.2890625, "learning_rate": 8.840946307514675e-05, "loss": 2.742411804199219, "step": 36290 }, { "epoch": 1.3772507344359206, "grad_norm": 0.296875, "learning_rate": 8.831056715778338e-05, "loss": 2.7574010848999024, "step": 36300 }, { "epoch": 1.377630147826684, "grad_norm": 0.294921875, "learning_rate": 8.821171091020733e-05, "loss": 2.7856969833374023, "step": 36310 }, { "epoch": 1.3780095612174472, "grad_norm": 0.29296875, "learning_rate": 8.811289436753051e-05, "loss": 2.81508731842041, "step": 36320 }, { "epoch": 1.3783889746082105, "grad_norm": 0.296875, "learning_rate": 8.801411756485038e-05, "loss": 2.752048873901367, "step": 36330 }, { "epoch": 1.3787683879989738, "grad_norm": 0.294921875, "learning_rate": 8.791538053725068e-05, "loss": 2.7523813247680664, "step": 36340 }, { "epoch": 1.379147801389737, "grad_norm": 0.291015625, "learning_rate": 8.781668331980042e-05, "loss": 2.770431327819824, "step": 36350 }, { "epoch": 1.3795272147805004, "grad_norm": 0.3125, "learning_rate": 8.771802594755507e-05, "loss": 2.759814643859863, "step": 36360 }, { "epoch": 1.3799066281712635, "grad_norm": 0.294921875, "learning_rate": 8.761940845555564e-05, "loss": 2.763759994506836, "step": 36370 }, { "epoch": 1.3802860415620268, "grad_norm": 0.298828125, "learning_rate": 8.752083087882898e-05, "loss": 2.798875427246094, "step": 36380 }, { "epoch": 1.3806654549527901, "grad_norm": 0.3046875, "learning_rate": 8.74222932523878e-05, "loss": 2.79052848815918, "step": 36390 }, { "epoch": 1.3810448683435534, "grad_norm": 0.298828125, "learning_rate": 8.732379561123076e-05, "loss": 2.7755359649658202, "step": 36400 }, { "epoch": 1.3814242817343168, "grad_norm": 0.298828125, "learning_rate": 8.722533799034214e-05, "loss": 2.7655569076538087, "step": 36410 }, { "epoch": 1.3818036951250798, "grad_norm": 0.296875, "learning_rate": 8.712692042469202e-05, "loss": 2.7466714859008787, "step": 36420 }, { "epoch": 1.3821831085158431, "grad_norm": 0.294921875, "learning_rate": 8.702854294923624e-05, "loss": 2.736204147338867, "step": 36430 }, { "epoch": 1.3825625219066064, "grad_norm": 0.298828125, "learning_rate": 8.69302055989166e-05, "loss": 2.746609687805176, "step": 36440 }, { "epoch": 1.3829419352973698, "grad_norm": 0.296875, "learning_rate": 8.683190840866045e-05, "loss": 2.772617530822754, "step": 36450 }, { "epoch": 1.383321348688133, "grad_norm": 0.2890625, "learning_rate": 8.67336514133809e-05, "loss": 2.7861690521240234, "step": 36460 }, { "epoch": 1.3837007620788964, "grad_norm": 0.294921875, "learning_rate": 8.663543464797675e-05, "loss": 2.7811466217041017, "step": 36470 }, { "epoch": 1.3840801754696597, "grad_norm": 0.30078125, "learning_rate": 8.653725814733278e-05, "loss": 2.7898639678955077, "step": 36480 }, { "epoch": 1.3844595888604228, "grad_norm": 0.30078125, "learning_rate": 8.6439121946319e-05, "loss": 2.742196273803711, "step": 36490 }, { "epoch": 1.384839002251186, "grad_norm": 0.294921875, "learning_rate": 8.634102607979157e-05, "loss": 2.78470458984375, "step": 36500 }, { "epoch": 1.384839002251186, "eval_loss": 2.7757537364959717, "eval_runtime": 192.1364, "eval_samples_per_second": 19.824, "eval_steps_per_second": 3.305, "step": 36500 }, { "epoch": 1.3852184156419494, "grad_norm": 0.2890625, "learning_rate": 8.624297058259196e-05, "loss": 2.750570869445801, "step": 36510 }, { "epoch": 1.3855978290327127, "grad_norm": 0.326171875, "learning_rate": 8.61449554895477e-05, "loss": 2.7899639129638674, "step": 36520 }, { "epoch": 1.385977242423476, "grad_norm": 0.294921875, "learning_rate": 8.604698083547147e-05, "loss": 2.757917594909668, "step": 36530 }, { "epoch": 1.3863566558142393, "grad_norm": 0.294921875, "learning_rate": 8.594904665516202e-05, "loss": 2.7510147094726562, "step": 36540 }, { "epoch": 1.3867360692050026, "grad_norm": 0.302734375, "learning_rate": 8.585115298340343e-05, "loss": 2.797645378112793, "step": 36550 }, { "epoch": 1.3871154825957657, "grad_norm": 0.310546875, "learning_rate": 8.575329985496574e-05, "loss": 2.7910009384155274, "step": 36560 }, { "epoch": 1.387494895986529, "grad_norm": 0.287109375, "learning_rate": 8.565548730460406e-05, "loss": 2.755792808532715, "step": 36570 }, { "epoch": 1.3878743093772923, "grad_norm": 0.294921875, "learning_rate": 8.555771536705965e-05, "loss": 2.7554580688476564, "step": 36580 }, { "epoch": 1.3882537227680556, "grad_norm": 0.294921875, "learning_rate": 8.545998407705897e-05, "loss": 2.781211090087891, "step": 36590 }, { "epoch": 1.388633136158819, "grad_norm": 0.291015625, "learning_rate": 8.53622934693142e-05, "loss": 2.782056999206543, "step": 36600 }, { "epoch": 1.389012549549582, "grad_norm": 0.296875, "learning_rate": 8.52646435785229e-05, "loss": 2.762912559509277, "step": 36610 }, { "epoch": 1.3893919629403455, "grad_norm": 0.3046875, "learning_rate": 8.516703443936853e-05, "loss": 2.790074348449707, "step": 36620 }, { "epoch": 1.3897713763311086, "grad_norm": 0.296875, "learning_rate": 8.506946608651972e-05, "loss": 2.755118179321289, "step": 36630 }, { "epoch": 1.390150789721872, "grad_norm": 0.296875, "learning_rate": 8.497193855463075e-05, "loss": 2.782937240600586, "step": 36640 }, { "epoch": 1.3905302031126352, "grad_norm": 0.296875, "learning_rate": 8.487445187834133e-05, "loss": 2.7663362503051756, "step": 36650 }, { "epoch": 1.3909096165033985, "grad_norm": 0.30078125, "learning_rate": 8.477700609227688e-05, "loss": 2.771579933166504, "step": 36660 }, { "epoch": 1.3912890298941618, "grad_norm": 0.29296875, "learning_rate": 8.467960123104801e-05, "loss": 2.740419387817383, "step": 36670 }, { "epoch": 1.391668443284925, "grad_norm": 0.2890625, "learning_rate": 8.458223732925096e-05, "loss": 2.762233352661133, "step": 36680 }, { "epoch": 1.3920478566756882, "grad_norm": 0.294921875, "learning_rate": 8.448491442146734e-05, "loss": 2.7526973724365233, "step": 36690 }, { "epoch": 1.3924272700664515, "grad_norm": 0.298828125, "learning_rate": 8.438763254226431e-05, "loss": 2.7609926223754884, "step": 36700 }, { "epoch": 1.3928066834572148, "grad_norm": 0.29296875, "learning_rate": 8.429039172619439e-05, "loss": 2.7887332916259764, "step": 36710 }, { "epoch": 1.3931860968479781, "grad_norm": 0.30078125, "learning_rate": 8.41931920077955e-05, "loss": 2.7436391830444338, "step": 36720 }, { "epoch": 1.3935655102387414, "grad_norm": 0.294921875, "learning_rate": 8.40960334215909e-05, "loss": 2.753631591796875, "step": 36730 }, { "epoch": 1.3939449236295047, "grad_norm": 0.302734375, "learning_rate": 8.399891600208942e-05, "loss": 2.7711496353149414, "step": 36740 }, { "epoch": 1.3943243370202678, "grad_norm": 0.2890625, "learning_rate": 8.390183978378516e-05, "loss": 2.7762868881225584, "step": 36750 }, { "epoch": 1.3943243370202678, "eval_loss": 2.775576591491699, "eval_runtime": 190.5247, "eval_samples_per_second": 19.992, "eval_steps_per_second": 3.333, "step": 36750 }, { "epoch": 1.3947037504110311, "grad_norm": 0.30078125, "learning_rate": 8.38048048011576e-05, "loss": 2.767830657958984, "step": 36760 }, { "epoch": 1.3950831638017944, "grad_norm": 0.294921875, "learning_rate": 8.370781108867142e-05, "loss": 2.7593416213989257, "step": 36770 }, { "epoch": 1.3954625771925577, "grad_norm": 0.3046875, "learning_rate": 8.361085868077703e-05, "loss": 2.756595802307129, "step": 36780 }, { "epoch": 1.395841990583321, "grad_norm": 0.29296875, "learning_rate": 8.351394761190978e-05, "loss": 2.7926435470581055, "step": 36790 }, { "epoch": 1.3962214039740843, "grad_norm": 0.310546875, "learning_rate": 8.341707791649051e-05, "loss": 2.7904485702514648, "step": 36800 }, { "epoch": 1.3966008173648476, "grad_norm": 0.296875, "learning_rate": 8.332024962892537e-05, "loss": 2.7680206298828125, "step": 36810 }, { "epoch": 1.3969802307556107, "grad_norm": 0.294921875, "learning_rate": 8.32234627836057e-05, "loss": 2.77176513671875, "step": 36820 }, { "epoch": 1.397359644146374, "grad_norm": 0.29296875, "learning_rate": 8.312671741490827e-05, "loss": 2.762699317932129, "step": 36830 }, { "epoch": 1.3977390575371373, "grad_norm": 0.296875, "learning_rate": 8.303001355719497e-05, "loss": 2.7620571136474608, "step": 36840 }, { "epoch": 1.3981184709279006, "grad_norm": 0.310546875, "learning_rate": 8.293335124481322e-05, "loss": 2.7768869400024414, "step": 36850 }, { "epoch": 1.398497884318664, "grad_norm": 0.298828125, "learning_rate": 8.28367305120952e-05, "loss": 2.7857362747192385, "step": 36860 }, { "epoch": 1.398877297709427, "grad_norm": 0.298828125, "learning_rate": 8.27401513933588e-05, "loss": 2.750942611694336, "step": 36870 }, { "epoch": 1.3992567111001906, "grad_norm": 0.29296875, "learning_rate": 8.26436139229069e-05, "loss": 2.775135803222656, "step": 36880 }, { "epoch": 1.3996361244909536, "grad_norm": 0.298828125, "learning_rate": 8.254711813502755e-05, "loss": 2.8043174743652344, "step": 36890 }, { "epoch": 1.400015537881717, "grad_norm": 0.294921875, "learning_rate": 8.245066406399409e-05, "loss": 2.772347640991211, "step": 36900 }, { "epoch": 1.4003949512724803, "grad_norm": 0.296875, "learning_rate": 8.235425174406508e-05, "loss": 2.752819061279297, "step": 36910 }, { "epoch": 1.4007743646632436, "grad_norm": 0.29296875, "learning_rate": 8.225788120948419e-05, "loss": 2.8008647918701173, "step": 36920 }, { "epoch": 1.4011537780540069, "grad_norm": 0.298828125, "learning_rate": 8.216155249448018e-05, "loss": 2.7414743423461916, "step": 36930 }, { "epoch": 1.40153319144477, "grad_norm": 0.3046875, "learning_rate": 8.206526563326705e-05, "loss": 2.7860124588012694, "step": 36940 }, { "epoch": 1.4019126048355333, "grad_norm": 0.29296875, "learning_rate": 8.196902066004395e-05, "loss": 2.784760093688965, "step": 36950 }, { "epoch": 1.4022920182262966, "grad_norm": 0.306640625, "learning_rate": 8.18728176089951e-05, "loss": 2.7735265731811523, "step": 36960 }, { "epoch": 1.4026714316170599, "grad_norm": 0.30078125, "learning_rate": 8.177665651428983e-05, "loss": 2.759780502319336, "step": 36970 }, { "epoch": 1.4030508450078232, "grad_norm": 0.2890625, "learning_rate": 8.16805374100825e-05, "loss": 2.7622194290161133, "step": 36980 }, { "epoch": 1.4034302583985865, "grad_norm": 0.29296875, "learning_rate": 8.158446033051288e-05, "loss": 2.7730789184570312, "step": 36990 }, { "epoch": 1.4038096717893498, "grad_norm": 0.294921875, "learning_rate": 8.148842530970529e-05, "loss": 2.7752216339111326, "step": 37000 }, { "epoch": 1.4038096717893498, "eval_loss": 2.7753376960754395, "eval_runtime": 190.5832, "eval_samples_per_second": 19.986, "eval_steps_per_second": 3.332, "step": 37000 }, { "epoch": 1.4041890851801129, "grad_norm": 0.296875, "learning_rate": 8.139243238176954e-05, "loss": 2.7655725479125977, "step": 37010 }, { "epoch": 1.4045684985708762, "grad_norm": 0.298828125, "learning_rate": 8.129648158080028e-05, "loss": 2.7884757995605467, "step": 37020 }, { "epoch": 1.4049479119616395, "grad_norm": 0.291015625, "learning_rate": 8.120057294087742e-05, "loss": 2.785819244384766, "step": 37030 }, { "epoch": 1.4053273253524028, "grad_norm": 0.2890625, "learning_rate": 8.110470649606547e-05, "loss": 2.743495750427246, "step": 37040 }, { "epoch": 1.405706738743166, "grad_norm": 0.294921875, "learning_rate": 8.100888228041444e-05, "loss": 2.757626533508301, "step": 37050 }, { "epoch": 1.4060861521339292, "grad_norm": 0.291015625, "learning_rate": 8.091310032795896e-05, "loss": 2.7530832290649414, "step": 37060 }, { "epoch": 1.4064655655246927, "grad_norm": 0.294921875, "learning_rate": 8.081736067271901e-05, "loss": 2.769978904724121, "step": 37070 }, { "epoch": 1.4068449789154558, "grad_norm": 0.306640625, "learning_rate": 8.07216633486991e-05, "loss": 2.7554670333862306, "step": 37080 }, { "epoch": 1.407224392306219, "grad_norm": 0.30859375, "learning_rate": 8.062600838988912e-05, "loss": 2.7453908920288086, "step": 37090 }, { "epoch": 1.4076038056969824, "grad_norm": 0.30078125, "learning_rate": 8.053039583026371e-05, "loss": 2.7543792724609375, "step": 37100 }, { "epoch": 1.4079832190877457, "grad_norm": 0.296875, "learning_rate": 8.043482570378247e-05, "loss": 2.741659927368164, "step": 37110 }, { "epoch": 1.408362632478509, "grad_norm": 0.291015625, "learning_rate": 8.033929804438985e-05, "loss": 2.7713333129882813, "step": 37120 }, { "epoch": 1.408742045869272, "grad_norm": 0.296875, "learning_rate": 8.024381288601546e-05, "loss": 2.7904306411743165, "step": 37130 }, { "epoch": 1.4091214592600354, "grad_norm": 0.291015625, "learning_rate": 8.01483702625736e-05, "loss": 2.7615785598754883, "step": 37140 }, { "epoch": 1.4095008726507987, "grad_norm": 0.29296875, "learning_rate": 8.005297020796354e-05, "loss": 2.7521024703979493, "step": 37150 }, { "epoch": 1.409880286041562, "grad_norm": 0.296875, "learning_rate": 7.99576127560693e-05, "loss": 2.772168731689453, "step": 37160 }, { "epoch": 1.4102596994323253, "grad_norm": 0.298828125, "learning_rate": 7.986229794076008e-05, "loss": 2.7735790252685546, "step": 37170 }, { "epoch": 1.4106391128230886, "grad_norm": 0.29296875, "learning_rate": 7.976702579588962e-05, "loss": 2.775068473815918, "step": 37180 }, { "epoch": 1.411018526213852, "grad_norm": 0.294921875, "learning_rate": 7.967179635529666e-05, "loss": 2.7705501556396483, "step": 37190 }, { "epoch": 1.411397939604615, "grad_norm": 0.291015625, "learning_rate": 7.957660965280465e-05, "loss": 2.777090072631836, "step": 37200 }, { "epoch": 1.4117773529953783, "grad_norm": 0.298828125, "learning_rate": 7.948146572222217e-05, "loss": 2.768603515625, "step": 37210 }, { "epoch": 1.4121567663861416, "grad_norm": 0.298828125, "learning_rate": 7.938636459734208e-05, "loss": 2.7482099533081055, "step": 37220 }, { "epoch": 1.412536179776905, "grad_norm": 0.294921875, "learning_rate": 7.929130631194256e-05, "loss": 2.771947479248047, "step": 37230 }, { "epoch": 1.4129155931676682, "grad_norm": 0.29296875, "learning_rate": 7.919629089978627e-05, "loss": 2.7610151290893556, "step": 37240 }, { "epoch": 1.4132950065584315, "grad_norm": 0.287109375, "learning_rate": 7.910131839462078e-05, "loss": 2.7754884719848634, "step": 37250 }, { "epoch": 1.4132950065584315, "eval_loss": 2.7750306129455566, "eval_runtime": 190.391, "eval_samples_per_second": 20.006, "eval_steps_per_second": 3.335, "step": 37250 }, { "epoch": 1.4136744199491948, "grad_norm": 0.287109375, "learning_rate": 7.900638883017836e-05, "loss": 2.79837589263916, "step": 37260 }, { "epoch": 1.414053833339958, "grad_norm": 0.296875, "learning_rate": 7.891150224017603e-05, "loss": 2.7608034133911135, "step": 37270 }, { "epoch": 1.4144332467307212, "grad_norm": 0.2890625, "learning_rate": 7.881665865831547e-05, "loss": 2.768375205993652, "step": 37280 }, { "epoch": 1.4148126601214845, "grad_norm": 0.296875, "learning_rate": 7.87218581182833e-05, "loss": 2.765009307861328, "step": 37290 }, { "epoch": 1.4151920735122479, "grad_norm": 0.30078125, "learning_rate": 7.862710065375065e-05, "loss": 2.750996780395508, "step": 37300 }, { "epoch": 1.4155714869030112, "grad_norm": 0.2890625, "learning_rate": 7.853238629837346e-05, "loss": 2.762058448791504, "step": 37310 }, { "epoch": 1.4159509002937742, "grad_norm": 0.291015625, "learning_rate": 7.843771508579228e-05, "loss": 2.777128219604492, "step": 37320 }, { "epoch": 1.4163303136845378, "grad_norm": 0.30078125, "learning_rate": 7.834308704963232e-05, "loss": 2.772600555419922, "step": 37330 }, { "epoch": 1.4167097270753009, "grad_norm": 0.296875, "learning_rate": 7.824850222350364e-05, "loss": 2.775465202331543, "step": 37340 }, { "epoch": 1.4170891404660642, "grad_norm": 0.294921875, "learning_rate": 7.815396064100078e-05, "loss": 2.746545600891113, "step": 37350 }, { "epoch": 1.4174685538568275, "grad_norm": 0.298828125, "learning_rate": 7.805946233570292e-05, "loss": 2.7565155029296875, "step": 37360 }, { "epoch": 1.4178479672475908, "grad_norm": 0.3125, "learning_rate": 7.79650073411739e-05, "loss": 2.785464286804199, "step": 37370 }, { "epoch": 1.418227380638354, "grad_norm": 0.298828125, "learning_rate": 7.787059569096227e-05, "loss": 2.7866357803344726, "step": 37380 }, { "epoch": 1.4186067940291172, "grad_norm": 0.294921875, "learning_rate": 7.777622741860106e-05, "loss": 2.8032756805419923, "step": 37390 }, { "epoch": 1.4189862074198805, "grad_norm": 0.296875, "learning_rate": 7.768190255760796e-05, "loss": 2.7895175933837892, "step": 37400 }, { "epoch": 1.4193656208106438, "grad_norm": 0.294921875, "learning_rate": 7.758762114148512e-05, "loss": 2.7710649490356447, "step": 37410 }, { "epoch": 1.419745034201407, "grad_norm": 0.291015625, "learning_rate": 7.749338320371953e-05, "loss": 2.7826330184936525, "step": 37420 }, { "epoch": 1.4201244475921704, "grad_norm": 0.29296875, "learning_rate": 7.739918877778245e-05, "loss": 2.7946481704711914, "step": 37430 }, { "epoch": 1.4205038609829337, "grad_norm": 0.296875, "learning_rate": 7.730503789712984e-05, "loss": 2.8006567001342773, "step": 37440 }, { "epoch": 1.420883274373697, "grad_norm": 0.2890625, "learning_rate": 7.721093059520205e-05, "loss": 2.7798179626464843, "step": 37450 }, { "epoch": 1.42126268776446, "grad_norm": 0.2890625, "learning_rate": 7.711686690542423e-05, "loss": 2.743752288818359, "step": 37460 }, { "epoch": 1.4216421011552234, "grad_norm": 0.30078125, "learning_rate": 7.702284686120578e-05, "loss": 2.749449920654297, "step": 37470 }, { "epoch": 1.4220215145459867, "grad_norm": 0.306640625, "learning_rate": 7.692887049594069e-05, "loss": 2.758524513244629, "step": 37480 }, { "epoch": 1.42240092793675, "grad_norm": 0.291015625, "learning_rate": 7.683493784300734e-05, "loss": 2.805738830566406, "step": 37490 }, { "epoch": 1.4227803413275133, "grad_norm": 0.296875, "learning_rate": 7.674104893576892e-05, "loss": 2.7887605667114257, "step": 37500 }, { "epoch": 1.4227803413275133, "eval_loss": 2.774888515472412, "eval_runtime": 190.3923, "eval_samples_per_second": 20.006, "eval_steps_per_second": 3.335, "step": 37500 }, { "epoch": 1.4231597547182766, "grad_norm": 0.291015625, "learning_rate": 7.664720380757253e-05, "loss": 2.7928489685058593, "step": 37510 }, { "epoch": 1.42353916810904, "grad_norm": 0.296875, "learning_rate": 7.655340249175025e-05, "loss": 2.7819091796875, "step": 37520 }, { "epoch": 1.423918581499803, "grad_norm": 0.291015625, "learning_rate": 7.645964502161824e-05, "loss": 2.74365291595459, "step": 37530 }, { "epoch": 1.4242979948905663, "grad_norm": 0.294921875, "learning_rate": 7.636593143047742e-05, "loss": 2.734737014770508, "step": 37540 }, { "epoch": 1.4246774082813296, "grad_norm": 0.296875, "learning_rate": 7.627226175161266e-05, "loss": 2.7310956954956054, "step": 37550 }, { "epoch": 1.425056821672093, "grad_norm": 0.291015625, "learning_rate": 7.61786360182937e-05, "loss": 2.7715892791748047, "step": 37560 }, { "epoch": 1.4254362350628562, "grad_norm": 0.296875, "learning_rate": 7.608505426377434e-05, "loss": 2.7884363174438476, "step": 37570 }, { "epoch": 1.4258156484536193, "grad_norm": 0.296875, "learning_rate": 7.59915165212931e-05, "loss": 2.7764129638671875, "step": 37580 }, { "epoch": 1.4261950618443828, "grad_norm": 0.298828125, "learning_rate": 7.58980228240724e-05, "loss": 2.745808410644531, "step": 37590 }, { "epoch": 1.426574475235146, "grad_norm": 0.294921875, "learning_rate": 7.580457320531947e-05, "loss": 2.7599159240722657, "step": 37600 }, { "epoch": 1.4269538886259092, "grad_norm": 0.296875, "learning_rate": 7.571116769822562e-05, "loss": 2.802289581298828, "step": 37610 }, { "epoch": 1.4273333020166725, "grad_norm": 0.291015625, "learning_rate": 7.561780633596653e-05, "loss": 2.75314884185791, "step": 37620 }, { "epoch": 1.4277127154074358, "grad_norm": 0.30078125, "learning_rate": 7.552448915170225e-05, "loss": 2.7620141983032225, "step": 37630 }, { "epoch": 1.4280921287981991, "grad_norm": 0.30078125, "learning_rate": 7.543121617857719e-05, "loss": 2.7821584701538087, "step": 37640 }, { "epoch": 1.4284715421889622, "grad_norm": 0.29296875, "learning_rate": 7.533798744971993e-05, "loss": 2.7629297256469725, "step": 37650 }, { "epoch": 1.4288509555797255, "grad_norm": 0.29296875, "learning_rate": 7.524480299824342e-05, "loss": 2.794757080078125, "step": 37660 }, { "epoch": 1.4292303689704888, "grad_norm": 0.294921875, "learning_rate": 7.515166285724477e-05, "loss": 2.8000741958618165, "step": 37670 }, { "epoch": 1.4296097823612521, "grad_norm": 0.29296875, "learning_rate": 7.50585670598056e-05, "loss": 2.7333065032958985, "step": 37680 }, { "epoch": 1.4299891957520154, "grad_norm": 0.30078125, "learning_rate": 7.496551563899152e-05, "loss": 2.7673456192016603, "step": 37690 }, { "epoch": 1.4303686091427787, "grad_norm": 0.291015625, "learning_rate": 7.48725086278525e-05, "loss": 2.7632572174072267, "step": 37700 }, { "epoch": 1.430748022533542, "grad_norm": 0.296875, "learning_rate": 7.477954605942268e-05, "loss": 2.7735776901245117, "step": 37710 }, { "epoch": 1.4311274359243051, "grad_norm": 0.296875, "learning_rate": 7.468662796672058e-05, "loss": 2.7649667739868162, "step": 37720 }, { "epoch": 1.4315068493150684, "grad_norm": 0.29296875, "learning_rate": 7.459375438274858e-05, "loss": 2.7684144973754883, "step": 37730 }, { "epoch": 1.4318862627058317, "grad_norm": 0.2890625, "learning_rate": 7.450092534049365e-05, "loss": 2.792466735839844, "step": 37740 }, { "epoch": 1.432265676096595, "grad_norm": 0.302734375, "learning_rate": 7.44081408729266e-05, "loss": 2.7655805587768554, "step": 37750 }, { "epoch": 1.432265676096595, "eval_loss": 2.774787425994873, "eval_runtime": 191.6262, "eval_samples_per_second": 19.877, "eval_steps_per_second": 3.314, "step": 37750 } ], "logging_steps": 10, "max_steps": 52714, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.157848355581198e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }